diff --git a/Makefile b/Makefile index 5cf96bf10..da17c1fea 100644 --- a/Makefile +++ b/Makefile @@ -178,11 +178,19 @@ openocd.checkout: fi openocd.build: openocd.checkout - cd utils/openocd && ./bootstrap && ./configure --enable-jtag_dpi --prefix=$(INSTALL_DIR)/openocd && make && make install + cd utils/openocd && ./bootstrap && ./configure --enable-jtag_dpi --prefix=$(INSTALL_DIR)/openocd && $(MAKE) && $(MAKE) install openocd.clean: rm -rf $(INSTALL_DIR)/openocd tools/openocd +PROFILER_V2_DIR = $(GAP_SDK_HOME)/tools/profiler_v2 +PROFILER_V2_BUILD_DIR = $(GAP_SDK_HOME)/build/profiler_v2 + +profiler_v2: + cmake -S $(PROFILER_V2_DIR) -B $(PROFILER_V2_BUILD_DIR) + cmake --build $(PROFILER_V2_BUILD_DIR) + cmake --install $(PROFILER_V2_BUILD_DIR) --prefix $(INSTALL_DIR) + profiler: $(MAKE) -C tools/profiler all mkdir -p $(INSTALL_DIR)/bin diff --git a/configs/common.sh b/configs/common.sh index c030d35a7..ce8f600f5 100644 --- a/configs/common.sh +++ b/configs/common.sh @@ -76,7 +76,8 @@ export PYTHONPATH=$GAP_SDK_HOME/gvsoc/gvsoc/engine/python:$PYTHONPATH export PATH="$GAP_SDK_HOME/utils/gaptest":$PATH # Audio framework -export PYTHONPATH=$GAP_SDK_HOME/tools/audio-framework/frontends/python_graph_generator:$GAP_SDK_HOME/tools/audio-framework/components:$PYTHONPATH +export GAP_AUDIO_FRAMEWORK_HOME=$GAP_SDK_HOME/tools/audio-framework +export PYTHONPATH=$GAP_AUDIO_FRAMEWORK_HOME/frontends/python_graph_generator:$GAP_AUDIO_FRAMEWORK_HOME/components:$PYTHONPATH # Autotiler diff --git a/configs/gapuino_v3.sh b/configs/gapuino_v3.sh index 2db43eac9..ff20523ca 100644 --- a/configs/gapuino_v3.sh +++ b/configs/gapuino_v3.sh @@ -23,4 +23,6 @@ export OPENOCD_CABLE=interface/ftdi/gapuino_ftdi.cfg export GAPY_TARGET=gapuino_v3 +export PLPTEST_DEFAULT_PROPERTIES="chip=gap8_v3 chip_family=gap8 board=gapuino_v3 duration=50 test_duration=50" + source $GAP_SDK_HOME/configs/common.sh diff --git a/doc/conf.py b/doc/conf.py index f9bc1c7da..d672e4da7 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -35,6 +35,7 @@ def configure_doxyfile(file_in, file_out, replace_dict): "../rtos/pmsis/pmsis_api/include/pmsis/rtos/", "../rtos/pmsis/pmsis_api/include/pmsis/cluster/", "../rtos/pmsis/pmsis_api/include/pmsis/platforms/", + "../rtos/pmsis/pmsis_api/include/pmsis/", "../rtos/pmsis/pmsis_bsp/include/", "source/reference/builtins/headers/", ] @@ -86,6 +87,10 @@ def configure_doxyfile(file_in, file_out, replace_dict): html_theme = "sphinx_rtd_theme" html_logo = "_static/logo.png" +html_theme_options = { + 'navigation_depth' : -1, +} + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". diff --git a/examples/autotiler/FFTL1/FFTRunTest.c b/examples/autotiler/FFTL1/FFTRunTest.c index b8e892448..340333255 100644 --- a/examples/autotiler/FFTL1/FFTRunTest.c +++ b/examples/autotiler/FFTL1/FFTRunTest.c @@ -4,6 +4,12 @@ #define pmsis_exit(a) exit(a) #endif +#ifndef SILENT + #define PRINTF printf +#else + #define PRINTF(...) ((void) 0) +#endif + #define __XSTR(__s) __STR(__s) #define __STR(__s) #__s #include @@ -21,7 +27,8 @@ #endif #define STACK_SIZE 2048 typedef void (*FFTFun_T )(void *Data, void *Twiddles, signed char *shift, unsigned int Nfft, unsigned int Inverse); - +PI_L2 int PERF_ARR[6][3][2]; +PI_L2 float MSE_ARR[6][2]; short int *InBuff_q16; float *InBuff_f32, *InBuff_f32R4, *OutBuff_f32; @@ -58,14 +65,14 @@ float MSE_f32(float* real, float* calc, int Size){ void CallFFT(int Nfft, int Type){ // FFT: reset buffers, run and check mse - int start, elapsed, elapsedFFT, Q; + int start, elapsed, elapsedFFT, Q = 0; FFT_InstallArg_T ArgIns; FFT_Arg_T FFTArg; AT_L2_EVENT DmaR_Evt1; - void (*FFTFun)(FFT_Arg_T*); - void (*SwapFun)(SwapSamples_Arg_T*); + void (*FFTFun)(FFT_Arg_T*) = 0; + void (*SwapFun)(SwapSamples_Arg_T*) = 0; char *FFTDataType = 0; - void *InBuff; + void *InBuff = 0; ArgIns.Nfft = Nfft; ArgIns.Radix = ((Nfft)==64 || (Nfft)==256 || (Nfft)==1024)?4:2; @@ -96,11 +103,11 @@ void CallFFT(int Nfft, int Type){ FFTDataType = "Q16"; switch (Nfft) { case 64: ArgIns.Twiddles = R4_Twiddles_fix_64; ArgIns.SwapLUT = R4_SwapTable_fix_64; Q = 10; break; - case 128: ArgIns.Twiddles = R2_Twiddles_fix_128; ArgIns.SwapLUT = R2_SwapTable_fix_128; Q = 8; break; - case 256: ArgIns.Twiddles = R4_Twiddles_fix_256; ArgIns.SwapLUT = R4_SwapTable_fix_256; Q = 8; break; - case 512: ArgIns.Twiddles = R2_Twiddles_fix_512; ArgIns.SwapLUT = R2_SwapTable_fix_512; Q = 6; break; - case 1024: ArgIns.Twiddles = R4_Twiddles_fix_1024; ArgIns.SwapLUT = R4_SwapTable_fix_1024; Q = 6; break; - case 2048: ArgIns.Twiddles = R2_Twiddles_fix_2048; ArgIns.SwapLUT = R2_SwapTable_fix_2048; Q = 4; break; + case 128: ArgIns.Twiddles = R2_Twiddles_fix_128; ArgIns.SwapLUT = R2_SwapTable_fix_128; Q = 7; break; + case 256: ArgIns.Twiddles = R4_Twiddles_fix_256; ArgIns.SwapLUT = R4_SwapTable_fix_256; Q = 6; break; + case 512: ArgIns.Twiddles = R2_Twiddles_fix_512; ArgIns.SwapLUT = R2_SwapTable_fix_512; Q = 5; break; + case 1024: ArgIns.Twiddles = R4_Twiddles_fix_1024; ArgIns.SwapLUT = R4_SwapTable_fix_1024; Q = 4; break; + case 2048: ArgIns.Twiddles = R2_Twiddles_fix_2048; ArgIns.SwapLUT = R2_SwapTable_fix_2048; Q = 3; break; } if (ArgIns.Radix == 2) FFTFun = &Radix2FFT_DIF_Par_Fix16; else FFTFun = &Radix4FFT_DIF_Par_Fix16; @@ -139,24 +146,32 @@ void CallFFT(int Nfft, int Type){ __CALL((*FFTFun), &FFTArg); AT_FORK(gap_ncore(), (void *) (*SwapFun), (void *) &SwapArg); __CALL((*SwapFun), &SwapArg); - elapsed = gap_cl_readhwtimer() - start; printf("| %4d | %3s %6s | %6d | %5d | %6d", Nfft, FFTDataType, ArgIns.Radix==2?"Radix2":"Radix4", elapsedFFT, elapsed, elapsed+elapsedFFT); + elapsed = gap_cl_readhwtimer() - start; + + PERF_ARR[Nfft/128][Type][0] = elapsedFFT; + PERF_ARR[Nfft/128][Type][1] = elapsed; + + + PRINTF("| %4d | %3s %6s | %6d | %5d | %6d", Nfft, FFTDataType, ArgIns.Radix==2?"Radix2":"Radix4", elapsedFFT, elapsed, elapsed+elapsedFFT); #if !defined(__EMUL__) && defined(PERF_ALL) - printf(" | %7d | %7d | %7d | %8d | %7d |", pi_perf_read(PI_PERF_INSTR), pi_perf_read(PI_PERF_ACTIVE_CYCLES), pi_perf_read(PI_PERF_TCDM_CONT), pi_perf_read(PI_PERF_LD_STALL), pi_perf_read(PI_PERF_IMISS)); + PRINTF(" | %7d | %7d | %7d | %8d | %7d |", pi_perf_read(PI_PERF_INSTR), pi_perf_read(PI_PERF_ACTIVE_CYCLES), pi_perf_read(PI_PERF_TCDM_CONT), pi_perf_read(PI_PERF_LD_STALL), pi_perf_read(PI_PERF_IMISS)); #else - printf(" | | | | | |"); + PRINTF(" | | | | | |"); #endif if (Type == 0) { - printf(" |\n"); - // printf("\nOutFFT%d_f32 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) printf("%f%+fj, ", InBuff_f32[2*i], InBuff_f32[2*i+1]); printf("])\n"); + PRINTF(" |\n"); + // PRINTF("\nOutFFT%d_f32 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) PRINTF("%f%+fj, ", InBuff_f32[2*i], InBuff_f32[2*i+1]); PRINTF("])\n"); } else if (Type == 1) { - // printf("\nOutFFT%d_q16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) printf("%d%+dj, ", ((short int*)InBuff_q16)[2*i], ((short int*)InBuff_q16)[2*i+1]); printf("])\n"); - printf(" %f |\n", MSE_16(InBuff_f32, (short int*) InBuff_q16, Nfft, Q)); + // PRINTF("\nOutFFT%d_q16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) PRINTF("%d%+dj, ", ((short int*)InBuff_q16)[2*i], ((short int*)InBuff_q16)[2*i+1]); PRINTF("])\n"); + MSE_ARR[Nfft/128][0] = MSE_16(InBuff_f32, (short int*) InBuff_q16, Nfft, Q); + PRINTF(" %f |\n", MSE_ARR[Nfft/128][0]); } else if (Type == 2) { #ifdef __gap9__ - // printf("\nOutFFT%d_f16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) printf("%f%+fj, ", ((f16*)OutBuff)[2*i], ((f16*)OutBuff)[2*i+1]); printf("])\n"); - printf(" %f |\n", MSE_f16(InBuff_f32, (f16 *) InBuff_f16, Nfft)); + // PRINTF("\nOutFFT%d_f16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) PRINTF("%f%+fj, ", ((f16*)OutBuff)[2*i], ((f16*)OutBuff)[2*i+1]); PRINTF("])\n"); + MSE_ARR[Nfft/128][1] = MSE_f16(InBuff_f32, (f16 *) InBuff_f16, Nfft); + PRINTF(" %f |\n", MSE_ARR[Nfft/128][1]); #else - printf("\n"); + PRINTF("\n"); #endif } } @@ -171,31 +186,31 @@ static void RunFFT() #endif gap_cl_resethwtimer(); int start, elapsed, timef32; - printf("Initializing inputs....\n"); + PRINTF("Initializing inputs....\n"); //InitData4 (InDataQ16, MAXDIM, 37, 15, 23, 73, 0.1, 0.5, 0.6, 0.8); //InitData4_float(InDataf32, MAXDIM, 37, 15, 23, 73, 0.1, 0.5, 0.6, 0.8); #ifdef __gap9__ for (int i=0; i 0.016) { + printf("Error: MSE too large for %d FFT Q16\n", FFTBins); + printf("Test FAILED\n"); + pmsis_exit(-1); + } + #ifdef __gap9__ + if (MSE_ARR[FFTBins/128][1] > 0.000048) { + printf("Error: MSE too large for %d FFT F16\n", FFTBins); + printf("Test FAILED\n"); + pmsis_exit(-1); + } + #endif + FFTBins *= 2; + } + + printf("Test PASSED\n"); pmsis_exit(0); } diff --git a/examples/autotiler/FFTL1/Makefile b/examples/autotiler/FFTL1/Makefile index 56a7a8a6a..f9886f037 100644 --- a/examples/autotiler/FFTL1/Makefile +++ b/examples/autotiler/FFTL1/Makefile @@ -1,7 +1,7 @@ # User Test #------------------------------------ -PMSIS_OS?=pulpos +#PMSIS_OS?=pulpos APP = test APP_SRCS += FFTRunTest.c $(AT_HOME)/DSP_Libraries/FFT_Library.c $(AT_HOME)/DSP_Libraries/LUT_Tables/TwiddlesDef.c $(AT_HOME)/DSP_Libraries/LUT_Tables/SwapTablesDef.c APP_INC += diff --git a/examples/pmsis/periph/i2c/i2c_scan/Makefile b/examples/pmsis/periph/i2c/i2c_scan/Makefile index c210b96ed..ad43981fc 100644 --- a/examples/pmsis/periph/i2c/i2c_scan/Makefile +++ b/examples/pmsis/periph/i2c/i2c_scan/Makefile @@ -7,14 +7,11 @@ APP_INC += APP_CFLAGS += -runner_args =--trace=corruptor --trace-level=trace -#runner_args =--trace=board.*i2c --trace-level=trace -#runner_args =--trace=board.*i2c:gvsoc.log --trace-level=trace -#runner_args =--trace=eeprom -#runner_args +=--trace-level=trace +ifeq '$(platform)' 'gvsoc' # Overwrite the default target so that GVSOC simulates our board # First name is the class name, second one is the python module export GAPY_PY_TARGET=My_board@my_board +endif # Append current directory to python path so that it finds our board and module export PYTHONPATH:=$(CURDIR):$(PYTHONPATH) diff --git a/gvsoc/gvsoc/bin/pulp-pc-info b/gvsoc/gvsoc/bin/pulp-pc-info index 62bff025e..ab816b189 100755 --- a/gvsoc/gvsoc/bin/pulp-pc-info +++ b/gvsoc/gvsoc/bin/pulp-pc-info @@ -63,12 +63,15 @@ toolchain = os.environ.get('PULP_RISCV_GCC_TOOLCHAIN_CI') if toolchain is None: toolchain = os.environ.get('PULP_RISCV_GCC_TOOLCHAIN') -if toolchain is not None: - readelf = toolchain + '/bin/riscv32-unknown-elf-readelf' - addr2line = toolchain + '/bin/riscv32-unknown-elf-addr2line' -else: - readelf = 'riscv32-unknown-elf-readelf' - addr2line = 'riscv32-unknown-elf-addr2line' +# if toolchain is not None: +# readelf = toolchain + '/bin/riscv32-unknown-elf-readelf' +# addr2line = toolchain + '/bin/riscv32-unknown-elf-addr2line' +# else: +# readelf = 'riscv32-unknown-elf-readelf' +# addr2line = 'riscv32-unknown-elf-addr2line' + +readelf = 'readelf' +addr2line = 'addr2line' process = Popen((readelf + ' -s %s' % args.file).split(), stdin=PIPE, stdout=PIPE) diff --git a/gvsoc/gvsoc/engine/include/gv/power.hpp b/gvsoc/gvsoc/engine/include/gv/power.hpp index df4853d35..2f43e6049 100644 --- a/gvsoc/gvsoc/engine/include/gv/power.hpp +++ b/gvsoc/gvsoc/engine/include/gv/power.hpp @@ -148,9 +148,41 @@ namespace vp */ void setup(double temp, double volt, double freq); + /** + * @brief Turn on a power source + * + * This power source should be turned on when its power domain is turned on, in order to start consuming power + */ + void turn_on(); + + /** + * @brief Turn off a power source + * + * This power source should be turned off when its power domain is turned off, in order to stop consuming power + */ + void turn_off(); + + /** + * @brief Turn on a power source + * + * This power source should be turned on when its power domain is turned on, in order to start consuming power + */ + void turn_dynamic_power_on(); + + /** + * @brief Turn off a power source + * + * This power source should be turned off when its power domain is turned off, in order to stop consuming power + */ + void turn_dynamic_power_off(); + private: - Linear_table *table = NULL; // Table of power values for all supported temperatures and voltages - // imported from the json configuration given when trace was initialized. + void check(); + + Linear_table *dyn_table = NULL; // Table of power values for all supported temperatures and voltages + // imported from the json configuration given when trace was initialized. + Linear_table *leakage_table = NULL; // Table of power values for all supported temperatures and voltages + // imported from the json configuration given when trace was initialized. double quantum; // Current quantumm of energy, for quantum-based power consumption. // The current value is estimated depending on voltage and temperature according // to the provided json configuration. @@ -162,7 +194,12 @@ namespace vp // to the provided json configuration. component *top; // Top component containing the power source power_trace *trace; // Power trace where the power consumption should be reported. - bool is_on = false; // True is the source is on and backgroun-power and leakage should be reported + bool is_dynamic_power_started = false; // True is the source consuming dynamic backgroun power + bool is_leakage_power_started = false; // True is the source should start consuming leakage power + bool is_on = false; // True is the power domain containing the power source is on and backgroun-power and leakage should be reported + bool is_dynamic_power_on = false; // True is the power domain containing the power source is on and backgroun-power and leakage should be reported + bool dynamic_power_is_on_sync = false; + bool leakage_power_is_on_sync = false; }; @@ -327,12 +364,16 @@ namespace vp // power consumed. void account_leakage_power(); - // Check if the current amount of cycle energy is not for the current cycle + // Check if the current amount of power due to quantum of energies + // is not for the current cycle // (by checking the timestamp), and if not, reset it to zero. - inline void flush_dynamic_energy_for_cycle(); + inline void flush_quantum_power_for_cycle(); - // Get the amount of energy spent in the current cycle - inline double get_dynamic_energy_for_cycle(); + // Get the average power of the current cycle due to quantums of energy + inline double get_quantum_power_for_cycle(); + + // Get the energy spent in the current cycle due to quantums of energy + inline double get_quantum_energy_for_cycle(); // Return the total amount of dynamic energy spent since the beginning // of the report windows (since report_start was called) @@ -363,7 +404,7 @@ namespace vp int64_t curent_cycle_timestamp; // Timestamp of the current cycle, used to compute energy spent in the // current cycle. As soon as current time is different, the timestamp // is set to current time and the current energy is set to 0. - double dynamic_energy_for_cycle; // Amount of energy spent in the current cycle. + double quantum_power_for_cycle; // Power spent by quentum of energy in the current cycle. // It is increased everytime a quantum of energy is // spent and reset to zero when the current cycle is // over. It is mostly used to compute the instant power @@ -468,6 +509,15 @@ namespace vp */ vp::power::power_trace *get_power_trace() { return &this->power_trace; } + /** + * @brief Set power supply state + * + * This sets the power supply for this component and all his childs. + * + * @param state Supply state + */ + void power_supply_set_all(int state); + protected: /** * @brief Get the report energy from childs object @@ -516,10 +566,15 @@ namespace vp // Get instant power for this component and the whole hierarchy below him. double get_power_from_self_and_childs(); + // Set power supply state + static void power_supply_sync(void *_this, int state); + component ⊤ // Component containing the power component object vp::power::power_trace power_trace; // Default power trace of this component std::vector traces; // Vector of power traces of this component + std::vector sources; // Vector of power sources of this component power::engine *engine = NULL; // Power engine + vp::wire_slave power_port; // Slave port for setting power supply state }; @@ -542,6 +597,8 @@ namespace vp */ engine(vp::component *top); + ~engine(); + /** * @brief Start power report generation * @@ -570,6 +627,8 @@ namespace vp std::vector traces; // Vector of all traces. vp::component *top; // Top component of the simulated architecture + + FILE *file; // File where the power reports are dumped }; }; diff --git a/gvsoc/gvsoc/engine/include/vp/component.hpp b/gvsoc/gvsoc/engine/include/vp/component.hpp index dd336a587..44b05070c 100644 --- a/gvsoc/gvsoc/engine/include/vp/component.hpp +++ b/gvsoc/gvsoc/engine/include/vp/component.hpp @@ -397,6 +397,7 @@ namespace vp { { friend class component_clock; + friend class vp::power::component_power; public: component(js::config *config); @@ -410,6 +411,7 @@ namespace vp { virtual void quit(int status) {} virtual void pre_reset() {} virtual void reset(bool active) {} + virtual void power_supply_set(int state) {} virtual void load() {} virtual void elab(); virtual void run() {} @@ -568,6 +570,14 @@ namespace vp { vp::component *__gv_create(std::string config_path, struct gv_conf *gv_conf); + class top + { + public: + component *top_instance; + power::engine *power_engine; + private: + }; + }; #endif diff --git a/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp b/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp index 10bb99434..bcea94c18 100644 --- a/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp +++ b/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp @@ -25,51 +25,78 @@ #include "vp/vp_data.hpp" +inline void vp::power::power_source::turn_on() +{ + this->is_on = true; + this->is_dynamic_power_on = true; + this->check(); +} + + +inline void vp::power::power_source::turn_off() +{ + this->is_on = false; + this->is_dynamic_power_on = false; + this->check(); +} + +inline void vp::power::power_source::turn_dynamic_power_on() +{ + this->is_dynamic_power_on = true; + this->check(); +} + +inline void vp::power::power_source::turn_dynamic_power_off() +{ + this->is_dynamic_power_on = false; + this->check(); +} + inline void vp::power::power_source::leakage_power_start() { - // Only start accounting leakage if not already done and if leakage is defined - if (!this->is_on && this->leakage != -1) + // Only start if leakage is defined + if (this->leakage != -1) { - this->trace->inc_leakage_power(this->leakage); + this->is_leakage_power_started = true; + this->check(); } - this->is_on = true; } inline void vp::power::power_source::leakage_power_stop() { - // Only stop accounting leakage if not already done and if leakage is defined - if (this->is_on && this->leakage != -1) + // Only stop if leakage is defined + if (this->leakage != -1) { - this->trace->inc_leakage_power(-this->leakage); + this->is_leakage_power_started = false; + this->check(); } - this->is_on = false; } inline void vp::power::power_source::dynamic_power_start() { - // Only start accounting background power if not already done and if it is is defined - if (!this->is_on && this->background_power != -1) + // Only start accounting background power if it is is defined + if (this->background_power != -1) { - this->trace->inc_dynamic_power(this->background_power); + this->is_dynamic_power_started = true; + this->check(); } - this->is_on = true; } inline void vp::power::power_source::dynamic_power_stop() { - // Only stop accounting background power if not already done and if it is is defined - if (this->is_on && this->background_power != -1) + // Only stop accounting background power if it is is defined + if (this->background_power != -1) { - this->trace->inc_dynamic_power(-this->background_power); + this->is_dynamic_power_started = false; + this->check(); } - this->is_on = false; } @@ -77,7 +104,7 @@ inline void vp::power::power_source::dynamic_power_stop() inline void vp::power::power_source::account_energy_quantum() { // Only account energy is a quantum is defined - if (this->quantum != -1) + if (this->is_on && this->is_dynamic_power_on && this->quantum != -1) { this->trace->inc_dynamic_energy(this->quantum); } diff --git a/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp b/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp index 6d833599b..fe33eaa79 100644 --- a/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp +++ b/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp @@ -32,25 +32,43 @@ inline double vp::power::power_trace::get_power() -inline double vp::power::power_trace::get_dynamic_energy_for_cycle() +inline double vp::power::power_trace::get_quantum_power_for_cycle() { // First check if the current energy is for an old cycle - this->flush_dynamic_energy_for_cycle(); + this->flush_quantum_power_for_cycle(); // And return the current total - return this->dynamic_energy_for_cycle; + return this->quantum_power_for_cycle; } +inline double vp::power::power_trace::get_quantum_energy_for_cycle() +{ + double power = this->get_quantum_power_for_cycle(); -inline void vp::power::power_trace::flush_dynamic_energy_for_cycle() + if (power != 0) + { + return power * this->top->get_period(); + } + + return 0; +} + + + +inline void vp::power::power_trace::flush_quantum_power_for_cycle() { // Clear the current total if it is not for the current cycle - if (this->curent_cycle_timestamp < this->top->get_time()) + if (this->quantum_power_for_cycle && this->curent_cycle_timestamp < this->top->get_time()) { - this->curent_cycle_timestamp = this->top->get_time(); - this->dynamic_energy_for_cycle = 0; + if (this->parent) + { + this->parent->inc_dynamic_power(-this->quantum_power_for_cycle); + } + this->quantum_power_for_cycle = 0; } + + this->curent_cycle_timestamp = this->top->get_time(); } diff --git a/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp b/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp index 4f5da18ea..6340ba80d 100644 --- a/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp +++ b/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp @@ -71,7 +71,7 @@ namespace vp { class Event_dumper { public: - Event_dumper(vp::component *comp) : comp(comp) {} + Event_dumper(vp::component *comp) : comp(comp) { this->user_vcd = NULL; } Event_trace *get_trace(string trace_name, string file_name, int width, bool is_real=false, bool is_string=false); Event_trace *get_trace_real(string trace_name, string file_name); Event_trace *get_trace_string(string trace_name, string file_name); @@ -83,6 +83,7 @@ namespace vp { private: std::map event_traces; std::map event_files; + gv::Vcd_user *user_vcd; }; class Vcd_file : public Event_file diff --git a/gvsoc/gvsoc/engine/python/gv/gvsoc.py b/gvsoc/gvsoc/engine/python/gv/gvsoc.py index 04f900465..d711756d8 100644 --- a/gvsoc/gvsoc/engine/python/gv/gvsoc.py +++ b/gvsoc/gvsoc/engine/python/gv/gvsoc.py @@ -146,7 +146,8 @@ def conf(self): def __gen_debug_info(self, full_config, gvsoc_config): for binary in full_config.get('**/debug_binaries').get_dict(): - if os.system('pulp-pc-info --file %s --all-file %s' % (binary.replace('.debugInfo', ''), binary)) != 0: + if os.system('gen-debug-info %s %s' % (binary.replace('.debugInfo', ''), binary)) != 0: + # if os.system('pulp-pc-info --file %s --all-file %s' % (binary.replace('.debugInfo', ''), binary)) != 0: raise errors.InputError('Error while generating debug symbols information, make sure the toolchain and the binaries are accessible ') diff --git a/gvsoc/gvsoc/engine/src/launcher.cpp b/gvsoc/gvsoc/engine/src/launcher.cpp index 63908857c..1dc3b727a 100644 --- a/gvsoc/gvsoc/engine/src/launcher.cpp +++ b/gvsoc/gvsoc/engine/src/launcher.cpp @@ -43,6 +43,7 @@ class Gvsoc_launcher : public gv::Gvsoc private: + void *handler; vp::component *instance; }; @@ -53,19 +54,20 @@ gv::Gvsoc *gv::gvsoc_new() void Gvsoc_launcher::open(std::string config_path) { - this->instance = vp::__gv_create(config_path, NULL); + this->handler = vp::__gv_create(config_path, NULL); + this->instance = ((vp::top *)this->handler)->top_instance; - gv_start((void *)this->instance); + gv_start(this->handler); } void Gvsoc_launcher::close() { - gv_destroy((void *)this->instance); + gv_destroy(this->handler); } void Gvsoc_launcher::run() { - gv_step((void *)this->instance, 0); + gv_step(this->handler, 0); } int64_t Gvsoc_launcher::stop() @@ -76,7 +78,7 @@ int64_t Gvsoc_launcher::stop() int64_t Gvsoc_launcher::step(int64_t duration) { - gv_step((void *)this->instance, duration); + gv_step(this->handler, duration); return 0; } @@ -92,10 +94,10 @@ void Gvsoc_launcher::vcd_bind(gv::Vcd_user *user) void Gvsoc_launcher::event_add(std::string path, bool is_regex) { - + this->instance->traces.get_trace_manager()->conf_trace(1, path, 1); } void Gvsoc_launcher::event_exclude(std::string path, bool is_regex) { - + this->instance->traces.get_trace_manager()->conf_trace(1, path, 0); } diff --git a/gvsoc/gvsoc/engine/src/power/component_power.cpp b/gvsoc/gvsoc/engine/src/power/component_power.cpp index 310a00b7a..d8930b94c 100644 --- a/gvsoc/gvsoc/engine/src/power/component_power.cpp +++ b/gvsoc/gvsoc/engine/src/power/component_power.cpp @@ -41,6 +41,9 @@ void vp::power::component_power::build() { this->get_engine()->reg_trace(trace); } + + this->power_port.set_sync_meth(&vp::power::component_power::power_supply_sync); + this->top.new_slave_port(this, "power_supply", &this->power_port); } @@ -69,6 +72,8 @@ int vp::power::component_power::new_power_source(std::string name, power_source source->setup(VP_POWER_DEFAULT_TEMP, VP_POWER_DEFAULT_VOLT, VP_POWER_DEFAULT_FREQ); + this->sources.push_back(source); + return 0; } @@ -150,3 +155,51 @@ void vp::power::component_power::dump_child_traces(FILE *file, double total) x->power.dump(file, total); } } + +void vp::power::component_power::power_supply_sync(void *__this, int state) +{ + vp::power::component_power *_this = (vp::power::component_power *)__this; + _this->power_supply_set_all(state); +} + + +void vp::power::component_power::power_supply_set_all(int state) +{ + this->top.power_supply_set(state); + + for (auto &x : this->top.childs) + { + x->power.power_supply_set_all(state); + } + + + + if (state >= 2) + { + for (auto &x : this->sources) + { + if (state == 3) + { + x->turn_dynamic_power_on(); + } + else + { + x->turn_dynamic_power_off(); + } + } + } + else + { + for (auto &x : this->sources) + { + if (state == 1) + { + x->turn_on(); + } + else + { + x->turn_off(); + } + } + } +} diff --git a/gvsoc/gvsoc/engine/src/power/power_engine.cpp b/gvsoc/gvsoc/engine/src/power/power_engine.cpp index c7661f6a9..19652c069 100644 --- a/gvsoc/gvsoc/engine/src/power/power_engine.cpp +++ b/gvsoc/gvsoc/engine/src/power/power_engine.cpp @@ -46,14 +46,11 @@ void vp::power::engine::start_capture() void vp::power::engine::stop_capture() { // When stopping, dump recursively all traces to a file - FILE *file = fopen("power_report.csv", "w"); - if (file == NULL) + + if (this->file) { - // vp_warning_always(&this->warning, "Failed to open power report file (path: %s)\n", "power_report.csv"); - return; + this->top->dump_traces_recursive(file); } - - this->top->dump_traces_recursive(file); } @@ -64,4 +61,19 @@ vp::power::engine::engine(vp::component *top) // Declare power service, each component will ask the connection to it top->new_service("power", this); + + this->file = fopen("power_report.csv", "w"); + if (this->file == NULL) + { + //vp_warning_always(&this->warning, "Failed to open power report file (path: %s)\n", "power_report.csv"); + } +} + + +vp::power::engine::~engine() +{ + if (this->file) + { + fclose(this->file); + } } diff --git a/gvsoc/gvsoc/engine/src/power/power_source.cpp b/gvsoc/gvsoc/engine/src/power/power_source.cpp index 579e27f07..c28ded02a 100644 --- a/gvsoc/gvsoc/engine/src/power/power_source.cpp +++ b/gvsoc/gvsoc/engine/src/power/power_source.cpp @@ -30,15 +30,15 @@ void vp::power::power_source::setup(double temp, double volt, double freq) // dynamic background power or leakage if they are defined, which is the case if they are not -1 if (this->quantum != -1) { - this->quantum = this->table->get(temp, volt, freq); + this->quantum = this->dyn_table->get(temp, volt, freq); } if (this->background_power != -1) { - this->background_power = this->table->get(temp, volt, freq); + this->background_power = this->dyn_table->get(temp, volt, freq); } if (this->leakage != -1) { - this->leakage = this->table->get(temp, volt, freq); + this->leakage = this->leakage_table->get(temp, volt, freq); } } @@ -130,7 +130,14 @@ int vp::power::power_source::init(component *top, std::string name, js::config * return -1; } - this->table = new Linear_table(values); + if (is_leakage) + { + this->leakage_table = new Linear_table(values); + } + else + { + this->dyn_table = new Linear_table(values); + } } else { @@ -147,3 +154,44 @@ int vp::power::power_source::init(component *top, std::string name, js::config * return 0; } + + +void vp::power::power_source::check() +{ + bool leakage_power_is_on = this->is_on && this->is_leakage_power_started; + bool dynamic_power_is_on = this->is_on && this->is_dynamic_power_on && this->is_dynamic_power_started; + + if (this->dynamic_power_is_on_sync != dynamic_power_is_on) + { + if (this->background_power) + { + if (dynamic_power_is_on) + { + this->trace->inc_dynamic_power(this->background_power); + } + else + { + this->trace->inc_dynamic_power(-this->background_power); + } + } + + this->dynamic_power_is_on_sync = dynamic_power_is_on; + } + + if (this->leakage_power_is_on_sync != leakage_power_is_on) + { + if (this->leakage) + { + if (leakage_power_is_on) + { + this->trace->inc_leakage_power(this->leakage); + } + else + { + this->trace->inc_leakage_power(-this->leakage); + } + } + + this->leakage_power_is_on_sync = leakage_power_is_on; + } +} diff --git a/gvsoc/gvsoc/engine/src/power/power_trace.cpp b/gvsoc/gvsoc/engine/src/power/power_trace.cpp index 3bda6a8b4..c5e5001b8 100644 --- a/gvsoc/gvsoc/engine/src/power/power_trace.cpp +++ b/gvsoc/gvsoc/engine/src/power/power_trace.cpp @@ -28,7 +28,7 @@ int vp::power::power_trace::init(component *top, std::string name, vp::power::po { this->top = top; top->traces.new_trace_event_real(name, &this->trace); - this->dynamic_energy_for_cycle = 0; + this->quantum_power_for_cycle = 0; this->report_dynamic_energy = 0; this->report_leakage_energy = 0; this->curent_cycle_timestamp = 0; @@ -37,7 +37,7 @@ int vp::power::power_trace::init(component *top, std::string name, vp::power::po if (parent == NULL) { vp::component *component = top->get_parent(); - if (component) + if (component && component->get_path() != "") { parent = component->power.get_power_trace(); } @@ -73,10 +73,13 @@ void vp::power::power_trace::trace_handler(void *__this, vp::clock_event *event) void vp::power::power_trace::report_start() { + this->account_dynamic_power(); + this->account_leakage_power(); + // Since the report start may be triggered in the middle of several events // for power consumptions, include what has already be accounted // in the same cycle. - this->report_dynamic_energy = this->get_dynamic_energy_for_cycle(); + this->report_dynamic_energy = this->get_quantum_energy_for_cycle(); this->report_leakage_energy = 0; this->report_start_timestamp = this->top->get_time(); } @@ -93,12 +96,9 @@ void vp::power::power_trace::get_report_energy(double *dynamic, double *leakage) void vp::power::power_trace::get_report_power(double *dynamic, double *leakage) { - double childs_dynamic = 0, childs_leakage = 0; - // To get the power on the report window, we just get the total energy and divide by the window duration - this->top->power.get_report_energy_from_childs(&childs_dynamic, &childs_leakage); - *dynamic = (childs_dynamic + this->get_report_dynamic_energy()) / (this->top->get_time() - this->report_start_timestamp); - *leakage = (childs_leakage + this->get_report_leakage_energy()) / (this->top->get_time() - this->report_start_timestamp); + *dynamic = (this->get_report_dynamic_energy()) / (this->top->get_time() - this->report_start_timestamp); + *leakage = (this->get_report_leakage_energy()) / (this->top->get_time() - this->report_start_timestamp); } @@ -127,43 +127,25 @@ void vp::power::power_trace::dump_vcd_trace() if (this->top->get_path() == "") return; - double power = 0.0; - // To dump the VCD trace, we need to compute the instant power, since this is what is reported. // This is easy for background and leakage power. For enery quantum, we get the amount of energy for the current // cycle and compute the instant power using the clock engine period. - // Some component do not have clocks. They cannot use energy quantum but they can still use background - // power and leakage - if (this->top->get_clock()) - { - int64_t period = this->top->get_period(); - if (period != 0) - { - power += this->get_dynamic_energy_for_cycle() / period; - } - } + double quantum_power = this->get_quantum_power_for_cycle(); double power_background = this->current_dynamic_power + this->current_leakage_power; // Also account the power from childs since VCD traces are hierarchical - double childs_power = this->top->power.get_power_from_childs(); - this->current_power = power + power_background + childs_power; + this->current_power = quantum_power + power_background; // Dump the instant power to trace this->trace.event_real(current_power); // If there was a contribution from energy quantum, schedule an event in the next cycle so that we dump again // the trace since teh quantum implicitely disappears and overal power is modified - if (!this->trace_event->is_enqueued() && power > 0) + if (!this->trace_event->is_enqueued() && quantum_power > 0) { this->top->event_enqueue(this->trace_event, 1); } - - // Notify the parent that this trace was dumped so that the upper traces can be dumped as well - if (this->parent) - { - this->parent->dump_vcd_trace(); - } } @@ -213,15 +195,26 @@ void vp::power::power_trace::account_leakage_power() void vp::power::power_trace::inc_dynamic_energy(double quantum) { + if (this->top->get_period() == 0) + { + return; + } + // Since we need to account the energy for the current amount of the cycle, check if it needs to be flushed - this->flush_dynamic_energy_for_cycle(); + this->flush_quantum_power_for_cycle(); // Then account it to both the total amount and to the cycle amount - this->dynamic_energy_for_cycle += quantum; + double power = quantum / this->top->get_period(); + this->quantum_power_for_cycle += power; this->report_dynamic_energy += quantum; - // Redump VCD trace since teh instant power is impacted + // Redump VCD trace since the instant power is impacted this->dump_vcd_trace(); + + if (this->parent) + { + this->parent->inc_dynamic_power(power); + } } @@ -235,14 +228,24 @@ void vp::power::power_trace::inc_dynamic_power(double power_incr) this->account_dynamic_power(); this->current_dynamic_power += power_incr; - // Redump VCD trace since teh instant power is impacted + // Redump VCD trace since the instant power is impacted this->dump_vcd_trace(); + + if (this->parent) + { + this->parent->inc_dynamic_power(power_incr); + } } void vp::power::power_trace::inc_leakage_power(double power_incr) { + // TODO this is wasting time and should be removed once fake component such as time domain and trace domain + // are not in the component hierarchy anymore + if (this->top->get_path() == "") + return; + // Leakage and dynamic are handled differently since they are reported separately, // In both cases, first compute the power on current period, start a new one, // and change the power so that it is constant over the period, to properly @@ -250,6 +253,11 @@ void vp::power::power_trace::inc_leakage_power(double power_incr) this->account_leakage_power(); this->current_leakage_power += power_incr; - // Redump VCD trace since teh instant power is impacted + // Redump VCD trace since the instant power is impacted this->dump_vcd_trace(); + + if (this->parent) + { + this->parent->inc_leakage_power(power_incr); + } } diff --git a/gvsoc/gvsoc/engine/src/trace/event.cpp b/gvsoc/gvsoc/engine/src/trace/event.cpp index ae0f0475a..579880226 100644 --- a/gvsoc/gvsoc/engine/src/trace/event.cpp +++ b/gvsoc/gvsoc/engine/src/trace/event.cpp @@ -102,6 +102,11 @@ vp::Event_trace *vp::Event_dumper::get_trace(string trace_name, string file_name trace = new Event_trace(trace_name, event_file, width, is_real, is_string); event_traces[trace_name] = trace; + + if (this->user_vcd) + { + trace->set_vcd_user(this->user_vcd); + } } return trace; @@ -135,6 +140,8 @@ void vp::Event_dumper::close() void vp::Event_dumper::set_vcd_user(gv::Vcd_user *user) { + this->user_vcd = user; + for (auto const& x : event_traces) { x.second->set_vcd_user(user); diff --git a/gvsoc/gvsoc/engine/src/vp.cpp b/gvsoc/gvsoc/engine/src/vp.cpp index 271e0078a..f733d8e3d 100644 --- a/gvsoc/gvsoc/engine/src/vp.cpp +++ b/gvsoc/gvsoc/engine/src/vp.cpp @@ -67,6 +67,7 @@ char vp_error[VP_ERROR_SIZE]; static Gv_proxy *proxy = NULL; + uint64_t vp::reg::get_field(int offset, int width) { uint64_t value = 0; @@ -329,6 +330,7 @@ void vp::component_clock::clk_reg(component *_this, component *clock) } } + void vp::component::reset_all(bool active, bool from_itf) { // Small hack to not propagate the reset from top level if the reset has @@ -1686,12 +1688,15 @@ vp::component *vp::__gv_create(std::string config_path, struct gv_conf *gv_conf) vp::component *instance = constructor(js_config); - new vp::power::engine(instance); + vp::top *top = new vp::top(); + + top->top_instance = instance; + top->power_engine = new vp::power::engine(instance); instance->set_vp_config(gv_config); instance->set_gv_conf(gv_conf); - return instance; + return (vp::component *)top; } @@ -1708,7 +1713,8 @@ extern "C" void gv_destroy(void *arg) extern "C" void gv_start(void *arg) { - vp::component *instance = (vp::component *)arg; + vp::top *top = (vp::top *)arg; + vp::component *instance = (vp::component *)top->top_instance; instance->pre_pre_build(); instance->pre_build(); @@ -1736,7 +1742,8 @@ extern "C" void gv_start(void *arg) extern "C" void gv_step(void *arg, int64_t timestamp) { - vp::component *instance = (vp::component *)arg; + vp::top *top = (vp::top *)arg; + vp::component *instance = (vp::component *)top->top_instance; instance->step(timestamp); } @@ -1744,7 +1751,8 @@ extern "C" void gv_step(void *arg, int64_t timestamp) extern "C" int64_t gv_time(void *arg) { - vp::component *instance = (vp::component *)arg; + vp::top *top = (vp::top *)arg; + vp::component *instance = (vp::component *)top->top_instance; return instance->get_time_engine()->get_next_event_time(); } @@ -1989,9 +1997,10 @@ vp::time_event *vp::time_scheduler::enqueue(time_event *event, int64_t time) -extern "C" int gv_run(void *_instance) +extern "C" int gv_run(void *arg) { - vp::component *instance = (vp::component *)_instance; + vp::top *top = (vp::top *)arg; + vp::component *instance = (vp::component *)top->top_instance; if (!proxy) { @@ -2014,9 +2023,10 @@ extern "C" void gv_init(struct gv_conf *gv_conf) } -extern "C" void gv_stop(void *_instance, int retval) +extern "C" void gv_stop(void *arg, int retval) { - vp::component *instance = (vp::component *)_instance; + vp::top *top = (vp::top *)arg; + vp::component *instance = (vp::component *)top->top_instance; if (proxy) { @@ -2024,6 +2034,8 @@ extern "C" void gv_stop(void *_instance, int retval) } instance->stop(); + + delete top->power_engine; } @@ -2059,6 +2071,7 @@ void vp::fatal(const char *fmt, ...) extern "C" void *gv_chip_pad_bind(void *handle, char *name, int ext_handle) { - vp::component *instance = (vp::component *)handle; + vp::top *top = (vp::top *)handle; + vp::component *instance = (vp::component *)top->top_instance; return instance->external_bind(name, "", (void *)(long)ext_handle); } diff --git a/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp b/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp index 76f4757b3..9cd9aa576 100644 --- a/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp +++ b/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp @@ -474,7 +474,7 @@ void trace_domain::conf_trace(int event, std::string path_str, bool enabled) if (trace != NULL) { if (event) - { + { if (enabled) { vp::Event_trace *event_trace; diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp b/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp index f7e234e1c..9ae3c3fe4 100644 --- a/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp +++ b/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp @@ -113,6 +113,7 @@ class iss_wrapper : public vp::component, vp::Gdbserver_core vp::wire_slave irq_req_itf; vp::wire_master irq_ack_itf; + vp::wire_master busy_itf; vp::wire_master flush_cache_req_itf; vp::wire_slave flush_cache_ack_itf; @@ -142,8 +143,7 @@ class iss_wrapper : public vp::component, vp::Gdbserver_core vp::reg_1 do_step; std::vector insn_groups_power; - vp::power::power_source clock_gated_power; - vp::power::power_source leakage_power; + vp::power::power_source background_power; vp::trace state_event; vp::trace pc_trace_event; diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp b/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp index 69df9a4e0..9825fa0df 100644 --- a/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp +++ b/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp @@ -287,6 +287,11 @@ void iss_wrapper::clock_sync(void *__this, bool active) _this->clock_active = active; + if (_this->busy_itf.is_bound()) + { + _this->busy_itf.sync(active); + } + // TODO this could be better handler is the clock would be taken into // account in the core state machine uint8_t value = active && _this->is_active_reg.get(); @@ -1330,8 +1335,7 @@ int iss_wrapper::build() this->insn_groups_power.resize(1); power.new_power_source("power_insn", &this->insn_groups_power[0], this->get_js_config()->get("**/insn")); } - power.new_power_source("power_clock_gated", &clock_gated_power, this->get_js_config()->get("**/clock_gated")); - power.new_power_source("leakage", &leakage_power, this->get_js_config()->get("**/leakage")); + power.new_power_source("background", &background_power, this->get_js_config()->get("**/power_models/background")); data.set_resp_meth(&iss_wrapper::data_response); data.set_grant_meth(&iss_wrapper::data_grant); @@ -1354,6 +1358,8 @@ int iss_wrapper::build() new_slave_port("irq_req", &irq_req_itf); new_master_port("irq_ack", &irq_ack_itf); + new_master_port("busy", &busy_itf); + fetchen_itf.set_sync_meth(&iss_wrapper::fetchen_sync); new_slave_port("fetchen", &fetchen_itf); @@ -1424,14 +1430,6 @@ void iss_wrapper::start() iss_register_debug_info(this, x->get_str().c_str()); } - if (this->get_js_config()->get("**/binaries") != NULL) - { - for (auto x:this->get_js_config()->get("**/binaries")->get_elems()) - { - this->binaries_trace_event.event_string("static enable " + x->get_str()); - } - } - trace.msg("ISS start (fetch: %d, is_active: %d, boot_addr: 0x%lx)\n", fetch_enable_reg.get(), is_active_reg.get(), get_config_int("boot_addr")); #ifdef USE_TRDB @@ -1439,7 +1437,8 @@ void iss_wrapper::start() INIT_LIST_HEAD(&this->trdb_packet_list); #endif - this->leakage_power.leakage_power_start(); + this->background_power.leakage_power_start(); + this->background_power.dynamic_power_start(); this->gdbserver = (vp::Gdbserver_engine *)this->get_service("gdbserver"); @@ -1495,6 +1494,14 @@ void iss_wrapper::reset(bool active) this->halted.set(true); } + if (this->get_js_config()->get("**/binaries") != NULL) + { + for (auto x:this->get_js_config()->get("**/binaries")->get_elems()) + { + this->binaries_trace_event.event_string("static enable " + x->get_str()); + } + } + check_state(); } } diff --git a/gvsoc/gvsoc/models/devices/CMakeLists.txt b/gvsoc/gvsoc/models/devices/CMakeLists.txt index d87ee0b38..cfb0dccc5 100644 --- a/gvsoc/gvsoc/models/devices/CMakeLists.txt +++ b/gvsoc/gvsoc/models/devices/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(hyperbus) add_subdirectory(i2c) add_subdirectory(jtag) add_subdirectory(sound) +add_subdirectory(gpio) add_subdirectory(spiflash) add_subdirectory(testbench) add_subdirectory(uart) diff --git a/gvsoc/gvsoc/models/devices/gpio/CMakeLists.txt b/gvsoc/gvsoc/models/devices/gpio/CMakeLists.txt new file mode 100644 index 000000000..f2aa7d662 --- /dev/null +++ b/gvsoc/gvsoc/models/devices/gpio/CMakeLists.txt @@ -0,0 +1,6 @@ + +set(GPIO_PREFIX "devices/gpio") + +vp_model(NAME fxl6408 + PREFIX ${GPIO_PREFIX} + SOURCES "fxl6408.cpp") diff --git a/gvsoc/gvsoc/models/devices/gpio/fxl6408.cpp b/gvsoc/gvsoc/models/devices/gpio/fxl6408.cpp new file mode 100644 index 000000000..f148aa47d --- /dev/null +++ b/gvsoc/gvsoc/models/devices/gpio/fxl6408.cpp @@ -0,0 +1,471 @@ +/* + * Copyright (C) 2020 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + + +#include +#include + + + +typedef enum +{ + I2C_STATE_WAIT_START, + I2C_STATE_WAIT_ADDRESS, + I2C_STATE_GET_DATA, + I2C_STATE_SAMPLE_DATA, + I2C_STATE_ACK, + I2C_STATE_READ_ACK +} I2c_state_e; + + +class Fxl6408 : public vp::component +{ +public: + Fxl6408(js::config *config); + + int build(); + +protected: + static void i2c_sync(void *__this, int scl, int sda); + void i2c_start(unsigned int address, bool is_read); + void i2c_handle_byte(uint8_t byte); + void i2c_stop(); + void i2c_get_data(); + void i2c_send_byte(uint8_t byte); + + void handle_reg_write(uint8_t address, uint8_t value); + uint8_t handle_reg_read(uint8_t address); + + void start(); + + vp::trace trace; + vp::i2c_master i2c_itf; + + unsigned int device_address; + + bool i2c_being_addressed; + unsigned int i2c_address; + uint8_t i2c_pending_data; + bool i2c_is_read; + I2c_state_e i2c_state; + int i2c_pending_bits; + int i2c_prev_sda; + int i2c_prev_scl; + unsigned int i2c_pending_send_byte; + uint8_t reg_address; + bool waiting_reg_address; + + uint8_t device_id; + uint8_t io_dir; + uint8_t output_state; + uint8_t output_high_z; + uint8_t input_default_state; + uint8_t pull_enable; + uint8_t pull_down_up; + uint8_t input_status; + uint8_t interrupt_mask; + uint8_t interrupt_status; +}; + + +Fxl6408::Fxl6408(js::config *config) + : vp::component(config) +{ +} + + +void Fxl6408::start() +{ + this->i2c_itf.sync(1, 1); +} + + +void Fxl6408::i2c_sync(void *__this, int scl, int sda) +{ + Fxl6408 *_this = (Fxl6408 *)__this; + + _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C sync (scl: %d, sda: %d)\n", scl, sda); + + int sdo = 1; + + if (scl == 1 && _this->i2c_prev_sda != sda) + { + if (_this->i2c_prev_sda == 1) + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Detected start\n"); + + _this->i2c_state = I2C_STATE_WAIT_ADDRESS; + _this->i2c_address = 0; + _this->i2c_pending_bits = 8; + } + else + { + _this->i2c_state = I2C_STATE_WAIT_START; + _this->i2c_stop(); + } + goto end; + } + + if (!_this->i2c_prev_scl && scl) + { + switch (_this->i2c_state) + { + case I2C_STATE_WAIT_START: + { + sdo = 1; + break; + } + + case I2C_STATE_WAIT_ADDRESS: + { + if (_this->i2c_pending_bits > 1) + { + _this->i2c_address = (_this->i2c_address << 1) | sda; + _this->trace.msg(vp::trace::LEVEL_TRACE, "Received address bit (bit: %d, address: 0x%x, pending_bits: %d)\n", sda, _this->i2c_address, _this->i2c_pending_bits); + } + else + { + _this->i2c_is_read = sda; + } + _this->i2c_pending_bits--; + if (_this->i2c_pending_bits == 0) + { + _this->i2c_start(_this->i2c_address, _this->i2c_is_read); + _this->i2c_state = I2C_STATE_ACK; + _this->i2c_pending_bits = 8; + } + break; + } + + case I2C_STATE_SAMPLE_DATA: + { + _this->i2c_pending_data = (_this->i2c_pending_data << 1) | sda; + _this->trace.msg(vp::trace::LEVEL_TRACE, "Sampling data (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sda, _this->i2c_pending_data, _this->i2c_pending_bits); + _this->i2c_pending_bits--; + if (_this->i2c_pending_bits == 0) + { + _this->i2c_pending_bits = 8; + _this->i2c_handle_byte(_this->i2c_pending_data); + _this->i2c_state = I2C_STATE_ACK; + } + break; + } + + case I2C_STATE_ACK: { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed); + if (_this->i2c_being_addressed) + { + if (_this->i2c_is_read) + { + _this->i2c_state = I2C_STATE_GET_DATA; + _this->i2c_pending_bits = 8; + _this->i2c_get_data(); + } + else + { + _this->i2c_state = I2C_STATE_SAMPLE_DATA; + } + } + else + { + _this->i2c_state = I2C_STATE_WAIT_START; + } + + break; + } + + case I2C_STATE_READ_ACK: { + _this->i2c_state = I2C_STATE_WAIT_START; + break; + } + } + } + + if (_this->i2c_prev_scl && !scl) + { + switch (_this->i2c_state) + { + case I2C_STATE_ACK: + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed); + sdo = !_this->i2c_being_addressed; + break; + } + + case I2C_STATE_READ_ACK: + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Read ack\n"); + sdo = 0; + break; + } + + case I2C_STATE_GET_DATA: + { + sdo = (_this->i2c_pending_send_byte >> 7) & 1; + _this->trace.msg(vp::trace::LEVEL_TRACE, "Sending bit (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sdo, _this->i2c_pending_send_byte, _this->i2c_pending_bits); + _this->i2c_pending_send_byte <<= 1; + _this->i2c_pending_bits--; + if (_this->i2c_pending_bits == 0) + { + _this->i2c_state = I2C_STATE_READ_ACK; + } + break; + } + } + } + +end: + if (_this->i2c_prev_scl && !scl) + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Sync sda (value: %d)\n", sdo); + _this->i2c_itf.sync(1, sdo); + } + _this->i2c_prev_sda = sda; + _this->i2c_prev_scl = scl; +} + +void Fxl6408::i2c_start(unsigned int address, bool is_read) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Received header (address: 0x%x, is_read: %d)\n", address, is_read); + + this->i2c_being_addressed = address == this->device_address; + if (this->i2c_being_addressed && is_read) + { + this->i2c_send_byte(this->handle_reg_read(this->reg_address)); + } +} + +void Fxl6408::i2c_handle_byte(uint8_t byte) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Handle byte (value: 0x%x)\n", byte); + + if (this->waiting_reg_address) + { + this->reg_address = byte; + this->waiting_reg_address = false; + } + else + { + this->handle_reg_write(this->reg_address, byte); + this->waiting_reg_address = true; + } +} + +void Fxl6408::i2c_stop() +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Received stop bit\n"); + +} + +void Fxl6408::i2c_get_data() +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Getting data\n"); +} + +void Fxl6408::i2c_send_byte(uint8_t byte) +{ + this->i2c_pending_send_byte = byte; +} + + +void Fxl6408::handle_reg_write(uint8_t address, uint8_t value) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Register write (address: 0x%x, value: 0x%x)\n", address, value); + + switch (address) + { + case 0x01: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Device ID & Ctrl", value); + this->device_id = value; + break; + } + case 0x03: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "IO Direction", value); + this->io_dir = value; + break; + } + case 0x05: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Output State", value); + this->output_state = value; + break; + } + case 0x07: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Output High-Z", value); + this->output_high_z = value; + break; + } + case 0x09: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Input Default State", value); + this->input_default_state = value; + break; + } + case 0x0B: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Pull Enable", value); + this->pull_enable = value; + break; + } + case 0x0D: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Pull-Down/Pull-Up", value); + this->pull_down_up = value; + break; + } + case 0x0F: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Input Status", value); + this->input_status = value; + break; + } + case 0x11: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Interrupt Mask", value); + this->interrupt_mask = value; + break; + } + case 0x13: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "interrupt Status", value); + this->interrupt_status = value; + break; + } + default: + this->trace.force_warning("Writing invalid register (address: 0x%x)\n", address); + break; + } + +} + + +uint8_t Fxl6408::handle_reg_read(uint8_t address) +{ + this->trace.msg(vp::trace::LEVEL_DEBUG, "Register read (address: 0x%x)\n", address); + + uint8_t value = 0xFF; + + switch (address) + { + case 0x01: + { + value = this->device_id; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Device ID & Ctrl", value); + break; + } + case 0x03: + { + value = this->io_dir; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "IO Direction", value); + break; + } + case 0x05: + { + value = this->output_state; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Output State", value); + break; + } + case 0x07: + { + value = this->output_high_z; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Output High-Z", value); + break; + } + case 0x09: + { + value = this->input_default_state; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Input Default State", value); + break; + } + case 0x0B: + { + value = this->pull_enable; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Pull Enable", value); + break; + } + case 0x0D: + { + value = this->pull_down_up; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Pull-Down/Pull-Up", value); + break; + } + case 0x0F: + { + value = this->input_status; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Input Status", value); + break; + } + case 0x11: + { + value = this->interrupt_mask; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Interrupt Mask", value); + break; + } + case 0x13: + { + value = this->interrupt_status; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Interrupt Status", value); + break; + } + default: + this->trace.force_warning("Reading invalid register (address: 0x%x)\n", address); + break; + } + + return value; +} + + +int Fxl6408::build() +{ + traces.new_trace("trace", &trace, vp::DEBUG); + + this->i2c_itf.set_sync_meth(&Fxl6408::i2c_sync); + this->new_master_port("i2c", &this->i2c_itf); + + this->i2c_state = I2C_STATE_WAIT_START; + this->i2c_prev_sda = 1; + this->i2c_prev_scl = 1; + this->i2c_being_addressed = false; + this->device_address = 0x43; + this->waiting_reg_address = true; + + this->device_id = 0xC2; + this->io_dir = 0x00; + this->output_state = 0x00; + this->output_high_z = 0xFF; + this->input_default_state = 0x00; + this->pull_enable = 0xFF; + this->pull_down_up = 0x00; + this->input_status = 0xFF; + this->interrupt_mask = 0x00; + this->interrupt_status = 0xFF; + + return 0; +} + + +extern "C" vp::component *vp_constructor(js::config *config) +{ + return new Fxl6408(config); +} diff --git a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp index 22ed95106..7f024bb02 100644 --- a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp +++ b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp @@ -15,27 +15,29 @@ * along with this program. If not, see . */ +// The same library is compiled with same flags for all gvsoc mode (normal, debug and system verilog) +// Force trace support to be able to have them. +#define VP_TRACE_ACTIVE 1 + #include "i2c_helper.hpp" #include #include -//#define I2C_HELPER_DEBUG(...) (fprintf(stderr, "[I2C-HLP] " __VA_ARGS__)) -#define I2C_HELPER_DEBUG(...) - namespace { void null_callback(i2c_operation_e id, i2c_status_e status, int value) { (void) id; (void) status; (void) value; - I2C_HELPER_DEBUG("null callback: id=%d, status=%d, value=%d\n", - id, status, value); + //this->trace.msg(vp::trace::LEVEL_TRACE, "null callback: id=%d, status=%d, value=%d\n", + // id, status, value); } } I2C_helper::I2C_helper(vp::component* parent, vp::i2c_master* itf, - i2c_enqueue_event_fn_t enqueue_event, i2c_cancel_event_fn_t cancel_event) : + i2c_enqueue_event_fn_t enqueue_event, i2c_cancel_event_fn_t cancel_event, + std::string trace_path) : parent(parent), itf(itf), enqueue_event(enqueue_event), @@ -51,28 +53,24 @@ I2C_helper::I2C_helper(vp::component* parent, vp::i2c_master* itf, is_starting(false), is_stopping(false), is_clock_enabled(false), - is_clock_low(false), + clock_value(1), is_driving_scl(false), is_driving_sda(false), cb_master_operation(null_callback), clock_event(parent, this, I2C_helper::st_clock_event_handler), - data_event(parent, this, I2C_helper::st_data_event_handler) + fsm_event(parent, this, I2C_helper::fsm_event_handler) { assert(NULL != this->parent); assert(NULL != this->itf); - I2C_HELPER_DEBUG("Initializing helper interface\n"); -} + parent->traces.new_trace(trace_path + "/i2c_helper", &this->trace, vp::DEBUG); -void I2C_helper::st_data_event_handler(void* __this, vp::clock_event* event) -{ - assert(NULL != __this); - assert(NULL != event); + this->trace.msg(vp::trace::LEVEL_TRACE, "Initializing helper interface\n"); - I2C_HELPER_DEBUG("st_data_event_handler: none\n"); - I2C_helper* _this = (I2C_helper*) __this; - _this->desired_sda = _this->expected_bit_value; - _this->sync_pins(); + this->pending_data_bits = 0; + this->fsm_waiting = false; + this->input_scl = 1; + this->input_sda = 1; } void I2C_helper::st_clock_event_handler(void* __this, vp::clock_event* event) @@ -80,29 +78,70 @@ void I2C_helper::st_clock_event_handler(void* __this, vp::clock_event* event) assert(NULL != __this); assert(NULL != event); - I2C_HELPER_DEBUG("st_clock_event_handler: none\n"); I2C_helper* _this = (I2C_helper*) __this; _this->clock_event_handler(event); } + +void I2C_helper::clock_toggle(void) +{ + if (this->is_clock_enabled) + { + this->clock_value ^= 1; + + this->enqueue_clock_toggle(); + } +} + +void I2C_helper::enqueue_clock_toggle(void) +{ + if (this->is_clock_enabled) + { + if (this->clock_event.is_enqueued()) + { + this->cancel_event(&this->clock_event); + } + + const uint64_t delay = this->clock_value ? this->delay_low_ps : this->delay_high_ps; + this->enqueue_event(&this->clock_event, delay); + } +} + + +void I2C_helper::fsm_enqueue_event(int64_t delay) +{ + if (!this->fsm_event.is_enqueued()) + { + this->enqueue_event(&this->fsm_event, delay); + } +} + + +void I2C_helper::fsm_event_handler(void *__this, vp::clock_event* event) +{ + I2C_helper* _this = (I2C_helper *) __this; + + _this->fsm_waiting = false; + _this->fsm_step(); +} + + void I2C_helper::clock_event_handler(vp::clock_event* event) { assert(NULL != event); - I2C_HELPER_DEBUG("clock_event_handler: none\n"); /* clock toggling */ if (this->is_clock_enabled) { - if (this->is_clock_low) + this->trace.msg(vp::trace::LEVEL_TRACE, "Toggling clock (value: %d)\n", this->clock_value); + if (this->clock_value) { - I2C_HELPER_DEBUG("clock_event_handler: LOW (switch to high)\n"); /* switch to high */ this->desired_scl = 1; this->sync_pins(); } else { - I2C_HELPER_DEBUG("clock_event_handler: HIGH (switch to low)\n"); /* switch to low */ this->desired_scl = 0; this->sync_pins(); @@ -112,29 +151,23 @@ void I2C_helper::clock_event_handler(vp::clock_event* event) void I2C_helper::register_callback(i2c_callback_t callback) { - I2C_HELPER_DEBUG("register_callback: none\n"); + this->trace.msg(vp::trace::LEVEL_TRACE, "register_callback: none\n"); this->cb_master_operation = callback; } -void I2C_helper::update_pins(int scl, int sda) -{ - this->fsm_step(scl, sda); -} - - void I2C_helper::sync_pins(void) { - int res_scl = this->is_driving_scl ? this->desired_scl : 1; - int res_sda = this->is_driving_sda ? this->desired_sda : 1; + int res_scl = this->internal_state != I2C_INTERNAL_IDLE ? this->desired_scl : 1; + int res_sda = this->internal_state != I2C_INTERNAL_IDLE ? this->desired_sda : 1; - I2C_HELPER_DEBUG("sync_pins: scl=%d, sda=%d\n", res_scl, res_sda); + this->trace.msg(vp::trace::LEVEL_TRACE, "Synchronizing pins (scl:%d, sda:%d)\n", res_scl, res_sda); this->itf->sync(res_scl, res_sda); } void I2C_helper::set_timings(uint64_t delay_low_ps, uint64_t delay_high_ps) { - I2C_HELPER_DEBUG("set_timings: delay_low_ps=%ld, delay_high_ps=%ld\n", + this->trace.msg(vp::trace::LEVEL_TRACE, "set_timings: delay_low_ps=%ld, delay_high_ps=%ld\n", delay_low_ps, delay_high_ps); this->delay_low_ps = delay_low_ps; @@ -143,35 +176,28 @@ void I2C_helper::set_timings(uint64_t delay_low_ps, uint64_t delay_high_ps) void I2C_helper::send_start(void) { - I2C_HELPER_DEBUG("send_start: none\n"); - if (!this->is_busy()) - { - I2C_HELPER_DEBUG("send_start: sda=%d, scl=%d\n", this->sda, this->scl); - I2C_HELPER_DEBUG("send_start: this=%p\n", (void*) this); - this->is_driving_scl = true; - this->is_driving_sda = true; - this->desired_scl = 1; - this->desired_sda = 0; - this->sync_pins(); - this->start_clock(); - } - else - { - this->is_starting = true; - } + this->trace.msg(vp::trace::LEVEL_TRACE, "Request to send start\n"); + + this->is_starting = true; + this->fsm_enqueue_event(1); +} + +void I2C_helper::release_pins(void) +{ + } -bool I2C_helper::is_busy(void) +void I2C_helper::update_pins(int scl, int sda) { - return (this->internal_state != I2C_INTERNAL_IDLE); + this->input_scl = scl; + this->input_sda = sda; + this->fsm_enqueue_event(1); } void I2C_helper::send_address(int addr, bool is_write, bool is_10bits) { - I2C_HELPER_DEBUG("send_address: addr=%d, is_write=%s, is_10bits=%s\n", - addr, - is_write ? "true" : "false", - is_10bits ? "true" : "false"); + this->trace.msg(vp::trace::LEVEL_TRACE, "Request to send address (addr: 0x%x, is_write:%d, is_10bits:%d)\n", + addr, is_write, is_10bits); //TODO support 10 bits mode assert(!is_10bits); @@ -180,337 +206,274 @@ void I2C_helper::send_address(int addr, bool is_write, bool is_10bits) this->send_data(addr_byte); } +void I2C_helper::send_ack(bool ack) +{ + // I2C_HELPER_DEBUG("send_ack: ack=%s\n", ack ? "true" : "false"); + // //TODO + // this->expected_bit_value = ack ? 0 : 1; + // this->is_driving_sda = 1; + // this->enqueue_data_change(this->expected_bit_value); +} + void I2C_helper::send_data(int byte) { - I2C_HELPER_DEBUG("send_data: byte=%d\n", byte); - // TODO verify that we are in data mode ? + this->trace.msg(vp::trace::LEVEL_TRACE, "Request to send data (value: 0x%x)\n", byte); - /* load byte in sending queue */ - for (int i = 7; i >= 0; i--) + if (this->pending_data_bits) { - int bit = (byte >> i) & 1; - I2C_HELPER_DEBUG("push to send bit queue:%d\n", bit); - this->send_bit_queue.push(bit); + this->trace.force_warning("Trying to send data while there is already one pending\n"); } - - /* enqueue data change if clock is low, - * else will be done automatically at next falling scl */ - if (this->is_clock_low && this->internal_state == I2C_INTERNAL_DATA) + else { - I2C_HELPER_DEBUG("Directly enqueueing!\n"); - this->expected_bit_value = this->send_bit_queue.front(); - this->send_bit_queue.pop(); - this->is_driving_sda = true; - this->enqueue_data_change(this->expected_bit_value); + this->pending_data = byte; + this->pending_data_bits = 8; + this->fsm_enqueue_event(1); } } -void I2C_helper::send_ack(bool ack) -{ - I2C_HELPER_DEBUG("send_ack: ack=%s\n", ack ? "true" : "false"); - //TODO - this->expected_bit_value = ack ? 0 : 1; - this->is_driving_sda = 1; - this->enqueue_data_change(this->expected_bit_value); -} - void I2C_helper::send_stop(void) { - I2C_HELPER_DEBUG("send_stop: none\n"); - if(this->is_busy()) - { - this->is_stopping = true; - this->is_driving_sda = true; - this->expected_bit_value = 0; - this->enqueue_data_change(this->expected_bit_value); - } -} - -void I2C_helper::release_pins(void) -{ - // release everything that could hold the bus - this->empty_queues(); - this->stop_clock(); + this->trace.msg(vp::trace::LEVEL_TRACE, "Request to stop\n"); + this->is_stopping = true; } void I2C_helper::start_clock(void) { - I2C_HELPER_DEBUG("Starting clock\n"); + this->trace.msg(vp::trace::LEVEL_TRACE, "Starting clock\n"); //start high then loop(low -> high) this->is_clock_enabled = true; - this->is_clock_low = false; + this->clock_value = this->scl ^ 1; this->enqueue_clock_toggle(); } void I2C_helper::stop_clock(void) { - I2C_HELPER_DEBUG("Stop clock\n"); + this->trace.msg(vp::trace::LEVEL_TRACE, "Stopping clock\n"); this->is_clock_enabled =false; this->cancel_event(&this->clock_event); +} + +std::string I2C_helper::get_state_name(i2c_internal_state_e state) +{ + switch (state) + { + case I2C_INTERNAL_IDLE: return "idle"; + case I2C_INTERNAL_WAIT_START: return "wait_start"; + case I2C_INTERNAL_WAIT_STOP: return "wait_stop"; + case I2C_INTERNAL_START: return "start"; + case I2C_INTERNAL_WAIT_DATA: return "wait_data"; + case I2C_INTERNAL_DATA: return "data"; + case I2C_INTERNAL_DATA_READ: return "data_read"; + case I2C_INTERNAL_ACK: return "ack"; + case I2C_INTERNAL_STOP_CLOCK: return "stop_clock"; + case I2C_INTERNAL_STOP_CLOCK_WAIT: return "stop_clock_wait"; + case I2C_INTERNAL_RESTART: return "restart"; + case I2C_INTERNAL_STOP_0: return "stop_0"; + case I2C_INTERNAL_STOP_1: return "stop_1"; + default: return "unknown"; + } +} - this->desired_scl = 1; - this->is_driving_scl = false; - this->is_driving_sda = false; - this->enqueue_data_change(1); +void I2C_helper::send_data_bit() +{ + int bit = (this->pending_data >> 7) & 1; + this->trace.msg(vp::trace::LEVEL_TRACE, "Sending bit (bit: %d)\n", bit); + this->desired_sda = bit; + this->pending_data <<= 1; + this->pending_data_bits--; } -void I2C_helper::fsm_step(int input_scl, int input_sda) + +void I2C_helper::fsm_step() { - bool scl_rising = (input_scl == 1 && this->scl == 0); - bool scl_falling = (input_scl == 0 && this->scl == 1); - bool scl_steady = (input_scl == this->scl); + if (this->fsm_waiting) + { + return; + } + + this->trace.msg(vp::trace::LEVEL_TRACE, "FSM update (state: %s, prev_scl: %d, prev_sda: %d, scl: %d, sda: %d)\n", + this->get_state_name(this->internal_state).c_str(), this->scl, this->sda, this->input_scl, this->input_sda); - bool sda_rising = (input_sda == 1 && this->sda == 0); - bool sda_falling = (input_sda == 0 && this->sda == 1); - I2C_HELPER_DEBUG("\n\n\n"); - I2C_HELPER_DEBUG("fsm_step: input_scl=%d, input_sda=%d\n", input_scl, input_sda); - I2C_HELPER_DEBUG("fsm_step: scl=%d, this->scl=%d\n", input_scl, this->scl); - I2C_HELPER_DEBUG("fsm_step: sda=%d, this->sda=%d\n", input_sda, this->sda); - I2C_HELPER_DEBUG("fsm_step: this=%p\n", (void*) this); + bool scl_rising = (this->input_scl == 1 && this->scl == 0); + bool scl_falling = (this->input_scl == 0 && this->scl == 1); + bool scl_steady = (this->input_scl == this->scl); - this->scl = input_scl; - this->sda = input_sda; + bool sda_rising = (this->input_sda == 1 && this->sda == 0); + bool sda_falling = (this->input_sda == 0 && this->sda == 1); + + this->scl = this->input_scl; + this->sda =this-> input_sda; /* clock management */ if (!scl_steady) { - /* manages clock synchronization and clock stretching automatically */ - if (scl_rising) - { - this->is_clock_low = false; - } - else if (scl_falling) - { - this->is_clock_low = true; - } - - if (this->is_clock_enabled) - { - this->enqueue_clock_toggle(); - } + // Renqueue a clock toggle each time it toggles + this->clock_toggle(); } - /* I2C logic */ - if (scl_steady) + switch (this->internal_state) { - /* START/STOP detection */ - if (this->scl == 1) - { - if (sda_falling && !this->is_busy()) + case I2C_INTERNAL_IDLE: + if (is_starting) { + this->trace.msg(vp::trace::LEVEL_TRACE, "Waiting start\n"); + this->internal_state = I2C_INTERNAL_WAIT_START; this->is_starting = false; + this->desired_sda = 0; + this->desired_scl = 1; } - else if (sda_rising && this->is_busy()) + break; + + case I2C_INTERNAL_WAIT_START: + if (scl_steady && sda_falling) { - this->internal_state = I2C_INTERNAL_IDLE; - this->is_stopping = false; - /* stop clock */ - this->stop_clock(); - this->empty_queues(); + this->trace.msg(vp::trace::LEVEL_TRACE, "Detected start, waiting for data\n"); + this->internal_state = I2C_INTERNAL_WAIT_DATA; + this->cb_master_operation(I2C_OP_START, I2C_STATUS_OK, 0); + } + break; - I2C_HELPER_DEBUG("STOP DETECTED\n"); + case I2C_INTERNAL_WAIT_STOP: + if (scl_steady && sda_rising) + { + this->trace.msg(vp::trace::LEVEL_TRACE, "Detected stop\n"); + this->internal_state = I2C_INTERNAL_IDLE; this->cb_master_operation(I2C_OP_STOP, I2C_STATUS_OK, 0); } - } - } - else if (!this->is_busy() && scl_falling && this->sda == 0) - { - /* propagate start condition */ - this->internal_state = I2C_INTERNAL_START; + break; - this->sda_rise = this ->sda; - this->empty_queues(); - I2C_HELPER_DEBUG("START DETECTED\n"); - this->cb_master_operation(I2C_OP_START, I2C_STATUS_OK, 0); - } - else if (this->is_busy()) - { - /* sampling bit*/ - if (scl_rising) + case I2C_INTERNAL_WAIT_DATA: + if (this->pending_data_bits) + { + this->trace.msg(vp::trace::LEVEL_TRACE, "Detected data, starting clock\n"); + this->internal_state = I2C_INTERNAL_DATA; + } + this->start_clock(); + break; + + case I2C_INTERNAL_DATA: { - I2C_HELPER_DEBUG("SCL rising\n"); - I2C_HELPER_DEBUG("fsm_step: sampling rising bit\n"); - this->sda_rise = this->sda; - //TODO add check expected_bit_value - if (is_stopping) + if (scl_falling) { - this->is_driving_sda = true; - this->expected_bit_value = 1; - this->enqueue_data_change(this->expected_bit_value); + this->send_data_bit(); } - else if (is_starting) + else if (scl_rising) { - /* drive sda pin down */ - this->is_driving_sda = true; - this->expected_bit_value = 0; - this->enqueue_data_change(this->expected_bit_value); + if (this->pending_data_bits == 0) + { + this->internal_state = I2C_INTERNAL_ACK; + } } + break; + } - if (this->is_driving_sda && this->desired_sda != this->sda && this->desired_sda != 0) + case I2C_INTERNAL_DATA_READ: + { + if (scl_rising) { - // we lost arbitration - i2c_operation_e operation = I2C_OP_DATA; + this->pending_data = (this->pending_data << 1) | this->sda; + this->trace.msg(vp::trace::LEVEL_TRACE, "Sampled data (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", this->sda, this->pending_data, this->pending_data_bits); + this->pending_data_bits--; - if (this->is_stopping) - { - operation = I2C_OP_STOP; - } - else if (this->internal_state == I2C_INTERNAL_DATA) + if (this->pending_data_bits == 0) { - operation = I2C_OP_DATA; - } - else if (this->internal_state == I2C_INTERNAL_ACK) - { - operation = I2C_OP_ACK; + this->cb_master_operation(I2C_OP_DATA, I2C_STATUS_OK, this->pending_data); + this->internal_state = I2C_INTERNAL_ACK; } - - this->cb_master_operation( - operation, - I2C_STATUS_ERROR_ARBITRATION, - 0); } + break; } - else if (scl_falling) + + case I2C_INTERNAL_ACK: { - I2C_HELPER_DEBUG("SCL falling\n"); - I2C_HELPER_DEBUG("INTERNAL_STATE = %d\n", this->internal_state); - if (this->internal_state != I2C_INTERNAL_START) + if (scl_rising) { - if (this->sda_rise == this->sda) - { - this->recv_bit_queue.push(this->sda); - } - else - { - //TODO framing error ? - //TODO empty queue - I2C_HELPER_DEBUG("FRAMING ERROR!, sda_rise=%d, sda=%d\n", this->sda_rise, this->sda); - this->cb_master_operation(I2C_OP_STOP, I2C_STATUS_ERROR_FRAMING, 0); - } + int ack = this->sda; + + this->trace.msg(vp::trace::LEVEL_TRACE, "Sampled ack (value: %d)\n", ack); + const i2c_status_e status = (ack == 1) ? I2C_STATUS_KO : I2C_STATUS_OK; + this->cb_master_operation(I2C_OP_ACK, status, 0); + this->internal_state = I2C_INTERNAL_STOP_CLOCK; } - else + break; + } + + case I2C_INTERNAL_STOP_CLOCK: + { + if (scl_falling) { - this->internal_state = I2C_INTERNAL_DATA; + this->desired_sda = 1; + this->stop_clock(); + this->internal_state = I2C_INTERNAL_STOP_CLOCK_WAIT; + this->fsm_waiting = true; + this->fsm_enqueue_event(this->delay_high_ps); } + break; + } - if (is_stopping) + case I2C_INTERNAL_STOP_CLOCK_WAIT: + { + if (this->pending_data_bits) { - this->is_driving_sda = true; - this->expected_bit_value = 0; - this->enqueue_data_change(this->expected_bit_value); + // We must continue immediately with another byte of data + // Send a bit now since there is no falling edge and let the usual + // state continue + this->send_data_bit(); + this->start_clock(); + this->internal_state = I2C_INTERNAL_DATA; + } - else if (is_starting) + else if (this->is_starting) { - this->is_driving_sda = true; - this->expected_bit_value = 1; - this->enqueue_data_change(this->expected_bit_value); + this->internal_state = I2C_INTERNAL_RESTART; + this->is_starting = false; + this->desired_scl = 1; + this->fsm_waiting = true; + this->fsm_enqueue_event(this->delay_high_ps); } - else if (this->internal_state == I2C_INTERNAL_DATA) + else if (this->is_stopping) { - /* send data */ - if (!this->send_bit_queue.empty()) - { - assert(this->send_bit_queue.size() <= 8); - - int bit = this->send_bit_queue.front(); - this->send_bit_queue.pop(); - this->expected_bit_value = bit; - - this->is_driving_sda = true; - this->enqueue_data_change(this->expected_bit_value); - } - else - { - /* release sda pin */ - this->is_driving_sda = false; - this->enqueue_data_change(this->expected_bit_value); - } - - /* receiving data */ - if (this->recv_bit_queue.size() == 8) - { - int byte = 0; - /* full byte received */ - for (int i = 0; i < 8; i++) - { - int bit = this->recv_bit_queue.front(); - this->recv_bit_queue.pop(); - byte = byte << 1 | bit; - } - assert(this->recv_bit_queue.empty()); - - I2C_HELPER_DEBUG("fsm_step: byte received=%d\n", byte); - - this->internal_state = I2C_INTERNAL_ACK; - this->empty_queues(); - - this->cb_master_operation(I2C_OP_DATA, I2C_STATUS_OK, byte); - } + this->internal_state = I2C_INTERNAL_STOP_0; + this->is_stopping = false; + this->desired_sda = 0; + this->fsm_waiting = true; + this->fsm_enqueue_event(this->delay_high_ps); } - else if (this->internal_state == I2C_INTERNAL_ACK) + else { - if (this->recv_bit_queue.size() == 1) - { - const int bit = this->recv_bit_queue.front(); - this->recv_bit_queue.pop(); - - I2C_HELPER_DEBUG("fsm_step: ACK received=%d\n", bit); - - const i2c_status_e status = (bit == 1) ? I2C_STATUS_KO : I2C_STATUS_OK; - assert(this->recv_bit_queue.empty()); - - this->internal_state = I2C_INTERNAL_DATA; - this->empty_queues(); - - /* release sda pin */ - this->is_driving_sda = false; - this->expected_bit_value = 1; - this->enqueue_data_change(this->expected_bit_value); - - this->cb_master_operation(I2C_OP_ACK, status, 0); - - } + this->start_clock(); + this->pending_data_bits = 8; + this->internal_state = I2C_INTERNAL_DATA_READ; } + break; } - } -} -void I2C_helper::enqueue_clock_toggle(void) -{ - I2C_HELPER_DEBUG("enqueue_clock_toggle: clock_low=%s\n", - this->is_clock_low ? "true" : "false"); - if (this->is_clock_enabled) - { - if (this->clock_event.is_enqueued()) + case I2C_INTERNAL_STOP_0: { - this->cancel_event(&this->clock_event); + this->internal_state = I2C_INTERNAL_STOP_1; + this->desired_scl = 1; + this->fsm_waiting = true; + this->fsm_enqueue_event(this->delay_high_ps); + break; } - const uint64_t delay = this->is_clock_low ? this->delay_low_ps : this->delay_high_ps; - this->enqueue_event(&this->clock_event, delay); - } -} - -void I2C_helper::enqueue_data_change(int new_sda) -{ - I2C_HELPER_DEBUG("enqueue_data_change: %d\n", new_sda); - if (!this->data_event.is_enqueued()) - { - this->enqueue_event(&this->data_event, 1); - } -} + case I2C_INTERNAL_STOP_1: + { + this->internal_state = I2C_INTERNAL_WAIT_STOP; + this->desired_sda = 1; + break; + } -void I2C_helper::empty_queues(void) -{ - while(!this->send_bit_queue.empty()) - { - this->send_bit_queue.pop(); + case I2C_INTERNAL_RESTART: + { + this->trace.msg(vp::trace::LEVEL_TRACE, "Waiting start\n"); + this->internal_state = I2C_INTERNAL_WAIT_START; + this->desired_sda = 0; + break; + } } - while(!this->recv_bit_queue.empty()) - { - this->recv_bit_queue.pop(); - } + this->sync_pins(); } diff --git a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp index a9e863471..cb8289c98 100644 --- a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp +++ b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp @@ -41,9 +41,18 @@ typedef enum { typedef enum { I2C_INTERNAL_IDLE, + I2C_INTERNAL_WAIT_START, + I2C_INTERNAL_WAIT_STOP, I2C_INTERNAL_START, + I2C_INTERNAL_WAIT_DATA, I2C_INTERNAL_DATA, + I2C_INTERNAL_DATA_READ, I2C_INTERNAL_ACK, + I2C_INTERNAL_STOP_CLOCK, + I2C_INTERNAL_STOP_CLOCK_WAIT, + I2C_INTERNAL_RESTART, + I2C_INTERNAL_STOP_0, + I2C_INTERNAL_STOP_1, } i2c_internal_state_e; typedef std::function i2c_callback_t; @@ -66,7 +75,7 @@ typedef std::function i2c_cancel_event_fn_t; */ class I2C_helper { public: - I2C_helper(vp::component* parent, vp::i2c_master* itf, i2c_enqueue_event_fn_t event, i2c_cancel_event_fn_t cancel_event); + I2C_helper(vp::component* parent, vp::i2c_master* itf, i2c_enqueue_event_fn_t event, i2c_cancel_event_fn_t cancel_event, std::string trace_path=""); // TO be called when pin values change void update_pins(int scl, int sda); @@ -106,7 +115,7 @@ class I2C_helper { /******************/ /* Static methods */ /******************/ - static void st_data_event_handler(void* __this, vp::clock_event* event); + static void fsm_event_handler(void* __this, vp::clock_event* event); static void st_clock_event_handler(void* __this, vp::clock_event* event); static void i2c_sync(void *__this, int scl, int sda); @@ -117,13 +126,17 @@ class I2C_helper { void start_clock(void); void stop_clock(void); + void clock_toggle(void); void enqueue_clock_toggle(void); void enqueue_data_change(int new_sda); + void fsm_enqueue_event(int64_t delay); + void send_data_bit(); - void fsm_step(int scl, int sda); + void fsm_step(); void sync_pins(void); - void empty_queues(void); + + std::string get_state_name(i2c_internal_state_e state); /*************/ /* Externals */ @@ -149,7 +162,7 @@ class I2C_helper { /* Runtime data */ /****************/ vp::clock_event clock_event; - vp::clock_event data_event; + vp::clock_event fsm_event; i2c_internal_state_e internal_state; @@ -164,8 +177,6 @@ class I2C_helper { int sda_rise; /* sda sampled on scl rising edge */ - std::queue send_bit_queue; - std::queue recv_bit_queue; int expected_bit_value; /* checked when scl is rising */ bool check_sent; @@ -175,5 +186,16 @@ class I2C_helper { bool is_starting; bool is_clock_enabled; - bool is_clock_low; /* tell if clock is in low or high state */ + int clock_value; + + vp::trace trace; + + int ack_value; + + uint8_t pending_data; + int pending_data_bits; + bool fsm_waiting; + + int input_scl; + int input_sda; }; diff --git a/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp b/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp index b16036c39..81b0da6bd 100644 --- a/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp +++ b/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp @@ -46,6 +46,9 @@ class I2c_bus : public vp::component vp::reg_1 bus_scl; vp::reg_1 bus_sda; + + bool pending_resolve; + bool do_resolve; }; @@ -68,6 +71,8 @@ int I2c_bus::build() this->bus_scl.set(1); this->bus_sda.set(1); + this->pending_resolve = false; + return 0; } @@ -76,42 +81,61 @@ void I2c_bus::sync(void *__this, int scl, int sda, int id) { I2c_bus *_this = (I2c_bus *)__this; - _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus update [id=%d]: scl=%d, sda=%d\n", + _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus sync [id=%d]: scl=%d, sda=%d\n", id, scl, sda); /* store incoming values in maps */ _this->i2c_values[id].scl = scl; _this->i2c_values[id].sda = sda; - /* browse all values and compute resulting SCL and SDA */ - int res_scl_value = 1; - int res_sda_value = 1; + _this->do_resolve = true; + + if (_this->pending_resolve) + { + return; + } + + _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus update\n"); - for (std::pair i2c_val : _this->i2c_values) + _this->pending_resolve = true; + + while (_this->do_resolve) { - _this->trace.msg(vp::trace::LEVEL_TRACE, "bus values [id=%d]: scl=%d, sda=%d\n", - i2c_val.first, - i2c_val.second.scl, - i2c_val.second.sda); - if (i2c_val.second.scl == 0) + _this->do_resolve = false; + + /* browse all values and compute resulting SCL and SDA */ + int res_scl_value = 1; + int res_sda_value = 1; + + for (std::pair i2c_val : _this->i2c_values) { - res_scl_value = 0; + _this->trace.msg(vp::trace::LEVEL_TRACE, "bus values [id=%d]: scl=%d, sda=%d\n", + i2c_val.first, + i2c_val.second.scl, + i2c_val.second.sda); + if (i2c_val.second.scl == 0) + { + res_scl_value = 0; + } + if (i2c_val.second.sda == 0) + { + res_sda_value = 0; + } } - if (i2c_val.second.sda == 0) + + /* broadcast the values to all peripherals if needed */ + if (res_scl_value != _this->bus_scl.get() || res_sda_value != _this->bus_sda.get()) { - res_sda_value = 0; + /* only propagate changes */ + _this->bus_scl.set(res_scl_value); + _this->bus_sda.set(res_sda_value); + _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C: scl=%d, sda=%d\n", + _this->bus_scl.get(), _this->bus_sda.get()); + _this->in.sync(res_scl_value, res_sda_value); } } - /* broadcast the values to all peripherals if needed */ - if (res_scl_value != _this->bus_scl.get() || res_sda_value != _this->bus_sda.get()) - { - /* only propagate changes */ - _this->bus_scl.set(res_scl_value); - _this->bus_sda.set(res_sda_value); - _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C: scl=%d, sda=%d\n", - _this->bus_scl.get(), _this->bus_sda.get()); - _this->in.sync(res_scl_value, res_sda_value); - } + _this->pending_resolve = false; + _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus update done[id=%d]\n", id); } diff --git a/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt b/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt index fead9b06a..47da2a0d4 100644 --- a/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt +++ b/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(dac) set(SOUND_PREFIX "devices/sound") diff --git a/gvsoc/gvsoc/models/devices/sound/dac/CMakeLists.txt b/gvsoc/gvsoc/models/devices/sound/dac/CMakeLists.txt new file mode 100644 index 000000000..7eb9285ee --- /dev/null +++ b/gvsoc/gvsoc/models/devices/sound/dac/CMakeLists.txt @@ -0,0 +1,6 @@ + +set(DAC_PREFIX "devices/sound/dac") + +vp_model(NAME ak4332 + PREFIX ${DAC_PREFIX} + SOURCES "ak4332.cpp") diff --git a/gvsoc/gvsoc/models/devices/sound/dac/ak4332.cpp b/gvsoc/gvsoc/models/devices/sound/dac/ak4332.cpp new file mode 100644 index 000000000..5cff4d467 --- /dev/null +++ b/gvsoc/gvsoc/models/devices/sound/dac/ak4332.cpp @@ -0,0 +1,638 @@ +/* + * Copyright (C) 2020 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + + +#include +#include + + + +typedef enum +{ + I2C_STATE_WAIT_START, + I2C_STATE_WAIT_ADDRESS, + I2C_STATE_GET_DATA, + I2C_STATE_SAMPLE_DATA, + I2C_STATE_ACK, + I2C_STATE_READ_ACK +} I2c_state_e; + + +class Ak4332 : public vp::component +{ +public: + Ak4332(js::config *config); + + int build(); + +protected: + static void i2c_sync(void *__this, int scl, int sda); + void i2c_start(unsigned int address, bool is_read); + void i2c_handle_byte(uint8_t byte); + void i2c_stop(); + void i2c_get_data(); + void i2c_send_byte(uint8_t byte); + + void handle_reg_write(uint8_t address, uint8_t value); + uint8_t handle_reg_read(uint8_t address); + + void start(); + + vp::trace trace; + vp::i2c_master i2c_itf; + + unsigned int device_address; + + bool i2c_being_addressed; + unsigned int i2c_address; + uint8_t i2c_pending_data; + bool i2c_is_read; + I2c_state_e i2c_state; + int i2c_pending_bits; + int i2c_prev_sda; + int i2c_prev_scl; + unsigned int i2c_pending_send_byte; + uint8_t reg_address; + bool waiting_reg_address; + uint8_t power_1; + uint8_t power_2; + uint8_t power_3; + uint8_t power_4; + uint8_t output_mode; + uint8_t clock_mode; + uint8_t digital_filter; + uint8_t dac_mono_mixing; + uint8_t pdm_control; + uint8_t dac_volume_control; + uint8_t hp_volume_control; + uint8_t pll_clk_selection; + uint8_t pll_ref_clk_div_1; + uint8_t pll_ref_clk_div_2; + uint8_t pll_fb_clk_div_1; + uint8_t pll_fb_clk_div_2; + uint8_t dac_clk_source; + uint8_t dac_clk_divider; + uint8_t audio_format; + uint8_t pdm_err; + uint8_t dac_adjustment_1; + uint8_t dac_adjustment_2; +}; + + +Ak4332::Ak4332(js::config *config) + : vp::component(config) +{ +} + + +void Ak4332::start() +{ + this->i2c_itf.sync(1, 1); +} + + +void Ak4332::i2c_sync(void *__this, int scl, int sda) +{ + Ak4332 *_this = (Ak4332 *)__this; + + _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C sync (scl: %d, sda: %d)\n", scl, sda); + + int sdo = 1; + + if (scl == 1 && _this->i2c_prev_sda != sda) + { + if (_this->i2c_prev_sda == 1) + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Detected start\n"); + + _this->i2c_state = I2C_STATE_WAIT_ADDRESS; + _this->i2c_address = 0; + _this->i2c_pending_bits = 8; + } + else + { + _this->i2c_state = I2C_STATE_WAIT_START; + _this->i2c_stop(); + } + goto end; + } + + if (!_this->i2c_prev_scl && scl) + { + switch (_this->i2c_state) + { + case I2C_STATE_WAIT_START: + { + sdo = 1; + break; + } + + case I2C_STATE_WAIT_ADDRESS: + { + if (_this->i2c_pending_bits > 1) + { + _this->i2c_address = (_this->i2c_address << 1) | sda; + _this->trace.msg(vp::trace::LEVEL_TRACE, "Received address bit (bit: %d, address: 0x%x, pending_bits: %d)\n", sda, _this->i2c_address, _this->i2c_pending_bits); + } + else + { + _this->i2c_is_read = sda; + } + _this->i2c_pending_bits--; + if (_this->i2c_pending_bits == 0) + { + _this->i2c_start(_this->i2c_address, _this->i2c_is_read); + _this->i2c_state = I2C_STATE_ACK; + _this->i2c_pending_bits = 8; + } + break; + } + + case I2C_STATE_SAMPLE_DATA: + { + _this->i2c_pending_data = (_this->i2c_pending_data << 1) | sda; + _this->trace.msg(vp::trace::LEVEL_TRACE, "Sampling data (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sda, _this->i2c_pending_data, _this->i2c_pending_bits); + _this->i2c_pending_bits--; + if (_this->i2c_pending_bits == 0) + { + _this->i2c_pending_bits = 8; + _this->i2c_handle_byte(_this->i2c_pending_data); + _this->i2c_state = I2C_STATE_ACK; + } + break; + } + + case I2C_STATE_ACK: { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed); + if (_this->i2c_being_addressed) + { + if (_this->i2c_is_read) + { + _this->i2c_state = I2C_STATE_GET_DATA; + _this->i2c_pending_bits = 8; + _this->i2c_get_data(); + } + else + { + _this->i2c_state = I2C_STATE_SAMPLE_DATA; + } + } + else + { + _this->i2c_state = I2C_STATE_WAIT_START; + } + + break; + } + + case I2C_STATE_READ_ACK: { + _this->i2c_state = I2C_STATE_WAIT_START; + break; + } + } + } + + if (_this->i2c_prev_scl && !scl) + { + switch (_this->i2c_state) + { + case I2C_STATE_ACK: + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed); + sdo = !_this->i2c_being_addressed; + break; + } + + case I2C_STATE_READ_ACK: + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Read ack\n"); + sdo = 0; + break; + } + + case I2C_STATE_GET_DATA: + { + sdo = (_this->i2c_pending_send_byte >> 7) & 1; + _this->trace.msg(vp::trace::LEVEL_TRACE, "Sending bit (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sdo, _this->i2c_pending_send_byte, _this->i2c_pending_bits); + _this->i2c_pending_send_byte <<= 1; + _this->i2c_pending_bits--; + if (_this->i2c_pending_bits == 0) + { + _this->i2c_state = I2C_STATE_READ_ACK; + } + break; + } + } + } + +end: + if (_this->i2c_prev_scl && !scl) + { + _this->trace.msg(vp::trace::LEVEL_TRACE, "Sync sda (value: %d)\n", sdo); + _this->i2c_itf.sync(1, sdo); + } + _this->i2c_prev_sda = sda; + _this->i2c_prev_scl = scl; +} + +void Ak4332::i2c_start(unsigned int address, bool is_read) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Received header (address: 0x%x, is_read: %d)\n", address, is_read); + + this->i2c_being_addressed = address == this->device_address; + if (is_read) + { + this->i2c_send_byte(this->handle_reg_read(this->reg_address)); + } +} + +void Ak4332::i2c_handle_byte(uint8_t byte) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Handle byte (value: 0x%x)\n", byte); + + if (this->waiting_reg_address) + { + this->reg_address = byte; + this->waiting_reg_address = false; + } + else + { + this->handle_reg_write(this->reg_address, byte); + this->waiting_reg_address = true; + } +} + +void Ak4332::i2c_stop() +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Received stop bit\n"); + +} + +void Ak4332::i2c_get_data() +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Getting data\n"); +} + +void Ak4332::i2c_send_byte(uint8_t byte) +{ + this->i2c_pending_send_byte = byte; +} + + +void Ak4332::handle_reg_write(uint8_t address, uint8_t value) +{ + this->trace.msg(vp::trace::LEVEL_TRACE, "Register write (address: 0x%x, value: 0x%x)\n", address, value); + + switch (address) + { + case 0x00: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 1", value); + this->power_1 = value; + break; + } + case 0x01: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 2", value); + this->power_2 = value; + break; + } + case 0x02: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 3", value); + this->power_3 = value; + break; + } + case 0x03: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 4", value); + this->power_4 = value; + break; + } + case 0x04: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Output Mode Setting", value); + this->output_mode = value; + break; + } + case 0x05: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Clock Mode Selection", value); + this->clock_mode = value; + break; + } + case 0x06: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Digital Filter Selection", value); + this->digital_filter = value; + break; + } + case 0x07: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Mono Mixing", value); + this->dac_mono_mixing = value; + break; + } + case 0x08: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PDM I/F Control", value); + this->pdm_control = value; + break; + } + case 0x0B: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Output Volume", value); + this->dac_volume_control = value; + break; + } + case 0x0D: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "HP Volume Control", value); + this->hp_volume_control = value; + break; + } + case 0x0E: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL CLK Source Selection", value); + this->pll_clk_selection = value; + break; + } + case 0x0F: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 1", value); + this->pll_ref_clk_div_1 = value; + break; + } + case 0x10: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 2", value); + this->pll_ref_clk_div_2 = value; + break; + } + case 0x11: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 1", value); + this->pll_fb_clk_div_1 = value; + break; + } + case 0x12: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 2", value); + this->pll_fb_clk_div_2 = value; + break; + } + case 0x13: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC CLK Source", value); + this->dac_clk_source = value; + break; + } + case 0x14: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC CLK Divider", value); + this->dac_clk_divider = value; + break; + } + case 0x15: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Audio I/F Format", value); + this->audio_format = value; + break; + } + case 0x17: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PDMERR", value); + this->pdm_err = value; + break; + } + case 0x26: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Adjustment 1", value); + this->dac_adjustment_1 = value; + break; + } + case 0x27: + { + this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Adjustment 2", value); + this->dac_adjustment_2 = value; + break; + } + default: + this->trace.force_warning("Writing invalid register (address: 0x%x)\n", address); + break; + } + +} + + +uint8_t Ak4332::handle_reg_read(uint8_t address) +{ + this->trace.msg(vp::trace::LEVEL_DEBUG, "Register read (address: 0x%x)\n", address); + + uint8_t value = 0xFF; + + switch (address) + { + case 0x00: + { + value = this->power_1; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 1", value); + break; + } + case 0x01: + { + value = this->power_2; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 2", value); + break; + } + case 0x02: + { + value = this->power_3; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 3", value); + break; + } + case 0x03: + { + value = this->power_4; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 4", value); + break; + } + case 0x04: + { + value = this->output_mode; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Output Mode Setting", value); + break; + } + case 0x05: + { + value = this->clock_mode; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Clock Mode Selection", value); + break; + } + case 0x06: + { + value = this->digital_filter; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Digital Filter Selection", value); + break; + } + case 0x07: + { + value = this->dac_mono_mixing; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Mono Mixing", value); + break; + } + case 0x08: + { + value = this->pdm_control; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PDM I/F Control", value); + break; + } + case 0x0B: + { + value = this->dac_volume_control; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Output Volume", value); + break; + } + case 0x0D: + { + value = this->hp_volume_control; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "HP Volume Control", value); + break; + } + case 0x0E: + { + value = this->pll_clk_selection; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL CLK Source Selection", value); + break; + } + case 0x0F: + { + value = this->pll_ref_clk_div_1; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 1", value); + break; + } + case 0x10: + { + value = this->pll_ref_clk_div_2; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 2", value); + break; + } + case 0x11: + { + value = this->pll_fb_clk_div_1; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 1", value); + break; + } + case 0x12: + { + value = this->pll_fb_clk_div_2; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 2", value); + break; + } + case 0x13: + { + value = this->dac_clk_source; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC CLK Source", value); + break; + } + case 0x14: + { + value = this->dac_clk_divider; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC CLK Divider", value); + break; + } + case 0x15: + { + value = this->audio_format; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Audio I/F Format", value); + break; + } + case 0x17: + { + value = this->pdm_err; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PDMERR", value); + break; + } + case 0x26: + { + value = this->dac_adjustment_1; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Adjustment 1", value); + break; + } + case 0x27: + { + value = this->dac_adjustment_2; + this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Adjustment 2", value); + break; + } + default: + this->trace.force_warning("Reading invalid register (address: 0x%x)\n", address); + break; + } + + return value; +} + + +int Ak4332::build() +{ + traces.new_trace("trace", &trace, vp::DEBUG); + + this->i2c_itf.set_sync_meth(&Ak4332::i2c_sync); + this->new_master_port("i2c", &this->i2c_itf); + + this->i2c_state = I2C_STATE_WAIT_START; + this->i2c_prev_sda = 1; + this->i2c_prev_scl = 1; + this->i2c_being_addressed = false; + this->device_address = 0x10; + this->waiting_reg_address = true; + + this->power_1 = 0x00; + this->power_2 = 0x00; + this->power_3 = 0x00; + this->power_4 = 0x00; + this->output_mode = 0x00; + this->clock_mode = 0x00; + this->digital_filter = 0x00; + this->dac_mono_mixing = 0x00; + this->pdm_control = 0x00; + this->dac_volume_control = 0x19; + this->hp_volume_control = 0x65; + this->pll_clk_selection = 0x00; + this->pll_ref_clk_div_1 = 0x00; + this->pll_ref_clk_div_2 = 0x00; + this->pll_fb_clk_div_1 = 0x00; + this->pll_fb_clk_div_2 = 0x00; + this->dac_clk_source = 0x00; + this->dac_clk_divider = 0x00; + this->audio_format = 0x00; + this->pdm_err = 0x00; + this->dac_adjustment_1 = 0x6C; + this->dac_adjustment_2 = 0x40; + + return 0; +} + + +extern "C" vp::component *vp_constructor(js::config *config) +{ + return new Ak4332(config); +} diff --git a/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp b/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp index 289acf5a6..8d696e332 100644 --- a/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp +++ b/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp @@ -74,7 +74,7 @@ class Rx_stream_libsnd_file : public Rx_stream class Rx_stream_raw_file : public Rx_stream { public: - Rx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin); + Rx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding); uint32_t get_sample(int channel_id); Slot *slot; @@ -82,13 +82,14 @@ class Rx_stream_raw_file : public Rx_stream FILE *infile; int width; bool is_bin; + pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding; }; class Tx_stream_raw_file : public Tx_stream { public: - Tx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin); + Tx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding); void push_sample(uint32_t sample, int channel_id); Slot *slot; @@ -96,6 +97,7 @@ class Tx_stream_raw_file : public Tx_stream FILE *outfile; int width; bool is_bin; + pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding; }; @@ -565,10 +567,11 @@ void I2s_verif::start(pi_testbench_i2s_verif_start_config_t *config) } -Tx_stream_raw_file::Tx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin) +Tx_stream_raw_file::Tx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding) { this->width = width; this->is_bin = is_bin; + this->encoding = encoding; this->slot = slot; this->outfile = fopen(filepath.c_str(), "w"); this->slot->trace.msg(vp::trace::LEVEL_INFO, "Opening dumper (path: %s)\n", filepath.c_str()); @@ -583,6 +586,17 @@ void Tx_stream_raw_file::push_sample(uint32_t sample, int channel_id) { if (this->is_bin) { + if (this->encoding == PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS) + { + // Convert encoding from 0/1 to -1/+1 + if (sample == 0) + sample = (uint32_t)-1; + else if (sample == 1) + sample = 1; + else + sample = 0; // Error + } + int nb_bytes = (this->width + 7) / 8; if (fwrite((void *)&sample, nb_bytes, 1, this->outfile) != 1) { @@ -666,10 +680,11 @@ void Tx_stream_libsnd_file::push_sample(uint32_t data, int channel) } -Rx_stream_raw_file::Rx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin) +Rx_stream_raw_file::Rx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding) { this->width = width; this->is_bin = is_bin; + this->encoding = encoding; this->slot = slot; this->infile = fopen(filepath.c_str(), "r"); if (this->infile == NULL) @@ -685,10 +700,26 @@ uint32_t Rx_stream_raw_file::get_sample(int channel_id) { int nb_bytes = (this->width + 7) / 8; uint32_t result = 0; - if (fread((void *)&result, nb_bytes, 1, this->infile) != 1) + int freadres = fread((void *)&result, nb_bytes, 1, this->infile); + + // this->slot->top->trace.msg(vp::trace::LEVEL_TRACE, "channel_id=%d, nb_bytes=%d, freadres=%d, result=%d\n", channel_id, nb_bytes, freadres, result); + + if (freadres != 1) { return 0; } + + if (this->encoding == PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS) + { + // Convert encoding from -1/+1 to 0/1 + if ((int32_t)result == -1) + result = 0; + else if ((int32_t)result == 1) + result = 1; + else + result = 0; + } + return result; } else @@ -872,7 +903,12 @@ void Slot::start(pi_testbench_i2s_verif_slot_start_config_t *config, Slot *reuse { if (config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_TX_FILE_DUMPER_TYPE_RAW || config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN) { - this->outstream = new Tx_stream_raw_file(this, filepath, config->tx_file_dumper.width, config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN); + this->outstream = new Tx_stream_raw_file( + this, + filepath, + config->tx_file_dumper.width, + config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN, + (pi_testbench_i2s_verif_start_config_file_encoding_type_e)config->tx_file_dumper.encoding); } else { @@ -904,7 +940,12 @@ void Slot::start(pi_testbench_i2s_verif_slot_start_config_t *config, Slot *reuse { if (config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_RAW || config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN) { - this->instream = new Rx_stream_raw_file(this, filepath, config->rx_file_reader.width, config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN); + this->instream = new Rx_stream_raw_file( + this, + filepath, + config->rx_file_reader.width, + config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN, + (pi_testbench_i2s_verif_start_config_file_encoding_type_e)config->rx_file_reader.encoding); } else { diff --git a/gvsoc/gvsoc/models/devices/testbench/testbench.cpp b/gvsoc/gvsoc/models/devices/testbench/testbench.cpp index 2975e770d..0d2823038 100644 --- a/gvsoc/gvsoc/models/devices/testbench/testbench.cpp +++ b/gvsoc/gvsoc/models/devices/testbench/testbench.cpp @@ -1353,6 +1353,21 @@ std::string Testbench::handle_command(Gv_proxy *proxy, FILE *req_file, FILE *rep config->rx_file_reader.type = 0; } } + else if (name == "encoding") + { + if (value_str == "asis") + { + config->rx_file_reader.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS; + } + else if (value_str == "plusminus") + { + config->rx_file_reader.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS; + } + else + { + config->rx_file_reader.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS; + } + } } config->type = PI_TESTBENCH_I2S_VERIF_RX_FILE_READER; @@ -1410,6 +1425,21 @@ std::string Testbench::handle_command(Gv_proxy *proxy, FILE *req_file, FILE *rep config->tx_file_dumper.type = 0; } } + else if (name == "encoding") + { + if (value_str == "asis") + { + config->tx_file_dumper.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS; + } + else if (value_str == "plusminus") + { + config->tx_file_dumper.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS; + } + else + { + config->tx_file_dumper.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS; + } + } } config->type = PI_TESTBENCH_I2S_VERIF_TX_FILE_DUMPER; diff --git a/gvsoc/gvsoc/models/devices/testbench/testbench.hpp b/gvsoc/gvsoc/models/devices/testbench/testbench.hpp index d5830608f..079f25abd 100644 --- a/gvsoc/gvsoc/models/devices/testbench/testbench.hpp +++ b/gvsoc/gvsoc/models/devices/testbench/testbench.hpp @@ -240,6 +240,11 @@ typedef enum PI_TESTBENCH_I2S_VERIF_TX_FILE_DUMPER_TYPE_AU, } pi_testbench_i2s_verif_start_config_tx_file_dumper_type_e; +typedef enum +{ + PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS = 0, // Keep as is (default) + PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS, // Assume file contains -1/+1 values (usable for PDM only) +} pi_testbench_i2s_verif_start_config_file_encoding_type_e; // This structure can be used to describe what an I2S slot should do typedef struct @@ -261,6 +266,7 @@ typedef struct uint32_t filepath_len; uint8_t type; uint8_t width; + uint8_t encoding; } tx_file_dumper; struct { @@ -269,6 +275,7 @@ typedef struct uint32_t filepath_len; uint8_t type; uint8_t width; + uint8_t encoding; } rx_file_reader; }; diff --git a/gvsoc/gvsoc/models/memory/memory_impl.cpp b/gvsoc/gvsoc/models/memory/memory_impl.cpp index 4f7b748d2..da3edb540 100644 --- a/gvsoc/gvsoc/models/memory/memory_impl.cpp +++ b/gvsoc/gvsoc/models/memory/memory_impl.cpp @@ -40,7 +40,6 @@ class memory : public vp::component private: - static void power_callback(void *__this, vp::clock_event *event); static void power_ctrl_sync(void *__this, bool value); vp::trace trace; @@ -60,14 +59,13 @@ class memory : public vp::component bool power_trigger; bool powered_up; - vp::power::power_source idle_power; vp::power::power_source read_8_power; vp::power::power_source read_16_power; vp::power::power_source read_32_power; vp::power::power_source write_8_power; vp::power::power_source write_16_power; vp::power::power_source write_32_power; - vp::power::power_source leakage_power; + vp::power::power_source background_power; vp::clock_event *power_event; int64_t last_access_timestamp; @@ -79,15 +77,6 @@ memory::memory(js::config *config) } -void memory::power_callback(void *__this, vp::clock_event *event) -{ - memory *_this = (memory *)__this; - if (_this->last_access_timestamp < _this->get_time()) - { - _this->idle_power.dynamic_power_start(); - } -} - vp::io_req_status_e memory::req(void *__this, vp::io_req *req) { memory *_this = (memory *)__this; @@ -142,9 +131,6 @@ vp::io_req_status_e memory::req(void *__this, vp::io_req *req) else if (size == 4) _this->read_32_power.account_energy_quantum(); } - - if (!_this->power_event->is_enqueued()) - _this->event_enqueue(_this->power_event, 1); } #ifdef VP_TRACE_ACTIVE @@ -223,8 +209,7 @@ int memory::build() js::config *config = get_js_config()->get("power_trigger"); this->power_trigger = config != NULL && config->get_bool(); - power.new_power_source("leakage", &leakage_power, this->get_js_config()->get("**/leakage")); - power.new_power_source("idle", &idle_power, this->get_js_config()->get("**/idle")); + power.new_power_source("leakage", &background_power, this->get_js_config()->get("**/background")); power.new_power_source("read_8", &read_8_power, this->get_js_config()->get("**/read_8")); power.new_power_source("read_16", &read_16_power, this->get_js_config()->get("**/read_16")); power.new_power_source("read_32", &read_32_power, this->get_js_config()->get("**/read_32")); @@ -232,8 +217,6 @@ int memory::build() power.new_power_source("write_16", &write_16_power, this->get_js_config()->get("**/write_16")); power.new_power_source("write_32", &write_32_power, this->get_js_config()->get("**/write_32")); - power_event = this->event_new(memory::power_callback); - return 0; } @@ -287,8 +270,8 @@ void memory::start() } } - this->leakage_power.leakage_power_start(); - this->idle_power.dynamic_power_start(); + this->background_power.leakage_power_start(); + this->background_power.dynamic_power_start(); this->last_access_timestamp = -1; } diff --git a/gvsoc/gvsoc/models/utils/composite_impl.cpp b/gvsoc/gvsoc/models/utils/composite_impl.cpp index 907075e82..d5cf33847 100644 --- a/gvsoc/gvsoc/models/utils/composite_impl.cpp +++ b/gvsoc/gvsoc/models/utils/composite_impl.cpp @@ -37,6 +37,7 @@ class composite : public vp::component int build(); void start(); + void power_supply_set(int state); void dump_traces(FILE *file); @@ -83,6 +84,10 @@ void composite::add_port(std::string name, vp::port *port) this->ports[name] = port; } +void composite::power_supply_set(int state) +{ + //printf("%s power set %d\n", this->get_path().c_str(), state); +} extern "C" vp::component *vp_constructor(js::config *config) diff --git a/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt b/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt index 9f4db84d7..693e03c7d 100644 --- a/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt +++ b/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt @@ -1 +1,8 @@ add_subdirectory(cpu) + +set(CLUSTER_PREFIX "gap9") + +vp_model(NAME cluster + PREFIX ${CLUSTER_PREFIX} + SOURCES "cluster.cpp" + ) diff --git a/gvsoc/gvsoc_gap/models/gap9/cluster.cpp b/gvsoc/gvsoc_gap/models/gap9/cluster.cpp new file mode 100644 index 000000000..a9da7bf70 --- /dev/null +++ b/gvsoc/gvsoc_gap/models/gap9/cluster.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2020 GreenWaves Technologies, SAS, ETH Zurich and + * University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + +#include + + + +class cluster : public vp::component +{ + +public: + cluster(js::config *config); + + vp::port *get_slave_port(std::string name) { return this->ports[name]; } + vp::port *get_master_port(std::string name) { return this->ports[name]; } + + void add_slave_port(std::string name, vp::slave_port *port) { this->add_port(name, port); } + void add_master_port(std::string name, vp::master_port *port) { this->add_port(name, port); } + + int build(); + void start(); + void reset(bool active); + void power_supply_set(int state); + + void dump_traces(FILE *file); + + static void cluster_clock_gating_en_sync(void *__this, bool value); + static void timer_busy_sync(void *__this, bool value, int id); + static void ne16_busy_sync(void *__this, bool value); + static void ico_busy_sync(void *__this, bool value); + static void dma_busy_sync(void *__this, bool value); + static void cores_busy_sync(void *__this, bool value, int id); + +private: + void add_port(std::string name, vp::port *port); + std::map ports; + void check_clock_gating(); + + vp::wire_slave cluster_clock_gating_en_itf; + vp::wire_slave timer_busy_itf[2]; + vp::wire_slave ne16_busy_itf; + vp::wire_slave ico_busy_itf; + vp::wire_slave dma_busy_itf; + vp::wire_slave cores_busy_itf[9]; + + int timer_busy; + bool ne16_busy; + bool ico_busy; + bool dma_busy; + int cores_busy; + + bool busy_sync; + + bool clock_gating_en; + + vp::trace trace; + + vp::power::power_source background_power; +}; + + + +cluster::cluster(js::config *config) + : vp::component(config) +{ +} + + +void cluster::dump_traces(FILE *file) +{ + this->power.get_power_trace()->dump(file); +} + + +void cluster::check_clock_gating() +{ + this->trace.msg(vp::trace::LEVEL_DEBUG, "Checking cluster clock gating (timer: 0x%x, ne16: %d, ico: %d, dma: %d, cores: 0x%x)\n", + this->timer_busy, this->ne16_busy, this->ico_busy, this->dma_busy, this->cores_busy); + + bool busy = this->timer_busy || this->ne16_busy || this->ico_busy || this->dma_busy || this->cores_busy; + + if (busy != this->busy_sync) + { + if (this->clock_gating_en && !busy) + { + this->power.power_supply_set_all(2); + } + else + { + this->power.power_supply_set_all(3); + } + this->busy_sync = busy; + } +} + +void cluster::cluster_clock_gating_en_sync(void *__this, bool value) +{ + cluster *_this = (cluster *)__this; + _this->clock_gating_en = value; + _this->check_clock_gating(); +} + +void cluster::timer_busy_sync(void *__this, bool value, int id) +{ + cluster *_this = (cluster *)__this; + _this->timer_busy = (_this->timer_busy & ~(1<check_clock_gating(); +} + +void cluster::ne16_busy_sync(void *__this, bool value) +{ + printf("%d BUSY %d\n", __LINE__, value); + +} + +void cluster::ico_busy_sync(void *__this, bool value) +{ + printf("%d BUSY %d\n", __LINE__, value); + +} + +void cluster::dma_busy_sync(void *__this, bool value) +{ + cluster *_this = (cluster *)__this; + _this->dma_busy = value; + _this->check_clock_gating(); +} + +void cluster::cores_busy_sync(void *__this, bool value, int id) +{ + cluster *_this = (cluster *)__this; + _this->cores_busy = (_this->cores_busy & ~(1<check_clock_gating(); +} + +void cluster::reset(bool active) +{ + if (active) + { + this->busy_sync = false; + this->timer_busy = 0; + this->ne16_busy = 0; + this->ico_busy = 0; + this->dma_busy = 0; + this->cores_busy = 0; + this->clock_gating_en = 0; + } +} + + +int cluster::build() +{ + traces.new_trace("trace", &trace, vp::DEBUG); + + this->cluster_clock_gating_en_itf.set_sync_meth(&cluster::cluster_clock_gating_en_sync); + new_slave_port("cluster_clock_gating_en", &this->cluster_clock_gating_en_itf); + + for (int i=0; i<2; i++) + { + this->timer_busy_itf[i].set_sync_meth_muxed(&cluster::timer_busy_sync, i); + new_slave_port("timer" + std::to_string(i) + "_busy", &this->timer_busy_itf[i]); + } + + this->ne16_busy_itf.set_sync_meth(&cluster::ne16_busy_sync); + new_slave_port("ne16_busy", &this->ne16_busy_itf); + + this->ico_busy_itf.set_sync_meth(&cluster::ico_busy_sync); + new_slave_port("ico_busy", &this->ico_busy_itf); + + this->dma_busy_itf.set_sync_meth(&cluster::dma_busy_sync); + new_slave_port("dma_busy", &this->dma_busy_itf); + + for (int i=0; i<9; i++) + { + this->cores_busy_itf[i].set_sync_meth_muxed(&cluster::cores_busy_sync, i); + new_slave_port("core_busy_" + std::to_string(i), &this->cores_busy_itf[i]); + } + + this->power.new_power_source("background", &this->background_power, this->get_js_config()->get("power_models/background")); + + this->create_comps(); + this->create_ports(); + this->create_bindings(); + + return 0; +} + + +void cluster::start() +{ + this->background_power.leakage_power_start(); + this->background_power.dynamic_power_start(); +} + + + +void cluster::add_port(std::string name, vp::port *port) +{ + vp_assert_always(port != NULL, this->get_trace(), "Adding NULL port\n"); + //vp_assert_always(this->ports[name] == NULL, this->get_trace(), "Adding already existing port\n"); + this->ports[name] = port; +} + +void cluster::power_supply_set(int state) +{ + + //printf("%s power set %d\n", this->get_path().c_str(), state); +} + + +extern "C" vp::component *vp_constructor(js::config *config) +{ + return new cluster(config); +} diff --git a/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp index dc1c3bca0..8e44c267c 100644 --- a/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp @@ -68,6 +68,8 @@ class cluster_ctrl : public vp::component uint32_t dbg_halt_mask; uint32_t dbg_halt_status; uint32_t dbg_halt_status_sync; + + vp::wire_master clock_gating_en_itf; }; cluster_ctrl::cluster_ctrl(js::config *config) @@ -111,6 +113,10 @@ vp::io_req_status_e cluster_ctrl::req(void *__this, vp::io_req *req) } else if (offset == ARCHI_CLUSTER_CTRL_CLUSTER_CLK_GATE) { + if (_this->clock_gating_en_itf.is_bound()) + { + _this->clock_gating_en_itf.sync((*data) & 1); + } return vp::IO_REQ_OK; } else if (offset == ARCHI_CLUSTER_CTRL_DBG_STATUS) @@ -258,6 +264,8 @@ int cluster_ctrl::build() in.set_req_meth(&cluster_ctrl::req); new_slave_port("input", &in); + this->new_master_port("clock_gating_en", &this->clock_gating_en_itf); + for (int i = 0; i busy_itf; + vp::power::power_source background_power; + vp::power::power_source active_power; }; void Mchan_channel::reset() @@ -396,8 +402,7 @@ bool Mchan_channel::check_command(Mchan_cmd *cmd) top->trace.msg(vp::trace::LEVEL_TRACE, "Incrementing counter (id: %d, bytes: %d, remaining bytes: %d)\n", current_counter, cmd->size, top->pending_bytes[current_counter]); // Enqueue the command to the core queue - uint8_t one = 1; - this->top->cmd_events[cmd->counter_id].event(&one); + this->top->cmd_start(cmd->counter_id); pending_cmds->push(cmd); @@ -526,6 +531,20 @@ void Mchan_channel::trigger_event(Mchan_cmd *cmd) } } +void mchan::cmd_start(int cmd_id) +{ + uint8_t one = 1; + this->cmd_events[cmd_id].event(&one); + this->nb_cmd_started++; + if (this->nb_cmd_started == 1) + { + if (this->busy_itf.is_bound()) + { + this->active_power.dynamic_power_start(); + this->busy_itf.sync(1); + } + } +} void mchan::ext_grant(void *__this, vp::io_req *req) { @@ -876,6 +895,10 @@ void mchan::send_req() { ext_is_stalled = true; } + else + { + trace.force_warning("Got error during transfer (addr: 0x%lx, size: 0x%x)\n", cmd->source, size); + } } void mchan::check_ext_read_handler(void *__this, vp::clock_event *event) @@ -920,8 +943,17 @@ void mchan::check_ext_write_handler(void *__this, vp::clock_event *event) void mchan::handle_cmd_termination(Mchan_cmd *cmd) { - this->cmd_events[cmd->counter_id].event(NULL); - free_command(cmd); + this->cmd_events[cmd->counter_id].event(NULL); + this->nb_cmd_started--; + if (this->nb_cmd_started == 0) + { + if (this->busy_itf.is_bound()) + { + this->active_power.dynamic_power_stop(); + this->busy_itf.sync(0); + } + } + free_command(cmd); } void mchan::account_transfered_bytes(Mchan_cmd *cmd, int bytes) @@ -1148,6 +1180,7 @@ void mchan::check_queue() int mchan::build() { traces.new_trace("trace", &this->trace, vp::DEBUG); + new_master_port("busy", &this->busy_itf); for (int i=0; icmd_events[i], 8); } + this->power.new_power_source("background", &this->background_power, this->get_js_config()->get("**/power_models/background")); + this->power.new_power_source("active", &this->active_power, this->get_js_config()->get("**/power_models/active")); + return 0; } void mchan::start() { + this->background_power.leakage_power_start(); + this->background_power.dynamic_power_start(); } void mchan::reset(bool active) @@ -1187,6 +1225,7 @@ void mchan::reset(bool active) loc_port_ready_cycle[i] = 0; } + this->nb_cmd_started = 0; first_alloc_pending_req = NULL; last_alloc_pending_req = NULL; nb_core_read_cmd = 0; diff --git a/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp b/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp index 23131fa14..4bff46f71 100644 --- a/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp @@ -23,13 +23,13 @@ void Ne16::debug_x_buffer() { if(this->mode_linear) { std::ostringstream stringStream; - stringStream << "x_buffer[32,16] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(2) << this->x_buffer_linear << std::dec << "\n"; + stringStream << "x_buffer[32,16] = \n" << (this->trace_format?std::hex:std::dec) << this->x_buffer_linear << std::dec << "\n"; std::string copyOfStr = stringStream.str(); this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str()); } else { std::ostringstream stringStream; - stringStream << "x_buffer[5,5,16] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(2) << this->x_buffer << std::dec << "\n"; + stringStream << "x_buffer[5,5,16] = \n" << (this->trace_format?std::hex:std::dec) << this->x_buffer << std::dec << "\n"; std::string copyOfStr = stringStream.str(); this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str()); } @@ -47,7 +47,7 @@ void Ne16::debug_x_array() { // } // else { std::ostringstream stringStream; - stringStream << "x_array[9,9,16] = \n" << xt::print_options::threshold(10000) << (this->trace_format?std::hex:std::dec) << std::setw(2) << this->x_array << std::dec << "\n"; + stringStream << "x_array[9,9,16] = \n" << xt::print_options::threshold(10000) << (this->trace_format?std::hex:std::dec) << this->x_array << std::dec << "\n"; std::string copyOfStr = stringStream.str(); this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str()); // } @@ -55,7 +55,7 @@ void Ne16::debug_x_array() { void Ne16::debug_accum(){ std::ostringstream stringStream; - stringStream << "accum[9,32] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(8) << xt::cast(this->accum) << std::dec << "\n"; + stringStream << "accum[9,32] = \n" << (this->trace_format?std::hex:std::dec) << xt::cast(this->accum) << std::dec << "\n"; std::string copyOfStr = stringStream.str(); this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str()); } @@ -69,7 +69,7 @@ void Ne16::debug_accum(){ void Ne16::debug_psum_block(){ std::ostringstream stringStream; - stringStream << "psum_block[9,9] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(8) << xt::cast(this->psum_block) << std::dec << "\n"; + stringStream << "psum_block[9,9] = \n" << (this->trace_format?std::hex:std::dec) << xt::cast(this->psum_block) << std::dec << "\n"; std::string copyOfStr = stringStream.str(); this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str()); } diff --git a/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp index ab9d559a2..a048ba4ef 100644 --- a/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp @@ -236,6 +236,7 @@ class pmu_icu : public pmu_picl_slave private: pmu *top; vp::wire_master reset_itf; + vp::wire_master power_itf; pmu_icu_state states[16]; int index; int current_supply_state; @@ -706,6 +707,11 @@ void pmu_icu::icu_ctrl_req(bool is_write, uint16_t pwdata) } } + if (this->power_itf.is_bound()) + { + this->power_itf.sync(state->supply == MAESTRO_ICU_SUPPLY_ON); + } + this->current_supply_state = state->supply; top->picl_reply(); @@ -797,6 +803,7 @@ pmu_icu::pmu_icu(pmu *top, int index) : pmu_picl_slave(top), top(top), index(index) { top->new_master_port("icu" + std::to_string(index) + "_reset", &this->reset_itf); + top->new_master_port("icu" + std::to_string(index) + "_power", &this->power_itf); for (int i=0; i<16; i++) { diff --git a/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp index a1c9c4db2..bcc09ba9c 100644 --- a/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp @@ -61,8 +61,10 @@ class timer : public vp::component uint64_t get_compare_value(bool is_64, int counter); uint64_t get_value(bool is_64, int counter); void set_value(bool is_64, int counter, uint64_t new_value); + void set_enable(int counter, bool enabled); vp::wire_master irq_itf[2]; + vp::wire_master busy_itf; vp::clock_slave ref_clock_itf; uint32_t value[2]; @@ -143,6 +145,16 @@ void timer::set_value(bool is_64, int counter, uint64_t new_value) else value[counter] = new_value; } +void timer::set_enable(int counter, bool enabled) +{ + this->is_enabled[counter] = enabled; + + if (this->busy_itf.is_bound()) + { + this->busy_itf.sync(enabled); + } +} + void timer::check_state_counter(bool is_64, int counter) { if (is_enabled[counter] && get_compare_value(is_64, counter) == get_value(is_64, counter)) @@ -165,7 +177,7 @@ void timer::check_state_counter(bool is_64, int counter) if (one_shot[counter]) { this->trace.msg(vp::trace::LEVEL_DEBUG, "Reached one-shot end (timer: %d)\n", counter); - is_enabled[counter] = false; + this->set_enable(counter, false); } } @@ -300,7 +312,7 @@ vp::io_req_status_e timer::handle_compare(int counter, uint32_t *data, unsigned void timer::depack_config(int counter, uint32_t configuration) { - is_enabled[counter] = (configuration >> TIMER_CFG_LO_ENABLE_BIT) & 1; + this->set_enable(counter, (configuration >> TIMER_CFG_LO_ENABLE_BIT) & 1); irq_enabled[counter] = (configuration >> TIMER_CFG_LO_IRQEN_BIT) & 1; iem[counter] = (configuration >> TIMER_CFG_LO_IEM_BIT) & 1; cmp_clr[counter] = (configuration >> TIMER_CFG_LO_MODE_BIT) & 1; @@ -385,6 +397,8 @@ int timer::build() new_master_port("irq_itf_0", &irq_itf[0]); new_master_port("irq_itf_1", &irq_itf[1]); + new_master_port("busy", &busy_itf); + ref_clock_itf.set_sync_meth(&timer::ref_clock_sync); new_slave_port("ref_clock", &ref_clock_itf); diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp index f0b448356..c6dd9d543 100644 --- a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp @@ -22,9 +22,9 @@ using namespace std::placeholders; /* delay needed to replicate real performances */ -/* this should be 1 because ffc is 1 data/cycle, but 14 is the closest value +/* this should be 1 because ffc is 1 data/cycle, but 2 is the closest value * to replicate performance */ -#define FFC_DELAY_CYCLES (14) +#define FFC_DELAY_CYCLES (2) Ffc_periph::Ffc_periph(udma *top, int id, int itf_id) : Udma_periph(top, id) { @@ -47,6 +47,9 @@ Ffc_periph::Ffc_periph(udma *top, int id, int itf_id) : Udma_periph(top, id) /* setup event handlers */ this->event_convert = top->event_new(this, Ffc_periph::handle_event); + + /* Busy signal for VCD tracing */ + this->top->new_reg(itf_name + "/busy", &this->busy, 8); } @@ -56,6 +59,9 @@ void Ffc_periph::reset(bool active) this->rx_channel->reset(active); this->tx_channel->reset(active); + + // Since busy signal is displayed as a state, we need to release it when the FFC is not busy. */ + this->busy.release(); } @@ -96,6 +102,7 @@ vp::io_req_status_e Ffc_periph::custom_req(vp::io_req *req, uint64_t offset) this->trace.msg(vp::trace::LEVEL_TRACE, "Received START access\n"); /* start converting data */ this->enqueue_event(); + this->busy.set(1); break; default: break; @@ -207,6 +214,8 @@ void Ffc_periph::handle_event(void* __this, vp::clock_event* event) { /* done with conversion */ _this->state = FFC_STATE_IDLE; + // Since busy signal is displayed as a state, we need to release it when the FFC is not busy. */ + _this->busy.release(); } else if (!_this->ffc_queue.empty()) { @@ -570,6 +579,7 @@ void Ffc_periph::convert_to_fixed(uint8_t* src, printf("Invalid float type\n"); break; } + this->push_data((uint8_t*) &dst, 4); } break; @@ -583,7 +593,7 @@ void Ffc_periph::enqueue_event(void) { if (!(this->event_convert)->is_enqueued()) { - this->top->get_periph_clock()->enqueue(this->event_convert, FFC_DELAY_CYCLES); + this->top->event_enqueue(this->event_convert, FFC_DELAY_CYCLES); } } diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp index 8b142491f..345c65715 100644 --- a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp +++ b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp @@ -281,6 +281,9 @@ class Ffc_periph : public Udma_periph /** FFC TX channel, used to transmit data to the FFC */ Ffc_tx_channel *tx_channel; + + /** Busy signal for VCD tracing */ + vp::reg_8 busy; }; diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp index 0a351268a..aad3c8d80 100644 --- a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp @@ -108,6 +108,7 @@ void I2c_tx_channel::handle_pending_word(void *__this, vp::clock_event *event) if (_this->periph->waiting_rx) { _this->periph->prev_scl ^= 1; + _this->periph->trace.msg("Sync (scl: %d, sda: %d)\n", _this->periph->prev_scl, 0); _this->periph->i2c_itf.sync(_this->periph->prev_scl, 0); if (_this->periph->prev_scl) diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp index 8a589314f..411559042 100644 --- a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp @@ -38,7 +38,8 @@ I2c_periph::I2c_periph(udma *top, int id, int itf_id) : _2), std::bind(&I2c_periph::i2c_cancel_event, this, - _1) + _1), + "i2c" + std::to_string(itf_id) ), is_waiting_i2c_start(false), is_waiting_i2c_data(false), @@ -295,7 +296,7 @@ void I2c_periph::i2c_sync(void *__this, int scl, int sda) void I2c_periph::ucode_handler(ucode_data_t data) { - //I2C_PERIPH_FPRINTF("[I2C] ucode_handler: data.id=%d\n", data.id); + I2C_PERIPH_FPRINTF("[I2C] ucode_handler: data.id=0x%x\n", data.id); switch(data.id) { case CMD_MISC_NOP: @@ -367,11 +368,10 @@ void I2c_periph::ucode_handler(ucode_data_t data) } break; case CMD_LEAD_RECV: - if (this->repeat_downcounter == 0) + if (this->repeat_downcounter > 0) { - this->repeat_downcounter = 1; + this->is_waiting_i2c_data = true; } - this->is_waiting_i2c_data = true; break; case CMD_LEAD_RECV_LAST: // TODO @@ -576,7 +576,7 @@ void I2c_periph::i2c_helper_callback(i2c_operation_e id, i2c_status_e status, in void I2c_periph::i2c_start(void) { - if (!this->i2c_helper.is_busy()) + if (1) //!this->i2c_helper.is_busy()) { I2C_PERIPH_FPRINTF("Sending start directly\n"); this->is_waiting_i2c_start = true; diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp index 43d48b9cb..0dea6be56 100644 --- a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp +++ b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp @@ -134,6 +134,7 @@ void Udma_addrgen_linear::cfg_ctrl_req(uint64_t reg_offset, int size, uint8_t *v vp::io_req_status_e Udma_addrgen_linear::access(uint64_t offset, int size, uint8_t *value, bool is_write) { + if (this->regmap.access(offset, size, value, is_write)) return vp::IO_REQ_INVALID; diff --git a/libs/gap_lib/jpeg/cluster.c b/libs/gap_lib/jpeg/cluster.c index fd0644ae4..4f9fd4191 100644 --- a/libs/gap_lib/jpeg/cluster.c +++ b/libs/gap_lib/jpeg/cluster.c @@ -30,8 +30,14 @@ #ifdef PMSIS_DRIVERS #define RT_USER_EVENT (CL_USER_EVENT) #define eu_evt_trig_from_id(x,y) (hal_eu_cluster_evt_trig_set(x,y)) +#if defined(__GAP9__) + #define eu_evt_maskWaitAndClr(x) (hal_cl_eu_evt_mask_wait_and_clear(x)) +#else #define eu_evt_maskWaitAndClr(x) (hal_cl_eu_evt_mask_wait_clear(x)) #endif +#else + #define RT_USER_EVENT 6 +#endif #define FLOAT2FIX(f) ((int)((f) * (1 << 11))) #define FIXQ 11 diff --git a/libs/gap_lib/testbench/testbench.h b/libs/gap_lib/testbench/testbench.h index 3eece6c29..5bbd70444 100644 --- a/libs/gap_lib/testbench/testbench.h +++ b/libs/gap_lib/testbench/testbench.h @@ -122,6 +122,12 @@ typedef enum PI_TESTBENCH_I2S_VERIF_RX_FILE_READER } pi_testbench_i2s_verif_start_config_type_e; +typedef enum +{ + PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS = 0, // Keep as is (default) + PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS, // Assume file contains -1/+1 values (usable for PDM only) +} pi_testbench_i2s_verif_start_config_file_encoding_type_e; + // This structure can be used to describe what an I2S slot should do typedef struct { @@ -142,6 +148,7 @@ typedef struct uint32_t filepath_len; uint8_t type; uint8_t width; + uint8_t encoding; } tx_file_dumper; struct { @@ -150,6 +157,7 @@ typedef struct uint32_t filepath_len; uint8_t type; uint8_t width; + uint8_t encoding; } rx_file_reader; }; diff --git a/libs/gap_lib/testbench/testlib.c b/libs/gap_lib/testbench/testlib.c index 721a38078..36cfc0209 100644 --- a/libs/gap_lib/testbench/testlib.c +++ b/libs/gap_lib/testbench/testlib.c @@ -10,7 +10,6 @@ #include "pmsis.h" #include "testbench.h" #include "testlib.h" -#include #include @@ -1108,128 +1107,3 @@ int i2s_test_stop(i2s_test_t *test) return 0; } - - -void testlib_hyperram_trafficgen_conf_init(testlib_hyperram_trafficgen_config_t *config) -{ - config->transfer_size = 8192; - config->itf = -1; - config->cs = -1; - config->frequency = -1; -} - - -int testlib_hyperram_trafficgen_init(testlib_hyperram_trafficgen_t *data, testlib_hyperram_trafficgen_config_t *config) -{ - struct pi_hyperram_conf conf; - pi_hyperram_conf_init(&conf); - - if (config->itf != -1) - { - conf.hyper_itf = config->itf; - } - if (config->cs != -1) - { - conf.hyper_cs = config->cs; - } - if (config->frequency != -1) - { - conf.baudrate = config->frequency; - } - - pi_open_from_conf(&data->dev, &conf); - - if (pi_ram_open(&data->dev)) - goto error0; - - if (pi_ram_alloc(&data->dev, &data->hyper_addr, config->transfer_size)) - goto error1; - - data->transfer_size = config->transfer_size; - - data->buffer = pi_l2_malloc(config->transfer_size); - if (data->buffer == NULL) goto error2; - - for (int i=0; itransfer_size/4; i++) - { - ((uint32_t *)data->buffer)[i] = i; - } - - return 0; - -error2: - pi_ram_free(&data->dev, data->hyper_addr, config->transfer_size); -error1: - pi_ram_close(&data->dev); -error0: - return -1; -} - - -static void testlib_hyperram_callback(void *arg) -{ - testlib_hyperram_trafficgen_t *data = (testlib_hyperram_trafficgen_t *)arg; - - if (data->end) - { - data->pending--; - if (data->pending == 0) - { - pi_task_push(&data->end_task); - } - return; - } - - if (data->is_read) - { - data->is_read = 0; - pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data)); - } - else - { - data->is_read = 1; - pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data)); - } -} - - -int testlib_hyperram_trafficgen_start(testlib_hyperram_trafficgen_t *data) -{ - data->is_read = 0; - data->end = 0; - data->pending = 2; - pi_task_block(&data->end_task); - pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data)); - pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data)); - return 0; -} - - -int testlib_hyperram_trafficgen_stop(testlib_hyperram_trafficgen_t *data) -{ - int errors = 0; - - data->end = 1; - - pi_task_wait_on(&data->end_task); - - for (int i=0; itransfer_size/4; i++) - { - uint32_t expected = i; - if (expected != ((uint32_t *)data->buffer)[i]) - { - errors++; - } - } - - return errors; -} - - -int testlib_hyperram_trafficgen_deinit(testlib_hyperram_trafficgen_t *data) -{ - pi_ram_free(&data->dev, data->hyper_addr, data->transfer_size); - pi_ram_close(&data->dev); - return 0; -} - diff --git a/libs/gap_lib/testbench/testlib.h b/libs/gap_lib/testbench/testlib.h index 5d9470398..650af5d7e 100644 --- a/libs/gap_lib/testbench/testlib.h +++ b/libs/gap_lib/testbench/testlib.h @@ -13,6 +13,9 @@ #include "pmsis.h" #include "testbench.h" +#include "testlib_i2s.h" +#include "testlib_uart.h" +#include "testlib_i2c.h" #define I2S_SLOT_STATIC_INIT {0} diff --git a/libs/gap_lib/testbench/testlib_hyper.c b/libs/gap_lib/testbench/testlib_hyper.c new file mode 100644 index 000000000..6ac967bc9 --- /dev/null +++ b/libs/gap_lib/testbench/testlib_hyper.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#include "pmsis.h" +#include "testbench.h" +#include "testlib.h" +#include +#include + + +void testlib_hyperram_trafficgen_conf_init(testlib_hyperram_trafficgen_config_t *config) +{ + config->transfer_size = 8192; + config->itf = -1; + config->cs = -1; + config->frequency = -1; +} + + +int testlib_hyperram_trafficgen_init(testlib_hyperram_trafficgen_t *data, testlib_hyperram_trafficgen_config_t *config) +{ + struct pi_hyperram_conf conf; + pi_hyperram_conf_init(&conf); + + if (config->itf != -1) + { + conf.hyper_itf = config->itf; + } + if (config->cs != -1) + { + conf.hyper_cs = config->cs; + } + if (config->frequency != -1) + { + conf.baudrate = config->frequency; + } + + pi_open_from_conf(&data->dev, &conf); + + if (pi_ram_open(&data->dev)) + goto error0; + + if (pi_ram_alloc(&data->dev, &data->hyper_addr, config->transfer_size)) + goto error1; + + data->transfer_size = config->transfer_size; + + data->buffer = pi_l2_malloc(config->transfer_size); + if (data->buffer == NULL) goto error2; + + for (int i=0; itransfer_size/4; i++) + { + ((uint32_t *)data->buffer)[i] = i; + } + + return 0; + +error2: + pi_ram_free(&data->dev, data->hyper_addr, config->transfer_size); +error1: + pi_ram_close(&data->dev); +error0: + return -1; +} + + +static void testlib_hyperram_callback(void *arg) +{ + testlib_hyperram_trafficgen_t *data = (testlib_hyperram_trafficgen_t *)arg; + + if (data->end) + { + data->pending--; + if (data->pending == 0) + { + pi_task_push(&data->end_task); + } + return; + } + + if (data->is_read) + { + data->is_read = 0; + pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data)); + } + else + { + data->is_read = 1; + pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data)); + } +} + + +int testlib_hyperram_trafficgen_start(testlib_hyperram_trafficgen_t *data) +{ + data->is_read = 0; + data->end = 0; + data->pending = 2; + pi_task_block(&data->end_task); + pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data)); + pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data)); + return 0; +} + + +int testlib_hyperram_trafficgen_stop(testlib_hyperram_trafficgen_t *data) +{ + int errors = 0; + + data->end = 1; + + pi_task_wait_on(&data->end_task); + + for (int i=0; itransfer_size/4; i++) + { + uint32_t expected = i; + if (expected != ((uint32_t *)data->buffer)[i]) + { + errors++; + } + } + + return errors; +} + + +int testlib_hyperram_trafficgen_deinit(testlib_hyperram_trafficgen_t *data) +{ + pi_ram_free(&data->dev, data->hyper_addr, data->transfer_size); + pi_ram_close(&data->dev); + return 0; +} + diff --git a/libs/gap_lib/testbench/testlib_i2c.c b/libs/gap_lib/testbench/testlib_i2c.c new file mode 100644 index 000000000..69ce83bfd --- /dev/null +++ b/libs/gap_lib/testbench/testlib_i2c.c @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#include "pmsis.h" +#include "testbench.h" +#include "testlib.h" +#include + + +void testlib_i2c_trafficgen_conf_init(testlib_i2c_trafficgen_config_t *config) +{ + config->transfer_size = 32; + config->itf = 0; + config->baudrate = 400000; +} + + +int testlib_i2c_trafficgen_init(testlib_i2c_trafficgen_t *data, testlib_i2c_trafficgen_config_t *config) +{ + struct pi_i2c_conf conf; + + data->transfer_size = config->transfer_size; + data->tx_buffers[0] = pi_l2_malloc(config->transfer_size); + data->tx_buffers[1] = pi_l2_malloc(config->transfer_size); + + if (data->tx_buffers[0] == NULL || data->tx_buffers[1] == NULL) + { + return -1; + } + + for (int i=0; itransfer_size; i++) + { + ((uint8_t *)data->tx_buffers[0])[i] = i; + ((uint8_t *)data->tx_buffers[1])[i] = i; + } + + pi_i2c_conf_init(&conf); + + conf.itf = config->itf; + //conf.max_baudrate = config->baudrate; + conf.cs = 0xA0; + + pi_open_from_conf(&data->dev, &conf); + + if (pi_i2c_open(&data->dev)) + { + return -1; + } + + return 0; +} + + +static void testlib_i2c_tx_callback(void *arg) +{ + testlib_i2c_trafficgen_t *data = (testlib_i2c_trafficgen_t *)arg; + + if (data->end) + { + data->tx_pending--; + if (data->tx_pending == 0) + { + pi_task_push(&data->tx_end_task); + } + return; + } + + pi_i2c_write_async(&data->dev, data->tx_buffers[data->tx_current_task], data->transfer_size, 0, pi_task_irq_callback(&data->tx_tasks[data->tx_current_task], testlib_i2c_tx_callback, (void *)data)); + data->tx_current_task ^= 1; +} + + +int testlib_i2c_trafficgen_start(testlib_i2c_trafficgen_t *data) +{ + data->end = 0; + + data->tx_pending = 2; + data->tx_current_task = 0; + pi_task_block(&data->tx_end_task); + pi_i2c_write_async(&data->dev, data->tx_buffers[0], data->transfer_size, 0, pi_task_irq_callback(&data->tx_tasks[0], testlib_i2c_tx_callback, (void *)data)); + pi_i2c_write_async(&data->dev, data->tx_buffers[1], data->transfer_size, 0, pi_task_irq_callback(&data->tx_tasks[1], testlib_i2c_tx_callback, (void *)data)); + + return 0; +} +int testlib_i2c_trafficgen_stop(testlib_i2c_trafficgen_t *data) +{ + int errors = 0; + + data->end = 1; + + pi_task_wait_on(&data->tx_end_task); + + return errors; +} + + +int testlib_i2c_trafficgen_deinit(testlib_i2c_trafficgen_t *data) +{ + pi_l2_free(data->tx_buffers[0], data->transfer_size); + pi_l2_free(data->tx_buffers[1], data->transfer_size); + pi_i2c_close(&data->dev); + return 0; +} + diff --git a/libs/gap_lib/testbench/testlib_i2c.h b/libs/gap_lib/testbench/testlib_i2c.h new file mode 100644 index 000000000..c97cbfd84 --- /dev/null +++ b/libs/gap_lib/testbench/testlib_i2c.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#pragma once + +#include "pmsis.h" + +typedef struct +{ + int itf; + int transfer_size; + int baudrate; +} testlib_i2c_trafficgen_config_t; + + +typedef struct +{ + pi_device_t dev; + uint32_t frame; + void *tx_buffers[2]; + pi_task_t tx_tasks[2]; + int tx_pending; + int tx_current_task; + pi_task_t tx_end_task; + int transfer_size; + int end; +} testlib_i2c_trafficgen_t; + + +void testlib_i2c_trafficgen_conf_init(testlib_i2c_trafficgen_config_t *config); +int testlib_i2c_trafficgen_init(testlib_i2c_trafficgen_t *data, testlib_i2c_trafficgen_config_t *config); +int testlib_i2c_trafficgen_start(testlib_i2c_trafficgen_t *data); +int testlib_i2c_trafficgen_stop(testlib_i2c_trafficgen_t *data); +int testlib_i2c_trafficgen_deinit(testlib_i2c_trafficgen_t *data); diff --git a/libs/gap_lib/testbench/testlib_i2s.c b/libs/gap_lib/testbench/testlib_i2s.c new file mode 100644 index 000000000..80d48128d --- /dev/null +++ b/libs/gap_lib/testbench/testlib_i2s.c @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#include "pmsis.h" +#include "testbench.h" +#include "testlib.h" +#include + + +void testlib_i2s_trafficgen_conf_init(testlib_i2s_trafficgen_config_t *config) +{ + config->transfer_size = 128; + config->itf = 0; + config->sample_rate = 48000; + config->nb_slots = 16; + config->word_size = 32; +} + + +int testlib_i2s_trafficgen_init(testlib_i2s_trafficgen_t *data, testlib_i2s_trafficgen_config_t *config) +{ + struct pi_i2s_conf i2s_conf; + pi_i2s_conf_init(&i2s_conf); + + i2s_conf.frame_clk_freq = config->sample_rate; + i2s_conf.itf = config->itf; + i2s_conf.word_size = config->word_size; + i2s_conf.channels = config->nb_slots; + i2s_conf.options = PI_I2S_OPT_TDM | PI_I2S_OPT_FULL_DUPLEX; + + pi_open_from_conf(&data->dev, &i2s_conf); + if (pi_i2s_open(&data->dev)) + { + printf("Error opening i2s\n"); + return -3; + } + + struct pi_i2s_channel_conf i2s_slot_conf; + pi_i2s_channel_conf_init(&i2s_slot_conf); + uint16_t frame = (1 << config->nb_slots) - 1; + data->frame = frame; + for (int i=0; inb_slots; i++) + { + i2s_slot_conf.options = PI_I2S_OPT_IS_RX | PI_I2S_OPT_PINGPONG | PI_I2S_OPT_ENABLED; + i2s_slot_conf.word_size = config->word_size; + i2s_slot_conf.block_size = config->transfer_size; + + if (i == 0) + { + data->transfer_size = config->transfer_size * config->nb_slots; + i2s_slot_conf.pingpong_buffers[0] = pi_l2_malloc(data->transfer_size); + i2s_slot_conf.pingpong_buffers[1] = pi_l2_malloc(data->transfer_size); + data->rx_buffers[0] = i2s_slot_conf.pingpong_buffers[0]; + data->rx_buffers[1]= i2s_slot_conf.pingpong_buffers[1]; + + if (i2s_slot_conf.pingpong_buffers[0] == NULL || i2s_slot_conf.pingpong_buffers[1] == NULL) + { + printf("Error allocating memory\n"); + return -1; + } + } + + if (pi_i2s_frame_channel_conf_set(&data->dev, frame, i, &i2s_slot_conf)) + { + printf("Error setting conf channel\n"); + return -4; + } + } + + pi_i2s_channel_conf_init(&i2s_slot_conf); + for (int i=0; inb_slots; i++) + { + i2s_slot_conf.options = PI_I2S_OPT_IS_TX | PI_I2S_OPT_PINGPONG | PI_I2S_OPT_ENABLED; + i2s_slot_conf.word_size = config->word_size; + i2s_slot_conf.block_size = config->transfer_size; + + if (i == 0) + { + i2s_slot_conf.pingpong_buffers[0] = pi_l2_malloc(config->transfer_size * config->nb_slots); + i2s_slot_conf.pingpong_buffers[1] = pi_l2_malloc(config->transfer_size * config->nb_slots); + data->tx_buffers[0] = i2s_slot_conf.pingpong_buffers[0]; + data->tx_buffers[1]= i2s_slot_conf.pingpong_buffers[1]; + + if (i2s_slot_conf.pingpong_buffers[0] == NULL || i2s_slot_conf.pingpong_buffers[1] == NULL) + { + printf("Error allocating memory\n"); + return -1; + } + for (int i=0; itransfer_size * config->nb_slots; i++) + { + ((uint8_t *)i2s_slot_conf.pingpong_buffers[0])[i] = i; + ((uint8_t *)i2s_slot_conf.pingpong_buffers[1])[i] = i; + } + + } + + if (pi_i2s_frame_channel_conf_set(&data->dev, frame, i, &i2s_slot_conf)) + { + printf("Error setting conf channel\n"); + return -4; + } + } + + return 0; +} + + +static void testlib_i2s_rx_callback(void *arg) +{ + testlib_i2s_trafficgen_t *data = (testlib_i2s_trafficgen_t *)arg; + + if (data->end) + { + data->rx_pending--; + if (data->rx_pending == 0) + { + pi_task_push(&data->rx_end_task); + } + return; + } + + pi_i2s_frame_read_async(&data->dev, data->frame, pi_task_irq_callback(&data->rx_tasks[data->rx_current_task], testlib_i2s_rx_callback, (void *)data)); + data->rx_current_task ^= 1; +} + +static void testlib_i2s_tx_callback(void *arg) +{ + testlib_i2s_trafficgen_t *data = (testlib_i2s_trafficgen_t *)arg; + + if (data->end) + { + data->tx_pending--; + if (data->tx_pending == 0) + { + pi_task_push(&data->tx_end_task); + } + return; + } + + pi_i2s_frame_write_async(&data->dev, data->frame, NULL, 0, pi_task_irq_callback(&data->tx_tasks[data->tx_current_task], testlib_i2s_tx_callback, (void *)data)); + data->tx_current_task ^= 1; +} + + +int testlib_i2s_trafficgen_start(testlib_i2s_trafficgen_t *data) +{ + data->end = 0; + + data->tx_pending = 2; + data->tx_current_task = 0; + pi_task_block(&data->tx_end_task); + pi_i2s_frame_write_async(&data->dev, data->frame, NULL, 0, pi_task_irq_callback(&data->tx_tasks[0], testlib_i2s_tx_callback, (void *)data)); + pi_i2s_frame_write_async(&data->dev, data->frame, NULL, 0, pi_task_irq_callback(&data->tx_tasks[1], testlib_i2s_tx_callback, (void *)data)); + + data->rx_pending = 2; + data->rx_current_task = 0; + pi_task_block(&data->rx_end_task); + pi_i2s_frame_read_async(&data->dev, data->frame, pi_task_irq_callback(&data->rx_tasks[0], testlib_i2s_rx_callback, (void *)data)); + pi_i2s_frame_read_async(&data->dev, data->frame, pi_task_irq_callback(&data->rx_tasks[1], testlib_i2s_rx_callback, (void *)data)); + + if (pi_i2s_ioctl(&data->dev, PI_I2S_IOCTL_START, NULL)) + { + return -4; + } + return 0; +} + + +int testlib_i2s_trafficgen_stop(testlib_i2s_trafficgen_t *data) +{ + int errors = 0; + + data->end = 1; + + pi_task_wait_on(&data->tx_end_task); + pi_task_wait_on(&data->rx_end_task); + + if (pi_i2s_ioctl(&data->dev, PI_I2S_IOCTL_STOP, NULL)) + { + return -4; + } + + return errors; +} + + +int testlib_i2s_trafficgen_deinit(testlib_i2s_trafficgen_t *data) +{ + pi_l2_free(data->tx_buffers[0], data->transfer_size); + pi_l2_free(data->tx_buffers[1], data->transfer_size); + pi_l2_free(data->rx_buffers[0], data->transfer_size); + pi_l2_free(data->rx_buffers[1], data->transfer_size); + pi_i2s_close(&data->dev); + return 0; +} + diff --git a/libs/gap_lib/testbench/testlib_i2s.h b/libs/gap_lib/testbench/testlib_i2s.h new file mode 100644 index 000000000..2c4a2b2ce --- /dev/null +++ b/libs/gap_lib/testbench/testlib_i2s.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#pragma once + +#include "pmsis.h" + +typedef struct +{ + int itf; + int transfer_size; + int sample_rate; + int nb_slots; + int word_size; +} testlib_i2s_trafficgen_config_t; + + +typedef struct +{ + pi_device_t dev; + uint32_t frame; + void *tx_buffers[2]; + void *rx_buffers[2]; + pi_task_t rx_tasks[2]; + pi_task_t tx_tasks[2]; + int tx_pending; + int tx_current_task; + pi_task_t tx_end_task; + int rx_pending; + int rx_current_task; + pi_task_t rx_end_task; + int transfer_size; + int end; +} testlib_i2s_trafficgen_t; + + +void testlib_i2s_trafficgen_conf_init(testlib_i2s_trafficgen_config_t *config); +int testlib_i2s_trafficgen_init(testlib_i2s_trafficgen_t *data, testlib_i2s_trafficgen_config_t *config); +int testlib_i2s_trafficgen_start(testlib_i2s_trafficgen_t *data); +int testlib_i2s_trafficgen_stop(testlib_i2s_trafficgen_t *data); +int testlib_i2s_trafficgen_deinit(testlib_i2s_trafficgen_t *data); diff --git a/libs/gap_lib/testbench/testlib_uart.c b/libs/gap_lib/testbench/testlib_uart.c new file mode 100644 index 000000000..74b484b3e --- /dev/null +++ b/libs/gap_lib/testbench/testlib_uart.c @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#include "pmsis.h" +#include "testbench.h" +#include "testlib.h" +#include + + +void testlib_uart_trafficgen_conf_init(testlib_uart_trafficgen_config_t *config) +{ + config->transfer_size = 128; + config->itf = 0; + config->baudrate = 1000000; + config->control_flow = 1; +} + + +int testlib_uart_trafficgen_init(testlib_uart_trafficgen_t *data, testlib_uart_trafficgen_config_t *config) +{ + struct pi_uart_conf conf; + + data->transfer_size = config->transfer_size; + data->rx_buffers[0] = pi_l2_malloc(config->transfer_size); + data->rx_buffers[1] = pi_l2_malloc(config->transfer_size); + data->tx_buffers[0] = pi_l2_malloc(config->transfer_size); + data->tx_buffers[1] = pi_l2_malloc(config->transfer_size); + + if (data->rx_buffers[0] == NULL || data->rx_buffers[1] == NULL || data->tx_buffers[0] == NULL || data->tx_buffers[1] == NULL) + { + return -1; + } + + for (int i=0; itransfer_size; i++) + { + ((uint8_t *)data->tx_buffers[0])[i] = i; + ((uint8_t *)data->tx_buffers[1])[i] = i; + } + + pi_uart_conf_init(&conf); + + conf.use_ctrl_flow = config->control_flow; + conf.enable_tx = 1; + conf.enable_rx = 1; + conf.uart_id = config->itf; + conf.baudrate_bps = config->baudrate; + + pi_open_from_conf(&data->dev, &conf); + + if (pi_uart_open(&data->dev)) + { + return -1; + } + + return 0; +} + + +static void testlib_uart_rx_callback(void *arg) +{ + testlib_uart_trafficgen_t *data = (testlib_uart_trafficgen_t *)arg; + + if (data->end) + { + data->rx_pending--; + if (data->rx_pending == 0) + { + pi_task_push(&data->rx_end_task); + } + return; + } + + pi_uart_read_async(&data->dev, data->rx_buffers[data->rx_current_task], data->transfer_size, pi_task_irq_callback(&data->rx_tasks[data->rx_current_task], testlib_uart_rx_callback, (void *)data)); + data->rx_current_task ^= 1; +} + +static void testlib_uart_tx_callback(void *arg) +{ + testlib_uart_trafficgen_t *data = (testlib_uart_trafficgen_t *)arg; + + if (data->end) + { + data->tx_pending--; + if (data->tx_pending == 0) + { + pi_task_push(&data->tx_end_task); + } + return; + } + + pi_uart_write_async(&data->dev, data->tx_buffers[data->tx_current_task], data->transfer_size, pi_task_irq_callback(&data->tx_tasks[data->tx_current_task], testlib_uart_tx_callback, (void *)data)); + data->tx_current_task ^= 1; +} + + +int testlib_uart_trafficgen_start(testlib_uart_trafficgen_t *data) +{ + data->end = 0; + + data->tx_pending = 2; + data->tx_current_task = 0; + pi_task_block(&data->tx_end_task); + pi_uart_write_async(&data->dev, data->tx_buffers[0], data->transfer_size, pi_task_irq_callback(&data->tx_tasks[0], testlib_uart_tx_callback, (void *)data)); + pi_uart_write_async(&data->dev, data->tx_buffers[1], data->transfer_size, pi_task_irq_callback(&data->tx_tasks[1], testlib_uart_tx_callback, (void *)data)); + + data->rx_pending = 2; + data->rx_current_task = 0; + pi_task_block(&data->rx_end_task); + pi_uart_read_async(&data->dev, data->rx_buffers[0], data->transfer_size, pi_task_irq_callback(&data->rx_tasks[0], testlib_uart_rx_callback, (void *)data)); + pi_uart_read_async(&data->dev, data->rx_buffers[1], data->transfer_size, pi_task_irq_callback(&data->rx_tasks[1], testlib_uart_rx_callback, (void *)data)); + + return 0; +} + + +int testlib_uart_trafficgen_stop(testlib_uart_trafficgen_t *data) +{ + int errors = 0; + + data->end = 1; + + pi_task_wait_on(&data->tx_end_task); + pi_task_wait_on(&data->rx_end_task); + + return errors; +} + + +int testlib_uart_trafficgen_deinit(testlib_uart_trafficgen_t *data) +{ + pi_l2_free(data->tx_buffers[0], data->transfer_size); + pi_l2_free(data->tx_buffers[1], data->transfer_size); + pi_l2_free(data->rx_buffers[0], data->transfer_size); + pi_l2_free(data->rx_buffers[1], data->transfer_size); + pi_uart_close(&data->dev); + return 0; +} + diff --git a/libs/gap_lib/testbench/testlib_uart.h b/libs/gap_lib/testbench/testlib_uart.h new file mode 100644 index 000000000..f776509a1 --- /dev/null +++ b/libs/gap_lib/testbench/testlib_uart.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 GreenWaves Technologies + * All rights reserved. + * + * This software may be modified and distributed under the terms + * of the BSD license. See the LICENSE file for details. + * + */ + +#pragma once + +#include "pmsis.h" + +typedef struct +{ + int itf; + int transfer_size; + int baudrate; + int control_flow; +} testlib_uart_trafficgen_config_t; + + +typedef struct +{ + pi_device_t dev; + uint32_t frame; + void *tx_buffers[2]; + void *rx_buffers[2]; + pi_task_t rx_tasks[2]; + pi_task_t tx_tasks[2]; + int tx_pending; + int tx_current_task; + pi_task_t tx_end_task; + int rx_pending; + int rx_current_task; + pi_task_t rx_end_task; + int transfer_size; + int end; +} testlib_uart_trafficgen_t; + + +void testlib_uart_trafficgen_conf_init(testlib_uart_trafficgen_config_t *config); +int testlib_uart_trafficgen_init(testlib_uart_trafficgen_t *data, testlib_uart_trafficgen_config_t *config); +int testlib_uart_trafficgen_start(testlib_uart_trafficgen_t *data); +int testlib_uart_trafficgen_stop(testlib_uart_trafficgen_t *data); +int testlib_uart_trafficgen_deinit(testlib_uart_trafficgen_t *data); diff --git a/libs/openmp/tests/benchmark/testset.cfg b/libs/openmp/tests/benchmark/testset.cfg index daa17de0c..cf85c9156 100644 --- a/libs/openmp/tests/benchmark/testset.cfg +++ b/libs/openmp/tests/benchmark/testset.cfg @@ -1,4 +1,5 @@ from plptest import * +import os TestConfig = c = {} diff --git a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h index f5a72ddec..85e653ab6 100644 --- a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h +++ b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h @@ -38,6 +38,7 @@ #include "pmsis/chips/gap8/pad.h" #include "pmsis/chips/gap8/gpio.h" #include "pmsis/chips/gap8/pmu.h" +#include "pmsis/chips/gap8/timer.h" /* Drivers. */ #include "pmsis/drivers/cpi.h" diff --git a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h index da1ea6a48..ae15ecdac 100644 --- a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h +++ b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h @@ -32,6 +32,7 @@ #define __PI_TIMER_H__ #include +#include "pmsis/chips/gap8/timer.h" #include "pmsis/targets/target.h" #include "pmsis/implem/hal/hal.h" @@ -56,6 +57,7 @@ *****************************************************************************/ /* @brief Timers. */ +#if 0 typedef enum { SYS_TIMER = 0, /*!< FC_TIMER_0 used as SysTick timer by preemptive RTOS. */ @@ -64,6 +66,8 @@ typedef enum CL_TIMER_0 = 2, /*!< Cluster Timer_Low. */ CL_TIMER_1 = 3 /*!< Cluster Timer_High. */ } timer_e; +#endif +typedef pi_timer_e timer_e; /******************************************************************************* * Function declaration diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c b/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c index b232c2aa3..15e67449a 100644 --- a/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c +++ b/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c @@ -78,18 +78,17 @@ void pi_time_wait_us(int time_us) /* Wait less than 1 ms. */ if (time_us < 1000) { - uint32_t irq = pi_irq_disable(); - uint32_t freq_fc = pi_freq_get(PI_FREQ_DOMAIN_FC); - //uint64_t counter = (uint64_t) (((uint64_t) time_us) * freq_fc) / 1000000; - //uint64_t counter = (uint64_t) (((uint64_t) time_us) * freq_fc); - uint64_t freq = (uint64_t) ((uint64_t) time_us * (uint64_t) freq_fc); - uint32_t counter = (uint32_t) ((freq) >> 20); - //counter >>= 20; /* Div 10^6 */ - //uint32_t counter = (uint32_t) time_us; - //counter = (counter * freq_fc) / 1000000; - //printf("counter=%ld, freq=%ld, time_us=%d\n", counter, freq_fc, time_us); +#ifdef __VEGA__ + int irq = pi_irq_disable(); + for (volatile int i = 0; i < time_us; i++){}; pi_irq_restore(irq); - for (volatile uint32_t i=0; idata[8] = ((delay_us)/ref_clk_us) + (((delay_us)%ref_clk_us) > 0); //printf("ticks: %i\n ref_clk_us: %i\n rem: %i\n",task->data[8] @@ -165,7 +176,6 @@ void pi_task_timer_enqueue(struct pi_task *task, uint32_t delay_us) delayed_task.fifo_tail->next = task; } delayed_task.fifo_tail = task; -#endif } @@ -200,6 +210,7 @@ void __pi_task_timer_irq(void) NVIC_DisableIRQ(FC_IRQ_TIMER0_HI_EVT); } } +#endif /* __GAP8__ */ // return value allows to skip some OS logic when a switch has already been triggered int pi_task_delayed_increment_push(void) diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h index c18724f57..02fd95137 100644 --- a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h +++ b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h @@ -26,7 +26,8 @@ #define pi_data_free(x,y) pmsis_l2_malloc_free(x,y) #define PI_TASK_IMPLEM \ - uint8_t destroy; + uint8_t destroy; \ + uint32_t time; #define PI_TASK_IMPLEM_NB_DATA 9 diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h index 78e06acd8..38b08f218 100644 --- a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h +++ b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h @@ -55,21 +55,6 @@ */ pi_task_t *__pi_task_block(pi_task_t *task); -/** - * \brief Prepare an event task with callback. - * - * This function initializes an instance of event task. - * This event task executes the callback given in argument. - * - * \param callback_task Pointer to event task. - * \param func Callback function. - * \param arg Callback function argument. - * - * \return task This function returns the event task initialized. - */ -pi_task_t *__pi_task_callback(pi_task_t *callback_task, - void (*func)(void *), void *arg); - /** * \brief Wait on an event task. * @@ -110,6 +95,21 @@ void __pi_task_destroy(pi_task_t *task); */ void pi_task_delayed_fifo_enqueue(struct pi_task *task, uint32_t delay_us); + +static inline void __pi_task_push_no_irq(pi_task_t *task) +{ + pmsis_event_push(pmsis_event_get_default_scheduler(), task); +} + +static inline void __pi_task_push_exec_irq_safe(pi_task_t *task) +{ + pi_callback_func_t func = (pi_callback_func_t) task->arg[0]; + void *arg = (void *) task->arg[1]; + func(arg); +} + +void __pi_task_push_locked(pi_task_t * task); + /******************************************************************************* * API implementation ******************************************************************************/ @@ -122,15 +122,24 @@ static inline pi_task_t *pi_task_block(pi_task_t *task) static inline pi_task_t *pi_task_callback(pi_task_t *task, void (*callback)(void*), void *arg) { - return __pi_task_callback(task, callback, arg); + task->id = PI_TASK_CALLBACK_ID; + task->arg[0] = (uintptr_t) callback; + task->arg[1] = (uintptr_t) arg; + task->done = 0; + task->sync_obj = NULL; + //task->destroy = 0; + task->core_id = -1; + task->timeout = 0; + task->next = NULL; + return task; } static inline pi_task_t *pi_task_irq_callback(pi_task_t *task, - void (*callback)(void*), void *arg) + void (*callback)(void*), void *arg) { task->id = PI_TASK_IRQ_ID; - task->arg[0] = (uintptr_t)callback; - task->arg[1] = (uintptr_t)arg; + task->arg[0] = (uintptr_t) callback; + task->arg[1] = (uintptr_t) arg; return task; } @@ -139,19 +148,16 @@ static inline void pi_task_wait_on(pi_task_t *task) __pi_task_wait_on(task); } +static inline void pi_task_push_irq_safe(pi_task_t *task) +{ + __pi_task_push_locked(task); +} + static inline void pi_task_push(pi_task_t *task) { - switch (task->id) - { - case PI_TASK_NONE_ID : - pi_task_release(task); - break; - case PI_TASK_CALLBACK_ID : - __pi_task_push(task); - break; - default : - return; - } + uint32_t irq = pi_irq_disable(); + __pi_task_push_locked(task); + pi_irq_restore(irq); } static inline void pi_task_destroy(pi_task_t *task) @@ -169,6 +175,16 @@ static inline int32_t pi_task_transfer_end_result_get(pi_task_t *task) return task->arg[3]; } +static inline int32_t pi_task_status_get(pi_task_t *task) +{ + return task->arg[3]; +} + +static inline void pi_task_status_set(pi_task_t *task, int32_t status) +{ + task->arg[3] = status; +} + static inline void pi_task_timeout_callback_set(pi_task_t *task, pi_callback_func_t func, void *arg) { diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c index ee3ae0893..83bbc2956 100644 --- a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c +++ b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c @@ -43,22 +43,7 @@ pi_task_t *__pi_task_block(pi_task_t *callback_task) callback_task->id = PI_TASK_NONE_ID; callback_task->done = 0; pi_sync_obj_init((void *) &(callback_task->sync_obj)); - callback_task->destroy = 1; - callback_task->core_id = -1; - callback_task->timeout = 0; - callback_task->next = NULL; - return callback_task; -} - -pi_task_t *__pi_task_callback(pi_task_t *callback_task, - pi_callback_func_t func, void *arg) -{ - callback_task->id = PI_TASK_CALLBACK_ID; - callback_task->arg[0] = (uintptr_t) func; - callback_task->arg[1] = (uintptr_t) arg; - callback_task->done = 0; - callback_task->sync_obj = NULL; - callback_task->destroy = 0; + //callback_task->destroy = 1; callback_task->core_id = -1; callback_task->timeout = 0; callback_task->next = NULL; @@ -67,16 +52,16 @@ pi_task_t *__pi_task_callback(pi_task_t *callback_task, void __pi_task_destroy(pi_task_t *task) { - if (task->destroy) + //if (task->destroy) { - task->destroy = 0; + //task->destroy = 0; // if the mutex is only virtual (e.g. wait on soc event) - hal_compiler_barrier(); + //hal_compiler_barrier(); if (task->sync_obj != NULL) { pi_sync_obj_deinit((void *) &(task->sync_obj)); } - hal_compiler_barrier(); + //hal_compiler_barrier(); } } @@ -96,12 +81,14 @@ void __pi_task_wait_on(pi_task_t *task) __pi_task_destroy(task); } +#if 0 void __pi_task_push(pi_task_t *task) { uint32_t irq = disable_irq(); pmsis_event_push(pmsis_event_get_default_scheduler(), task); restore_irq(irq); } +#endif /******************************************************************************* * API implementation @@ -118,13 +105,34 @@ pi_task_t *pi_task_block_no_mutex(pi_task_t *callback_task) callback_task->id = PI_TASK_NONE_ID; callback_task->done = 0; callback_task->sync_obj = NULL; - callback_task->destroy = 0; + //callback_task->destroy = 0; callback_task->core_id = -1; callback_task->timeout = 0; callback_task->next = NULL; return callback_task; } +void __pi_task_push_locked(pi_task_t * task) +{ + switch (task->id) + { + case PI_TASK_NONE_ID : + pi_task_release(task); + break; + + case PI_TASK_CALLBACK_ID : + __pi_task_push_no_irq(task); + break; + + case PI_TASK_IRQ_ID : + __pi_task_push_exec_irq_safe(task); + break; + + default : + return; + } +} + void pi_task_release(pi_task_t *task) { DEBUG_PRINTF("[%s] releasing task %p\n",__func__,task); diff --git a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk index 6e0fc1289..c2225d9c6 100644 --- a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk +++ b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk @@ -408,13 +408,16 @@ $(BIN).size: $(BIN) flash: $(BIN) - gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args) + gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args) $(WSL_ENV) -flash_fs: $(BIN) +flash_noforce: $(BIN) gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) +flash_fs: $(BIN) + gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) $(WSL_ENV) + image: $(BIN) - gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args) + gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args) $(WSL_ENV) run: $(BIN) gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --exec --binary=$(BIN) $(runner_args) $(WSL_ENV) diff --git a/rtos/pmsis/pmsis_api/docs/rtos.rst b/rtos/pmsis/pmsis_api/docs/rtos.rst index 35500b4f4..3bb11eed1 100644 --- a/rtos/pmsis/pmsis_api/docs/rtos.rst +++ b/rtos/pmsis/pmsis_api/docs/rtos.rst @@ -9,6 +9,14 @@ Task :private-members: :protected-members: +Event_Task +.... + +.. doxygengroup:: Event_Task + :members: + :private-members: + :protected-members: + Memory allocation ................. diff --git a/rtos/pmsis/pmsis_api/include/pmsis/chips/gap8/timer.h b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap8/timer.h new file mode 100644 index 000000000..7cb15c00d --- /dev/null +++ b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap8/timer.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2021 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \ingroup groupChips + * + * \addtogroup GAP8 + * \{ + * + * \defgroup GAP8_Timers GAP8 Timers + * + * \brief GAP8 Timers + * + * This part enumerates available Timers on chip **GAP8**. + * + * There are a total of 4 timers : + * * 2 on Fabric Controller + * * 2 on Cluster + * + * \addtogroup GAP8_Timers + * \{ + */ + +/** + * \enum pi_timer_e + * + * \brief Timers. + * + * List of available timers. + */ +typedef enum +{ + SYS_TIMER = 0, /*!< FC_TIMER_0 used as SysTick timer by preemptive RTOS. */ + FC_TIMER_0 = 0, /*!< FC Timer_Low. */ + FC_TIMER_1 = 1, /*!< FC Timer_High, can be used as perf counter. */ + CL_TIMER_0 = 2, /*!< Cluster Timer_Low. */ + CL_TIMER_1 = 3 /*!< Cluster Timer_High. */ +} pi_timer_e; + +/** + * \} end of GAP8_Timers + * + * \} end of GAP8 + */ + +#ifdef __cplusplus +} +#endif diff --git a/rtos/pmsis/pmsis_api/include/pmsis/chips/gap9/timer.h b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap9/timer.h new file mode 100644 index 000000000..dfa9b21c5 --- /dev/null +++ b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap9/timer.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2021 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \ingroup groupChips + * + * \addtogroup GAP9 + * \{ + * + * \defgroup GAP9_Timers GAP9 Timers + * + * \brief GAP9 Timers + * + * This part enumerates available Timers on chip **GAP9**. + * + * There are a total of 4 timers : + * * 4 on Fabric Controller + * * 2 on Cluster + * + * \addtogroup GAP9_Timers + * \{ + */ + +/** + * \enum pi_timer_e + * + * \brief Timers. + * + * List of available timers. + */ +typedef enum +{ + SYS_TIMER = 0, /*!< FC TIMER_0 used as SysTick timer by preemptive RTOS. */ + FC_TIMER_0 = 0, /*!< FC Timer_0_Low. FC_TIMER_0 and FC_TIMER_1 can be used + together to form a 64 bits timer. */ + FC_TIMER_1 = 1, /*!< FC Timer_0_High. */ + FC_TIMER_2 = 2, /*!< FC Timer_1_Low. FC_TIMER_2 and FC_TIMER_3 can be used + together to form a 64 bits timer. */ + FC_TIMER_3 = 3, /*!< FC Timer_1_High. */ + CL_TIMER_0 = 4, /*!< Cluster Timer_0_Low. */ + CL_TIMER_1 = 5 /*!< Cluster Timer_0_High. */ +} pi_timer_e; + +/** + * \} end of GAP9_Timers + * + * \} end of GAP9 + */ + +#ifdef __cplusplus +} +#endif diff --git a/rtos/pmsis/pmsis_api/include/pmsis/drivers/timer.h b/rtos/pmsis/pmsis_api/include/pmsis/drivers/timer.h new file mode 100644 index 000000000..b4bcdc7d2 --- /dev/null +++ b/rtos/pmsis/pmsis_api/include/pmsis/drivers/timer.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2021 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \ingroup groupDrivers + * + * \defgroup Timer Timer + * + * \brief Timer + * + * The timer driver includes API to manage different timers available on + * chips, for both Fabric Controller side and Cluster side. + */ + +/** + * \addtogroup Timer + * \{ + */ + + +/** + * \struct pi_timer_conf_s + * + * \brief Timer configuration structure. + * + * This structure is used to pass the desired timer configuration to the + * runtime when opening the device. + */ +struct pi_timer_conf_s +{ + uint32_t time_us; /*!< Timer value to compare. */ + uint8_t timer_id; /*!< Timer ID, refer to \ref pi_timer_e. */ + uint8_t one_shot; /*!< One shot timer, after reaching time_us, timer is disabled. */ + uint8_t irq_en; /*!< Enable timer IRQ. */ + uint8_t clk_src; /*!< Timer clock source: FLL=0, REF_CLK=1. */ + uint8_t timer_reset; /*!< When value is reached: CONTINUE=0, RESET=1. */ + //uint8_t timer_64; /*!< Enable a 64-bit timer, using two 32-bit timers. */ +}; + + +/** + * \brief Initialize a timer configuration with default values. + * + * This function can be called to get default values for all parameters before + * setting some of them. + * The structure containing the configuration must be kept alive until the I2S + * device is opened. + * + * \param conf Pointer to the timer configuration. + * + * \note Only FC_TIMER_2 and FC_TIMER_3 timers are available. + */ +void pi_timer_conf_init(struct pi_timer_conf_s *conf); + +/** + * \brief Open a timer. + * + * This function will do all the needed configuration to initialize a timer(on + * FC or Cluster) with given configuration. + * + * \param device Pointer to device structure. + * + * \retval 0 If operation is successfull. + * \retval ERRNO An error code otherwise. + * + * \note This function must be called before the timer device can be used. + * \note For preemptive RTOS using time slicing, the FC_Timer_0 should be used as + * SysTick timer. Thus a 64 bit timer can not be used on FC side. + */ +int pi_timer_open(struct pi_device *device); + +/** + * \brief Close an opened timer device. + * + * This function closes a timer device. + * + * \param device Pointer to device structure. + */ +void pi_timer_close(struct pi_device *device); + +/** + * \brief Start a timer. + * + * This function starts a timer. + * + * \param device Pointer to device structure. + * + * \retval 0 If operation is successfull. + * \retval ERRNO An error code otherwise. + */ +int pi_timer_start(struct pi_device *device); + +/** + * \brief Stop a timer. + * + * This function stops a timer. + * + * \param device Pointer to device structure. + */ +void pi_timer_stop(struct pi_device *device); + +/** + * \brief Reset a timer counter. + * + * This function resets a timer's counter register. + * + * \param device Pointer to device structure. + * + * \note This function does not reset a timer's configuration. + * To fully reset a timer, stop first the timer, then reinitialize + * the timer using pi_timer_init() function. + */ +void pi_timer_reset(struct pi_device *device); + +/** + * \brief Get a timer's counter value. + * + * This function reads the current counter value of a timer and stores it in the + * given buffer. + * + * \param device Pointer to device structure. + * \param value Buffer to store counter value. + * + * \retval 0 If operation is successfull. + * \retval ERRNO An error code otherwise. + */ +int pi_timer_current_value_read(struct pi_device *device, uint32_t *value); + +int pi_timer_task_add(struct pi_device *device, uint32_t time_us, pi_task_t *task); + +/** + * \} end of Timer + */ + +#ifdef __cplusplus +} +#endif diff --git a/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h b/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h index 4770f588c..e70c60ca6 100644 --- a/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h +++ b/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h @@ -410,10 +410,10 @@ static inline int pi_log_default_vprintf(const char *format, va_list list) #define HYPER_DBG(fmt, ...) PI_LOG_DBG(HYPER_TAG, fmt, ##__VA_ARGS__) #define HYPER_TRC(fmt, ...) PI_LOG_TRC(HYPER_TAG, fmt, ##__VA_ARGS__) -#define MRAM_TAG "mram" +//#define MRAM_TAG "mram" #define MRAM_ERR(fmt, ...) PI_LOG_ERR(MRAM_TAG, fmt, ##__VA_ARGS__) #define MRAM_WNG(fmt, ...) PI_LOG_WNG(MRAM_TAG, fmt, ##__VA_ARGS__) -#define MRAM_INF(fmt, ...) PI_LOG_INF(MRAM_TAG, fmt, ##__VA_ARGS__) +#define MRAM_INF(fmt, ...) #define MRAM_DBG(fmt, ...) PI_LOG_DBG(MRAM_TAG, fmt, ##__VA_ARGS__) #define MRAM_TRC(fmt, ...) PI_LOG_TRC(MRAM_TAG, fmt, ##__VA_ARGS__) diff --git a/rtos/pmsis/pmsis_api/include/pmsis/task.h b/rtos/pmsis/pmsis_api/include/pmsis/task.h index 3d4189d8c..c6ae9ba3f 100644 --- a/rtos/pmsis/pmsis_api/include/pmsis/task.h +++ b/rtos/pmsis/pmsis_api/include/pmsis/task.h @@ -182,13 +182,22 @@ static inline pi_callback_t *pi_callback_init(pi_callback_t *callback, static inline void pi_task_timeout_set(pi_task_t *task, uint32_t timeout_us); /** - * \brief Query result end of transfer. + * \brief Query task status. * - * This function can be used to check the end result of a transfer. + * This function can be used to check if a task completed successfully. * - * \return ERRNO Value corresponding to end of transfer. + * \return ERRNO Value corresponding to task status. */ -static inline int32_t pi_task_transfer_end_result_get(pi_task_t *task); +static inline int32_t pi_task_status_get(pi_task_t *task); + +/** + * \brief Set task status. + * + * This function can be used to tell if a task completed successfully. + * + * \param status Value corresponding to task status. + */ +static inline void pi_task_status_set(pi_task_t *task, int32_t status); /** * @} diff --git a/rtos/pmsis/pmsis_bsp/adc/ads1014.c b/rtos/pmsis/pmsis_bsp/adc/ads1014.c new file mode 100644 index 000000000..45b5835d4 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/adc/ads1014.c @@ -0,0 +1,349 @@ +#include "pmsis.h" +#include "bsp/bsp.h" +#include "bsp/adc/ads1014.h" + +/**************/ +/* Structures */ +/**************/ + +enum ads1014_registers { + ADS1014_REGISTER_VALUE = 0x0, + ADS1014_REGISTER_CONF = 0x1, + ADS1014_REGISTER_THRESHOLD_LOW = 0x2, + ADS1014_REGISTER_THRESHOLD_HIGH = 0x3, +}; + +typedef union __attribute__((__packed__)) { + struct __attribute__((packed)) { + uint8_t _unused :4; + int16_t value :12; + } reg; + uint16_t value; +} ads1014_register_value_t; + +typedef union __attribute__((__packed__)) { + struct __attribute__((__packed__)) { + enum ads1014_comparator_status comparator_status :2; + enum ads1014_comparator_latch comparator_latch :1; + enum ads1014_comparator_polarity comparator_polarity :1; + enum ads1014_comparator_mode comparator_mode :1; + enum ads1014_data_rate data_rate :3; + enum ads1014_operating_mode operating_mode :1; + enum ads1014_pga pga :3; + uint8_t _unused: 3; + uint8_t converting :1; + } reg; + uint16_t value; +} ads1014_register_conf_t; + +typedef struct { + pi_device_t i2c_device; + ads1014_register_conf_t adc_conf; +} ads1014_data_t; + +/********************/ +/* Static functions */ +/********************/ + +static int +__ads1014_update_configuration(pi_device_t *device) { + ads1014_data_t *data = (ads1014_data_t*) device->data; + + /* set the adc to trigger a conversion (useful in single shot mode) */ + data->adc_conf.reg.converting = 1; + + uint16_t packed_conf = data->adc_conf.value; + uint8_t payload[3] = { ADS1014_REGISTER_CONF, + (packed_conf >> 8) & 0xFF, + packed_conf & 0xFF, + }; + + int status = pi_i2c_write(&data->i2c_device, payload, 3, + PI_I2C_XFER_START | PI_I2C_XFER_STOP); + return status; +}; + +/** + * + * Convert a float value (in mV) to fit in a comparator threshold register + * + * @param[in] pga ads1014 pga setting (to know the scale) + * @param[in] f float to convert + * @param[out] reg register that will contain the final 12 bit value + * + * @return PI_OK if operation was successful, + * an error otherwise + */ +static inline int +__convert_float_to_register_value(enum ads1014_pga pga, float f, + ads1014_register_value_t *reg) +{ + if (NULL == reg) { + return PI_ERR_INVALID_ARG; + } + + /* Change the scale according to the ADC PGA */ + switch(pga) { + case ADS1014_PGA_FSR_6V144: + f /= 3; + break; + case ADS1014_PGA_FSR_4V096: + f /= 2; + break; + case ADS1014_PGA_FSR_2V048: + /* nothing, f *= 1*/ + break; + case ADS1014_PGA_FSR_1V024: + f *= 2; + break; + case ADS1014_PGA_FSR_0V512: + f *= 4; + break; + case ADS1014_PGA_FSR_0V256: /* fallthrough */ + default: + f *= 8; + break; + } + + /* fit into 12 bits */ + int32_t i = f; + /* check if it fits in 12 bits */ + if (__builtin_pulp_clb(i) < 20) { + return PI_ERR_INVALID_STATE; + } + reg->reg.value = f; + + return PI_OK; +} + +/** + * Helper to write a comparator register + * + * @param[in] device pointer to the ads1014 device (not checked) + * @param[in] reg register to write + * @param[in] value value to write in the register + * + * @return PI_OK if operation was succesful, an error code otherwise + */ +static inline int +__write_comparator_register(pi_device_t *device, enum ads1014_registers reg, + ads1014_register_value_t value) +{ + ads1014_data_t *data = (ads1014_data_t*) device->data; + + uint16_t packed_conf = value.value; + uint8_t payload[3] = { reg, + (packed_conf >> 8) & 0xFF, + packed_conf & 0xFF, + }; + + int status = pi_i2c_write(&data->i2c_device, payload, 3, + PI_I2C_XFER_START | PI_I2C_XFER_STOP); + return status; +} + +/*****************/ +/* API functions */ +/*****************/ + +void pi_ads1014_conf_init(struct pi_ads1014_conf *conf) { + if (NULL == conf) { + return; + } + + /* I2C related */ + conf->i2c_itf = 1; //TODO retrieve from BSP + conf->i2c_addr = 0x90; //TODO retrieve from BSP + + /* ADC General settings */ + conf->operating_mode = ADS1014_OPERATING_MODE_SINGLE_SHOT; + conf->pga = ADS1014_PGA_FSR_2V048; + conf->data_rate = ADS1014_DATA_RATE_SPS_1600; + + /* ADC comparator settings */ + conf->comparator_status = ADS1014_COMPARATOR_STATUS_DISABLED; + conf->comparator_mode = ADS1014_COMPARATOR_MODE_TRADITIONAL; + conf->comparator_latch = ADS1014_COMPARATOR_LATCH_DISABLED; + conf->comparator_polarity = ADS1014_COMPARATOR_POLARITY_ACTIVE_LOW; +} + +int pi_ads1014_open(pi_device_t *device) { + if (NULL == device) { + return PI_ERR_INVALID_ARG; + } + + struct pi_ads1014_conf *conf = (struct pi_ads1014_conf *) device->config; + + /* allocate memory for runtime data */ + ads1014_data_t *ads1014 = (ads1014_data_t *) pi_l2_malloc(sizeof(ads1014_data_t)); + if (NULL == ads1014) { + return PI_ERR_L2_NO_MEM; + } + device->data = (void*) ads1014; + + /* open bsp */ + //TODO + + /* initialize configuration register */ + { + ads1014->adc_conf.reg.operating_mode = conf->operating_mode; + ads1014->adc_conf.reg.pga = conf->pga; + ads1014->adc_conf.reg.data_rate = conf->data_rate; + ads1014->adc_conf.reg.comparator_status = conf->comparator_status; + ads1014->adc_conf.reg.comparator_mode = conf->comparator_mode; + ads1014->adc_conf.reg.comparator_latch = conf->comparator_latch; + ads1014->adc_conf.reg.comparator_mode = conf->comparator_mode; + ads1014->adc_conf.reg.comparator_polarity = conf->comparator_polarity; + ads1014->adc_conf.reg.converting = 1; /* initial trigger */ + } + + /* initialize I2C device */ + { + struct pi_i2c_conf i2c_conf; + pi_i2c_conf_init(&i2c_conf); + i2c_conf.itf = conf->i2c_itf; + i2c_conf.max_baudrate = 100000; + pi_i2c_conf_set_slave_addr(&i2c_conf, conf->i2c_addr, 0); + + pi_open_from_conf(&ads1014->i2c_device, &i2c_conf); + + if (PI_OK != pi_i2c_open(&ads1014->i2c_device)) { + pi_l2_free(device->data, sizeof(ads1014_data_t)); + return PI_ERR_INVALID_STATE; + } + } + + /* ADC configuration */ + int status = __ads1014_update_configuration(device); + if (status != PI_OK) { + pi_i2c_close(&ads1014->i2c_device); + pi_l2_free(device->data, sizeof(ads1014_data_t)); + return PI_ERR_INVALID_STATE; + } + + return PI_OK; +} + +void pi_ads1014_close(pi_device_t *device) { + if (NULL == device || NULL == device->data) { + return; + } + + ads1014_data_t *data = (ads1014_data_t*) device->data; + + /* close devices */ + pi_i2c_close(&data->i2c_device); + + /* free memory */ + pi_l2_free(data, sizeof(ads1014_data_t)); + device->data = NULL; +} + +int pi_ads1014_read(pi_device_t *device, float *value) { + if (NULL == device || NULL == device->data || NULL == value) { + return PI_ERR_INVALID_ARG; + } + + ads1014_data_t *data = (ads1014_data_t*) device->data; + + int status = PI_OK; + + // if ADC is in continous mode, no need to trigger a conversion. + // else we need to trigger for a single shot measurement + if (data->adc_conf.reg.operating_mode == ADS1014_OPERATING_MODE_SINGLE_SHOT) { + status = __ads1014_update_configuration(device); + if (PI_OK != status) { + return PI_ERR_INVALID_STATE; + } + + //TODO wait for the end of the conversion ? + } + + uint8_t write_payload = ADS1014_REGISTER_VALUE; + ads1014_register_value_t result; + + status = pi_i2c_write(&data->i2c_device, &write_payload, 1, + PI_I2C_XFER_START | PI_I2C_XFER_STOP); + + if (status != PI_OK) { + return PI_ERR_INVALID_STATE; + } + + status = pi_i2c_read(&data->i2c_device, (uint8_t*) &result, 2, + PI_I2C_XFER_START | PI_I2C_XFER_STOP); + + if (status != PI_OK) { + return PI_ERR_INVALID_STATE; + } + + result.value= (result.value << 8) | (result.value >> 8); + + /* conversion depending on the pga */ + switch(data->adc_conf.reg.pga) { + case ADS1014_PGA_FSR_6V144: + *value = result.reg.value * 3; + break; + case ADS1014_PGA_FSR_4V096: + *value = result.reg.value * 2; + break; + case ADS1014_PGA_FSR_2V048: + *value = result.reg.value * 1; + break; + case ADS1014_PGA_FSR_1V024: + *value = result.reg.value * 0.5; + break; + case ADS1014_PGA_FSR_0V512: + *value = result.reg.value * 0.25; + break; + case ADS1014_PGA_FSR_0V256: /* fallthrough */ + default: + *value = result.reg.value * 0.125; + break; + } + + return PI_OK; +} + +int pi_ads1014_set_comparator_thresholds(pi_device_t *device, + float threshold_low, float threshold_high) +{ + if (NULL == device || NULL == device->data) { + return PI_ERR_INVALID_ARG; + } + + ads1014_data_t *data = (ads1014_data_t*) device->data; + + ads1014_register_value_t low_th_reg; + ads1014_register_value_t high_th_reg; + + /* Convert threshold to 12bits */ + { + /* return an error if they do not fit inside 12 bits */ + enum ads1014_pga pga = data->adc_conf.reg.pga; + + if (PI_OK != __convert_float_to_register_value(pga, threshold_low, + &low_th_reg)) { + return PI_ERR_INVALID_STATE; + } + + if (PI_OK != __convert_float_to_register_value(pga, threshold_high, + &high_th_reg)) { + return PI_ERR_INVALID_STATE; + } + } + + /* write values to registers */ + int status = __write_comparator_register(device, + ADS1014_REGISTER_THRESHOLD_LOW, low_th_reg); + if (PI_OK != status) { + return status; + } + + status = __write_comparator_register(device, + ADS1014_REGISTER_THRESHOLD_HIGH, high_th_reg); + if (PI_OK != status) { + return status; + } + + return PI_OK; +} diff --git a/rtos/pmsis/pmsis_bsp/audio/adc/tlv320.c b/rtos/pmsis/pmsis_bsp/audio/adc/tlv320.c new file mode 100644 index 000000000..1f471ece0 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/audio/adc/tlv320.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2018 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + +#include "pmsis.h" +#include "bsp/bsp.h" + + +typedef struct +{ + struct pi_device i2c; +} tlv320_t; + + +static int __pi_tlv320_reg_write(pi_device_t *dev, uint8_t addr, uint8_t value) +{ + uint8_t buffer[2] = { addr, value }; + if (pi_i2c_write(dev, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP)) + { + return -1; + } + return 0; +} + + +static uint8_t __pi_tlv320_reg_read(pi_device_t *dev, uint8_t addr) +{ + uint8_t result; + pi_i2c_write_read(dev, &addr, &result, 1, 1); + return result; +} + + +int pi_tlv320_open(struct pi_device *device) +{ + struct pi_tlv320_conf *conf = (struct pi_tlv320_conf *)device->config; + + tlv320_t *tlv320 = (tlv320_t *)pmsis_l2_malloc(sizeof(tlv320_t)); + if (tlv320 == NULL) + { + return -1; + } + + device->data = (void *)tlv320; + + if (bsp_tlv320_open(conf)) + { + goto error; + } + + struct pi_i2c_conf i2c_conf; + pi_i2c_conf_init(&i2c_conf); + i2c_conf.itf = conf->i2c_itf; + i2c_conf.max_baudrate = 100000; + pi_i2c_conf_set_slave_addr(&i2c_conf, 0x98, 0); + + pi_open_from_conf(&tlv320->i2c, &i2c_conf); + if (pi_i2c_open(&tlv320->i2c)) goto error; + + //tlv320 write i2c config (and read for debug) + + uint8_t expected , read; + + // Select page 0 (configuration register page) + expected = 0x00; + __pi_tlv320_reg_write(&tlv320->i2c, 0x00, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x00); + printf("Read page 0x%x \n",read); + if (read == expected) + printf("Page ok \n"); + else printf("Page failed \n",read); + + // Wake-up device by I2C write into P0_R2 using internal AREG + __pi_tlv320_reg_write(&tlv320->i2c, 0x02, 0x81); + expected = 0x81; + __pi_tlv320_reg_write(&tlv320->i2c, 0x02, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x02); + if (read == expected) + printf("Wake up ok \n"); + else printf("Wake up failed \n",read); + + //wait at least 1ms to complete init + pi_time_wait_us(2000); + + // Enable Input Ch-1,2,3,4 by I2C write into P0_R115 + //__pi_tlv320_reg_write(&tlv320->i2c, 0x73, 0xF0); + //only channel 2 + expected = 0x40; + __pi_tlv320_reg_write(&tlv320->i2c, 0x73, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x73); + if (read == expected) + printf("Enable channel 2 ok \n"); + else printf("Enable channel 2 failed \n",read); + + // Enable ASI Output Ch-1,2,3,4 slot by I2C write into P0_R116 + //__pi_tlv320_reg_write(&tlv320->i2c, 0x74, 0xF0); + //only channel 2 + expected = 0x40; + __pi_tlv320_reg_write(&tlv320->i2c, 0x74, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x74); + if (read == expected) + printf("Enable ASI out channel 2 ok \n"); + else printf("Enable ASI out channel 2 failed \n",read); + + // Power-up ADC, MICBIAS and PLL by I2C write into P0_R117 + //__pi_tlv320_reg_write(&tlv320->i2c, 0x75, 0xe0); + //power up only ADC and PLL + expected = 0x60; + __pi_tlv320_reg_write(&tlv320->i2c, 0x75, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x75); + if (read == expected) + printf("Power up ADC and PLL ok \n"); + else printf("Power Up ADC and PLL failed \n",read); + + + // TDM 32bits mode + //__pi_tlv320_reg_write(&tlv320->i2c, 0x7, 0x30); + expected = 0x30; + __pi_tlv320_reg_write(&tlv320->i2c, 0x7, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x7); + if (read == expected) + printf("TDM 32 bit ok \n"); + else printf("TDM 32 bit failed \n",read); + + // TX OFFSET. set to 1 to match our ws_delay of 1, but not sure it is the right value -- seems to work + expected = 0x1; + __pi_tlv320_reg_write(&tlv320->i2c, 0x8, expected); + read = __pi_tlv320_reg_read(&tlv320->i2c, 0x8); + if (read == expected) + printf("Tx offset 1 ok \n"); + else printf("Tx offset 1 failed \n",read); + + + + //configure AC single - ended + //channel1 + //__pi_tlv320_reg_write(&tlv320->i2c, 0x3C, 0x20); + //channel2 + __pi_tlv320_reg_write(&tlv320->i2c, 0x41, 0x20); + //channel3 + //__pi_tlv320_reg_write(&tlv320->i2c, 0x46, 0x20); + //channel4 + //__pi_tlv320_reg_write(&tlv320->i2c, 0x4B, 0x20); + + return 0; + +error: + pmsis_l2_malloc_free(tlv320, sizeof(tlv320_t)); + return -2; +} + + +void pi_tlv320_close(struct pi_device *device) +{ + tlv320_t *tlv320 = (tlv320_t *)device->data; + pmsis_l2_malloc_free(tlv320, sizeof(tlv320_t)); +} + + +void pi_tlv320_conf_init(struct pi_tlv320_conf *conf) +{ + bsp_tlv320_conf_init(conf); +} + diff --git a/rtos/pmsis/pmsis_bsp/audio/dac/ak4332.c b/rtos/pmsis/pmsis_bsp/audio/dac/ak4332.c new file mode 100644 index 000000000..120457112 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/audio/dac/ak4332.c @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2018 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + +#include "pmsis.h" +#include "bsp/bsp.h" + + +typedef struct +{ + struct pi_device i2c; +} ak4332_t; + + +static int __pi_ak4332_reg_write(pi_device_t *dev, uint8_t addr, uint8_t value) +{ + uint8_t buffer[2] = { addr, value }; + if (pi_i2c_write(dev, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP)) + { + return -1; + } + return 0; +} + + +static uint8_t __pi_ak4332_reg_read(pi_device_t *dev, uint8_t addr) +{ + uint8_t result; + pi_i2c_write_read(dev, &addr, &result, 1, 1); + return result; +} + + +int pi_ak4332_open(struct pi_device *device) +{ + struct pi_ak4332_conf *conf = (struct pi_ak4332_conf *)device->config; + + ak4332_t *ak4332 = (ak4332_t *)pmsis_l2_malloc(sizeof(ak4332_t)); + if (ak4332 == NULL) + { + return -1; + } + + device->data = (void *)ak4332; + + if (bsp_ak4332_open(conf)) + { + goto error; + } + + struct pi_i2c_conf i2c_conf; + pi_i2c_conf_init(&i2c_conf); + i2c_conf.itf = conf->i2c_itf; + i2c_conf.max_baudrate = 100000; + pi_i2c_conf_set_slave_addr(&i2c_conf, 0x20, 0); + + pi_open_from_conf(&ak4332->i2c, &i2c_conf); + if (pi_i2c_open(&ak4332->i2c)) goto error; + + // DAC initial settings + __pi_ak4332_reg_write(&ak4332->i2c, 0x26, 0x02); + + __pi_ak4332_reg_write(&ak4332->i2c, 0x27, 0xC0); + + // Select left channel + __pi_ak4332_reg_write(&ak4332->i2c, 0x07, 0x1); + + // Set 32bits samples + __pi_ak4332_reg_write(&ak4332->i2c, 0x15, 0b110); + + // Set 48KHz sampling rate + // and CM to 512 + __pi_ak4332_reg_write(&ak4332->i2c, 0x5, 0x2A); + + //Set HP Gain to 0 + __pi_ak4332_reg_write(&ak4332->i2c, 0x0d, 0b101); + //Set DAC volume to max + __pi_ak4332_reg_write(&ak4332->i2c, 0x0b, 0x1F); + + // Configure PLL to take BLCK as input clock + __pi_ak4332_reg_write(&ak4332->i2c, 0x0E, 0x1); + + // Configure DAC to take PLL as input clock + __pi_ak4332_reg_write(&ak4332->i2c, 0x13, 0x1); + + + int pld = 3; + __pi_ak4332_reg_write(&ak4332->i2c, 0x0F, pld >> 8); + __pi_ak4332_reg_write(&ak4332->i2c, 0x10, pld & 0xff); + int plm = 31; + __pi_ak4332_reg_write(&ak4332->i2c, 0x11, plm >> 8); + __pi_ak4332_reg_write(&ak4332->i2c, 0x12, plm & 0xff); + + // set volume to max + //__pi_ak4332_reg_write(&ak4332->i2c, 0x0b, 0x1f); + //__pi_ak4332_reg_write(&ak4332->i2c, 0x0d, 0x7); + + // Power-up PLL + __pi_ak4332_reg_write(&ak4332->i2c, 0x00, 0x1); + + pi_time_wait_us(20000); + + // Power-up PMTIM + __pi_ak4332_reg_write(&ak4332->i2c, 0x00, 0x3); + + // Power-up charge pump for both channels + __pi_ak4332_reg_write(&ak4332->i2c, 0x01, 0x1); + + pi_time_wait_us(65000); + + // Power-up LDO1 + __pi_ak4332_reg_write(&ak4332->i2c, 0x01, 0x31); + + pi_time_wait_us(5000); + + // Power up charge pump 2 + __pi_ak4332_reg_write(&ak4332->i2c, 0x01, 0x33); + + // Power-up DAC + __pi_ak4332_reg_write(&ak4332->i2c, 0x02, 0x1); + + // Power-up Amplifier + __pi_ak4332_reg_write(&ak4332->i2c, 0x03, 0x1); + + return 0; + +error: + pmsis_l2_malloc_free(ak4332, sizeof(ak4332_t)); + return -2; +} + + +int pi_ak4332_set_dac_volume(pi_device_t *device, uint8_t volume) +{ + ak4332_t *ak4332 = (ak4332_t *)device->data; + if (volume > 0x1F) + { + return -1; + } + + return __pi_ak4332_reg_write(&ak4332->i2c, 0x0b, volume); +} + +int pi_ak4332_set_hp_volume(pi_device_t *device, uint8_t volume) +{ + ak4332_t *ak4332 = (ak4332_t *)device->data; + if (volume > 0x7) + { + return -1; + } + + return __pi_ak4332_reg_write(&ak4332->i2c, 0x0d, volume); +} + + +void pi_ak4332_close(struct pi_device *device) +{ + ak4332_t *ak4332 = (ak4332_t *)device->data; + pmsis_l2_malloc_free(ak4332, sizeof(ak4332_t)); +} + + +void pi_ak4332_conf_init(struct pi_ak4332_conf *conf) +{ + bsp_ak4332_conf_init(conf); +} + diff --git a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c index 67df81b68..124ceff6f 100644 --- a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c +++ b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c @@ -17,7 +17,6 @@ #include "pmsis.h" #include "bsp/bsp.h" -#include "bsp/gap9_v2.h" #include "bsp/camera/himax.h" #include "bsp/flash/hyperflash.h" #include "bsp/ram/hyperram.h" @@ -35,6 +34,19 @@ static void __bsp_init_pads() } } +void bsp_aps25xxxn_conf_init(struct pi_aps25xxxn_conf *conf) +{ + conf->ram_start = CONFIG_APS25XXXN_START; + conf->ram_size = CONFIG_APS25XXXN_SIZE; + conf->spi_itf = CONFIG_APS25XXXN_SPI_ITF; + conf->spi_cs = CONFIG_APS25XXXN_SPI_CS; +} + +int bsp_aps25xxxn_open(struct pi_aps25xxxn_conf *conf) +{ + return 0; +} + int bsp_24xx1025_open(struct pi_24xx1025_conf *conf) { @@ -64,93 +76,20 @@ void bsp_virtual_eeprom_conf_init(struct pi_virtual_eeprom_conf *conf) conf->i2c_itf = CONFIG_VIRTUAL_EEPROM_I2C_ITF; } -void bsp_hyperram_conf_init(struct pi_hyperram_conf *conf) -{ - conf->ram_start = CONFIG_HYPERRAM_START; - conf->ram_size = CONFIG_HYPERRAM_SIZE; - conf->skip_pads_config = 0; - conf->hyper_itf = CONFIG_HYPERRAM_HYPER_ITF; - conf->hyper_cs = CONFIG_HYPERRAM_HYPER_CS; -} - - -int bsp_hyperram_open(struct pi_hyperram_conf *conf) -{ - __bsp_init_pads(); - return 0; -} - - -void bsp_spiram_conf_init(struct pi_spiram_conf *conf) -{ - conf->ram_start = CONFIG_SPIRAM_START; - conf->ram_size = CONFIG_SPIRAM_SIZE; - conf->skip_pads_config = 0; - conf->spi_itf = CONFIG_SPIRAM_SPI_ITF; - conf->spi_cs = CONFIG_SPIRAM_SPI_CS; -} -int bsp_spiram_open(struct pi_spiram_conf *conf) +void bsp_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf) { - return 0; -} - - -void bsp_aps25xxxn_conf_init(struct pi_aps25xxxn_conf *conf) -{ - conf->ram_start = CONFIG_APS25XXXN_START; - conf->ram_size = CONFIG_APS25XXXN_SIZE; - conf->spi_itf = CONFIG_APS25XXXN_SPI_ITF; - conf->spi_cs = CONFIG_APS25XXXN_SPI_CS; -} - -int bsp_aps25xxxn_open(struct pi_aps25xxxn_conf *conf) -{ - return 0; -} - - -void bsp_atxp032_conf_init(struct pi_atxp032_conf *conf) -{ - conf->spi_itf = CONFIG_ATXP032_SPI_ITF; - conf->spi_cs = CONFIG_ATXP032_SPI_CS; + conf->spi_itf = CONFIG_MX25U51245G_SPI_ITF; + conf->spi_cs = CONFIG_MX25U51245G_SPI_CS; conf->baudrate = 200000000; } -int bsp_atxp032_open(struct pi_atxp032_conf *conf) +int bsp_mx25u51245g_open(struct pi_mx25u51245g_conf *conf) { return 0; } -void bsp_spiflash_conf_init(struct pi_spiflash_conf *conf) -{ - conf->size = CONFIG_SPIFLASH_SIZE; - // sector size is in number of KB - conf->sector_size = CONFIG_SPIFLASH_SECTOR_SIZE; - conf->spi_itf = CONFIG_SPIFLASH_SPI_ITF; - conf->spi_cs = CONFIG_SPIFLASH_SPI_CS; -} - -int bsp_spiflash_open(struct pi_spiflash_conf *conf) -{ - return 0; -} - - -void bsp_hyperflash_conf_init(struct pi_hyperflash_conf *conf) -{ - conf->hyper_itf = CONFIG_HYPERFLASH_HYPER_ITF; - conf->hyper_cs = CONFIG_HYPERFLASH_HYPER_CS; -} - -int bsp_hyperflash_open(struct pi_hyperflash_conf *conf) -{ - __bsp_init_pads(); - return 0; -} - - void bsp_himax_conf_init(struct pi_himax_conf *conf) { @@ -164,22 +103,6 @@ int bsp_himax_open(struct pi_himax_conf *conf) return 0; } -void bsp_nina_b112_conf_init(struct pi_nina_b112_conf *conf) -{ - conf->uart_itf = (uint8_t) CONFIG_NINA_B112_UART_ID; -} - -int bsp_nina_b112_open(struct pi_nina_b112_conf *conf) -{ - return 0; -} - -int bsp_nina_b112_open_old() -{ - __bsp_init_pads(); - return 0; -} - void bsp_init() { } @@ -191,8 +114,25 @@ void pi_bsp_init_profile(int profile) +// This function is automatically called by the OS during init void pi_bsp_init() { + // Set the pads alternate so that we have by default flash/ram and uart + // working. + // Flash and ram are on hyperbus0, pads 0 to 12 included. + // Uart is on pad 44 and 45 +#ifdef __FREERTOS__ + // TODO freertos is setting everything to 0 by default, keep it for now to not break everything + uint32_t pad_values[] = { 0, 0, 0, 0, 0, 0 }; +#else + uint32_t pad_values[] = { 0x54000000, 0x55555555, 0x50555555, 0x55555555, 0x55555555, 0x55555555 }; +#endif + pi_pad_init(pad_values); + + // Since pad 44 and 45 are for i2c, we need to configure the pad mux + pi_pad_set_mux_group(PI_PAD_044, PI_PAD_MUX_GROUP_UART1_RX); + pi_pad_set_mux_group(PI_PAD_045, PI_PAD_MUX_GROUP_UART1_TX); + pi_bsp_init_profile(PI_BSP_PROFILE_DEFAULT); #if defined(CONFIG_GAP9_EVK_AUDIO_ADDON) diff --git a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c index c7220d13f..cffee3e17 100644 --- a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c +++ b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c @@ -17,119 +17,127 @@ #include "pmsis.h" #include "bsp/bsp.h" -#include "bsp/gap9_v2.h" #include "bsp/boards/gap9_evk/audio_addon.h" -static pi_device_t __bsp_fxl6408_i2c; -static PI_FC_TINY uint8_t __bsp_fxl6408_is_init; -static PI_FC_TINY uint32_t __bsp_fxl6408_output_state; // Keep as 32bits to lower code footprint - -void bsp_ak4332_conf_init(struct pi_ak4332_conf *conf) -{ - conf->i2c_itf = CONFIG_AK4332_I2C_ITF; - conf->i2s_itf = CONFIG_AK4332_I2S_ITF; -} - - -int bsp_ak4332_open(struct pi_ak4332_conf *conf) -{ - return 0; -} +#if defined(CONFIG_FXL6408) +static pi_device_t __pi_bsp_fxl6408; +#if !defined(__FREERTOS__) +static PI_FC_TINY uint8_t __pi_bsp_fxl6408_is_init; +#else +static uint8_t __pi_bsp_fxl6408_is_init; +#endif -static void __bsp_fxl6408_write_reg(uint8_t addr, uint8_t value) +void bsp_fxl6408_conf_init(struct pi_fxl6408_conf *conf) { - uint8_t buffer[2] = { addr, value }; - pi_i2c_write(&__bsp_fxl6408_i2c, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP); + pi_fxl6408_conf_init(conf); + conf->i2c_itf = CONFIG_FXL6408UMX_I2C_ITF; + conf->interrupt_pin = PI_PAD_089; } -static uint8_t __bsp_fxl6408_read_reg(uint8_t addr) +static __attribute__((noinline)) int __pi_bsp_fxl6408_do_init() { - uint8_t result; - pi_i2c_write_read(&__bsp_fxl6408_i2c, &addr, &result, 1, 1); - return result; -} - - -static __attribute__((noinline)) int __bsp_fxl6408_do_init() -{ - struct pi_i2c_conf conf; - pi_i2c_conf_init(&conf); - conf.itf = CONFIG_FXL6408UMX_I2C_ITF; - pi_i2c_conf_set_slave_addr(&conf, CONFIG_FXL6408UMX_I2C_ADDR, 0); - - pi_open_from_conf(&__bsp_fxl6408_i2c, &conf); - if (pi_i2c_open(&__bsp_fxl6408_i2c)) + struct pi_fxl6408_conf conf; + pi_fxl6408_conf_init(&conf); + pi_open_from_conf(&__pi_bsp_fxl6408, &conf); + if (pi_fxl6408_open(&__pi_bsp_fxl6408)) { return - 1; } - // Configure GPIO direction (output) for dac and adc - __bsp_fxl6408_write_reg(0x03, (1 << CONFIG_FXL6408UMX_AK4332_GPIO) | (1 << CONFIG_FXL6408UMX_TLV320_GPIO)); - - __bsp_fxl6408_output_state = 0; - __bsp_fxl6408_is_init = 1; + __pi_bsp_fxl6408_is_init = 1; return 0; } - -static inline __attribute__((always_inline)) int __bsp_fxl6408_check_init() +static inline __attribute__((always_inline)) int __pi_bsp_fxl6408_check_init() { - if (!__bsp_fxl6408_is_init) + if (!__pi_bsp_fxl6408_is_init) { - return __bsp_fxl6408_do_init(); + return __pi_bsp_fxl6408_do_init(); } return 0; } +#endif -static void __bsp_fxl6408_gpio_output_state(unsigned int gpio, int state) +#if defined(CONFIG_AK4332) +void bsp_ak4332_conf_init(struct pi_ak4332_conf *conf) { - __bsp_fxl6408_output_state = __BITINSERT_R(__bsp_fxl6408_output_state, state, 1, gpio); - - __bsp_fxl6408_write_reg(0x05, __bsp_fxl6408_output_state); + conf->i2c_itf = CONFIG_AK4332_I2C_ITF; } - -void __bsp_audio_addon_init() +int bsp_ak4332_open(struct pi_ak4332_conf *conf) { - // Initialize this global state here to work well with retentive wake-up - __bsp_fxl6408_is_init = 0; + if (__pi_bsp_fxl6408_check_init()) + { + return -1; + } - // Configure padframe for I2C1 - // TODO should use real pad names when available - pi_pad_set_function(PI_PAD_042, 0); - pi_pad_set_function(PI_PAD_043, 0); + pi_fxl6408_gpio_conf_t gpio_conf; + pi_fxl6408_gpio_conf_init(&gpio_conf); + gpio_conf.id = CONFIG_FXL6408UMX_AK4332_GPIO; + gpio_conf.direction = FXL6408_GPIO_DIR_OUTPUT; + gpio_conf.output_state = FXL6408_GPIO_OUTPUT_STATE_HIGH; + + if (PI_OK != pi_fxl6408_gpio_set(&__pi_bsp_fxl6408, &gpio_conf)) + { + return -1; + } + + // Wait at least 1ms after ak4332 power-up + // TODO took 2x as margin, check we can switch back to 1 + pi_time_wait_us(2000); + + return 0; } +#endif -uint8_t pi_bsp_fxl6408_read_id() +#if defined(CONFIG_TLV320) +void bsp_tlv320_conf_init(struct pi_tlv320_conf *conf) { - if (__bsp_fxl6408_check_init()) return 0; - - return __bsp_fxl6408_read_reg(0x01); + conf->i2c_itf = CONFIG_TLV320_I2C_ITF; } - -int pi_bsp_ak4332_power_ctrl(int power_enable) +int bsp_tlv320_open(struct pi_tlv320_conf *conf) { - if (__bsp_fxl6408_check_init()) return -1; + if (__pi_bsp_fxl6408_check_init()) + { + return -1; + } - __bsp_fxl6408_gpio_output_state(CONFIG_FXL6408UMX_AK4332_GPIO, power_enable); + pi_fxl6408_gpio_conf_t gpio_conf; + pi_fxl6408_gpio_conf_init(&gpio_conf); + gpio_conf.id = CONFIG_FXL6408UMX_TLV320_GPIO; + gpio_conf.direction = FXL6408_GPIO_DIR_OUTPUT; + gpio_conf.output_state = FXL6408_GPIO_OUTPUT_STATE_HIGH; + + if (PI_OK != pi_fxl6408_gpio_set(&__pi_bsp_fxl6408, &gpio_conf)) + { + return -1; + } return 0; } +#endif -int pi_bsp_tlv320_power_ctrl(int power_enable) +void __bsp_audio_addon_init() { - if (__bsp_fxl6408_check_init()) return -1; +#if defined(CONFIG_FXL6408) + // Initialize this global state here to work well with retentive wake-up + __pi_bsp_fxl6408_is_init = 0; +#endif - __bsp_fxl6408_gpio_output_state(CONFIG_FXL6408UMX_TLV320_GPIO, power_enable); + // Configure padframe for I2C1 + // TODO should use real pad names when available + pi_pad_set_function(PI_PAD_042, PI_PAD_FUNC0); + pi_pad_set_function(PI_PAD_043, PI_PAD_FUNC0); - return 0; + // configure gpio for gpio expander interrupt + pi_pad_set_function(PI_PAD_089, PI_PAD_FUNC1); } diff --git a/rtos/pmsis/pmsis_bsp/docs/adc.rst b/rtos/pmsis/pmsis_bsp/docs/adc.rst new file mode 100644 index 000000000..e033027ad --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/docs/adc.rst @@ -0,0 +1,10 @@ +ADC +--- + +ADS1014 +""""""" + +.. doxygengroup:: ADS1014 + :members: + :private-members: + :protected-members: diff --git a/rtos/pmsis/pmsis_bsp/docs/gpio.rst b/rtos/pmsis/pmsis_bsp/docs/gpio.rst new file mode 100644 index 000000000..c56fd18e7 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/docs/gpio.rst @@ -0,0 +1,10 @@ +GPIO +---- + +FXL6408 +""""""" + +.. doxygengroup:: FXL6408 + :members: + :private-members: + :protected-members: diff --git a/rtos/pmsis/pmsis_bsp/docs/index.rst b/rtos/pmsis/pmsis_bsp/docs/index.rst index 0db979642..2be0a4424 100644 --- a/rtos/pmsis/pmsis_bsp/docs/index.rst +++ b/rtos/pmsis/pmsis_bsp/docs/index.rst @@ -38,9 +38,11 @@ Drivers .. toctree:: :maxdepth: 2 - ram.rst - flash.rst - camera.rst - fs.rst + adc.rst ble.rst + camera.rst display.rst + flash.rst + fs.rst + gpio.rst + ram.rst diff --git a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c index 2df49b820..6fb99e928 100644 --- a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c +++ b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c @@ -230,11 +230,7 @@ static void pos_mram_handle_pending_tasks(void *arg) } -#ifndef PMSIS_DRIVERS -PI_LOCAL_CODE static void pos_mram_handle_event(int event, void *arg) -#else -static void pos_mram_handle_event(void *arg) -#endif +PI_LOCAL_CODE static void pos_mram_handle_event(uint32_t event, void *arg) { pi_device_t *dev = (pi_device_t *)arg; pos_mram_t *mram = (pos_mram_t *)(pos_mram_t *)dev->data; diff --git a/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c b/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c index de792089f..5b3a14628 100644 --- a/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c +++ b/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c @@ -61,7 +61,7 @@ #define ATXP032_PROGRAM_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) #define ATXP032_READ_CMD (0x0B | PI_OCTOSPI_CMD_ADDR_EVEN) -#define ATXP032_READ_LATENCY 22 +#define ATXP032_READ_LATENCY 5 #define ATXP032_READ_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) #define SECTOR_SIZE (1<<12) @@ -248,7 +248,7 @@ static int atxp032_open(struct pi_device *device) // Activate octospi mode and DTR and unprotect all sectors - uint32_t data = 0x1b880200; + uint32_t data = 0x17880200; pi_octospi_op_conf_t op_ws = { .cmd=ATXP032_WRITE_STATUS_CMD, .latency=ATXP032_WRITE_STATUS_LATENCY_SPI, .flags=ATXP032_WRITE_STATUS_FLAGS_SPI }; pi_octospi_write(&atxp032->octospi_device, 0, &data, 4, &op_ws); @@ -262,7 +262,7 @@ static int atxp032_open(struct pi_device *device) // Activate octospi mode and DTR and unprotect all sectors // Since the UDMA does not support 1 byte address in DDR mode, we pack it into the data - char status_regs[5] = { 0x00, 0x00, 0x02, 0x88, 0x1b }; + char status_regs[5] = { 0x00, 0x00, 0x02, 0x88, 0x17 }; pi_octospi_op_conf_t op_ws = { .cmd=ATXP032_WRITE_STATUS_CMD, .latency=ATXP032_WRITE_STATUS_LATENCY_OCTO, .flags=ATXP032_WRITE_STATUS_FLAGS_OCTO }; pi_octospi_write(&atxp032->octospi_device, 0, status_regs, 5, &op_ws); @@ -271,6 +271,12 @@ static int atxp032_open(struct pi_device *device) // In the spec writing to volatile status register should take 200ns but RTL model take 10us to update it pi_time_wait_us(20); + + if(conf->xip_en) + { + pi_octospi_ioctl(&atxp032->octospi_device, PI_OCTOSPI_IOCTL_SET_XIP_OP, (void *)&atxp032_read_op); + } + return 0; error: diff --git a/rtos/pmsis/pmsis_bsp/flash/spiflash/mx25u51245g.c b/rtos/pmsis/pmsis_bsp/flash/spiflash/mx25u51245g.c new file mode 100644 index 000000000..44fc4e4ed --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/flash/spiflash/mx25u51245g.c @@ -0,0 +1,900 @@ +/* + * Copyright (C) 2018 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + +// Driver for the octo spi flash Adesto MX25U + +#include "pmsis.h" +#include "bsp/bsp.h" +#include "pmsis/drivers/octospi.h" + +#ifndef PI_LOCAL_CODE +#define PI_LOCAL_CODE +#endif + +#if defined(CONFIG_XIP) +// For now we always activate XIP locking since we only have flash which do not support concurrent read and write. +// This has to be deactivated for flash which support it (e.g. ATXP064R) +#define MX25U_LOCK_XIP +#endif + +#define MX25U_READ_STATUS_CMD_SPI (0x05 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_READ_STATUS_CMD_OCTO (0x05FA | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_READ_STATUS_LATENCY_SPI 0 +#define MX25U_READ_STATUS_LATENCY_OCTO 4 +#define MX25U_READ_STATUS_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_0 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR) +#define MX25U_READ_STATUS_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) + +#define MX25U_WRITE_STATUS_CMD (0x71 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_WRITE_STATUS_LATENCY_SPI 0 +#define MX25U_WRITE_STATUS_LATENCY_OCTO 0 +#define MX25U_WRITE_STATUS_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR) +#define MX25U_WRITE_STATUS_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_0 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) + +#define MX25U_READ_CONFREG_CMD_SPI (0x71 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_READ_CONFREG_CMD_OCTO (0x718E | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_READ_CONFREG_LATENCY_SPI 0 +#define MX25U_READ_CONFREG_LATENCY_OCTO 4 +#define MX25U_READ_CONFREG_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR) +#define MX25U_READ_CONFREG_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) + +#define MX25U_WRITE_CONFREG_CMD (0x72 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_WRITE_CONFREG_LATENCY_SPI 0 +#define MX25U_WRITE_CONFREG_LATENCY_OCTO 0 +#define MX25U_WRITE_CONFREG_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR) +#define MX25U_WRITE_CONFREG_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_0 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) + +#define MX25U_WRITE_ENABLE_CMD_SPI (0x0006 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_WRITE_ENABLE_CMD_OCTO (0x06F9 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_WRITE_ENABLE_LATENCY_SPI 0 +#define MX25U_WRITE_ENABLE_LATENCY_OCTO 0 +#define MX25U_WRITE_ENABLE_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_DATA_STR) +#define MX25U_WRITE_ENABLE_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_DATA_DTR) + +#define MX25U_ERASE_CMD (0x21DE | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_ERASE_LATENCY 0 +#define MX25U_ERASE_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR) + +#define MX25U_PROGRAM_CMD (0x12ED | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_PROGRAM_LATENCY 0 +#define MX25U_PROGRAM_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR | PI_OCTOSPI_FLAG_DATA_DTR_MSB) + +#define MX25U_READ_CMD (0xEE11 | PI_OCTOSPI_CMD_ADDR_EVEN) +#define MX25U_READ_LATENCY 20 +#define MX25U_READ_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR | PI_OCTOSPI_FLAG_DATA_DTR_MSB) + +#define SECTOR_SIZE (1<<12) + +#define STALL_TASK_PROGRAM 0 +#define STALL_TASK_ERASE_CHIP 1 +#define STALL_TASK_ERASE_SECTOR 2 +#define STALL_TASK_REG_SET 3 +#define STALL_TASK_REG_GET 4 +#define STALL_TASK_READ 5 +#define STALL_TASK_READ_2D 6 + +typedef struct { + struct pi_device octospi_device; + // Used for communications with mx25u through udma + uint16_t udma_buffer[2]; + + // Waiting queue for common operations (only 1 is handled at the same time) + pi_task_t *waiting_first; + pi_task_t *waiting_last; + + // Task to be enqueued when the on-going operation is done + pi_task_t *pending_task; + + // Waiting queue for erase operation (only 1 is handled at the same time) as it needs a + // second level FSM + pi_task_t *erase_waiting_first; + pi_task_t *erase_waiting_last; + + // Task to be enqueued when the on-going erase operation is done + pi_task_t *erase_task; + + // Task used for internal FSM scheduling for common operations + pi_task_t task; + + // Task used for internal second-level FSM scheduling (for erase operation) + pi_task_t task2; + + // Description of on-going task for common operations. The FSM will keep executing + // until this operation is done + uint32_t pending_octospi_addr; + uint32_t pending_data; + uint32_t pending_size; + + // Description of on-going task for erase operation. The FSM will keep executing + // until this operation is done + uint32_t pending_erase_octospi_addr; + uint32_t pending_erase_size; + +} mx25u_t; + + +static pi_octospi_op_conf_t mx25u_erase_op = { .cmd=MX25U_ERASE_CMD, .latency=MX25U_ERASE_LATENCY, .flags=MX25U_ERASE_FLAGS }; + +static pi_octospi_op_conf_t mx25u_program_op = { .cmd=MX25U_PROGRAM_CMD, .latency=MX25U_PROGRAM_LATENCY, .flags=MX25U_PROGRAM_FLAGS }; + +static pi_octospi_op_conf_t mx25u_read_op = { .cmd=MX25U_READ_CMD, .latency=MX25U_READ_LATENCY, .flags=MX25U_READ_FLAGS }; + + + +static void mx25u_program_async(struct pi_device *device, uint32_t octospi_addr, const void *data, uint32_t size, pi_task_t *task); + +static void mx25u_check_program(void *arg); + +static void mx25u_erase_async(struct pi_device *device, uint32_t addr, int size, pi_task_t *task); + +static int mx25u_stall_task(mx25u_t *mx25u, pi_task_t *task, uint32_t id, uint32_t arg0, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint32_t arg4); + +static void mx25u_handle_pending_task(void *arg); + +static void mx25u_erase_chip_async(struct pi_device *device, pi_task_t *task); + +static void mx25u_erase_sector_async(struct pi_device *device, uint32_t addr, pi_task_t *task); + +static void mx25u_set_reg_exec(mx25u_t *mx25u, unsigned int addr, unsigned short value) +{ + mx25u->udma_buffer[0] = value; + //pi_octospi_write(&mx25u->octospi_device, addr, mx25u->udma_buffer, 2); +} + + + +// TODO should be moved to pmsis api +static void pi_task_enqueue(pi_task_t *task) +{ + pi_task_push(task); +} + + + +static unsigned short mx25u_get_reg_exec(mx25u_t *mx25u, unsigned int addr) +{ + //pi_octospi_read(&mx25u->octospi_device, addr, mx25u->udma_buffer, 4); + return mx25u->udma_buffer[0]; +} + + +#ifdef CONFIG_XIP + +PI_LOCAL_CODE static uint32_t mx25u_get_status(mx25u_t *mx25u) +{ + struct pi_task task; + uint32_t data; + pi_octospi_op_conf_t op = { .cmd=MX25U_READ_STATUS_CMD_OCTO, .latency=MX25U_READ_STATUS_LATENCY_OCTO, .flags=MX25U_READ_STATUS_FLAGS_OCTO }; + pi_octospi_read_async(&mx25u->octospi_device, 0, &data, 4, &op, pi_task_block(&task)); + pi_task_wait_on_xip(&task); + return data; +} + +#else + +static uint32_t mx25u_get_status(mx25u_t *mx25u) +{ + uint32_t data; + pi_octospi_op_conf_t op = { .cmd=MX25U_READ_STATUS_CMD_OCTO, .latency=MX25U_READ_STATUS_LATENCY_OCTO, .flags=MX25U_READ_STATUS_FLAGS_OCTO }; + pi_octospi_read(&mx25u->octospi_device, 0, &data, 4, &op); + return data; +} + +#endif + + +PI_LOCAL_CODE static int mx25u_is_busy(mx25u_t *mx25u) +{ + uint32_t value = mx25u_get_status(mx25u); + return (value >> 0) & 1; +} + + +static void mx25u_write_enable(mx25u_t *mx25u) +{ + pi_octospi_op_conf_t op = { .cmd=MX25U_WRITE_ENABLE_CMD_OCTO, .latency=MX25U_WRITE_ENABLE_LATENCY_OCTO, .flags=MX25U_WRITE_ENABLE_FLAGS_OCTO }; + int dummy = 0; + pi_octospi_write(&mx25u->octospi_device, 0, &dummy, 0, &op); +} + + + +static int mx25u_open(struct pi_device *device) +{ + struct pi_mx25u51245g_conf *conf = (struct pi_mx25u51245g_conf *)device->config; + + mx25u_t *mx25u = (mx25u_t *)pmsis_l2_malloc(sizeof(mx25u_t)); + if (mx25u == NULL) + { + return -1; + } + + device->data = (void *)mx25u; + + if (bsp_mx25u51245g_open(conf)) + { + goto error; + } + + struct pi_octospi_conf octospi_conf; + pi_octospi_conf_init(&octospi_conf); + + octospi_conf.id = (unsigned char) conf->spi_itf; + octospi_conf.cs = conf->spi_cs; + octospi_conf.type = PI_OCTOSPI_TYPE_FLASH; + octospi_conf.xip_en = conf->xip_en; + octospi_conf.baudrate = conf->baudrate; + + pi_open_from_conf(&mx25u->octospi_device, &octospi_conf); + + int32_t error = pi_octospi_open(&mx25u->octospi_device); + if (error) + { + goto error; + } + + mx25u->pending_task = NULL; + mx25u->waiting_first = NULL; + + mx25u->erase_task = NULL; + mx25u->erase_waiting_first = NULL; + + // Activate DTR octospi mode and DTR + { + pi_octospi_op_conf_t op_we = { .cmd=MX25U_WRITE_ENABLE_CMD_SPI, .latency=MX25U_WRITE_ENABLE_LATENCY_SPI, .flags=MX25U_WRITE_ENABLE_FLAGS_SPI }; + pi_octospi_write(&mx25u->octospi_device, 0, NULL, 0, &op_we); + + pi_octospi_op_conf_t op_ws = { .cmd=MX25U_WRITE_CONFREG_CMD, .latency=MX25U_WRITE_CONFREG_LATENCY_SPI, .flags=MX25U_WRITE_CONFREG_FLAGS_SPI }; + uint32_t data = 1 << 1; + pi_octospi_write(&mx25u->octospi_device, 0, &data, 1, &op_ws); + + pi_time_wait_us(60); + } + + return 0; + +error: + pmsis_l2_malloc_free(mx25u, sizeof(mx25u_t)); + return -2; +} + + + +static void mx25u_close(struct pi_device *device) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + pi_octospi_close(&mx25u->octospi_device); + pmsis_l2_malloc_free(mx25u, sizeof(mx25u_t)); +} + + + +static int32_t mx25u_ioctl(struct pi_device *device, uint32_t cmd, void *arg) +{ + switch (cmd) + { + case PI_FLASH_IOCTL_INFO: + { + struct pi_flash_info *flash_info = (struct pi_flash_info *)arg; + flash_info->sector_size = 1<<18; + // TODO find a way to know what is on the flash, as they may be a boot binary + flash_info->flash_start = flash_info->sector_size; + } + } + return 0; +} + + +void pi_mx25u_deep_sleep_enter(pi_device_t *device) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + pi_octospi_op_conf_t op_we = { + .cmd=0xB9, + .latency=0, + .flags=PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_DATA_DTR + }; + int dummy = 0; + pi_octospi_write(&mx25u->octospi_device, 0, &dummy, 1, &op_we); +} + + +void pi_mx25u_deep_sleep_exit(pi_device_t *device) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + pi_octospi_op_conf_t op_we = { + .cmd=0xAB, + .latency=0, + .flags=PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_DATA_DTR + }; + int dummy = 0; + pi_octospi_write(&mx25u->octospi_device, 0, &dummy, 1, &op_we); +} + + +static void mx25u_reg_set_async(struct pi_device *device, uint32_t addr, uint8_t *value, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_REG_SET, addr, (uint32_t)value, 0, 0, 0)) + return; + + mx25u_set_reg_exec(mx25u, addr, *(uint16_t *)value); + + mx25u_handle_pending_task(device); +} + + + +static void mx25u_reg_get_async(struct pi_device *device, uint32_t addr, uint8_t *value, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_REG_GET, addr, (uint32_t)value, 0, 0, 0)) + return; + + *(uint16_t *)value = mx25u_get_reg_exec(mx25u, addr); + + mx25u_handle_pending_task(device); +} + + + +static void mx25u_read_async(struct pi_device *device, uint32_t addr, void *data, uint32_t size, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_READ, addr, (uint32_t)data, size, 0, 0)) + return; + + pi_octospi_read_async(&mx25u->octospi_device, addr, data, size, &mx25u_read_op, pi_task_callback(&mx25u->task, mx25u_handle_pending_task, device)); +} + + + +static void mx25u_read_2d_async(struct pi_device *device, uint32_t addr, void *data, uint32_t size, uint32_t stride, uint32_t length, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_READ_2D, addr, (uint32_t)data, size, stride, length)) + { + return; + } + + pi_octospi_read_2d_async(&mx25u->octospi_device, addr, data, size, stride, length, &mx25u_read_op, pi_task_callback(&mx25u->task, mx25u_handle_pending_task, device)); +} + + + +static void mx25u_handle_pending_task(void *arg) +{ + struct pi_device *device = (struct pi_device *)arg; + mx25u_t *mx25u = (mx25u_t *)device->data; + + uint32_t irq = disable_irq(); + + pi_task_enqueue(mx25u->pending_task); + mx25u->pending_task = NULL; + + pi_task_t *task = mx25u->waiting_first; + + if (task) + { + mx25u->waiting_first = task->next; + } + + restore_irq(irq); + + if (task) + { + if (task->data[0] == STALL_TASK_PROGRAM) + { + mx25u_program_async(device, task->data[1], (void *)task->data[2], task->data[3], task); + } + else if (task->data[0] == STALL_TASK_ERASE_CHIP) + { + mx25u_erase_chip_async(device, task); + } + else if (task->data[0] == STALL_TASK_ERASE_SECTOR) + { + mx25u_erase_sector_async(device, task->data[1], task); + } + else if (task->data[0] == STALL_TASK_REG_SET) + { + mx25u_reg_set_async(device, task->data[1], (uint8_t *)task->data[2], task); + } + else if (task->data[0] == STALL_TASK_REG_GET) + { + mx25u_reg_get_async(device, task->data[1], (uint8_t *)task->data[2], task); + } + else if (task->data[0] == STALL_TASK_READ) + { + mx25u_read_async(device, task->data[1], (void *)task->data[2], task->data[3], task); + } + else if (task->data[0] == STALL_TASK_READ_2D) + { + mx25u_read_2d_async(device, task->data[1], (void *)task->data[2], task->data[3], task->data[4], task->data[5], task); + } + } +} + + + +static void mx25u_handle_pending_erase_task(void *arg) +{ + struct pi_device *device = (struct pi_device *)arg; + mx25u_t *mx25u = (mx25u_t *)device->data; + + uint32_t irq = disable_irq(); + + pi_task_enqueue(mx25u->erase_task); + mx25u->erase_task = NULL; + + pi_task_t *task = mx25u->erase_waiting_first; + if (task) + { + mx25u->erase_waiting_first = task->next; + } + + restore_irq(irq); + + if (task) + { + mx25u_erase_async(device, task->data[1], task->data[2], task); + } +} + + + +static int mx25u_stall_task(mx25u_t *mx25u, pi_task_t *task, uint32_t id, uint32_t arg0, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint32_t arg4) +{ + uint32_t irq = disable_irq(); + + if (mx25u->pending_task != NULL) + { + task->data[0] = id; + task->data[1] = arg0; + task->data[2] = arg1; + task->data[3] = arg2; + task->data[4] = arg3; + task->data[5] = arg4; + task->next = NULL; + + if (mx25u->waiting_first) + { + mx25u->waiting_last->next = task; + } + else + { + mx25u->waiting_first = task; + } + + mx25u->waiting_last = task; + + restore_irq(irq); + return 1; + } + + mx25u->pending_task = task; + + restore_irq(irq); + return 0; +} + + + +static int mx25u_stall_erase_task(mx25u_t *mx25u, pi_task_t *task, uint32_t id, uint32_t arg0, uint32_t arg1, uint32_t arg2) +{ + uint32_t irq = disable_irq(); + + if (mx25u->erase_task != NULL) + { + task->data[0] = id; + task->data[1] = arg0; + task->data[2] = arg1; + task->data[3] = arg2; + task->next = NULL; + + if (mx25u->erase_waiting_first) + { + mx25u->erase_waiting_last->next = task; + } + else + { + mx25u->erase_waiting_first = task; + } + + mx25u->erase_waiting_last = task; + + restore_irq(irq); + return 1; + } + + mx25u->erase_task = task; + + restore_irq(irq); + return 0; +} + + +PI_LOCAL_CODE static void mx25u_program_resume(void *arg) +{ + struct pi_device *device = (struct pi_device *)arg; + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u->pending_size == 0) + { + mx25u_handle_pending_task(device); + } + else + { + #ifdef MX25U_LOCK_XIP + // When XIP is active and flash does not support concurrent read and write, loop on the program operation until it is done + // Since XIP can not work at the same time. + // On multi-threaded systems, we should also put on hold any request to this driver and resume them after the program operation is done, + // since the octosp drover will let other requests execute between 2 operations, to let other devices being used. + while (mx25u->pending_size > 0) + { + mx25u_write_enable(mx25u); + + unsigned int iter_size = 256 - (mx25u->pending_octospi_addr & 0xff); + if (iter_size > mx25u->pending_size) + iter_size = mx25u->pending_size; + + uint32_t octospi_addr = mx25u->pending_octospi_addr; + uint32_t data = mx25u->pending_data; + + mx25u->pending_octospi_addr += iter_size; + mx25u->pending_data += iter_size; + mx25u->pending_size -= iter_size; + + // In XIP mode, we need to lock XIP refills to avoid having a read while the flash is doing the program operation. + pi_octospi_xip_lock(&mx25u->octospi_device); + + // Even though the operation should be asynchronous, do everything synchronously to avoid XIP refills until the operation is done + struct pi_task task; + pi_octospi_write_async(&mx25u->octospi_device, octospi_addr, (void *)data, iter_size, &mx25u_program_op, pi_task_block(&task)); + pi_task_wait_on_xip(&task); + while (mx25u_is_busy(mx25u)) + { + for (int i=0; i<32768/1000; i++) + { + pos_wait_for_event(1<octospi_device); + } + + mx25u_handle_pending_task(device); + #else + unsigned int iter_size = 256 - (mx25u->pending_octospi_addr & 0xff); + if (iter_size > mx25u->pending_size) + iter_size = mx25u->pending_size; + + uint32_t octospi_addr = mx25u->pending_octospi_addr; + uint32_t data = mx25u->pending_data; + + mx25u->pending_octospi_addr += iter_size; + mx25u->pending_data += iter_size; + mx25u->pending_size -= iter_size; + + mx25u_write_enable(mx25u); + pi_octospi_write_async(&mx25u->octospi_device, octospi_addr, (void *)data, iter_size, &mx25u_program_op, pi_task_callback(&mx25u->task, mx25u_check_program, device)); + #endif + } +} + + + +static void mx25u_check_program(void *arg) +{ + struct pi_device *device = (struct pi_device *)arg; + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_is_busy(mx25u)) + { + // Typical buffer programming time is 4ms. Note that this could be optimzed by taking into account buffer size + pi_task_push_delayed_us(pi_task_callback(&mx25u->task, mx25u_check_program, device), 1000); + } + else + { + mx25u_program_resume(device); + } +} + + + +PI_LOCAL_CODE static void mx25u_program_async(struct pi_device *device, uint32_t octospi_addr, const void *data, uint32_t size, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_PROGRAM, octospi_addr, (uint32_t)data, size, 0, 0)) + return; + + mx25u->pending_octospi_addr = octospi_addr; + mx25u->pending_data = (uint32_t)data; + mx25u->pending_size = size; + + mx25u_program_resume(device); +} + + + + + +static void mx25u_check_erase(void *arg) +{ + struct pi_device *device = (struct pi_device *)arg; + mx25u_t *mx25u = (mx25u_t *)device->data; + + uint32_t reg_status; // = mx25u_get_status_reg(mx25u); + if (mx25u_is_busy(mx25u)) + { + // Typical sector erase time is 25ms but keep it short as this time is shorter or some platform + pi_task_push_delayed_us(pi_task_callback(&mx25u->task, mx25u_check_erase, device), 10000); + } + else + { + mx25u_handle_pending_task(device); + } +} + + +static void mx25u_erase_chip_async(struct pi_device *device, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_ERASE_CHIP, 0, 0, 0, 0, 0)) + return; + + + + pi_task_push_delayed_us(pi_task_callback(&mx25u->task, mx25u_check_erase, device), 100000); +} + + +static void mx25u_erase_sector_async(struct pi_device *device, uint32_t addr, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_task(mx25u, task, STALL_TASK_ERASE_SECTOR, addr, 0, 0, 0, 0)) + return; + + mx25u_write_enable(mx25u); + + // We don't need to send data but UDMA needs at least 1 byte, this will be ignored by the flash + pi_octospi_write_async(&mx25u->octospi_device, addr, mx25u->udma_buffer, 0, &mx25u_erase_op, pi_task_callback(&mx25u->task, mx25u_check_erase, device)); +} + + + +PI_LOCAL_CODE static void mx25u_erase_resume(void *arg) +{ + struct pi_device *device = (struct pi_device *)arg; + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u->pending_erase_size == 0) + { + mx25u_handle_pending_erase_task(device); + } + else + { + #ifdef MX25U_LOCK_XIP + // When XIP is active and flash does not support concurrent read and write, loop on the erase operation until it is done + // Since XIP can not work at the same time. + // On multi-threaded systems, we should also put on hold any request to this driver and resume them after the program operation is done, + // since the octosp drover will let other requests execute between 2 operations, to let other devices being used. + while (mx25u->pending_erase_size > 0) + { + mx25u_write_enable(mx25u); + + // In XIP mode, we need to lock XIP refills to avoid having a read while the flash is doing the program operation. + pi_octospi_xip_lock(&mx25u->octospi_device); + + unsigned int iter_size = SECTOR_SIZE - (mx25u->pending_erase_octospi_addr & (SECTOR_SIZE - 1)); + if (iter_size > mx25u->pending_erase_size) + iter_size = mx25u->pending_erase_size; + + uint32_t octospi_addr = mx25u->pending_erase_octospi_addr; + + mx25u->pending_erase_octospi_addr += iter_size; + mx25u->pending_erase_size -= iter_size; + + struct pi_task task; + + // We don't need to send data but UDMA needs at least 1 byte, this will be ignored by the flash + pi_octospi_write_async(&mx25u->octospi_device, octospi_addr, mx25u->udma_buffer, 1, &mx25u_erase_op, pi_task_block(&task)); + pi_task_wait_on_xip(&task); + + while (mx25u_is_busy(mx25u)) + { + for (int i=0; i<32768/100; i++) + { + pos_wait_for_event(1<octospi_device); + } + + mx25u_handle_pending_erase_task(device); + #else + unsigned int iter_size = SECTOR_SIZE - (mx25u->pending_erase_octospi_addr & (SECTOR_SIZE - 1)); + if (iter_size > mx25u->pending_erase_size) + iter_size = mx25u->pending_erase_size; + + uint32_t octospi_addr = mx25u->pending_erase_octospi_addr; + mx25u_erase_sector_async(device, octospi_addr, pi_task_callback(&mx25u->task2, mx25u_erase_resume, device)); + + mx25u->pending_erase_octospi_addr += iter_size; + mx25u->pending_erase_size -= iter_size; + #endif + } +} + + + +static void mx25u_erase_async(struct pi_device *device, uint32_t addr, int size, pi_task_t *task) +{ + mx25u_t *mx25u = (mx25u_t *)device->data; + + if (mx25u_stall_erase_task(mx25u, task, 3, addr, size, 0)) + { + return; + } + + mx25u->pending_erase_octospi_addr = addr; + mx25u->pending_erase_size = size; + + mx25u_erase_resume(device); +} + + + +static int mx25u_copy_async(struct pi_device *device, uint32_t flash_addr, void *buffer, uint32_t size, int ext2loc, pi_task_t *task) +{ + if (!ext2loc) + mx25u_program_async(device, flash_addr, buffer, size, task); + else + mx25u_read_async(device, flash_addr, buffer, size, task); + + return 0; +} + + + +static int mx25u_copy_2d_async(struct pi_device *device, uint32_t flash_addr, void *buffer, uint32_t size, uint32_t stride, uint32_t length, int ext2loc, pi_task_t *task) +{ + if (!ext2loc) + return -1; + + mx25u_read_2d_async(device, flash_addr, buffer, size, stride, length, task); + + return 0; +} + +static int mx25u_read(struct pi_device *device, uint32_t pi_flash_addr, void *data, uint32_t size) +{ + pi_task_t task; + mx25u_read_async(device, pi_flash_addr, data, size, pi_task_block(&task)); + pi_task_wait_on(&task); + return 0; +} + + +static int mx25u_program(struct pi_device *device, uint32_t pi_flash_addr, const void *data, uint32_t size) +{ + pi_task_t task; + mx25u_program_async(device, pi_flash_addr, data, size, pi_task_block(&task)); + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_erase_chip(struct pi_device *device) +{ + pi_task_t task; + mx25u_erase_chip_async(device, pi_task_block(&task)); + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_erase_sector(struct pi_device *device, uint32_t pi_flash_addr) +{ + pi_task_t task; + mx25u_erase_sector_async(device, pi_flash_addr, pi_task_block(&task)); + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_erase(struct pi_device *device, uint32_t pi_flash_addr, int size) +{ + pi_task_t task; + pi_task_block(&task); + mx25u_erase_async(device, pi_flash_addr, size, &task); + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_reg_set(struct pi_device *device, uint32_t pi_flash_addr, uint8_t *value) +{ + pi_task_t task; + mx25u_reg_set_async(device, pi_flash_addr, value, pi_task_block(&task)); + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_reg_get(struct pi_device *device, uint32_t pi_flash_addr, uint8_t *value) +{ + pi_task_t task; + mx25u_reg_get_async(device, pi_flash_addr, value, pi_task_block(&task)); + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_copy(struct pi_device *device, uint32_t pi_flash_addr, void *buffer, uint32_t size, int ext2loc) +{ + pi_task_t task; + pi_task_block(&task); + if (mx25u_copy_async(device, pi_flash_addr, buffer, size, ext2loc, &task)) + return -1; + pi_task_wait_on(&task); + return 0; +} + +static inline int mx25u_copy_2d(struct pi_device *device, uint32_t pi_flash_addr, void *buffer, uint32_t size, uint32_t stride, uint32_t length, int ext2loc) +{ + pi_task_t task; + pi_task_block(&task); + if (mx25u_copy_2d_async(device, pi_flash_addr, buffer, size, stride, length, ext2loc, &task)) + return -1; + pi_task_wait_on(&task); + return 0; +} + +static pi_flash_api_t mx25u_api = { + .open = &mx25u_open, + .close = &mx25u_close, + .ioctl = &mx25u_ioctl, + .read_async = &mx25u_read_async, + .program_async = &mx25u_program_async, + .erase_chip_async = &mx25u_erase_chip_async, + .erase_sector_async = &mx25u_erase_sector_async, + .erase_async = &mx25u_erase_async, + .reg_set_async = &mx25u_reg_set_async, + .reg_get_async = &mx25u_reg_get_async, + .copy_async = &mx25u_copy_async, + .copy_2d_async = &mx25u_copy_2d_async, + .read = &mx25u_read, + .program = &mx25u_program, + .erase_chip = &mx25u_erase_chip, + .erase_sector = &mx25u_erase_sector, + .erase = &mx25u_erase, + .reg_set = &mx25u_reg_set, + .reg_get = &mx25u_reg_get, + .copy = &mx25u_copy, + .copy_2d = &mx25u_copy_2d, +}; + + + +void pi_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf) +{ + conf->flash.api = &mx25u_api; + bsp_mx25u51245g_conf_init(conf); + __flash_conf_init(&conf->flash); + conf->xip_en = 0; +} diff --git a/rtos/pmsis/pmsis_bsp/fs/fs.c b/rtos/pmsis/pmsis_bsp/fs/fs.c index 1b79329df..f47fcb449 100644 --- a/rtos/pmsis/pmsis_bsp/fs/fs.c +++ b/rtos/pmsis/pmsis_bsp/fs/fs.c @@ -340,19 +340,21 @@ void __pi_cl_fs_copy_req_exec(void *_req); void __pi_cl_fs_copy_req_done(void *_req) { pi_cl_fs_req_t *req = (pi_cl_fs_req_t *)_req; + pi_cl_fs_req_t *next_req; pi_fs_file_t *file = req->file; pi_fs_data_t *fs = req->file->fs_data; pi_task_t *task = &fs->cl_req_task; - cl_notify_task_done(&(req->copy.done), req->copy.cid); - uint32_t irq = disable_irq(); fs->cluster_reqs_first = (void *)req->callback.next; - req = fs->cluster_reqs_first; + next_req = fs->cluster_reqs_first; restore_irq(irq); - if (req) + + cl_notify_task_done(&(req->copy.done), req->copy.cid); + + if (next_req) { - __pi_cl_fs_copy_req_exec(req); + __pi_cl_fs_copy_req_exec(next_req); } } diff --git a/rtos/pmsis/pmsis_bsp/gpio/fxl6408.c b/rtos/pmsis/pmsis_bsp/gpio/fxl6408.c new file mode 100644 index 000000000..bf26b3544 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/gpio/fxl6408.c @@ -0,0 +1,466 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use fxl6408 file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "bsp/gpio/fxl6408.h" + +/**************/ +/* Structures */ +/**************/ + +typedef enum { + FXL6408_REG_DEVICE_ID_CTRL = 0x1, + FXL6408_REG_IO_DIRECTION = 0x3, + FXL6408_REG_OUTPUT_STATE = 0x5, + FXL6408_REG_OUTPUT_HIGHZ = 0x7, + FXL6408_REG_INPUT_DEFAULT_STATE = 0x9, + FXL6408_REG_PULL_ENABLE = 0xB, + FXL6408_REG_PULL_UP_DOWN = 0xD, + FXL6408_REG_INPUT_STATUS = 0xF, + FXL6408_REG_INTERRUPT_MASK = 0x11, + FXL6408_REG_INTERRUPT_STATUS = 0x13, +} fxl6408_register_e; + +/* values for the IO direction register */ +enum { + __GPIO_MODE_INPUT = 0x0, + __GPIO_MODE_OUTPUT = 0x1, +}; + +/* values for the output highz register */ +enum { + __GPIO_HIGHZ_DISABLED = 0x0, + __GPIO_HIGHZ_ENABLED = 0x1, +}; + +/* values for the output value register */ +enum { + __GPIO_OUTPUT_LOW = 0x0, + __GPIO_OUTPUT_HIGH = 0x1, +}; + +/* values for the interrupt mask register */ +enum { + __GPIO_INTERRUPT_ENABLED = 0x0, + __GPIO_INTERRUPT_DISABLED = 0x1, +}; + +/* values for the input default state register */ +enum { + __GPIO_TRIGGER_RISING = 0x0, + __GPIO_TRIGGER_FALLING = 0x1, +}; + +/* values for the pull enable register */ +enum { + __GPIO_PULL_DISABLED = 0x0, + __GPIO_PULL_ENABLED = 0x1, +}; + +/* values for the pull up down register */ +enum { + __GPIO_PULL_DOWN = 0x0, + __GPIO_PULL_UP = 0x1, +}; + +typedef struct { + pi_device_t i2c; + pi_device_t gpio_irq; + + /* store device registers value locally to avoid reading and writing back */ + uint8_t gpio_dir; + uint8_t gpio_value; + uint8_t gpio_high_z; + uint8_t gpio_trigger; + uint8_t gpio_pull_enable; + uint8_t gpio_pull_updown; + uint8_t gpio_interrupt_mask; + + /* gpio irq status related */ + pi_task_t gpio_irq_cb; + pi_task_t i2c_interrupt_status_cb; + /* this value will be set by the i2c write read async */ + uint8_t i2c_interrupt_status_payload; //1-byte payload + uint8_t i2c_interrupt_status; + pi_task_t *irq_tasks[8]; +} fxl6408_t; + +/********************/ +/* Static functions */ +/********************/ + +static int __pi_fxl6408_reg_write(pi_device_t *dev, fxl6408_register_e addr, + uint8_t value) +{ + uint8_t buffer[2] = { addr, value }; + if (pi_i2c_write(dev, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP)) + { + return -1; + } + return 0; +} + + +static uint8_t __pi_fxl6408_reg_read(pi_device_t *dev, + fxl6408_register_e addr) +{ + uint8_t result; + pi_i2c_write_read(dev, &addr, &result, 1, 1); + return result; +} + + +static int __pi_fxl6408_reset(fxl6408_t *fxl6408) +{ + // To reset the IO expander, just make sure it does not drive gpio outputs + // Set all GPIO to input + fxl6408->gpio_dir = 0x00; + if (__pi_fxl6408_reg_write(&fxl6408->i2c, + FXL6408_REG_IO_DIRECTION, fxl6408->gpio_dir)) { + return -1; + } + + // Set all GPIO output value to 0 + fxl6408->gpio_value = 0x00; + if (__pi_fxl6408_reg_write(&fxl6408->i2c, + FXL6408_REG_OUTPUT_STATE, fxl6408->gpio_value)) { + return -1; + } + + // Set all GPIO to high-Z + fxl6408->gpio_high_z = 0xFF; + if (__pi_fxl6408_reg_write(&fxl6408->i2c, + FXL6408_REG_OUTPUT_HIGHZ, fxl6408->gpio_high_z)) { + return -1; + } + + return 0; +} + +static void __attribute__((noinline)) +__i2c_interrupt_status_cb(void *arg) +{ + fxl6408_t* fxl6408 = (fxl6408_t *) arg; + // retrieve the status of the interrupt and + // schedule corresponding pi_task + + uint8_t irq_status = fxl6408->i2c_interrupt_status; + for (int i = 0; i < 8; i++) { + if (irq_status & (1 << i)) { + /* irq has been triggered */ + if (fxl6408->irq_tasks[i] != NULL) { + /* schedule the task */ + pi_task_push(fxl6408->irq_tasks[i]); + } + } + } +} + +static void __attribute__((noinline)) +__gpio_irq_cb(void* arg) +{ + fxl6408_t* fxl6408 = (fxl6408_t *) arg; + // read the interrupt status (async call) will call another callback. + pi_task_callback(&fxl6408->i2c_interrupt_status_cb, + __i2c_interrupt_status_cb, (void*) fxl6408); + fxl6408->i2c_interrupt_status_payload = FXL6408_REG_INTERRUPT_STATUS; + pi_i2c_write_read_async(&fxl6408->i2c, + &fxl6408->i2c_interrupt_status_payload, + &fxl6408->i2c_interrupt_status, + 1, 1, + &fxl6408->i2c_interrupt_status_cb); +} + +/*****************/ +/* API Functions */ +/*****************/ + +int pi_fxl6408_open(pi_device_t *device) +{ + struct pi_fxl6408_conf *conf = (struct pi_fxl6408_conf *) device->config; + + fxl6408_t *fxl6408 = (fxl6408_t *) pmsis_l2_malloc(sizeof(fxl6408_t)); + if (fxl6408 == NULL) + { + return -1; + } + + device->data = (void *) fxl6408; + + struct pi_i2c_conf i2c_conf; + pi_i2c_conf_init(&i2c_conf); + i2c_conf.itf = conf->i2c_itf; + i2c_conf.max_baudrate = 100000; + pi_i2c_conf_set_slave_addr(&i2c_conf, 0x86, 0); + + pi_open_from_conf(&fxl6408->i2c, &i2c_conf); + if (pi_i2c_open(&fxl6408->i2c)) goto error; + + /* Reset the IO expander to a known state in case, in case it was kept on + * after chip reset */ + if (__pi_fxl6408_reset(fxl6408)) goto error2; + + /* initialize gpio input irq tasks */ + for (int i = 0; i < 8; i++) + { + fxl6408->irq_tasks[i] = NULL; + } + + /* initiliaze gpio irq callback */ + { + pi_gpio_e gpio_pin = conf->interrupt_pin; + struct pi_gpio_conf gpio_conf; + + pi_gpio_conf_init(&gpio_conf); + gpio_conf.port = PI_PAD_089 / 32; + + pi_open_from_conf(&fxl6408->gpio_irq, &gpio_conf); + if (PI_OK != pi_gpio_open(&fxl6408->gpio_irq)) { + goto error2; + } + + pi_gpio_notif_e irq_type = PI_GPIO_NOTIF_FALL; + pi_gpio_flags_e cfg_flags = PI_GPIO_INPUT + | PI_GPIO_PULL_DISABLE + | PI_GPIO_DRIVE_STRENGTH_LOW; + + pi_gpio_pin_configure(&fxl6408->gpio_irq, gpio_pin, cfg_flags); + pi_gpio_pin_notif_configure(&fxl6408->gpio_irq, gpio_pin, irq_type); + + pi_task_callback(&fxl6408->gpio_irq_cb, __gpio_irq_cb, (void*) fxl6408); + + if (PI_OK != pi_gpio_pin_task_add(&fxl6408->gpio_irq, gpio_pin, + &fxl6408->gpio_irq_cb, irq_type)) + { + goto error3; + } + } + + /* clears the reset int else interrupt wont trigger the int pin + * (value is not important) */ + __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_DEVICE_ID_CTRL); + /* clears the interrupt status (value is not important) */ + __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_INTERRUPT_STATUS); + + return 0; + +error3: + //pi_gpio_close(&fxl6408->gpio_irq); //not implemented on pulpos2 ? +error2: + pi_i2c_close(&fxl6408->i2c); +error: + pmsis_l2_malloc_free(fxl6408, sizeof(fxl6408_t)); + return -2; +} + +int pi_fxl6408_gpio_set(pi_device_t *device, pi_fxl6408_gpio_conf_t *gpio_conf) +{ + if (NULL == device || NULL == device->data || NULL == gpio_conf) + { + return PI_ERR_INVALID_ARG; + } + + fxl6408_t *fxl6408 = (fxl6408_t *)device->data; + + if (gpio_conf->direction == FXL6408_GPIO_DIR_OUTPUT) + { + /* pin is in output mode, only set registers that have an effect */ + fxl6408->gpio_dir = __BITINSERT_R(fxl6408->gpio_dir, + __GPIO_MODE_OUTPUT, 1, gpio_conf->id); + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_IO_DIRECTION, fxl6408->gpio_dir)) + { + return PI_ERR_INVALID_STATE; + } + + if (gpio_conf->output_state == FXL6408_GPIO_OUTPUT_STATE_DISABLED) + { + fxl6408->gpio_high_z = __BITINSERT_R(fxl6408->gpio_high_z, + __GPIO_HIGHZ_ENABLED, 1, gpio_conf->id); + } + else { + fxl6408->gpio_high_z = __BITINSERT_R(fxl6408->gpio_high_z, + __GPIO_HIGHZ_DISABLED, 1, gpio_conf->id); + + if (gpio_conf->output_state == FXL6408_GPIO_OUTPUT_STATE_LOW) + { + fxl6408->gpio_value = __BITINSERT_R(fxl6408->gpio_value, + __GPIO_OUTPUT_LOW, 1, gpio_conf->id); + } + else + { + fxl6408->gpio_value = __BITINSERT_R(fxl6408->gpio_value, + __GPIO_OUTPUT_HIGH, 1, gpio_conf->id); + } + + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_OUTPUT_STATE, fxl6408->gpio_value)) + { + return PI_ERR_INVALID_STATE; + } + } + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_OUTPUT_HIGHZ, fxl6408->gpio_high_z)) + { + return PI_ERR_INVALID_STATE; + } + } + else + { + /* pin is in input mode, only set registers that have an effect */ + fxl6408->gpio_dir = __BITINSERT_R(fxl6408->gpio_dir, + __GPIO_MODE_INPUT, 1, gpio_conf->id); + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_IO_DIRECTION, fxl6408->gpio_dir)) + { + return PI_ERR_INVALID_STATE; + } + + // input trigger => (disabled, rising, falling) + if (gpio_conf->input_trigger == FXL6408_GPIO_INPUT_TRIGGER_DISABLED) + { + fxl6408->gpio_interrupt_mask = __BITINSERT_R(fxl6408->gpio_interrupt_mask, + __GPIO_INTERRUPT_DISABLED, 1, gpio_conf->id); + } + else { + fxl6408->gpio_interrupt_mask = __BITINSERT_R(fxl6408->gpio_interrupt_mask, + __GPIO_INTERRUPT_ENABLED, 1, gpio_conf->id); + + if (gpio_conf->input_trigger == FXL6408_GPIO_INPUT_TRIGGER_FALLING) + { + fxl6408->gpio_trigger = __BITINSERT_R(fxl6408->gpio_trigger, + __GPIO_TRIGGER_FALLING, 1, gpio_conf->id); + } + else + { + fxl6408->gpio_trigger = __BITINSERT_R(fxl6408->gpio_trigger, + __GPIO_TRIGGER_RISING, 1, gpio_conf->id); + } + + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_INPUT_DEFAULT_STATE, + fxl6408->gpio_trigger)) + { + return PI_ERR_INVALID_STATE; + } + + + /* set the irq task */ + if (NULL != gpio_conf->irq_task) + { + fxl6408->irq_tasks[gpio_conf->id] = gpio_conf->irq_task; + } + } + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_INTERRUPT_MASK, + fxl6408->gpio_interrupt_mask)) + { + return PI_ERR_INVALID_STATE; + } + + // pull state => (disabled, pull-up, pull-down) + if (gpio_conf->pull_state == FXL6408_GPIO_PULL_STATE_DISABLED) + { + fxl6408->gpio_pull_enable = __BITINSERT_R(fxl6408->gpio_pull_enable, + __GPIO_PULL_DISABLED, 1, gpio_conf->id); + } + else { + fxl6408->gpio_pull_enable = __BITINSERT_R(fxl6408->gpio_pull_enable, + __GPIO_PULL_ENABLED, 1, gpio_conf->id); + + if (gpio_conf->pull_state == FXL6408_GPIO_PULL_STATE_DOWN) + { + fxl6408->gpio_pull_updown = __BITINSERT_R(fxl6408->gpio_pull_updown, + __GPIO_PULL_DOWN, 1, gpio_conf->id); + } + else + { + fxl6408->gpio_pull_updown = __BITINSERT_R(fxl6408->gpio_pull_updown, + __GPIO_PULL_UP, 1, gpio_conf->id); + } + + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_PULL_UP_DOWN, + fxl6408->gpio_pull_updown)) + { + return PI_ERR_INVALID_STATE; + } + } + if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_PULL_ENABLE, + fxl6408->gpio_pull_enable)) + { + return PI_ERR_INVALID_STATE; + } + } + + return PI_OK; +} + +int pi_fxl6408_input_status_get(pi_device_t *device, uint8_t *input_status) +{ + if (NULL == device || NULL == device->data || NULL == input_status) + { + return PI_ERR_INVALID_ARG; + } + + fxl6408_t *fxl6408 = (fxl6408_t *)device->data; + *input_status = __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_INPUT_STATUS); + + return PI_OK; +} + +int pi_fxl6408_interrupt_status_get(pi_device_t *device, uint8_t *interrupt_status) +{ + if (NULL == device || NULL == device->data || NULL == interrupt_status) + { + return PI_ERR_INVALID_ARG; + } + + fxl6408_t *fxl6408 = (fxl6408_t *)device->data; + *interrupt_status = __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_INTERRUPT_STATUS); + + return PI_OK; +} + + +void pi_fxl6408_close(pi_device_t *device) +{ + fxl6408_t *fxl6408 = (fxl6408_t *)device->data; + // Make sure it is not driving anymore any gpio output + __pi_fxl6408_reset(fxl6408); + pi_i2c_close(&fxl6408->i2c); + pmsis_l2_malloc_free(fxl6408, sizeof(fxl6408_t)); +} + + +void pi_fxl6408_conf_init(struct pi_fxl6408_conf *conf) +{ + conf->i2c_itf = 0; + conf->interrupt_pin = 0; +} + +void pi_fxl6408_gpio_conf_init(pi_fxl6408_gpio_conf_t *gpio_conf) +{ + if (NULL == gpio_conf) + { + return; + } + + gpio_conf->id = 0; + + /* set the pin as output high impedance */ + gpio_conf->direction = FXL6408_GPIO_DIR_OUTPUT; + gpio_conf->output_state = FXL6408_GPIO_OUTPUT_STATE_DISABLED; + + /* no effect */ + gpio_conf->input_trigger = FXL6408_GPIO_INPUT_TRIGGER_DISABLED; + gpio_conf->pull_state = FXL6408_GPIO_PULL_STATE_DISABLED; + + gpio_conf->irq_task = NULL; +} diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/adc/ads1014.h b/rtos/pmsis/pmsis_bsp/include/bsp/adc/ads1014.h new file mode 100644 index 000000000..df4eb24be --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/include/bsp/adc/ads1014.h @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2021 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" + +#pragma once + +/** + * @addtogroup ADC + * @{ + */ + +/** + * @defgroup ADS1014 ADS1014 + * + * TI ADS1014 Analog-To-Digital-Converter + * + * @warning Support for comparator IRQ handling is not implemented. + */ + +/** + * @addtogroup ADS1014 + * @{ + */ + +/** + * ADS1014 PGA (Programmable Gain Amplifier) values + * + * It is the range of the measured value. + * The absolute value of the measured voltage will never go above the + * power supply voltage value. + */ +enum ads1014_pga { + ADS1014_PGA_FSR_6V144 = 0x0, + ADS1014_PGA_FSR_4V096 = 0x1, + ADS1014_PGA_FSR_2V048 = 0x2, + ADS1014_PGA_FSR_1V024 = 0x3, + ADS1014_PGA_FSR_0V512 = 0x4, + ADS1014_PGA_FSR_0V256 = 0x5, +}; + +/** + * ADS1014 operating mode values + */ +enum ads1014_operating_mode { + /** The ADC measures continuously. */ + ADS1014_OPERATING_MODE_CONTINUOUS = 0x0, + /** The ADC only measures once, and goes back to power-saving mode */ + ADS1014_OPERATING_MODE_SINGLE_SHOT = 0x1, +}; + +/** + * ADS1014 Sampling rate (samples per second) + */ +enum ads1014_data_rate { + ADS1014_DATA_RATE_SPS_128 = 0x0, + ADS1014_DATA_RATE_SPS_250 = 0x1, + ADS1014_DATA_RATE_SPS_490 = 0x2, + ADS1014_DATA_RATE_SPS_920 = 0x3, + ADS1014_DATA_RATE_SPS_1600 = 0x4, + ADS1014_DATA_RATE_SPS_2400 = 0x5, + ADS1014_DATA_RATE_SPS_3300 = 0x6, +}; + +/** + * ADS1014 Comparator mode + */ +enum ads1014_comparator_mode { + /** + * The comparator triggers when the measured value goes above the high + * threshold, and resets when the value goes below the low threshold. + */ + ADS1014_COMPARATOR_MODE_TRADITIONAL = 0x0, + /** + * The comparator triggers if the measured value goes outside the window, + * i.e. above the high threshold or below the low threshold. + */ + ADS1014_COMPARATOR_MODE_WINDOW = 0x1, +}; + +/** + * ADS1014 Alert/Ready comparator pin polarity + * + * Controls the ADC Alert/Ready pin active polarity + */ +enum ads1014_comparator_polarity { + ADS1014_COMPARATOR_POLARITY_ACTIVE_LOW = 0x0, + ADS1014_COMPARATOR_POLARITY_ACTIVE_HIGH = 0x1, +}; + +/** + * ADS1014 comparator latching mode + * + * Determines whether the comparator latches after triggering. + * When the comparator is set to latch, it will only be cleared by reading + * the ADC measured value. + */ +enum ads1014_comparator_latch { + ADS1014_COMPARATOR_LATCH_DISABLED = 0x0, + ADS1014_COMPARATOR_LATCH_ENABLED = 0x1, +}; + +/** + * ADS1014 comparator status + * + * Determines after how many out-of-bounds conversions the comparator will + * trigger. + */ +enum ads1014_comparator_status { + /** triggers after 1 conversion */ + ADS1014_COMPARATOR_STATUS_ASSERT_ONE = 0x0, + /** triggers after 2 out-of-bounds conversions */ + ADS1014_COMPARATOR_STATUS_ASSERT_TWO = 0x1, + /** triggers after 3 out-of-bounds conversions */ + ADS1014_COMPARATOR_STATUS_ASSERT_THREE = 0x2, + /** the comparator is disabled */ + ADS1014_COMPARATOR_STATUS_DISABLED = 0x3, +}; + +/* @brief Structure holding ADS1014 configuration */ +struct pi_ads1014_conf +{ + /** I2C interface which is connected to the ADC */ + uint8_t i2c_itf; + /** Address of the ADC */ + uint8_t i2c_addr; + + /** ADC operating mode (single or continous) */ + enum ads1014_operating_mode operating_mode; + /** range of the measured value */ + enum ads1014_pga pga; + /** sampling rate */ + enum ads1014_data_rate data_rate; + + /** ADC comparator status (enabled&trigger conditions, or disabled) */ + enum ads1014_comparator_status comparator_status; + /** ADC comparator mode (traditional or window) */ + enum ads1014_comparator_mode comparator_mode; + /** ADC comparator latch setting */ + enum ads1014_comparator_latch comparator_latch; + /** ADC comparator triggered polarity */ + enum ads1014_comparator_polarity comparator_polarity; +}; + +/** + * @brief Initialize an ADS1014 configuration with default values. + * + * The structure containing the configuration must be kept alive until + * the device is opened. + * It can only be called from fabric-controller side. + * + * @param[inout] conf Pointer to the device configuration. + */ +void pi_ads1014_conf_init(struct pi_ads1014_conf *conf); + +/** + * Open a ADS1014 device + * + * @param[inout] device pointer to the ADS1014 device + * + * @return PI_OK if operation was sucessful, + * an error code otherwise + */ +int pi_ads1014_open(pi_device_t *device); + +/** + * Close a ADS1014 device + * + * @param[inout] device pointer to the ADS1014 device + */ +void pi_ads1014_close(pi_device_t *device); + + +/** + * Read the value measured by the ADC. + * + * @param[in] device pointer to the ads1014 device + * @param[out] value value in mV returned by the ADC + * + * @return PI_OK if operation was successful, + * an error code otherwise + */ +int pi_ads1014_read(pi_device_t *device, float *value); + +/** + * Set the comparator thresholds (low and high) + * + * @param[in] device pointer to the ads1014 device + * @param[in] threshold_low new value for comparator low threshold (in mV) + * @param[in] threshold_high new value for comparator high threshold (in mV) + * + * @return PI_OK if operation was successful, + * an error code otherwise + */ +int pi_ads1014_set_comparator_thresholds(pi_device_t *device, + float threshold_low, float threshold_high); + + +/** + * @} + */ + +/** + * @} + */ diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/audio/adc/tlv320.h b/rtos/pmsis/pmsis_bsp/include/bsp/audio/adc/tlv320.h new file mode 100644 index 000000000..c78aa4cd7 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/include/bsp/audio/adc/tlv320.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2019 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" + +#pragma once + +/** + * @addtogroup ADC + * @{ + */ + +/** + * @defgroup tlv320 tlv320 + * + * DAC tlv320 + */ + +/** + * @addtogroup tlv320 + * @{ + */ + +/* @brief Struct holding tlv320 display config. */ +struct pi_tlv320_conf +{ + int i2c_itf; +}; + +/** + * @brief Initialize an tlv320 configuration with default values. + * + * The structure containing the configuration must be kept alive until + * the device is opened. + * It can only be called from fabric-controller side. + * + * @param conf Pointer to the device configuration. + */ +void pi_tlv320_conf_init(struct pi_tlv320_conf *conf); + +int pi_tlv320_open(pi_device_t *device); + +void pi_tlv320_close(struct pi_device *device); + +/** + * @} + */ + +/** + * @} + */ diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/audio/dac/ak4332.h b/rtos/pmsis/pmsis_bsp/include/bsp/audio/dac/ak4332.h new file mode 100644 index 000000000..f2c3833e0 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/include/bsp/audio/dac/ak4332.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2019 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" + +#pragma once + +/** + * @addtogroup Dac + * @{ + */ + +/** + * @defgroup Ak4332 Ak4332 + * + * DAC AK4332 + */ + +/** + * @addtogroup Ak4332 + * @{ + */ + +/* @brief Struct holding ak4332 display config. */ +struct pi_ak4332_conf +{ + int i2c_itf; /*!< I2C interface number where the device is connected. */ +}; + +/** + * @brief Initialize an ak4332 configuration with default values. + * + * The structure containing the configuration must be kept alive until + * the device is opened. + * It can only be called from fabric-controller side. + * + * @param conf Pointer to the device configuration. + */ +void pi_ak4332_conf_init(struct pi_ak4332_conf *conf); + +/** \brief Open a ak4332 device. + * + * This function must be called before the ak4332 device can be used. + * It will do all the needed configuration to make it usable and initialize + * the handle used to refer to this opened device when calling other functions. + * + * \param device A pointer to the device structure of the device to open. + * This structure is allocated by the called and must be kept alive until the + * device is closed. + * \return 0 if the operation is successfull, -1 if there was an error. + */ +int pi_ak4332_open(pi_device_t *device); + +/** + * @brief Set DAC digital input volume + * + * The volume can be set to 0 to mute it or from 0x01 (-12dB) to 0x1F (+3.0dB). + * + * @param device Pointer to the device structure. + * \return 0 if the operation is successfull, -1 if there was an error. + */ +int pi_ak4332_set_dac_volume(pi_device_t *device, uint8_t volume); + +/** + * @brief Set headphone amplifier volume + * + * The volume can be set from 0x00 (-10dB) to 0x7 (+4dB). + * + * @param device Pointer to the device structure. + * \return 0 if the operation is successfull, -1 if there was an error. + */ +int pi_ak4332_set_hp_volume(pi_device_t *device, uint8_t volume); + +/** \brief Close an opened ak4332 device. + * + * This function can be called to close an opened ak4332 device once it is + * not needed anymore, in order to free all allocated resources. Once this + * function is called, the device is not accessible anymore and must be opened + * again before being used. + * + * \param device The device structure of the device to close. + */ +void pi_ak4332_close(struct pi_device *device); + +/** + * @} + */ + +/** + * @} + */ diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h b/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h index c8da1cb47..47ed8bfc5 100644 --- a/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h +++ b/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h @@ -18,11 +18,12 @@ #include -#define CONFIG_AK4332 - #define CONFIG_AK4332_I2C_ITF 1 #define CONFIG_AK4332_I2S_ITF 2 +#define CONFIG_TLV320_I2C_ITF 1 +#define CONFIG_TLV320_I2S_ITF 2 + #ifdef __cplusplus extern "C" { #endif @@ -39,7 +40,6 @@ uint8_t pi_bsp_fxl6408_read_id(); void __bsp_audio_addon_init(); #define CONFIG_FXL6408UMX_I2C_ITF 1 -#define CONFIG_FXL6408UMX_I2C_ADDR 0x86 #define CONFIG_FXL6408UMX_AK4332_GPIO 1 #define CONFIG_FXL6408UMX_TLV320_GPIO 2 /// @endcond diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h b/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h index 2beabc8ba..bda0383c2 100644 --- a/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h +++ b/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h @@ -178,6 +178,12 @@ void bsp_atxp032_conf_init(struct pi_atxp032_conf *conf); int bsp_atxp032_open(struct pi_atxp032_conf *conf); #endif +#if defined(CONFIG_MX25U51245G) +#include "bsp/flash/mx25u51245g.h" +void bsp_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf); +int bsp_mx25u51245g_open(struct pi_mx25u51245g_conf *conf); +#endif + #if defined(CONFIG_NINA_W10) #include "bsp/transport/nina_w10.h" void bsp_nina_w10_conf_init(struct pi_nina_w10_conf *conf); @@ -204,6 +210,19 @@ void bsp_ak4332_conf_init(struct pi_ak4332_conf *conf); int bsp_ak4332_open(struct pi_ak4332_conf *conf); #endif /* CONFIG_AK4332 */ +#if defined(CONFIG_TLV320) +#include "audio/adc/tlv320.h" +void bsp_tlv320_conf_init(struct pi_tlv320_conf *conf); +int bsp_tlv320_open(struct pi_tlv320_conf *conf); +#endif /* CONFIG_TLV320 */ + +#if defined(CONFIG_FXL6408) +#include "gpio/fxl6408.h" +void bsp_fxl6408_conf_init(struct pi_fxl6408_conf *conf); +int bsp_fxl6408_open(struct pi_fxl6408_conf *conf); +int bsp_fxl6408_close(struct pi_fxl6408_conf *conf); +#endif /* CONFIG_FXL6408 */ + void bsp_init(); void pi_bsp_init(); diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/flash/mx25u51245g.h b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mx25u51245g.h new file mode 100644 index 000000000..d2cb1fa88 --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mx25u51245g.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2019 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSP__FLASH__MX25U51245G_H__ +#define __BSP__FLASH__MX25U51245G_H__ + +#include "bsp/flash.h" + +/** + * @addtogroup Flash + * @{ + */ + +/** + * @defgroup Mx25u51245g Mx25u51245g + * + */ + +/** + * @addtogroup Mx25u51245g + * @{ + */ + +/**@{*/ + +/** \struct pi_mx25u51245g_conf + * \brief Mx25u51245g configuration structure. + * + * This structure is used to pass the desired Mx25u51245g configuration to the + * runtime when opening the device. + */ +struct pi_mx25u51245g_conf +{ + struct pi_flash_conf flash; /*!< Generic flash configuration. */ + int spi_itf; /*!< SPI interface where the RAM is + connected. */ + int spi_cs; /*!< Chip select where the RAM is connected. */ + int xip_en; + uint32_t baudrate; /*!< Baudrate (in bytes/second). */ +}; + +/** \brief Initialize an Mx25u51245g configuration with default values. + * + * The structure containing the configuration must be kept alive until the + * mx25u51245g device is opened. + * + * \param conf A pointer to the mx25u51245g configuration. + */ +void pi_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf); + + +//!@} + +/** + * @} end of Mx25u51245g + */ + +/** + * @} end of Flash + */ + +#endif diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h index 735d7a0f6..9a0cebbc8 100644 --- a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h +++ b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h @@ -22,46 +22,23 @@ #endif #define CONFIG_HIMAX -#define CONFIG_HYPERFLASH #define CONFIG_MRAM -#define CONFIG_HYPERRAM -#define CONFIG_SPIRAM -#define CONFIG_SPIFLASH #define CONFIG_24XX1025 #define CONFIG_APS25XXXN #define CONFIG_VIRTUAL_EEPROM -#define CONFIG_ATXP032 +#define CONFIG_MX25U51245G #define CONFIG_NINA_B112 #define CONFIG_HIMAX_CPI_ITF 0 #define CONFIG_HIMAX_I2C_ITF 0 -#define CONFIG_HYPERFLASH_HYPER_ITF 0 -#define CONFIG_HYPERFLASH_HYPER_CS 1 - -#define CONFIG_HYPERRAM_HYPER_ITF 0 -#define CONFIG_HYPERRAM_HYPER_CS 0 -#define CONFIG_HYPERRAM_START 0 -#define CONFIG_HYPERRAM_SIZE (8<<20) - -#define CONFIG_SPIRAM_SPI_ITF 0 -#define CONFIG_SPIRAM_SPI_CS 0 -#define CONFIG_SPIRAM_START 0 -#define CONFIG_SPIRAM_SIZE (1<<20) - -#define CONFIG_APS25XXXN_SPI_ITF 1 -#define CONFIG_APS25XXXN_SPI_CS 0 +#define CONFIG_APS25XXXN_SPI_ITF 0 +#define CONFIG_APS25XXXN_SPI_CS 1 #define CONFIG_APS25XXXN_START 0 #define CONFIG_APS25XXXN_SIZE (1<<25) -#define CONFIG_ATXP032_SPI_ITF 1 -#define CONFIG_ATXP032_SPI_CS 1 - -#define CONFIG_SPIFLASH_SPI_ITF 0 -#define CONFIG_SPIFLASH_SPI_CS 0 -#define CONFIG_SPIFLASH_START 0 -#define CONFIG_SPIFLASH_SIZE (1<<24) -#define CONFIG_SPIFLASH_SECTOR_SIZE (1<<12) +#define CONFIG_MX25U51245G_SPI_ITF 0 +#define CONFIG_MX25U51245G_SPI_CS 0 #define CONFIG_24XX1025_I2C_ADDR 0xA0 #define CONFIG_24XX1025_I2C_ITF 0 @@ -80,4 +57,10 @@ #define GPIO_NINA_PWRON ( PI_PAD_042) #define GPIO_NINA17_DSR ( PI_PAD_043) +#define pi_default_flash_conf pi_mx25u51245g_conf +#define pi_default_flash_conf_init pi_mx25u51245g_conf_init + +#define pi_default_ram_conf pi_aps25xxxn_conf +#define pi_default_ram_conf_init pi_aps25xxxn_conf_init + #endif diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h index e935802f3..7eba5d772 100644 --- a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h +++ b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h @@ -45,13 +45,13 @@ #define CONFIG_SPIRAM_START 0 #define CONFIG_SPIRAM_SIZE (1<<20) -#define CONFIG_APS25XXXN_SPI_ITF 1 -#define CONFIG_APS25XXXN_SPI_CS 0 +#define CONFIG_APS25XXXN_SPI_ITF 0 +#define CONFIG_APS25XXXN_SPI_CS 1 #define CONFIG_APS25XXXN_START 0 #define CONFIG_APS25XXXN_SIZE (1<<25) -#define CONFIG_ATXP032_SPI_ITF 1 -#define CONFIG_ATXP032_SPI_CS 1 +#define CONFIG_ATXP032_SPI_ITF 0 +#define CONFIG_ATXP032_SPI_CS 0 #define CONFIG_SPIFLASH_SPI_ITF 0 #define CONFIG_SPIFLASH_SPI_CS 0 @@ -76,4 +76,22 @@ #define GPIO_NINA_PWRON ( PI_PAD_042) #define GPIO_NINA17_DSR ( PI_PAD_043) +#if defined(__PLATFORM_GVSOC__) + +#define pi_default_flash_conf pi_hyperflash_conf +#define pi_default_flash_conf_init pi_hyperflash_conf_init + +#define pi_default_ram_conf pi_hyperram_conf +#define pi_default_ram_conf_init pi_hyperram_conf_init + +#else + +#define pi_default_flash_conf pi_atxp032_conf +#define pi_default_flash_conf_init pi_atxp032_conf_init + +#define pi_default_ram_conf pi_aps25xxxn_conf +#define pi_default_ram_conf_init pi_aps25xxxn_conf_init + +#endif + #endif diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/gpio/fxl6408.h b/rtos/pmsis/pmsis_bsp/include/bsp/gpio/fxl6408.h new file mode 100644 index 000000000..268733dae --- /dev/null +++ b/rtos/pmsis/pmsis_bsp/include/bsp/gpio/fxl6408.h @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" + +#pragma once + +/** + * @addtogroup GPIO + * @{ + */ + +/** + * @defgroup FXL6408 FXL6408 + * + * I2C Controlled GPIO Expander (8 configurable IOs) + */ + +/** + * @addtogroup FXL6408 + * @{ + */ + +/** @brief Struct holding FXL6408 display config. */ +struct pi_fxl6408_conf +{ + int i2c_itf; /*!< I2C interface number where the device is connected. */ + pi_gpio_e interrupt_pin; /*!< interrupt pin */ +}; + +/** + * Direction of a GPIO (input or output) + */ +typedef enum { + FXL6408_GPIO_DIR_INPUT = 0x0, + FXL6408_GPIO_DIR_OUTPUT = 0x1, +} fxl6408_gpio_dir_e; + +/** + * GPIO Output state + * + * This has no effect in input mode. + */ +typedef enum { + /** GPIO is in High-Z (impedance) mode */ + FXL6408_GPIO_OUTPUT_STATE_DISABLED = 0x0, + /** GPIO is in low voltage level, or 0 */ + FXL6408_GPIO_OUTPUT_STATE_LOW = 0x1, + /** GPIO is in high voltage level, or 1 */ + FXL6408_GPIO_OUTPUT_STATE_HIGH = 0x2, +} fxl6408_gpio_output_state_e; + +/** + * GPIO Input trigger conditions (disabled, rising or falling edge) + * + * This has no effect in output mode. + */ +typedef enum { + /** Input will trigger on a rising edge */ + FXL6408_GPIO_INPUT_TRIGGER_RISING = 0x0, + /** Input will trigger on a falling edge */ + FXL6408_GPIO_INPUT_TRIGGER_FALLING = 0x1, + /** Input will never trigger */ + FXL6408_GPIO_INPUT_TRIGGER_DISABLED = 0x2, +} fxl6408_gpio_input_trigger_e; + +/** GPIO Pull up/down state + * + * This has no effect in output mode. + */ +typedef enum { + /** GPIO is set to pull down */ + FXL6408_GPIO_PULL_STATE_DOWN = 0x0, + /** GPIO is set to pull up */ + FXL6408_GPIO_PULL_STATE_UP = 0x1, + /** GPIO pull is disabled */ + FXL6408_GPIO_PULL_STATE_DISABLED = 0x2, +} fxl6408_gpio_pull_state_e; + +/** Structure holding the configuration of a FXL6408 GPIO */ +typedef struct { + /** GPIO id(from 0 to 7) */ + uint8_t id; + /** Direction (input or output) */ + fxl6408_gpio_dir_e direction; + /** Output State (disabled/High-Z, 0 or 1) */ + fxl6408_gpio_output_state_e output_state; + /** Input trigger (trigger on falling edge, rising edge or disabled) */ + fxl6408_gpio_input_trigger_e input_trigger; + /** pull state (pull-up, pull-down, disabled) */ + fxl6408_gpio_pull_state_e pull_state; + /** task executed when an irq is detected */ + pi_task_t *irq_task; +} pi_fxl6408_gpio_conf_t; + +/** + * @brief Initialize an FXL6408 configuration with default values. + * + * The structure containing the configuration must be kept alive until + * the device is opened. + * It can only be called from fabric-controller side. + * It is not thread-safe and cannot be called from a pmsis task callback or + * interrupt handler. + * + * @param[inout] conf Pointer to the device configuration. + */ +void pi_fxl6408_conf_init(struct pi_fxl6408_conf *conf); + +/** + * @brief Open an FXL6408 device. + * + * It can only be called from fabric-controller side. + * It is not thread-safe and cannot be called from a pmsis task callback or + * interrupt handler. + * + * @param[inout] device Pointer to the device. + * + * @return 0 if successfull or any other value otherwise + */ +int pi_fxl6408_open(pi_device_t *device); + +/** + * Close an FXL6408 device. + * + * @param[inout] device device to be closed + */ +void pi_fxl6408_close(pi_device_t *device); + +/** + * Initialize the configuration of a GPIO (Output, High-Z) + * + * @param[inout] gpio_conf configuration of the gpio + */ +void pi_fxl6408_gpio_conf_init(pi_fxl6408_gpio_conf_t *gpio_conf); + +/** + * @brief Set a GPIO state. + * + * It can only be called from fabric-controller side. + * It is not thread-safe and cannot be called from a pmsis task callback or + * interrupt handler. + * + * @param[in] device Pointer to the device. + * @param[in] gpio_conf GPIO configuration + * @return PI_OK if successfull or any other value otherwise + */ +int pi_fxl6408_gpio_set(pi_device_t *device, pi_fxl6408_gpio_conf_t *gpio_conf); + +/** + * Return the current status of inputs + * + * Each bit of the input status is the status of the corresponding + * gpio input. + * + * @param[in] device pointer to the device + * @param[out] input_status value of the input status register + * + * @return PI_OK if operation was successful, an error code otherwise. + */ +int pi_fxl6408_input_status_get(pi_device_t *device, uint8_t *input_status); + +/** + * Return the current status of interrupts + * + * This will clear the interrupt status register. + * + * Each bit of the interrupt status is the status of the corresponding + * gpio interrupt. + * + * @param[in] device pointer to the device + * @param[out] interrupt_status value of the interrupt status register + * + * @return PI_OK if operation was successful, an error code otherwise. + */ +int pi_fxl6408_interrupt_status_get(pi_device_t *device, uint8_t *interrupt_status); + +/** + * @} + */ + +/** + * @} + */ diff --git a/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c b/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c index d1ced37ad..853297d77 100644 --- a/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c +++ b/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c @@ -92,6 +92,18 @@ static int hyperram_open(struct pi_device *device) goto error2; } + +#if defined(WINBOND_HYPER) + hyper_crt0_set(REG_ACCESS); + hyper_crt1_set(REG_ACCESS); + hyperram->reg_value = 0x9f10; + pi_hyper_write(&hyperram->hyper_device, 0x1000, &hyperram->reg_value, 2); + pi_hyper_read(&hyperram->hyper_device, 0x1000, &hyperram->reg_value, 2); + //printf("Reg value of 0x80001000 = %lx\n", hyperram->reg_value); + hyper_crt0_set(MEM_ACCESS); + hyper_crt1_set(MEM_ACCESS); +#endif + #if defined(__GAP9__) pi_hyper_ioctl(&hyperram->hyper_device, PI_HYPER_IOCTL_ENABLE_AES, (void*) 0); diff --git a/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c b/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c index f19c892aa..529b89d71 100644 --- a/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c +++ b/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c @@ -215,6 +215,10 @@ void pi_aps25xxxn_conf_init(struct pi_aps25xxxn_conf *conf) conf->baudrate = 0; conf->xip_en = 0; conf->reserve_addr_0 = 1; + #if defined(__GAP9__) + conf->ram.aes_conf.enabled = 0; + conf->ram.aes_conf.qk_en = 0; + #endif bsp_aps25xxxn_conf_init(conf); } diff --git a/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk b/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk index a6e1acd7d..d5bf804f8 100644 --- a/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk +++ b/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk @@ -19,21 +19,23 @@ include $(PMSIS_BSP_DIR)/src.mk ifeq ($(BOARD_NAME), gapuino) -PMSIS_BSP_SRC = $(GAPUINO_SRC) +PMSIS_BSP_SRC += $(GAPUINO_SRC) else ifeq ($(BOARD_NAME), gapoc_a) -PMSIS_BSP_SRC = $(GAPOC_A_SRC) +PMSIS_BSP_SRC += $(GAPOC_A_SRC) else ifeq ($(BOARD_NAME), gapoc_a_revb) -PMSIS_BSP_SRC = $(GAPOC_A_SRC) +PMSIS_BSP_SRC += $(GAPOC_A_SRC) else ifeq ($(BOARD_NAME), gapoc_b) -PMSIS_BSP_SRC = $(GAPOC_B_SRC) +PMSIS_BSP_SRC += $(GAPOC_B_SRC) else ifeq ($(BOARD_NAME), gapoc_b_revb) -PMSIS_BSP_SRC = $(GAPOC_B_SRC) +PMSIS_BSP_SRC += $(GAPOC_B_SRC) else ifeq ($(BOARD_NAME), vega) -PMSIS_BSP_SRC = $(VEGA_SRC) +PMSIS_BSP_SRC += $(VEGA_SRC) else ifeq ($(BOARD_NAME), gap9_v2) -PMSIS_BSP_SRC = $(GAP9_SRC) +PMSIS_BSP_SRC += $(GAP9_SRC) else ifeq ($(BOARD_NAME), ai_deck) -PMSIS_BSP_SRC = $(AI_DECK_SRC) +PMSIS_BSP_SRC += $(AI_DECK_SRC) +else ifeq ($(BOARD_NAME), gap9_evk) +PMSIS_BSP_SRC += $(GAP9_EVK_SRC) endif EXCLUDE_FROM_SRCS= transport/transport.c transport/nina_w10/nina_w10.c diff --git a/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk b/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk index d01b1dc05..96493e41f 100644 --- a/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk +++ b/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk @@ -18,11 +18,6 @@ endif BOARD_PROFILE_UPPER = $(shell echo $(PULPOS_BOARD_PROFILE) | tr 'a-z' 'A-Z') PULP_CFLAGS += -DCONFIG_PROFILE_$(BOARD_PROFILE_UPPER) -ifneq (,$(findstring $(BOARD_FEATURES),audio_addon)) - PULP_SRCS += $(BSP_GAP9_EVK_AUDIO_ADDON) - PULP_CFLAGS += -DCONFIG_GAP9_EVK_AUDIO_ADDON=1 -endif - # BSP is needed if i2s is used to properly configure pads ifeq '$(CONFIG_I2S)' '1' CONFIG_BSP = 1 @@ -116,6 +111,5 @@ ifeq '$(CONFIG_BLE_NINA_B112)' '1' PULP_SRCS += $(BSP_BLE_NINA_B112_SRC) endif -ifeq '$(CONFIG_AK4332)' '1' -PULP_SRCS += $(BSP_AK4332_SRC) -endif +PULP_SRCS += $(PMSIS_BSP_SRC) +PULP_CFLAGS += $(PMSIS_BSP_CFLAGS) \ No newline at end of file diff --git a/rtos/pmsis/pmsis_bsp/src.mk b/rtos/pmsis/pmsis_bsp/src.mk index 58c747924..505fe287f 100644 --- a/rtos/pmsis/pmsis_bsp/src.mk +++ b/rtos/pmsis/pmsis_bsp/src.mk @@ -22,6 +22,9 @@ BSP_HIMAX_SRC = camera/himax/himax.c BSP_HM0360_SRC = camera/hm0360/hm0360.c BSP_BLE_NINA_B112_SRC= ble/ble.c ble/nina_b112/nina_b112.c ble/nina_b112/nina_b112_old.c BSP_AK4332_SRC = audio/dac/ak4332.c +BSP_TLV320_SRC = audio/adc/tlv320.c +BSP_FXL6408_SRC = gpio/fxl6408.c +BSP_ADC_ADS1014_SRC = adc/ads1014.c COMMON_SRC = \ $(BSP_FLASH_SRC) \ @@ -57,10 +60,8 @@ GAP9_SRC = \ $(BSP_HIMAX_SRC) \ $(BSP_HYPERFLASH_SRC) \ $(BSP_HYPERRAM_SRC) \ - $(BSP_RAM_SRC) \ $(BSP_MRAM_SRC) \ $(BSP_OSPI_FLASH_SRC) \ - $(BSP_OSPI_RAM_SRC) \ $(BSP_BLE_NINA_B112_SRC) WOLFE_SRC = \ @@ -105,6 +106,11 @@ AI_DECK_SRC = \ $(BSP_SPIFLASH_SRC) \ $(BSP_RAM_SRC) +GAP9_EVK_SRC = \ + $(COMMON_SRC) \ + $(BSP_MRAM_SRC) \ + bsp/gap9_evk.c + GAPOC_A_SRC = \ $(COMMON_SRC) \ bsp/gapoc_a.c \ @@ -152,3 +158,66 @@ GAPOC_B_SRC = \ camera/ov5640/ov5640.c endif # TARGET_CHIP +ifeq '$(BOARD_NAME)' 'gap9_evk' +# Configure the right spi flash +CONFIG_MX25U51245G=1 +CONFIG_APS25XXXN=1 +CONFIG_IO_UART_ITF=1 +CONFIG_IO_UART_BAUDRATE=115200 +endif + +ifeq '$(BOARD_NAME)' 'gap9_v2' +# Configure the right spi flash +CONFIG_ATXP032=1 +CONFIG_HYPERFLASH=1 +CONFIG_HYPERRAM=1 +CONFIG_APS25XXXN=1 +endif + +ifneq (,$(findstring $(BOARD_FEATURES),audio_addon)) + PMSIS_BSP_SRC += $(BSP_GAP9_EVK_AUDIO_ADDON) + PMSIS_BSP_CFLAGS += -DCONFIG_GAP9_EVK_AUDIO_ADDON=1 +endif + +CONFIG_OCTOSPI = 1 +ifeq '$(CONFIG_AK4332)' '1' +PMSIS_BSP_SRC += $(BSP_AK4332_SRC) +CONFIG_FXL6408 = 1 +CONFIG_I2C = 1 +PMSIS_BSP_CFLAGS += -DCONFIG_AK4332=1 +endif + +ifeq '$(CONFIG_TLV320)' '1' +PMSIS_BSP_SRC += $(BSP_TLV320_SRC) +CONFIG_FXL6408 = 1 +CONFIG_I2C = 1 +PMSIS_BSP_CFLAGS += -DCONFIG_TLV320=1 +endif + +ifeq '$(CONFIG_FXL6408)' '1' +PMSIS_BSP_SRC += $(BSP_FXL6408_SRC) +PMSIS_BSP_CFLAGS += -DCONFIG_FXL6408=1 +CONFIG_I2C = 1 +endif + +ifeq '$(CONFIG_MX25U51245G)' '1' +PMSIS_BSP_SRC += flash/spiflash/mx25u51245g.c +CONFIG_FLASH = 1 +CONFIG_OCTOSPI = 1 +endif + +ifeq '$(CONFIG_APS25XXXN)' '1' +PMSIS_BSP_SRC += ram/spiram/aps25xxxn.c +CONFIG_RAM = 1 +CONFIG_OCTOSPI = 1 +endif + +ifeq '$(CONFIG_RAM)' '1' +PMSIS_BSP_SRC += $(BSP_RAM_SRC) +CONFIG_BSP = 1 +endif + +ifeq '$(CONFIG_ADS1014)' '1' +PMSIS_BSP_SRC += $(BSP_ADC_ADS1014_SRC) +PMSIS_BSP_CFLAGS += -DCONFIG_ADS1014=1 +endif diff --git a/rtos/pmsis/pmsis_implem/CMakeLists.txt b/rtos/pmsis/pmsis_implem/CMakeLists.txt index c7876bb83..f4eb2da9d 100644 --- a/rtos/pmsis/pmsis_implem/CMakeLists.txt +++ b/rtos/pmsis/pmsis_implem/CMakeLists.txt @@ -1,7 +1,14 @@ # Driver sources LIST(APPEND PMSIS_SRC chips/gap9/drivers/i2s/i2s.c + chips/gap9/drivers/spim/spim.c chips/gap9/drivers/udma/udma_core.c + chips/gap9/drivers/udma/udma_datamove.c + chips/gap9/drivers/udma/udma_ffc.c + chips/gap9/drivers/udma/udma_timeout.c + chips/gap9/drivers/udma/udma_timestamp.c + chips/gap9/drivers/i2c/i2c.c + chips/gap9/drivers/i2c/i2c_slave.c ) add_library(pmsis_implem STATIC ${PMSIS_SRC}) diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c new file mode 100644 index 000000000..566c8c84a --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c @@ -0,0 +1,893 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "pmsis/drivers/i2c.h" +#include "i2c_internal.h" + + +//#define USE_TIMEOUT 1 + + +i2c_itf_data_t *__pi_i2c_itf_data[UDMA_NB_I2C]; + + +static int __pi_i2c_prepare_write_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *write_buffer, + int size, pi_i2c_xfer_flags_e flags); + +static int __pi_i2c_prepare_read_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *write_buffer, + int size, pi_i2c_xfer_flags_e flags); + +static int __pi_i2c_prepare_dual_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *write_buffer, + int size, pi_i2c_xfer_flags_e flags); + +static int __pi_i2c_prepare_write_read_buf(i2c_slave_data_t *slave_data, + uint32_t *buffer, int size0, int size1); + +static int __pi_i2c_prepare_write_dual_buf(i2c_slave_data_t *slave_data, + uint32_t *buffer, int size0, int size1); + + +static void __pi_i2c_write_exec(i2c_slave_data_t *slave_data, uint8_t *buffer, int size, + pi_i2c_xfer_flags_e flags, pi_task_t *task) +{ + uint16_t slave_addr = slave_data->slave_addr; + i2c_itf_data_t *itf_data = slave_data->itf_data; + +#if defined(USE_TIMEOUT) + // TODO this does not take into account errors. Timeout should be restarted with proper time + uint32_t timeout_us = task->timeout; + if (timeout_us) + { + __pi_i2c_timeout_config_set(task, slave_data->itf_data->tx_timeout_id, + slave_data->itf_data->tx_chan_id, timeout_us, + __pi_i2c_timeout_abort, slave_data->itf_data); + } +#endif + int cmd_buf_size = __pi_i2c_prepare_write_cmd_buf(slave_data, itf_data->cmd_buf, + size, flags); + pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)buffer, size, 0); + pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf, + cmd_buf_size*sizeof(uint32_t), 0); +} + + +static void __pi_i2c_read_exec(i2c_slave_data_t *slave_data, uint8_t *buffer, int size, + pi_i2c_xfer_flags_e flags, pi_task_t *task) +{ + uint16_t slave_addr = slave_data->slave_addr; + i2c_itf_data_t *itf_data = slave_data->itf_data; + +#if defined(USE_TIMEOUT) + uint32_t timeout_us = task->timeout; + if (timeout_us) + { + __pi_i2c_timeout_config_set(task, slave_data->itf_data->rx_timeout_id, + slave_data->itf_data->rx_chan_id, timeout_us, + __pi_i2c_timeout_abort, slave_data->itf_data); + } +#endif + int cmd_buf_size = __pi_i2c_prepare_read_cmd_buf(slave_data, itf_data->cmd_buf, size, + flags); + pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)buffer, size, 0); + pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf, + cmd_buf_size*sizeof(uint32_t), 0); +} + + +static void __pi_i2c_write_read_exec(i2c_slave_data_t *slave_data, void *tx_buffer, + void *rx_buffer, uint32_t tx_size, uint32_t rx_size, pi_task_t *task) +{ + uint16_t slave_addr = slave_data->slave_addr; + i2c_itf_data_t *itf_data = slave_data->itf_data; + int8_t bits = slave_data->is_10_bits; + + pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)tx_buffer, tx_size, 0); + pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)rx_buffer, rx_size, 0); + + int cmd_buf_size = __pi_i2c_prepare_write_read_buf(slave_data, itf_data->cmd_buf, + tx_size, rx_size); + pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf, + sizeof(i2c_cmd_t)*cmd_buf_size, 0); +} + +static void __pi_i2c_write_dual_exec(i2c_slave_data_t *slave_data, void *tx_buffer0, + void *tx_buffer1, uint32_t tx_size0, uint32_t tx_size1, pi_task_t *task) +{ + uint16_t slave_addr = slave_data->slave_addr; + i2c_itf_data_t *itf_data = slave_data->itf_data; + + pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)tx_buffer0, tx_size0, 0); + pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)tx_buffer1, tx_size1, 0); + int cmd_buf_size = __pi_i2c_prepare_write_dual_buf(slave_data, itf_data->cmd_buf, + tx_size0, tx_size1); + pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf, + sizeof(i2c_cmd_t)*cmd_buf_size, 0); +} + + +static inline void __pi_i2c_send_request_from_irq(i2c_itf_data_t* itf_data, pi_task_t* task) +{ + int cmd_buf_size; + + pi_device_t *device = (pi_device_t *)task->data[3]; + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + if(task->data[0] == I2C_WRITE) + { + __pi_i2c_write_exec(slave_data, (void*) task->data[1], task->data[2], task->data[4], task); + } + else if(task->data[0] == I2C_READ) + { + __pi_i2c_read_exec(slave_data, (void*) task->data[1], task->data[2], task->data[4], task); + } + else if(task->data[0] == I2C_WRITE_READ) + { + __pi_i2c_write_read_exec(slave_data, (void*)task->data[1], (void *)task->data[2], + task->data[4], task->data[5], task); + } + else if(task->data[0] == I2C_WRITE_DUAL) + { + __pi_i2c_write_dual_exec(slave_data, (void*)task->data[1], (void *)task->data[2], + task->data[4], task->data[5], task); + } +} + +static inline void __pi_i2c_handle_error(int device_id, i2c_itf_data_t* itf_data) +{ + I2C_TRACE("I2C(%d)->lead_error_handler\n", device_id); + uint32_t nack = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_ERROR_NACK_EVENT); + uint32_t arlo = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_ERROR_ARLO_EVENT); + uint32_t framing = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_ERROR_FRAMING_EVENT); + if (nack || arlo || framing) + { + // 1) stop cmd, rx and tx leader udma addr gen + pi_udma_core_lin_stop(itf_data->rx_chan_addr); + pi_udma_core_lin_stop(itf_data->tx_chan_addr); + pi_udma_core_lin_stop(itf_data->cmd_chan_addr); + + // 2) clear event, unlock and purge + udma_i2c_status_reg_idx_set(itf_data->base, + (nack << I2C_STATUS_ERROR_NACK_EVENT) | + (arlo << I2C_STATUS_ERROR_ARLO_EVENT) | + (framing << I2C_STATUS_ERROR_FRAMING_EVENT) | + (1 << I2C_FLAG_UNLOCK_EVENT_O) | + (1 << I2C_FLAG_PURGE_EVENT_O)); + } + + // 3) depends on error: + if (nack) + { + I2C_TRACE_ERR("I2C(%d)->lead_error_handler - nack error\n", device_id); + // NACK => report error + itf_data->end_task->data[0] = PI_ERR_I2C_NACK; + itf_data->end_task->arg[3] = 0; + pi_task_push_irq_safe(itf_data->end_task); + itf_data->end_task = NULL; + } + else if (arlo || framing) + { + I2C_TRACE_ERR("I2C(%d)->lead_error_handler - arbitration loss or framing error\n", device_id); + // ARLO and FRAMING error => restart current CMD buffer + __pi_i2c_send_request_from_irq(itf_data, itf_data->end_task); + } +} + +__attribute__((section(".text"))) __noinline +void __pi_i2c_lead_event_handler(uint32_t event, void *arg) +{ + i2c_itf_data_t *itf_data = arg; + int device_id = itf_data->id; + I2C_TRACE("I2C(%d)->lead_event_handler\n", device_id); + + // I2C_FLAG_CMD_EVENT is basically our EOT? + if(__pi_i2c_get_event_status(itf_data->base,I2C_FLAG_CMD_EVENT_I)) + { + // set the return status to OK + itf_data->end_task->data[0] = PI_OK; + itf_data->end_task->arg[3] = 0; + // if it's a mutex, release on the spot + pi_task_push_irq_safe(itf_data->end_task); + itf_data->end_task = NULL; + udma_i2c_status_reg_idx_set(itf_data->base, 1<end_task = next_task; + } + return; +} + +/** + * \brief internal helper function for preparing write command buffer + */ +static int __pi_i2c_prepare_write_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *buffer, + int size, pi_i2c_xfer_flags_e flags) +{ + uint16_t slave_addr = slave_data->slave_addr; + int index = 0; + + if(!(flags & PI_I2C_XFER_NO_START) || (flags & PI_I2C_XFER_RESTART)) + { // generate a start condition + buffer[index++] = slave_data->cfg; + buffer[index++] = I2C_CMD_LEAD_START(1); + } + // slave addr, no rnw bit since it's a write + if(!slave_data->is_10_bits) + { + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_addr & 0xFE); + } + else + { + slave_addr = slave_addr & 0x3FF; + uint16_t slave_addrh = (((slave_addr>>7)|0)&0x7) | 0xF0; + uint16_t slave_addrl = (slave_addr&0xFF); + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(1, (slave_addrh << 8) + | slave_addrl); + } + + buffer[index++] = I2C_CMD_RPT(size); + buffer[index++] = I2C_CMD_MISC_SEND(1); + if(!(flags & PI_I2C_XFER_NO_STOP)) + { + buffer[index++] = I2C_CMD_STOP(1); + } + + if(slave_data->wait_cycles) + { + buffer[index++] = I2C_CMD_RPT(slave_data->wait_cycles); + buffer[index++] = I2C_CMD_MISC_WAIT(1); + } + + buffer[index++] = I2C_CMD_EVENT(1); + + return index; +} + +/** + * \brief internal helper function for preparing read command buffer + */ +static inline int __pi_i2c_prepare_read_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *buffer, + int size, pi_i2c_xfer_flags_e flags) +{ + uint16_t slave_addr = slave_data->slave_addr; + int index = 0; + + if(!(flags & PI_I2C_XFER_NO_START) || (flags & PI_I2C_XFER_RESTART)) + { + buffer[index++] = slave_data->cfg; + buffer[index++] = I2C_CMD_LEAD_START(1); + } + + // slave addr + rnw bit + if(!slave_data->is_10_bits) + { + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, (slave_addr &0xFE)|1); + } + else + { // for 10 bits, need to use write mode first + slave_addr = slave_addr & 0x3FF; + uint16_t slave_addrh = (((slave_addr>>7)|0)&0x7) | 0xF0; + uint16_t slave_addrl = (slave_addr&0xFF); + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(1, (slave_addrh << 8) + | slave_addrl); + buffer[index++] = I2C_CMD_LEAD_START(1); + buffer[index++] = I2C_CMD_LEAD_SEND_IMM(slave_addrh|1); + } + buffer[index++] = I2C_CMD_RPT(size - 1); + // receive -1 byte because there is a "last" + buffer[index++] = I2C_CMD_MISC_RECEIVE(1); + buffer[index++] = I2C_CMD_MISC_RECEIVE_LAST(1); + + if(!(flags & PI_I2C_XFER_NO_STOP)) + { + buffer[index++] = I2C_CMD_STOP(1); + } + + if(slave_data->wait_cycles) + { + buffer[index++] = I2C_CMD_RPT(slave_data->wait_cycles); + buffer[index++] = I2C_CMD_MISC_WAIT(1); + } + buffer[index++] = I2C_CMD_EVENT(1); + + return index; +} + + +/** + * \brief internal helper function for preparing write&read command buffer + */ +static int __pi_i2c_prepare_write_read_buf(i2c_slave_data_t *slave_data, + uint32_t *buffer, int size0, int size1) +{ + int index = 0; + + buffer[index++] = slave_data->cfg; + buffer[index++] = I2C_CMD_LEAD_START(1); + // slave addr, no rnw bit since it's a write + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_data->slave_addr); + buffer[index++] = I2C_CMD_RPT(size0); + buffer[index++] = I2C_CMD_MISC_SEND(1); + buffer[index++] = I2C_CMD_LEAD_START(1); + // slave addr + rnw bit + if(!slave_data->is_10_bits) + { + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_data->slave_addr|1); + } + else + {// for 10 bits, need to use write mode first + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(1, slave_data->slave_addr); + buffer[index++] = I2C_CMD_LEAD_START(1); + buffer[index++] = I2C_CMD_LEAD_SEND_IMM(slave_data->slave_addrh|1); + } + buffer[index++] = I2C_CMD_RPT(size1); + // receive -1 byte because there is a "last" + buffer[index++] = I2C_CMD_MISC_RECEIVE(1); + buffer[index++] = I2C_CMD_MISC_RECEIVE_LAST(1); + buffer[index++] = I2C_CMD_STOP(1); + buffer[index++] = I2C_CMD_EVENT(1); + + return index; +} + + +/** + * \brief internal helper function for preparing write&read command buffer + */ +static int __pi_i2c_prepare_write_dual_buf(i2c_slave_data_t *slave_data, + uint32_t *buffer, int size0, int size1) +{ + int index = 0; + + buffer[index++] = slave_data->cfg; + buffer[index++] = I2C_CMD_LEAD_START(1); + // slave addr, no rnw bit since it's a write + buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_data->slave_addr); + buffer[index++] = I2C_CMD_RPT(size0); + buffer[index++] = I2C_CMD_MISC_SEND(1); + buffer[index++] = I2C_CMD_RPT(size1); + buffer[index++] = I2C_CMD_MISC_SEND(1); + buffer[index++] = I2C_CMD_STOP(1); + buffer[index++] = I2C_CMD_EVENT(1); + + return index; +} + + +static void __pi_i2c_timestamp_enable(i2c_itf_data_t *itf_data, struct pi_i2c_conf *conf) +{ +#if defined(__FREERTOS__) + uint8_t is_rx = conf->ts_ch; + + uint32_t base = UDMA_CTRL_ADDR; + uint8_t evt_id = conf->ts_evt_id; + uint8_t soc_evt = is_rx ? SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id) : SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id); + + uint32_t cfg_evt_val = (udma_ctrl_cfg_event_get(base) & ~(0xFF<config)); + + i2c_slave_data_t *slave_data; + struct pi_i2c_conf *conf = device->config; + i2c_itf_data_t *itf_data = NULL; + + // check interface first + I2C_TRACE("I2C(%d)->lead_open\n", conf->itf); + int irq = disable_irq(); + + slave_data = pi_fc_l1_malloc(sizeof(i2c_slave_data_t)); + if (slave_data == NULL) goto error0; + + if(!(itf_data = __pi_i2c_itf_data[conf->itf])) + { + uint32_t i2c_base = UDMA_I2C_ADDR(conf->itf); + + // prepare itf struct + itf_data = pi_fc_l1_malloc(sizeof(i2c_itf_data_t)); + I2C_TRACE("I2C(%d)->itf_data=%x\n", conf->itf, itf_data); + if(itf_data == NULL) goto error1; + + itf_data->rx_chan_id = pi_udma_core_lin_alloc(); + if (itf_data->rx_chan_id == -1) goto error2; + itf_data->tx_chan_id = pi_udma_core_lin_alloc(); + if (itf_data->tx_chan_id == -1) goto error3; + itf_data->cmd_chan_id = pi_udma_core_lin_alloc(); + if (itf_data->cmd_chan_id == -1) goto error4; + + __pi_i2c_itf_data[conf->itf] = itf_data; + itf_data->id = conf->itf; + itf_data->fifo_head = NULL; + itf_data->end_task = NULL; + itf_data->base = i2c_base; + itf_data->open_nb = 0; + + // disable udma reset before setting regs + udma_ctrl_cfg_rstn_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf)); + udma_ctrl_cfg_cg_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf)); + + udma_i2c_lead_udma_rx_dest_reg_idx_set(i2c_base, itf_data->rx_chan_id); + udma_i2c_lead_udma_tx_dest_reg_idx_set(i2c_base, itf_data->tx_chan_id); + udma_i2c_udma_cmd_dest_reg_idx_set(i2c_base, itf_data->cmd_chan_id); + + itf_data->rx_chan_addr = pi_udma_core_lin_addr_get(itf_data->rx_chan_id); + itf_data->tx_chan_addr = pi_udma_core_lin_addr_get(itf_data->tx_chan_id); + itf_data->cmd_chan_addr = pi_udma_core_lin_addr_get(itf_data->cmd_chan_id); + +#if defined(USE_TIMEOUT) + itf_data->rx_timeout_id = 0xFF; + itf_data->tx_timeout_id = 0xFF; +#endif + + udma_i2c_status_reg_idx_set(i2c_base, 1<itf), + __pi_i2c_lead_event_handler, itf_data); + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_I2C_LEAD_EVT(conf->itf)); + } + + slave_data->itf_data = itf_data; + slave_data->is_10_bits = conf->is_10_bits; + slave_data->wait_cycles = conf->wait_cycles; + slave_data->cfg = __i2c_prepare_timing(conf->max_baudrate, pi_freq_get(PI_FREQ_DOMAIN_PERIPH)); + + if (conf->is_10_bits) + { + uint16_t slave_addr = conf->cs & 0x3FF; + uint16_t slave_addrh = (((slave_addr>>7)|0)&0x7) | 0xF0; + uint16_t slave_addrl = (slave_addr&0xFF); + slave_data->slave_addr = (slave_addrh << 8) | slave_addrl; + slave_data->slave_addrh = slave_addrh; + } + else + { + slave_data->slave_addr = conf->cs & 0xFE; + } + + device->data = (void *)slave_data; + + itf_data->open_nb++; + restore_irq(irq); + return 0; + +error4: + pi_udma_core_lin_free(itf_data->tx_chan_id); +error3: + pi_udma_core_lin_free(itf_data->rx_chan_id); +error2: + pi_fc_l1_free(itf_data, sizeof(i2c_itf_data_t)); +error1: + pi_fc_l1_free(slave_data, sizeof(i2c_slave_data_t)); +error0: + restore_irq(irq); + return -1; +} + + +void pi_i2c_close(pi_device_t *device) +{ + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + I2C_TRACE("I2C->i2c_close\n"); + int irq = disable_irq(); + i2c_itf_data_t *itf_data = slave_data->itf_data; + pi_fc_l1_free(slave_data,sizeof(*slave_data)); + + itf_data->open_nb--; + if(itf_data->open_nb == 0) + { + // flush channels + pi_udma_core_lin_free(itf_data->rx_chan_id); + pi_udma_core_lin_free(itf_data->tx_chan_id); + pi_udma_core_lin_free(itf_data->cmd_chan_id); + + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->cmd_chan_id)); + + pi_udma_core_lin_reset(itf_data->rx_chan_addr); + pi_udma_core_lin_reset(itf_data->tx_chan_addr); + pi_udma_core_lin_reset(itf_data->cmd_chan_addr); + + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_I2C_LEAD_EVT(itf_data->id)); + + udma_ctrl_cfg_cg_clr_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(itf_data->id)); + udma_ctrl_cfg_rstn_clr_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(itf_data->id)); + __pi_i2c_itf_data[itf_data->id] = NULL; + pi_fc_l1_free(itf_data,sizeof(*itf_data)); + } + restore_irq(irq); +} + + +static void __pi_i2c_baudrate_set(i2c_itf_data_t *driver_data) +{ +} + + +#if defined(USE_TIMEOUT) +void __pi_i2c_timeout_abort(void* arg) +{ + i2c_itf_data_t *driver_data = (i2c_itf_data_t*) arg; + uint32_t device_id = driver_data->id; + int irq = pi_irq_disable(); + /* Stop UDMA channels. */ + udma_core_lin_t *udma_core = NULL; + if (driver_data->cmd_chan_id != 0xFF) + { + pi_udma_core_lin_reset(driver_data->rx_chan_addr); + } + if (driver_data->tx_chan_id != 0xFF) + { + if (driver_data->end_task->data[0] == I2C_WRITE) + { + driver_data->end_task->arg[3] = pi_udma_core_lin_bytes_left_get(driver_data->base); + } + pi_udma_core_lin_reset(driver_data->tx_chan_addr); + } + if (driver_data->rx_chan_id != 0xFF) + { + if (driver_data->end_task->data[0] == I2C_READ) + { + driver_data->end_task->arg[3] = pi_udma_core_lin_bytes_left_get(driver_data->base); + } + pi_udma_core_lin_reset(driver_data->rx_chan_addr); + } + + /* Status events clear. */ + uint32_t status_mask = (1 << UDMA_I2C_STATUS_REG_IDX_STATUS_LEAD_UNLOCK_EVENT_O_IDX_BIT | + 1 << UDMA_I2C_STATUS_REG_IDX_STATUS_LEAD_PURGE_EVENT_O_IDX_BIT | + 1 << UDMA_I2C_STATUS_REG_IDX_STATUS_I2C_SOFT_RESET_EVENT_O_IDX_BIT); + udma_i2c_status_reg_idx_set(driver_data->base, status_mask); + + /* Pop current aborted task. */ + driver_data->end_task = NULL; + pi_task_t *next_task = __pi_i2c_drv_fifo_pop(driver_data); + if (next_task) + { + driver_data->end_task = next_task; + __pi_i2c_send_request_from_irq(driver_data, next_task); + } + pi_irq_restore(irq); +} + +static void __pi_i2c_udma_timeout_rx_set(i2c_itf_data_t *driver_data, + uint8_t timeout_id) +{ + driver_data->rx_timeout_id = timeout_id; +} + +static void __pi_i2c_udma_timeout_tx_set(i2c_itf_data_t *driver_data, + uint8_t timeout_id) +{ + driver_data->tx_timeout_id = timeout_id; +} +#endif + +void pi_i2c_ioctl(struct pi_device *device, uint32_t cmd, void *arg) +{ + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + I2C_TRACE("I2C(%d) : ioctl cmd=%lx, arg=%lx\n", slave_data->itf_data->id, cmd, arg); + + uint32_t irq = disable_irq(); + uint8_t udma_timeout_id = 0xFF; + switch (cmd) + { + case PI_I2C_CTRL_SET_MAX_BAUDRATE : + __pi_i2c_baudrate_set(slave_data->itf_data); + break; + + case PI_I2C_IOCTL_ABORT_RX : + __pi_i2c_timeout_abort(slave_data->itf_data); + break; + + case PI_I2C_IOCTL_ABORT_TX : + __pi_i2c_timeout_abort(slave_data->itf_data); + break; + + case PI_I2C_IOCTL_ATTACH_TIMEOUT_RX : +#if defined(USE_TIMEOUT) + udma_timeout_id = (uint32_t) arg; + __pi_i2c_udma_timeout_rx_set(slave_data->itf_data, udma_timeout_id); +#endif + break; + + case PI_I2C_IOCTL_DETACH_TIMEOUT_RX : +#if defined(USE_TIMEOUT) + __pi_i2c_udma_timeout_rx_set(slave_data->itf_data, udma_timeout_id); +#endif + break; + + case PI_I2C_IOCTL_ATTACH_TIMEOUT_TX : +#if defined(USE_TIMEOUT) + udma_timeout_id = (uint32_t) arg; + __pi_i2c_udma_timeout_tx_set(slave_data->itf_data, udma_timeout_id); +#endif + break; + + case PI_I2C_IOCTL_DETACH_TIMEOUT_TX : +#if defined(USE_TIMEOUT) + __pi_i2c_udma_timeout_tx_set(slave_data->itf_data, udma_timeout_id); +#endif + break; + + case PI_I2C_IOCTL_EN_TIMESTAMP : + __pi_i2c_timestamp_enable(slave_data->itf_data, (struct pi_i2c_conf *) arg); + break; + + default : + break; + } + restore_irq(irq); +} + + +void pi_i2c_conf_init(pi_i2c_conf_t *conf) +{ + pi_assert(NULL != conf); + + conf->max_baudrate = 400000; + conf->itf = 0; + conf->cs = 0; + conf->wait_cycles = 0; + conf->is_10_bits = 0; +} + +void pi_i2c_conf_set_wait_cycles(struct pi_i2c_conf *conf, uint16_t wait_cycles) +{ + pi_assert(NULL != conf); + + conf->wait_cycles = wait_cycles; +} + +/** accessors **/ +void pi_i2c_conf_set_slave_addr(struct pi_i2c_conf *conf, uint16_t slave_addr, + int8_t is_10_bits) +{ + pi_assert(NULL != conf); + + conf->cs = slave_addr; + conf->is_10_bits = is_10_bits; +} + +int pi_i2c_write(struct pi_device *device, uint8_t *tx_data, int length, pi_i2c_xfer_flags_e flags) +{ + pi_task_t task_block; + pi_task_block(&task_block); + pi_i2c_write_async(device, (void*)tx_data, (uint32_t)length, flags, &task_block); + pi_task_wait_on(&task_block); + return pi_i2c_get_request_status(&task_block); +} + +int pi_i2c_read (struct pi_device *device, uint8_t *rx_buff, int length, + pi_i2c_xfer_flags_e flags) +{ + pi_task_t task_block; + pi_task_block(&task_block); + pi_i2c_read_async(device, (void*)rx_buff, (uint32_t)length, flags, &task_block); + pi_task_wait_on(&task_block); + return pi_i2c_get_request_status(&task_block); +} + +void pi_i2c_read_async(struct pi_device *device, uint8_t *buffer, int size, + pi_i2c_xfer_flags_e flags, pi_task_t *task) +{ + pi_assert(NULL != device); + pi_assert((NULL != rx_buff) && (IS_BUFF_IN_L2(rx_buff))); + pi_assert(0 != length); + pi_assert(NULL != task); + + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + i2c_itf_data_t *itf_data = slave_data->itf_data; + uint16_t slave_addr = slave_data->slave_addr; + + task->data[0] = I2C_READ; + task->data[1] = (uintptr_t)buffer; + task->data[2] = (uintptr_t)size; + task->data[3] = (uintptr_t)device; + task->data[4] = (uintptr_t)flags; + + int irq = disable_irq(); + if (!itf_data->end_task) + { // exec transfer + __pi_i2c_read_exec(slave_data, buffer, size, flags, task); + itf_data->end_task = task; + } + else + { + __pi_i2c_drv_fifo_enqueue(itf_data, task); + } + restore_irq(irq); +} + +void pi_i2c_write_async(struct pi_device *device, uint8_t *buffer, int size, + pi_i2c_xfer_flags_e flags, pi_task_t *task) +{ + pi_assert(NULL != device); + pi_assert((NULL != tx_data) && (IS_BUFF_IN_L2(tx_data))); + pi_assert(0 != length); + pi_assert(NULL != task); + + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + int irq = disable_irq(); + i2c_itf_data_t *itf_data = slave_data->itf_data; + + task->data[0] = I2C_WRITE; + task->data[1] = (uintptr_t)buffer; + task->data[2] = (uintptr_t)size; + task->data[3] = (uintptr_t)device; + task->data[4] = (uintptr_t)flags; + + if(!itf_data->end_task) + { // exec transfer + __pi_i2c_write_exec(slave_data, buffer, size, flags, task); + itf_data->end_task = task; + } + else + { + __pi_i2c_drv_fifo_enqueue(itf_data, task); + } + restore_irq(irq); +} + +int pi_i2c_read_timeout(struct pi_device *device, uint8_t *rx_buff, int length, + pi_i2c_xfer_flags_e flags, uint32_t timeout_us) +{ + pi_assert(NULL != device); + pi_assert((NULL != rx_buff) && (IS_BUFF_IN_L2(rx_buff))); + pi_assert(0 != length); + + pi_task_t task_block = {0}; + pi_task_block(&task_block); +#if defined(USE_TIMEOUT) + pi_task_timeout_set(&task_block, timeout_us); +#endif + pi_i2c_read_async(device, rx_buff, length, flags, &task_block); + pi_task_wait_on(&task_block); + int status = pi_task_status_get(&task_block); + //return ((status == -1) ? -1 : 0); + return status; +} + + +int pi_i2c_write_timeout(struct pi_device *device, uint8_t *tx_data, int length, + pi_i2c_xfer_flags_e flags, uint32_t timeout_us) +{ + pi_assert(NULL != device); + pi_assert((NULL != tx_data) && (IS_BUFF_IN_L2(tx_data))); + pi_assert(0 != length); + + pi_task_t task_block = {0}; + pi_task_block(&task_block); +#if defined(USE_TIMEOUT) + pi_task_timeout_set(&task_block, timeout_us); +#endif + pi_i2c_write_async(device, tx_data, length, flags, &task_block); + pi_task_wait_on(&task_block); + int status = pi_task_status_get(&task_block); + //return ((status == -1) ? -1 : 0); + return status; +} + +int pi_i2c_get_request_status(pi_task_t* task) +{ + pi_assert(NULL != task); + + return (int) (task->data[0]); +} + + +void pi_i2c_write_read(struct pi_device *device, void *tx_buffer, + void *rx_buffer, uint32_t tx_size, uint32_t rx_size) +{ + pi_task_t task_block; + pi_task_block(&task_block); + pi_i2c_write_read_async(device, tx_buffer, rx_buffer, tx_size, rx_size, &task_block); + pi_task_wait_on(&task_block); +} + + +void pi_i2c_write_read_async(struct pi_device *device, void *tx_buffer, + void *rx_buffer, uint32_t tx_size, uint32_t rx_size, + pi_task_t *task) +{ + pi_assert(NULL != device); + pi_assert((NULL != tx_buffer) && (IS_BUFF_IN_L2(tx_buffer))); + pi_assert((NULL != rx_buffer) && (IS_BUFF_IN_L2(rx_buffer))); + pi_assert(0 != tx_size); + pi_assert(0 != rx_size); + pi_assert(NULL != task); + + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + int irq = disable_irq(); + i2c_itf_data_t *itf_data = slave_data->itf_data; + + task->data[0] = I2C_WRITE_READ; + task->data[1] = (uintptr_t)tx_buffer; + task->data[2] = (uintptr_t)rx_buffer; + task->data[3] = (uintptr_t)device; + task->data[4] = (uintptr_t)tx_size; + task->data[5] = (uintptr_t)rx_size; + + if(!itf_data->end_task) + { + __pi_i2c_write_read_exec(slave_data, tx_buffer, rx_buffer, tx_size, rx_size, task); + itf_data->end_task = task; + } + else + { + __pi_i2c_drv_fifo_enqueue(itf_data, task); + } + restore_irq(irq); +} + +void pi_i2c_write_dual_async(struct pi_device *device, void *tx_buffer0, + void *tx_buffer1, uint32_t tx_size0, uint32_t tx_size1, + pi_task_t *task) +{ + pi_assert(NULL != device); + pi_assert((NULL != tx_buffer0) && (IS_BUFF_IN_L2(tx_buffer0))); + pi_assert((NULL != tx_buffer1) && (IS_BUFF_IN_L2(tx_buffer1))); + pi_assert(0 != tx_size0); + pi_assert(0 != tx_size1); + pi_assert(NULL != task); + + i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data; + + int irq = disable_irq(); + i2c_itf_data_t *itf_data = slave_data->itf_data; + + task->data[0] = I2C_WRITE_DUAL; + task->data[1] = (uintptr_t)tx_buffer0; + task->data[2] = (uintptr_t)tx_buffer1; + task->data[3] = (uintptr_t)device; + task->data[4] = (uintptr_t)tx_size0; + task->data[5] = (uintptr_t)tx_size1; + + if(!itf_data->end_task) + { + __pi_i2c_write_dual_exec(slave_data, tx_buffer0, tx_buffer1, tx_size0, tx_size1, task); + itf_data->end_task = task; + } + else + { + __pi_i2c_drv_fifo_enqueue(itf_data, task); + } + restore_irq(irq); +} + diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_internal.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_internal.h new file mode 100644 index 000000000..ed833146c --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_internal.h @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "stdlib.h" +#include "pmsis.h" +#include "pmsis/drivers/i2c.h" +#include "udma_i2c.h" +#include + +#if !defined(__FREERTOS__) +#if !defined(__TRACE_ALL__) && !defined(__TRACE_I2C__) +#define I2C_TRACE(x...) +#define I2C_TRACE_ERR(...) ((void) 0) +#else +#define I2C_TRACE(level, x...) POS_TRACE(level, "[I2C] " x) +#define I2C_TRACE_ERR(...) PI_LOG_ERR(__func__, __VA_ARGS__) +#endif +#endif + +/*! @brief UART request structure. */ +#define i2c_req_t udma_req_t + +#define I2C_READ 0 +#define I2C_WRITE 1 +#define I2C_WRITE_READ 2 +#define I2C_WRITE_DUAL 3 + +#define I2C_CMD_BUF_SIZE 14 + +#define I2C_BUF_START_POS 6 +#define I2C_BUF_WRITE_RPT_POS 8 + +#define I2C_W_R_BUF_WR_POS 2 +#define I2C_W_R_BUF_RESTART_POS 4 +#define RD_BUF_RPT(bits) ((bits) ? 10 : 8) +#define W_R_BUF_RD(bits) ((bits) ? 8 : 6) + +#define RD_BUF_STOP_ID(bits) (RD_BUF_RPT((bits)) + 3) +#define WR_BUF_STOP_ID 10 + +#define I2C_SLAVE_GET_ITF(slave) (slave->itf_data) + + +typedef uint32_t i2c_cmd_t; + +typedef struct i2c_slave_data +{ + struct i2c_itf_data *itf_data; + uint32_t cfg; + // a slave might answer to up to two addresses, 7 or 10 bits + int16_t wait_cycles; + uint16_t slave_addr; + uint16_t slave_addrh; + int8_t is_10_bits; +} i2c_slave_data_t; + +typedef struct i2c_itf_data { + uint32_t base; + pi_task_t *fifo_head; + pi_task_t *fifo_tail; + pi_task_t *end_task; + i2c_cmd_t cmd_buf[I2C_CMD_BUF_SIZE]; + uint32_t rx_chan_addr; + uint32_t tx_chan_addr; + uint32_t cmd_chan_addr; + // per itf + uint8_t open_nb; + uint8_t id; + // --- channel event --- + int8_t rx_chan_id; + int8_t tx_chan_id; + int8_t cmd_chan_id; + // --- timeout channel --- + uint8_t rx_timeout_id; + uint8_t tx_timeout_id; +} i2c_itf_data_t; + + +// Has to be synchronized with irq_disabled since irq handler might pop at the same time +static inline void __pi_i2c_drv_fifo_enqueue(struct i2c_itf_data *data, + pi_task_t *pi_task) +{ + if (data->fifo_head) + { + data->fifo_tail->next = pi_task; + } + else + { + data->fifo_head = pi_task; + } + + pi_task->next = NULL; + data->fifo_tail = pi_task; +} + +static inline pi_task_t *__pi_i2c_drv_fifo_pop(struct i2c_itf_data *data) +{ + pi_task_t *ret_task = data->fifo_head; + if (ret_task) + { + data->fifo_head = ret_task->next; + } + return ret_task; +} + +/* + * @brief compute input divisor for i2c ip + * Input divisor is not the same as hw/real divisor, so compute an input + * divisor as near as possible to satisfy constraint + */ +static inline uint32_t __i2c_prepare_timing(uint32_t max_baudrate, + uint32_t periph_clock) +{ + pi_i2c_mode_e mode; + // TODO: HW, discuss plain divisor, this makes no sense + uint32_t input_div = 0, hw_div=0, targetL, targetH, divH, divL, cmd; + + // second, determine mode: + if(max_baudrate < 200000) + { + mode = PI_I2C_STD_MODE; + } + else if(max_baudrate >= 200000 && max_baudrate < 750000) + { + mode = PI_I2C_FAST_MODE; + } + else if(max_baudrate >= 750000) + { + mode = PI_I2C_FAST_MODE_PLUS; + } + + // choose target L and H (depend on std = 100KHz, fast mode=400KHz) + // we try to get the best frequency, without going faster than what standard + // allows + switch(mode) + { + case PI_I2C_STD_MODE: // 100KHz - up to 200 + targetL = 4700; // 4.7 µs + targetH = 4000; // 4.0 µs + break; + case PI_I2C_FAST_MODE: // 400KHz + targetL = 1300; // 1.3 µs + targetH = 600; // 0.6 µs + break; + case PI_I2C_FAST_MODE_PLUS: // 1MHz + targetL = 500; // 0.5 µs + targetH = 260; // 0.260 µs + break; + default: + return PI_FAIL; + } + + /* timing method extracted from IP designer python script */ + input_div = 0; + divL = 15; + divH = 15; + + uint32_t dio = 3; + uint32_t f_pclk_mega = periph_clock / 1000000; + input_div = targetL * f_pclk_mega / (1000 * (15 + 1 + dio)); + divL = targetL * f_pclk_mega / (1000 * (input_div + 1)) - dio; + divH = targetH * f_pclk_mega / (1000 * (input_div + 1)) - dio; + + if (divL > 15) + { + input_div = input_div + 1; + divL = 1000 * targetL * f_pclk_mega / (input_div + 1) - dio; + divH = 1000 * targetH * f_pclk_mega / (input_div + 1) - dio; + } + + cmd = I2C_CMD_TIMING(((input_div&0xFF) << 8) | ((divH&0xF) << 4) + | (divL&0xF)); + return cmd; +} + +void __pi_i2c_timeout_abort(void* arg); + +static inline void __pi_i2c_timeout_config_set(pi_task_t *task, uint8_t timeout_id, + uint8_t udma_chan_id, uint32_t timeout_us, + pi_callback_func_t abort_func, + void *arg) +{ +#if defined(__FREERTOS__) + pi_udma_timeout_config_set(task, timeout_id, udma_chan_id, timeout_us); + pi_task_timeout_callback_set(task, abort_func, arg); +#endif +} diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave.c new file mode 100644 index 000000000..25e378d9b --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave.c @@ -0,0 +1,387 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pmsis.h" +#include "i2c_internal.h" +#include "pmsis/drivers/i2c_slave.h" +#include "i2c_slave_internal.h" + + +/** Internal defines **/ +#define I2C_SLAVE_ADDR0 0 +#define I2C_SLAVE_ADDR1 1 +#define I2C_ADDR_PUSH_DISABLE 0 +#define I2C_ADDR_PUSH_ENABLE 1 + +struct i2c_slave_itf_data *__global_i2c_slave_itf_data[UDMA_NB_I2C]; + + +static void __pi_i2c_slave_handle_error(struct i2c_slave_itf_data *itf_data) +{ + I2C_TRACE("I2C_slave(%d)->slave_error_handler\n", device_id); + // I2C_STATUS_FOLL_ERROR_ARLO_EVENT & I2C_STATUS_FOLL_ERROR_FRAMING_EVENT + // => clear status, unlock and purge + int device_id = itf_data->id; + uint32_t arlo = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_ARLO_EVENT); + uint32_t framing = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_FRAMING_EVENT); + if (arlo || framing) + { + I2C_TRACE("I2C_slave(%d)->slave_error_handler - arbitration loss or framing error\n", device_id); + udma_i2c_status_reg_idx_set(itf_data->base, + (arlo << I2C_STATUS_FOLL_ERROR_ARLO_EVENT) | + (framing << I2C_STATUS_FOLL_ERROR_FRAMING_EVENT) | + (1 << I2C_FLAG_FOLL_UNLOCK_EVENT_O) | + (1 << I2C_FLAG_FOLL_PURGE_EVENT_O)); + } +} + +__attribute__((section(".text"))) __noinline +void __pi_i2c_slave_event_handler(uint32_t event, void *arg) +{ + struct i2c_slave_itf_data *itf_data = arg; + int device_id = itf_data->id; + I2C_TRACE("I2C_slave(%d)->slave_event_handler\n", device_id); + + // even for tx, there is a pseudo rx for addr matcher byte + uint8_t *l2_buffer = itf_data->rx_buffer; + + uint8_t addr_byte = *l2_buffer; + uint8_t match = addr_byte >> 6; + l2_buffer++; + + struct pi_i2c_slave_args slave_args = { + .slave_addr = itf_data->addr[match], + .handle = arg, + .ret = PI_FAIL, // if callee does not fill it, consider it a failure + .itf_id = device_id, + }; + + // First check whether we're here because of read or send + // then let the user supplied callback execute + // User will have the responsibility of unlocking the udma once callback + // is done + if(__pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_ARLO_EVENT) + ||__pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_FRAMING_EVENT)) + { + // TODO should we execute callback if there is an error ? + __pi_i2c_slave_handle_error(itf_data); + } + if(__pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_EOF_RCV_EVENT)) + { + uint32_t bytes_left = pi_udma_core_lin_bytes_left_get(itf_data->rx_chan_addr); + uint32_t size = itf_data->rx_buffer_size - bytes_left; + slave_args.nb_bytes = size - 1; // remove push byte + slave_args.l2_buffer = l2_buffer; // take buffer minus first byte + if(itf_data->rx_callback) + { + itf_data->rx_callback(&slave_args); + } + } + if(__pi_i2c_get_event_status(itf_data->base, + I2C_STATUS_FOLL_EOF_SND_EVENT)) + { + uint32_t bytes_left = pi_udma_core_lin_bytes_left_get(itf_data->tx_chan_addr); + uint32_t size = itf_data->tx_buffer_size - bytes_left; + slave_args.nb_bytes = size; + slave_args.l2_buffer = itf_data->tx_buffer; + if(itf_data->tx_callback) + { + itf_data->tx_callback(&slave_args); + } + } + return; +} + +/* + * @brief Internal open of i2c interface + * Caller must provide synchronization + */ +int pi_i2c_slave_open(struct pi_device *device) +{ + pi_assert((NULL != device) && (NULL != device->config)); + struct pi_i2c_slave_conf *conf = device->config; + + struct i2c_slave_itf_data *itf_data = NULL; + // check interface first + I2C_TRACE("I2C_slave(%d)->open_slave\n", conf->itf); + int irq = disable_irq(); + if(!(itf_data = __global_i2c_slave_itf_data[conf->itf])) + { + uint32_t i2c_base = UDMA_I2C_ADDR(conf->itf); + + // prepare itf struct + itf_data = pi_fc_l1_malloc(sizeof(struct i2c_slave_itf_data)); + I2C_TRACE("I2C_slave(%d)->itf_data=%x\n", conf->itf, itf_data); + if(itf_data == NULL) + { + restore_irq(irq); + return -1; + } + __global_i2c_slave_itf_data[conf->itf] = itf_data; + memset(itf_data,0,sizeof(struct i2c_slave_itf_data)); + itf_data->id = conf->itf; + itf_data->base = i2c_base; + + // disable udma reset before setting regs + udma_ctrl_cfg_rstn_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf)); + udma_ctrl_cfg_cg_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf)); + + itf_data->rx_chan_id = pi_udma_core_lin_alloc(); + I2C_TRACE("I2C(%d)->rx chan id = %x\n", conf->itf, itf_data->rx_chan_id); + itf_data->tx_chan_id = pi_udma_core_lin_alloc(); + itf_data->cmd_chan_id = pi_udma_core_lin_alloc(); + udma_i2c_udma_cmd_dest_reg_idx_set(i2c_base, itf_data->cmd_chan_id); + + itf_data->rx_chan_addr = pi_udma_core_lin_addr_get(itf_data->rx_chan_id); + itf_data->tx_chan_addr = pi_udma_core_lin_addr_get(itf_data->tx_chan_id); + itf_data->cmd_chan_addr = pi_udma_core_lin_addr_get(itf_data->cmd_chan_id); + + // Master init procedure + int cmd_buf_id = 0; + uint32_t cmd_buf[8]; + + udma_i2c_status_reg_idx_set(i2c_base, 1<max_baudrate, + pi_freq_get(PI_FREQ_DOMAIN_PERIPH)); + if(conf->addr0 != 0) + { + itf_data->addr[0] = conf->addr0; + cmd_buf[cmd_buf_id++] = CMD_FOLL_ADDR(I2C_SLAVE_ADDR0, + I2C_ADDR_PUSH_ENABLE, conf->addr0_10_bit, conf->addr0, + conf->mask0, conf->sof0, conf->eof0); + } + if(conf->addr1 != 0) + { + itf_data->addr[1] = conf->addr1; + cmd_buf[cmd_buf_id++] = CMD_FOLL_ADDR(I2C_SLAVE_ADDR1, + I2C_ADDR_PUSH_ENABLE, conf->addr1_10_bit, conf->addr1, + conf->mask1, conf->sof1, conf->eof1); + } + cmd_buf[cmd_buf_id++] = I2C_CMD_EVENT(1); + pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)cmd_buf, + cmd_buf_id*sizeof(uint32_t), 0); + + while(!__pi_i2c_get_event_status(i2c_base,I2C_FLAG_CMD_EVENT_I)) + { + pi_time_wait_us(1); + } + udma_i2c_status_reg_idx_set(i2c_base, 1<rx_chan_id); + udma_i2c_foll_udma_tx_dest_reg_idx_set(i2c_base, itf_data->tx_chan_id); + pi_fc_event_handler_set(SOC_EVENT_UDMA_I2C_SLAVE_EVT(conf->itf), + __pi_i2c_slave_event_handler, itf_data); + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_I2C_SLAVE_EVT(conf->itf)); + + itf_data->rx_callback = conf->rx_callback; + itf_data->tx_callback = conf->tx_callback; + + } + device->data = itf_data; + itf_data->open_nb++; + restore_irq(irq); + return 0; +} + +void pi_i2c_slave_close(struct pi_device *device) +{ + pi_assert(NULL != device); + + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)device->data; + + int irq = disable_irq(); + itf_data->open_nb--; + if(itf_data->open_nb == 0) + { + // flush channels + pi_udma_core_lin_free(itf_data->rx_chan_id); + pi_udma_core_lin_free(itf_data->tx_chan_id); + pi_udma_core_lin_free(itf_data->cmd_chan_id); + + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->cmd_chan_id)); + + pi_udma_core_lin_reset(itf_data->rx_chan_addr); + pi_udma_core_lin_reset(itf_data->tx_chan_addr); + pi_udma_core_lin_reset(itf_data->cmd_chan_addr); + + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_I2C_SLAVE_EVT(itf_data->id)); + + // disable udma reset before setting regs + udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, 1 << UDMA_I2C_ID(itf_data->id)); + udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, 1 << UDMA_I2C_ID(itf_data->id)); + __global_i2c_slave_itf_data[itf_data->id] = NULL; + pi_fc_l1_free(itf_data,sizeof(itf_data)); + } + restore_irq(irq); +} + + +void pi_i2c_slave_conf_init(pi_i2c_slave_conf_t *conf) +{ + pi_assert(NULL != conf); + + conf->max_baudrate = 400000; + conf->itf = 0; + conf->addr0 = 0; + conf->addr1 = 0; + conf->sof0 = 0; + conf->eof0 = 0; + conf->sof1 = 0; + conf->eof1 = 0; + conf->addr0_10_bit = 0; + conf->addr1_10_bit = 0; + conf->mask0 = 0x1F; + conf->mask1 = 0x1F; + conf->addr0 = 0; + conf->addr1 = 0; + conf->rx_callback = NULL; + conf->tx_callback = NULL; +} + + +/** accessors **/ + +void pi_i2c_slave_conf_set_addr0(struct pi_i2c_slave_conf *conf, uint16_t addr, + uint8_t mask, uint8_t is_10_bit, uint8_t eof, uint8_t sof) +{ + pi_assert(NULL != conf); + + conf->addr0 = addr; + conf->mask0 = mask; + conf->addr0_10_bit = is_10_bit; + conf->eof0 = eof; + conf->sof0 = sof; +} + +void pi_i2c_slave_conf_set_addr1(struct pi_i2c_slave_conf *conf, uint16_t addr, + uint8_t mask, uint8_t is_10_bit, uint8_t eof, uint8_t sof) +{ + pi_assert(NULL != conf); + + conf->addr1 = addr; + conf->mask1 = mask; + conf->addr1_10_bit = is_10_bit; + conf->eof1 = eof; + conf->sof1 = sof; +} + +void pi_i2c_slave_set_rx_channel(void *handle, + void *l2_addr, uint32_t size) +{ + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle; + + pi_assert(NULL != itf_data); + pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr))); + pi_assert(0 != size); + + int irq = disable_irq(); + pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)l2_addr, size, 0); + itf_data->rx_buffer = l2_addr; + itf_data->rx_buffer_size = size; + restore_irq(irq); +} + +void pi_i2c_slave_set_tx_channel(void *handle, + void *l2_addr, uint32_t size) +{ + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle; + + pi_assert(NULL != itf_data); + pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr))); + pi_assert(0 != size); + + int irq = disable_irq(); + pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)l2_addr, size, 0); + itf_data->tx_buffer = l2_addr; + itf_data->tx_buffer_size = size; + restore_irq(irq); +} + +void pi_i2c_slave_unlock(void *handle, int is_rd) +{ + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle; + + pi_assert(NULL != handle); + + int device_id = itf_data->id; + if(is_rd) + { + udma_i2c_status_reg_idx_set(itf_data->base,(1<base,(1<rx_chan_addr, (uint32_t)l2_addr, size, 0); + itf_data->rx_buffer = l2_addr; + itf_data->rx_buffer_size = size; + restore_irq(irq); +} + +void pi_i2c_slave_set_tx(void *handle, void *l2_addr, + uint32_t size) +{ + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle; + + pi_assert(NULL != itf_data); + pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr))); + pi_assert(0 != size); + + int irq = disable_irq(); + pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)l2_addr, size, 0); + itf_data->tx_buffer = l2_addr; + itf_data->tx_buffer_size = size; + restore_irq(irq); +} + +void pi_i2c_slave_stop_rx(void *handle) +{ + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle; + + pi_assert(NULL != itf_data); + + int irq = disable_irq(); + pi_udma_core_lin_stop(itf_data->rx_chan_addr); + restore_irq(irq); +} + +void pi_i2c_slave_stop_tx(void *handle) +{ + struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle; + + pi_assert(NULL != itf_data); + + int irq = disable_irq(); + pi_udma_core_lin_stop(itf_data->tx_chan_addr); + restore_irq(irq); +} diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave_internal.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave_internal.h new file mode 100644 index 000000000..70ec112cf --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave_internal.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "stdlib.h" +#include "pmsis.h" +#include "pmsis/drivers/i2c_slave.h" + +#ifndef I2C_DRIVER_DATA_IMPLEM_SPECIFC + #define I2C_DRIVER_DATA_IMPLEM_SPECIFC +#endif + +/*! @brief UART request structure. */ +#define i2c_req_t udma_req_t + +typedef uint32_t i2c_cmd_t; + +struct i2c_slave_itf_data { + uint32_t base; + + // per itf + uint32_t rx_chan_addr; + uint32_t tx_chan_addr; + uint32_t cmd_chan_addr; + + uint8_t open_nb; + uint8_t id; + // --- channel event --- + uint8_t rx_chan_id; + uint8_t tx_chan_id; + uint8_t cmd_chan_id; + // -- addresses to which we answer + uint8_t addr0_10_bit; + uint8_t addr1_10_bit; + uint8_t addr0_mask; + uint8_t addr1_mask; + uint16_t addr[4]; + // in handler callbacks + pi_i2c_callback_t rx_callback; + pi_i2c_callback_t tx_callback; + // buffers + void *rx_buffer; + uint32_t rx_buffer_size; + void *tx_buffer; + uint32_t tx_buffer_size; + I2C_DRIVER_DATA_IMPLEM_SPECIFC +}; diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/udma_i2c.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/udma_i2c.h new file mode 100644 index 000000000..02244b182 --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/udma_i2c.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2022 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +typedef enum i2c_mode { + PI_I2C_STD_MODE, + PI_I2C_FAST_MODE, + PI_I2C_FAST_MODE_PLUS, + PI_I2C_NB_MODES +} pi_i2c_mode_e; + +// TEMPORARY, until can be generated +#define I2C_STATUS_FOLL_EOF_RCV_EVENT 2 +#define I2C_STATUS_FOLL_EOF_SND_EVENT 3 +#define I2C_STATUS_FOLL_ERROR_ARLO_EVENT 4 +#define I2C_STATUS_FOLL_ERROR_FRAMING_EVENT 5 +#define I2C_FLAG_CMD_EVENT_I 19 +#define I2C_FLAG_FOLL_UNLOCK_EVENT_O 14 +#define I2C_FLAG_FOLL_PURGE_EVENT_O 15 +#define I2C_STATUS_ERROR_NACK_EVENT 16 +#define I2C_STATUS_ERROR_ARLO_EVENT 17 +#define I2C_STATUS_ERROR_FRAMING_EVENT 18 +#define I2C_FLAG_UNLOCK_EVENT_O 22 +#define I2C_FLAG_PURGE_EVENT_O 23 +#define I2C_FLAG_SOFT_RESET_EVENT 24 +#define I2C_FLAG_PRESC_DIV10_EVENT_O 25 + +#define I2C_CMD_TIMING(T) ((0x10 << 24) | (T)) +#define I2C_CMD_EVENT(T) ((0x41 << 24)) + + +#define CMD_FOLL_ADDR(match_id,push_en,addr_10_bit,slave_addr,mask,sof,eof) \ + ((0x20 << 24) | (match_id << 22) | (push_en << 21) | (eof << 20) \ + | (sof << 19) | (eof << 18) | (sof << 17) | (addr_10_bit << 16) \ + | ((!(addr_10_bit)) << 15) | (mask << 10) \ + | ((addr_10_bit) ? (slave_addr) : ((slave_addr)>>1) << 0)) + +/* enable automatic sending of stop when receiving a nack error */ +#define __I2C_NACK_STOP ((1 << 23)) + + +#define I2C_CMD_LEAD_START(T) ((0x30 << 24) | __I2C_NACK_STOP) +#define I2C_CMD_MISC_WAIT(T) ((0x3<<24)) +#define I2C_CMD_NOP(T) ((0x0<<24)) +// add -1, well, hw guys saving on bits... +#define I2C_CMD_RPT(T) ((0x02 << 24) | (((T)&0xFFFF))) +#define I2C_CMD_LEAD_SEND_IMM(T) ((0x32 << 24) | __I2C_NACK_STOP | ((T)&0xFF)) +#define I2C_CMD_LEAD_SEND_IMM_ADDR(IS_10BITS, T) ((0x37 << 24) | __I2C_NACK_STOP | (IS_10BITS << 15) | ((T)&0xFFFF)) +#define I2C_CMD_MISC_RECEIVE(T) ((0x33 << 24) | __I2C_NACK_STOP) +#define I2C_CMD_MISC_RECEIVE_LAST(T) ((0x34 << 24) | __I2C_NACK_STOP) +#define I2C_CMD_MISC_SEND(T) ((0x31 << 24) | __I2C_NACK_STOP) +#define I2C_CMD_STOP(T) ((0x36 << 24) | __I2C_NACK_STOP) +#define I2C_CMD_UDMA_TX_CHAN_CFG(T) (( 0x50 << 24 ) | (T)) +#define I2C_CMD_UDMA_RX_CHAN_CFG(T) (( 0x51 << 24 ) | (T)) + +#define I2C_CHAN_ADDR_REG 0x0 +#define I2C_CHAN_SIZE_REG 0x2 +#define I2C_CHAN_CFG_REG 0x7 + + + +/** get_current value of event in i2c event register. Defined as a macro to work with O0 **/ +#define __pi_i2c_get_event_status(base, event_id) __BITEXTRACT(udma_i2c_status_reg_idx_get(base), 1, event_id) diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c index 8f0127712..44c869014 100644 --- a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c @@ -21,10 +21,6 @@ #include #include -#if defined(__FREERTOS__) -#define pi_task_push_irq_safe(pi_task) pi_task_push((pi_task)) -#endif /* __FREERTOS__ */ - #define I2S_NB_SLOTS 16 typedef struct @@ -143,11 +139,7 @@ static void *__pi_i2s_ring_buffer_pop(__pi_i2s_slot_t *slot) } -#if !defined(__FREERTOS__) -static void __pi_i2s_handle_rx_frame(int event, void *arg) -#else -static void __pi_i2s_handle_rx_frame(void *arg) -#endif /* __FREERTOS__ */ +static void __pi_i2s_handle_rx_frame(uint32_t event, void *arg) { __pi_i2s_slot_t *frame_slot = (__pi_i2s_slot_t *)arg; __pi_i2s_t *i2s = frame_slot->i2s; @@ -204,7 +196,7 @@ static void __pi_i2s_handle_rx_frame(void *arg) frame_slot->tx_buffer0 = frame_slot->tx_buffer1; frame_slot->tx_buffer1 = __pi_i2s_ring_buffer_pop(frame_slot); - + if (frame_slot->tx_buffer1) { uint32_t frame = frame_slot->frame; @@ -253,11 +245,7 @@ static void __pi_i2s_handle_rx_frame(void *arg) } } -#if !defined(__FREERTOS__) -static void __pi_i2s_handle_tx_frame(int event, void *arg) -#else -static void __pi_i2s_handle_tx_frame(void *arg) -#endif /* __FREERTOS__ */ +static void __pi_i2s_handle_tx_frame(uint32_t event, void *arg) { __pi_i2s_slot_t *frame_slot = (__pi_i2s_slot_t *)arg; __pi_i2s_t *i2s = frame_slot->i2s; @@ -321,11 +309,7 @@ static void __pi_i2s_handle_tx_frame(void *arg) } -#if !defined(__FREERTOS__) -static void __pi_i2s_handle_rx_channel(int event, void *arg) -#else -static void __pi_i2s_handle_rx_channel(void *arg) -#endif /* __FREERTOS__ */ +static void __pi_i2s_handle_rx_channel(uint32_t event, void *arg) { __pi_i2s_slot_t *slot = (__pi_i2s_slot_t *)arg; __pi_i2s_t *i2s = slot->i2s; @@ -390,7 +374,7 @@ static void __pi_i2s_handle_rx_channel(void *arg) slot->tx_buffer0 = slot->tx_buffer1; slot->tx_buffer1 = __pi_i2s_ring_buffer_pop(slot); - + if (slot->tx_buffer1) { uint32_t base = slot->channel_base; @@ -424,11 +408,7 @@ static void __pi_i2s_handle_rx_channel(void *arg) } } -#if !defined(__FREERTOS__) -static void __pi_i2s_handle_tx_channel(int event, void *arg) -#else -static void __pi_i2s_handle_tx_channel(void *arg) -#endif /* __FREERTOS__ */ +static void __pi_i2s_handle_tx_channel(uint32_t event, void *arg) { __pi_i2s_slot_t *slot = (__pi_i2s_slot_t *)arg; __pi_i2s_t *i2s = slot->i2s; @@ -505,8 +485,7 @@ static inline void __pi_i2s_enqueue_buffer(__pi_i2s_slot_t *slot, void *buffer) slot->ring_buffer.buffer[slot->ring_buffer.head++] = buffer; if (slot->ring_buffer.head == slot->ring_buffer.nb_elem) slot->ring_buffer.head = 0; - - + slot->ring_buffer.current_nb_elem++; } @@ -637,7 +616,6 @@ int __pi_i2s_channel_conf_set(struct pi_device *device, uint32_t frame, int slot unsigned int loopback = __BITEXTRACTU(conf->options, 1, PI_I2S_OPT_LOOPBACK_ENA_SHIFT); int ring_buffer_nb_elem = conf->mem_slab ? conf->mem_slab->num_blocks : 2; int ring_buffer_size = sizeof(void *) * ring_buffer_nb_elem; - int use_buffers = ((conf->pingpong_buffers[0] != NULL || conf->mem_slab) && loopback == 0) || frame; slot = pi_fc_l1_malloc(sizeof(__pi_i2s_slot_t) + ring_buffer_size); if (slot == NULL) @@ -671,7 +649,7 @@ int __pi_i2s_channel_conf_set(struct pi_device *device, uint32_t frame, int slot slot->tx_task1 = NULL; slot->ignore_first_error = 0; - if (use_buffers) + if (conf->asrc_channel == -1 && loopback == 0) { int channel = pi_udma_core_lin_alloc(); if (channel == -1) @@ -819,8 +797,8 @@ int pi_i2s_open(struct pi_device *device) if (i2s->open_count == 1) { int periph_id = UDMA_I2S_ID(itf_id); - udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, 1 << periph_id); - udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, 1 << periph_id); + udma_ctrl_cfg_rstn_set_set(UDMA_CTRL_ADDR, 1 << periph_id); + udma_ctrl_cfg_cg_set_set(UDMA_CTRL_ADDR, 1 << periph_id); i2s->itf = itf_id; i2s->errors = 0; @@ -959,8 +937,8 @@ void pi_i2s_close(struct pi_device *device) // And deactivated device int periph_id = UDMA_I2S_ID(i2s->itf); - udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, 1 << periph_id); - udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, 1 << periph_id); + udma_ctrl_cfg_rstn_clr_set(UDMA_CTRL_ADDR, 1 << periph_id); + udma_ctrl_cfg_cg_clr_set(UDMA_CTRL_ADDR, 1 << periph_id); } } @@ -1024,19 +1002,11 @@ static void __pi_i2s_slot_stop(__pi_i2s_slot_t *slot) if (slot->is_rx) { - #if !defined(__FREERTOS__) __pi_i2s_handle_rx_channel(0, slot); - #else - __pi_i2s_handle_rx_channel(slot); - #endif /* __FREERTOS__ */ } else { - #if !defined(__FREERTOS__) __pi_i2s_handle_tx_channel(0, slot); - #else - __pi_i2s_handle_tx_channel(slot); - #endif /* __FREERTOS__ */ } } @@ -1271,7 +1241,7 @@ int pi_i2s_channel_write_async(struct pi_device *dev, int channel, // them if they are different. task->data[0] = 0; - if (!slot->mem_slab) + if (slot->pingpong_buffers[0]) { int buffer_index = slot->current_buffer; mem_block = slot->pingpong_buffers[buffer_index]; diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim.c new file mode 100644 index 000000000..025d0afad --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim.c @@ -0,0 +1,1204 @@ +/* + * Copyright (C) 2019 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/* + * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com) + */ + +#include +#include +#include "pmsis.h" + +#include "spim_v4.h" +#include "chips/gap9/drivers/udma/udma_core.h" +#include "pmsis/task.h" + +#if !defined(__FREERTOS__) +#define SOC_EVENT_UDMA_SPIM_EOT(id) (80 + id + 0) +#else +#define likely(x) (__builtin_expect(x, 1)) +#endif /* __FREERTOS__ */ + +#ifndef SPIM_TRACE +#if !defined(__TRACE_ALL__) && !defined(__TRACE_SPIM__) +#define SPIM_TRACE(x...) +#else +#define SPIM_TRACE(level, x...) POS_TRACE(level, "[SPIM] " x) +#endif +#endif + +#define PI_SPIM_UDMA_CMD_SIZE 4 + +typedef struct +{ + uint32_t *temp_buff; + uint32_t addr; + uint32_t size; + uint32_t end; +} pi_spim_pending_transfert_t; + + +typedef struct +{ + uint32_t udma_cmd[PI_SPIM_UDMA_CMD_SIZE]; + uint32_t temp_buff[2]; +} pi_spim_l2_t; + +typedef struct +{ + pi_task_t *pending_copy; + pi_task_t *waiting_first; + pi_task_t *waiting_last; + unsigned int pending_repeat_base; + unsigned int pending_repeat_callback; + unsigned int pending_repeat_asm_callback; + unsigned int pending_repeat_misaligned_size; + unsigned int pending_repeat_misaligned_addr; + unsigned int pending_repeat_misaligned_ram_addr; + unsigned int pending_repeat_misaligned_end; + unsigned int pending_repeat_misaligned_length; + unsigned int pending_repeat_misaligned_stride; + unsigned int pending_repeat_misaligned_2d_size; + unsigned int pending_chunk_size; + unsigned int pending_send_cmd; + unsigned int pending_eot; + void (*pending_callback)(uint32_t event, void *arg); + uint32_t udma_cmd[PI_SPIM_UDMA_CMD_SIZE]; + int open_count; + int id; + int is_slave; + pi_task_t task; + unsigned int pending_repeat_addr; + unsigned int pending_repeat_dup_addr; + unsigned int pending_repeat_send; + unsigned int pending_repeat_flags; + pi_spim_pending_transfert_t pending_transfers[2]; + int pending_transfer_index; + int pending_transfer_read_index; + struct pi_device *pending_repeat_device; + uint32_t periph_base; + uint32_t rx_cmd; + uint32_t tx_cmd; + int channel; + int pending_is_auto; + uint32_t cmd_channel_base; + uint32_t tx_channel_base; + uint32_t rx_channel_base; + int cmd_channel; + int tx_channel; + int rx_channel; +} pi_spim_t; + +#define PI_SPIM_T_PENDING_COPY 0 +#define PI_SPIM_T_WAITING_FIRST 4 +#define PI_SPIM_T_WAITING_LAST 8 +#define PI_SPIM_T_REPEAT_BASE 12 +//#define PI_SPIM_T_REPEAT_LEN 16 +#define PI_SPIM_T_REPEAT_CALLBACK 16 +#define PI_SPIM_T_REPEAT_ASM_CALLBACK 20 + +typedef struct +{ + pi_spim_t *spim; + uint32_t rx_cmd; + uint32_t tx_cmd; + uint8_t *receive_addr_ucode; + uint32_t receive_addr_ucode_size; + uint8_t *send_addr_ucode; + uint32_t send_addr_ucode_size; + uint32_t *udma_receive_cmd; + uint32_t *udma_send_cmd; + uint32_t udma_receive_cmd_size; + uint32_t udma_send_cmd_size; + int max_baudrate; + unsigned int cfg; + unsigned int periph_base; + char cs; + char wordsize; + char big_endian; + char channel; + char byte_align; + unsigned char div; + char polarity; + char phase; + uint32_t max_rcv_size; + uint32_t max_snd_size; +} pi_spim_cs_t; + +typedef struct { + unsigned int cmd[4]; +} pi_spim_cmd_t; + + +static PI_L2 pi_spim_t g_spim_data[ARCHI_UDMA_NB_SPIM]; + +void pi_spim_handle_waiting_copy(pi_task_t *task); + +void pi_spim_handle_eot(uint32_t event, void *arg) +{ + pi_spim_t *spim = (pi_spim_t *) arg; + + pi_task_t *task = spim->pending_copy; + spim->pending_copy = NULL; + + /* handle current task end */ + pi_task_push_irq_safe(task); + + task = spim->waiting_first; + if (task) + { + spim->waiting_first = task->next; + pi_spim_handle_waiting_copy(task); + } +} + +void pi_spim_handle_rx_copy(uint32_t event, void *arg) +{ + pi_soc_eu_fc_mask_clear(event); + pi_spim_handle_eot(event, arg); +} + +void pi_spim_handle_tx_copy(uint32_t event, void *arg) +{ + pi_soc_eu_fc_mask_clear(event); + pi_spim_handle_eot(event, arg); +} + + +static int pi_spim_get_div(uint32_t spi_freq) +{ + uint32_t periph_freq = pi_freq_get(PI_FREQ_DOMAIN_PERIPH); + + if (spi_freq >= periph_freq) + { + return 0; + } + else + { + // Round-up the divider to obtain an SPI frequency which is below the maximum + int div = (periph_freq + spi_freq - 1)/ spi_freq; + // The SPIM always divide by 2 once we activate the divider, thus increase by 1 + // in case it is even to not go above the max frequency. + if (div & 1) div += 1; + return div; + } +} + + + +static inline int pi_spim_get_byte_align(int wordsize, int big_endian) +{ + return wordsize == PI_SPI_WORDSIZE_32 && big_endian; +} + +static void pi_spim_apply_conf(pi_spim_cs_t *spim_cs) +{ + if (spim_cs->udma_receive_cmd) + { + spim_cs->udma_receive_cmd[0] = spim_cs->cfg; + spim_cs->udma_receive_cmd[1] = SPI_CMD_SOT(spim_cs->cs); + } + + if (spim_cs->udma_send_cmd) + { + spim_cs->udma_send_cmd[0] = spim_cs->cfg; + spim_cs->udma_send_cmd[1] = SPI_CMD_SOT(spim_cs->cs); + } + + spim_cs->rx_cmd = SPI_CMD_RX_DATA(1, SPI_CMD_4_WORD_PER_TRANSF, 8, 0, 0); + spim_cs->tx_cmd = SPI_CMD_TX_DATA(1, SPI_CMD_4_WORD_PER_TRANSF, 8, 0, 0); +} + +int pi_spi_open(struct pi_device *device) +{ + uint32_t irq = pi_irq_disable(); + + struct pi_spi_conf *conf = (struct pi_spi_conf *) device->config; + + int periph_id = ARCHI_UDMA_SPIM_ID(conf->itf); + + SPIM_TRACE(POS_LOG_INFO, "Opening SPIM device (device: %p, id: %d, cs: %d, max_baudrate: %d, wordsize: %d, big_endian: %d, polarity: %d, phase: %d)\n", + device, conf->itf, conf->cs, conf->max_baudrate, conf->wordsize, conf->big_endian, conf->polarity, conf->phase); + + pi_spim_t *spim = &g_spim_data[conf->itf]; + + pi_spim_cs_t *spim_cs = pmsis_l2_malloc(sizeof(pi_spim_cs_t)); + if (spim_cs == NULL) + { + SPIM_TRACE(POS_LOG_WARNING, "Failed to allocate spim structure\n"); + goto error; + } + + device->data = (void *) spim_cs; + + spim_cs->channel = periph_id; + spim_cs->periph_base = (uint32_t) UDMA_SPIM_ADDR(conf->itf); + spim_cs->spim = spim; + spim_cs->wordsize = conf->wordsize; + spim_cs->big_endian = conf->big_endian; + spim_cs->polarity = conf->polarity; + spim_cs->phase = conf->phase; + spim_cs->max_baudrate = conf->max_baudrate; + spim_cs->cs = conf->cs; + spim_cs->byte_align = pi_spim_get_byte_align(conf->wordsize, conf->big_endian); + spim_cs->max_rcv_size = conf->max_rcv_chunk_size; + spim_cs->max_snd_size = conf->max_snd_chunk_size; + spim_cs->udma_send_cmd = NULL; + spim_cs->udma_receive_cmd = NULL; + + int div = pi_spim_get_div(spim_cs->max_baudrate); + spim_cs->div = div; + + spim_cs->cfg = SPI_CMD_CFG(div, conf->polarity, conf->phase); + + spim->open_count++; + if (spim->open_count == 1) + { + spim->is_slave = conf->is_slave; + + spim->rx_channel = pi_udma_core_lin_alloc(); + spim->tx_channel = pi_udma_core_lin_alloc(); + spim->cmd_channel = pi_udma_core_lin_alloc(); + + if (spim->rx_channel < 0 || spim->tx_channel < 0 || spim->cmd_channel < 0) + { + pi_udma_core_lin_free(spim->rx_channel); + pi_udma_core_lin_free(spim->tx_channel); + pi_udma_core_lin_free(spim->cmd_channel); + + SPIM_TRACE(POS_LOG_WARNING, "Failed to allocate channels\n"); + goto error; + } + + spim->rx_channel_base = pi_udma_core_lin_addr_get(spim->rx_channel); + spim->tx_channel_base = pi_udma_core_lin_addr_get(spim->tx_channel); + spim->cmd_channel_base = pi_udma_core_lin_addr_get(spim->cmd_channel); + + + udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + + uint32_t event = SOC_EVENT_UDMA_SPIM_EOT(conf->itf); + + pi_fc_event_handler_set(event, pi_spim_handle_eot, (void *) spim); + pi_soc_eu_fc_mask_set(event); + + udma_spim_rx_dest_set(spim_cs->periph_base, spim->rx_channel); + udma_spim_tx_dest_set(spim_cs->periph_base, spim->tx_channel); + udma_spim_cmd_dest_set(spim_cs->periph_base, spim->cmd_channel); + + udma_spim_config_set(spim_cs->periph_base, UDMA_SPIM_CONFIG_SPI_SLAVE_MODE(conf->is_slave)); + } + + pi_irq_restore(irq); + + return 0; + +error: + pi_irq_restore(irq); + return -1; +} + +static void __pi_spi_timestamp_enable(pi_spim_cs_t *spim_cs, struct pi_spi_conf *conf) +{ + pi_spim_t *spim = spim_cs->spim; + uint8_t is_rx = conf->ts_ch; + + uint32_t base = ARCHI_UDMA_ADDR; + uint8_t evt_id = conf->ts_evt_id; + uint8_t soc_evt = is_rx ? spim->rx_channel : spim->tx_channel; + + uint32_t cfg_evt_val = (udma_ctrl_cfg_event_get(base) & ~(0xFF<data; + uint32_t arg = (uint32_t) _arg; + + int polarity = (cmd >> __PI_SPI_CTRL_CPOL_BIT) & 3; + int phase = (cmd >> __PI_SPI_CTRL_CPHA_BIT) & 3; + int set_freq = (cmd >> __PI_SPI_CTRL_SET_MAX_BAUDRATE_BIT) & 1; + int wordsize = (cmd >> __PI_SPI_CTRL_WORDSIZE_BIT) & 3; + int big_endian = (cmd >> __PI_SPI_CTRL_ENDIANNESS_BIT) & 3; + int ts_spi = (cmd >> __PI_SPI_CTRL_SET_TIMESTAMP) & 1; + + if (ts_spi) + { + __pi_spi_timestamp_enable(spim_cs, (struct pi_spi_conf *) _arg); + } + else + { + if (set_freq) + { + spim_cs->max_baudrate = arg; + spim_cs->div = pi_spim_get_div(arg); + } + + if (polarity) + { + spim_cs->polarity = polarity >> 1; + } + if (phase) + { + spim_cs->phase = phase >> 1; + } + if (wordsize) + { + spim_cs->wordsize = wordsize >> 1; + } + if (big_endian) + { + spim_cs->big_endian = big_endian >> 1; + } + + spim_cs->cfg = SPI_CMD_CFG(spim_cs->div, spim_cs->polarity, spim_cs->phase); + spim_cs->byte_align = pi_spim_get_byte_align(spim_cs->wordsize, spim_cs->big_endian); + } + + pi_irq_restore(irq); +} + +void pi_spi_close(struct pi_device *device) +{ + int irq = pi_irq_disable(); + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data; + pi_spim_t *spim = spim_cs->spim; + + SPIM_TRACE(POS_LOG_INFO, "Closing SPIM device (device: %p)\n", device); + + spim->open_count--; + + if (spim->open_count == 0) + { + // Deactivate SPIM channels + udma_spim_rx_dest_set(spim_cs->periph_base, 0xFF); + udma_spim_tx_dest_set(spim_cs->periph_base, 0xFF); + udma_spim_cmd_dest_set(spim_cs->periph_base, 0xFF); + + // And free them + pi_udma_core_lin_free(spim->rx_channel); + pi_udma_core_lin_free(spim->tx_channel); + pi_udma_core_lin_free(spim->cmd_channel); + + // Deactivate event routing + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_SPIM_EOT(spim->id)); + + // Reactivate clock-gating and reset + // Reactivate clock-gating and reset + udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, (1 << spim_cs->channel)); + udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, (1 << spim_cs->channel)); + } + + pi_l2_free(spim_cs, sizeof(pi_spim_cs_t)); + + pi_irq_restore(irq); +} + + + +static void __attribute__((noinline)) pi_spim_enqueue_to_pending(pi_spim_t *spim, pi_task_t *task, uint32_t data0, uint32_t data1, uint32_t data2, uint32_t data3, uint32_t data4) +{ + task->data[0] = data0; + task->data[1] = data1; + task->data[2] = data2; + task->data[3] = data3; + task->data[4] = data4; + + if (spim->waiting_first) + { + spim->waiting_last->next = task; + } + else + { + spim->waiting_first = task; + } + spim->waiting_last = task; + task->next = NULL; +} + + + +static void __attribute__((noinline)) pi_spim_enqueue_to_pending_7(pi_spim_t *spim, pi_task_t *task, uint32_t data0, uint32_t data1, uint32_t data2, uint32_t data3, uint32_t data4, uint32_t data5, uint32_t data6, uint32_t data7) +{ + task->data[0] = data0; + task->data[1] = data1; + task->data[2] = data2; + task->data[3] = data3; + task->data[4] = data4; + task->data[5] = data5; + task->data[6] = data6; + task->data[7] = data7; + + if (spim->waiting_first) + { + spim->waiting_last->next = task; + } + else + { + spim->waiting_first = task; + } + spim->waiting_last = task; + task->next = NULL; +} + + + +void pi_spi_send_async(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags, pi_task_t *task) +{ + int irq = pi_irq_disable(); + + SPIM_TRACE(POS_LOG_TRACE, "Sending bitstream (device: %p, buffer: %p, len: 0x%x, flags: 0x%x, task: %p)\n", device, data, len, flags, task); + + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data; + pi_spim_t *spim = spim_cs->spim; + int qspi = ((flags >> 2) & 0x3) == 1; + int cs_mode = (flags >> 0) & 0x3; + + if (spim->pending_copy) + { + task->data[0] = 0; + task->data[1] = (int)device; + task->data[2] = (int)data; + task->data[3] = len; + task->data[4] = flags; + + if (spim->waiting_first) + { + spim->waiting_last->next = task; + } + else + { + spim->waiting_first = task; + } + spim->waiting_last = task; + task->next = NULL; + + goto end; + } + + int buffer_size = (len+7)/8; + + spim->pending_copy = task; + + int size = (len + 7) >> 3; + int endianness = spim_cs->big_endian ? SPI_CMD_MSB_FIRST : SPI_CMD_LSB_FIRST; + + // First enqueue the header with SPI config, cs, and send command. + // The rest will be sent by the assembly code. + // First the user data and finally an epilogue with the EOT command. + + pi_udma_core_lin_enqueue(spim->tx_channel_base, (uint32_t) data, buffer_size, 0); + + if (!spim->is_slave) + { + spim->udma_cmd[0] = spim_cs->cfg; + spim->udma_cmd[1] = SPI_CMD_SOT(spim_cs->cs); + + if (spim_cs->wordsize == PI_SPI_WORDSIZE_8) + { + spim->udma_cmd[2] = SPI_CMD_TX_DATA(len/8, SPI_CMD_4_WORD_PER_TRANSF, 8, qspi, endianness); + } + else if (spim_cs->wordsize == PI_SPI_WORDSIZE_16) + { + spim->udma_cmd[2] = SPI_CMD_TX_DATA(len/16, SPI_CMD_2_WORD_PER_TRANSF, 16, qspi, endianness); + } + else + { + spim->udma_cmd[2] = SPI_CMD_TX_DATA(len/32, SPI_CMD_1_WORD_PER_TRANSF, 32, qspi, endianness); + } + spim->udma_cmd[3] = SPI_CMD_EOT(1, cs_mode == PI_SPI_CS_KEEP); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 4*4, 0); + } + else + { + spim->udma_cmd[0] = SPI_CMD_SLAVE_TX_DATA(len>>3, 0); + spim->udma_cmd[1] = SPI_CMD_SLAVE_EOT(); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 8, 0); + } + +end: + pi_irq_restore(irq); +} + + +void pi_spi_send(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags) +{ + pi_task_t task; + pi_spi_send_async(device, data, len, flags, pi_task_block(&task)); + pi_task_wait_on(&task); +} + + + +void pi_spi_receive_async(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags, pi_task_t *task) +{ + SPIM_TRACE(POS_LOG_TRACE, "Receive bitstream (device: %p, buffer: %p, len: 0x%x, flags: 0x%x, task: %p)\n", device, data, len, flags, task); + + int irq = pi_irq_disable(); + + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data; + pi_spim_t *spim = spim_cs->spim; + int qspi = ((flags >> 2) & 0x3) == 1; + int cs_mode = (flags >> 0) & 0x3; + + if (spim->pending_copy) + { + task->data[0] = 1; + task->data[1] = (int)device; + task->data[2] = (int)data; + task->data[3] = len; + task->data[4] = flags; + + if (spim->waiting_first) + { + spim->waiting_last->next = task; + } + else + { + spim->waiting_first = task; + } + + spim->waiting_last = task; + task->next = NULL; + + goto end; + } + + spim->pending_copy = task; + + int size = (len + 7) >> 3; + int endianness = spim_cs->big_endian ? SPI_CMD_MSB_FIRST : SPI_CMD_LSB_FIRST; + + pi_udma_core_lin_enqueue(spim->rx_channel_base, (uint32_t) data, size, 0); + + if (!spim->is_slave) + { + spim->udma_cmd[0] = spim_cs->cfg; + spim->udma_cmd[1] = SPI_CMD_SOT(spim_cs->cs); + + if (spim_cs->wordsize == PI_SPI_WORDSIZE_8) + { + spim->udma_cmd[2] = SPI_CMD_RX_DATA(len/8, SPI_CMD_4_WORD_PER_TRANSF, 8, qspi, endianness); + } + else if (spim_cs->wordsize == PI_SPI_WORDSIZE_16) + { + spim->udma_cmd[2] = SPI_CMD_RX_DATA(len/16, SPI_CMD_2_WORD_PER_TRANSF, 16, qspi, endianness); + } + else + { + spim->udma_cmd[2] = SPI_CMD_RX_DATA(len/32, SPI_CMD_1_WORD_PER_TRANSF, 32, qspi, endianness); + } + + spim->udma_cmd[3] = SPI_CMD_EOT(1, cs_mode == PI_SPI_CS_KEEP); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 4*4, 0); + } + else + { + spim->udma_cmd[0] = SPI_CMD_SLAVE_RX_DATA(len>>3, 0); + spim->udma_cmd[1] = SPI_CMD_SLAVE_EOT(); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 8, 0); + } + +end: + pi_irq_restore(irq); +} + +void pi_spi_receive(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags) +{ + pi_task_t task; + pi_spi_receive_async(device, data, len, flags, pi_task_block(&task)); + pi_task_wait_on(&task); +} + + +void pi_spi_transfer_async(struct pi_device *device, void *tx_data, void *rx_data, size_t len, pi_spi_flags_e flags, pi_task_t *task) +{ + SPIM_TRACE(POS_LOG_TRACE, "Transfering bitstream (device: %p, tx_buffer: %p, rx_buffer: %p, len: 0x%x, flags: 0x%x, task: %p)\n", device, tx_data, rx_data, len, flags, task); + + int irq = pi_irq_disable(); + + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data; + pi_spim_t *spim = spim_cs->spim; + int cs_mode = (flags >> 0) & 0x3; + + if (spim->pending_copy) + { + task->data[0] = 2; + task->data[1] = (int)device; + task->data[2] = (int)tx_data; + task->data[3] = (int)rx_data; + task->data[4] = len; + task->data[5] = cs_mode; + + if (spim->waiting_first) + spim->waiting_last->next = task; + else + spim->waiting_first = task; + + spim->waiting_last = task; + task->next = NULL; + + goto end; + } + + //int channel_id = UDMA_CHANNEL_ID(spim_cs->channel); + + int endianness = spim_cs->big_endian ? SPI_CMD_MSB_FIRST : SPI_CMD_LSB_FIRST; + + spim->pending_copy = task; + + int size = (len + 7) >> 3; + + pi_udma_core_lin_enqueue(spim->rx_channel_base, (uint32_t) rx_data, size, 0); + pi_udma_core_lin_enqueue(spim->tx_channel_base, (uint32_t) tx_data, size, 0); + + if (!spim->is_slave) + { + // First enqueue the header with SPI config, cs, and send command. + // The rest will be sent by the assembly code. + // First the user data and finally an epilogue with the EOT command. + spim->udma_cmd[0] = spim_cs->cfg; + spim->udma_cmd[1] = SPI_CMD_SOT(spim_cs->cs); + + if (spim_cs->wordsize == PI_SPI_WORDSIZE_8) + { + spim->udma_cmd[2] = SPI_CMD_FUL(len/8, SPI_CMD_4_WORD_PER_TRANSF, 8, endianness); + } + else if (spim_cs->wordsize == PI_SPI_WORDSIZE_16) + { + spim->udma_cmd[2] = SPI_CMD_FUL(len/16, SPI_CMD_2_WORD_PER_TRANSF, 16, endianness); + } + else + { + spim->udma_cmd[2] = SPI_CMD_FUL(len/32, SPI_CMD_1_WORD_PER_TRANSF, 32, endianness); + } + + spim->udma_cmd[3] = SPI_CMD_EOT(1, cs_mode == PI_SPI_CS_KEEP); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 4*4, 0); + } + else + { + spim->udma_cmd[0] = SPI_CMD_SLAVE_FUL(len>>3, 0); + spim->udma_cmd[1] = SPI_CMD_SLAVE_EOT(); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 8, 0); + } + +end: + pi_irq_restore(irq); +} + +void pi_spi_transfer(struct pi_device *device, void *tx_data, void *rx_data, + size_t len, pi_spi_flags_e flags) +{ + pi_task_t task; + pi_spi_transfer_async(device, tx_data, rx_data, len, flags, pi_task_block(&task)); + pi_task_wait_on(&task); +} + +void pi_spi_copy_2d(struct pi_device *device, uint32_t addr, void *data, + uint32_t size, uint32_t stride, uint32_t length, pi_spi_flags_e flags) +{ + pi_task_t task; + pi_spi_copy_2d_async(device, addr, data, size, stride, length, flags, pi_task_block(&task)); + pi_task_wait_on(&task); +} + + +static __attribute__((noinline)) void pi_spim_send_enqueue_transfer(uint32_t event, void *arg) +{ + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) arg; + pi_spim_t *spim = (pi_spim_t *) spim_cs->spim; + + uint32_t addr = spim->pending_repeat_misaligned_addr; + uint32_t size = spim->pending_repeat_misaligned_size; + uint32_t chunk_size = spim_cs->max_snd_size; + + uint32_t cmd_size; + uint32_t *cmd; + + SPIM_TRACE(POS_LOG_TRACE, "Enqueueing send transfer (addr: 0x%lx, ram_addr: 0x%x, pending size: 0x%lx)\n", addr, spim->pending_repeat_misaligned_ram_addr, size); + + memcpy(spim_cs->send_addr_ucode, &spim->pending_repeat_misaligned_ram_addr, spim_cs->send_addr_ucode_size); + + + + cmd_size = spim_cs->udma_send_cmd_size; + cmd = spim_cs->udma_send_cmd; + + void *callback = pi_spim_send_enqueue_transfer; + + if (size > chunk_size) + { + size = chunk_size; + } + + SPIM_TRACE(POS_LOG_TRACE, "Enqueueing aligned send chunk (addr: 0x%x, size: 0x%lx)\n", spim->pending_repeat_misaligned_addr, size); + + spim->pending_repeat_misaligned_ram_addr += size; + spim->pending_repeat_misaligned_addr += size; + spim->pending_repeat_misaligned_size -= size; + + cmd[cmd_size++] = __BITINSERT(spim->tx_cmd, size-1, SPI_CMD_TX_DATA_SIZE_WIDTH, SPI_CMD_TX_DATA_SIZE_OFFSET); + cmd[cmd_size++] = SPI_CMD_EOT(1, 0); + + if (spim->pending_repeat_misaligned_size == 0) + { + if (spim->pending_repeat_misaligned_2d_size > 0) + { + uint32_t size = spim->pending_repeat_misaligned_length; + + if (size > spim->pending_repeat_misaligned_2d_size) + { + size = spim->pending_repeat_misaligned_2d_size; + } + + spim->pending_repeat_misaligned_2d_size -= size; + + spim->pending_repeat_misaligned_size = size; + spim->pending_repeat_misaligned_ram_addr = spim->pending_repeat_misaligned_ram_addr - spim->pending_repeat_misaligned_length + spim->pending_repeat_misaligned_stride; + } + else + { + callback = pi_spim_handle_eot; + } + } + + pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), callback); + + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) cmd, cmd_size*4, 0); + pi_udma_core_lin_enqueue(spim->tx_channel_base, (uint32_t) addr, size, 0); +} + + + +static void __attribute__((noinline)) pi_spim_send_handle_misaligned(pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, pi_spim_t *spim) +{ + SPIM_TRACE(POS_LOG_TRACE, "Handling SPIM chunk (addr: 0x%lx, size: 0x%lx)\n", addr, size); + + spim->pending_repeat_misaligned_ram_addr = addr; + spim->pending_repeat_misaligned_addr = data; + spim->pending_repeat_misaligned_size = size; + spim->pending_repeat_misaligned_2d_size = 0; + + pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs); + + pi_spim_send_enqueue_transfer(0, spim_cs); +} + + +static void __attribute__((noinline)) pi_spim_send_handle_misaligned_2d(pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, uint32_t stride, uint32_t length, pi_spim_t *spim) +{ + SPIM_TRACE(POS_LOG_TRACE, "Sending SPIM 2D chunk (addr: 0x%lx, data: 0x%lx, size: 0x%lx, stride: 0x%lx, length: 0x%lx)\n", addr, data, size, stride, length); + + int transfer_size = size > length ? length : size; + + spim->pending_repeat_misaligned_ram_addr = addr; + spim->pending_repeat_misaligned_addr = data; + spim->pending_repeat_misaligned_size = transfer_size; + spim->pending_repeat_misaligned_length = length; + spim->pending_repeat_misaligned_stride = stride; + spim->pending_repeat_misaligned_2d_size = size - transfer_size; + + pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs); + + pi_spim_send_enqueue_transfer(0, spim_cs); +} + +static __attribute__((noinline)) void pi_spim_receive_enqueue_transfer(uint32_t event, void *arg) +{ + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) arg; + pi_spim_t *spim = (pi_spim_t *) spim_cs->spim; + + pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim->pending_callback); + + uint32_t addr = spim->pending_repeat_misaligned_addr; + uint32_t size = spim->pending_repeat_misaligned_size; + uint32_t chunk_size = spim_cs->max_rcv_size; + + if (size == 0) + { + return; + } + + uint32_t cmd_size; + uint32_t *cmd; + + SPIM_TRACE(POS_LOG_TRACE, "Enqueueing receive transfer (addr: 0x%lx, ram_addr: 0x%x, pending size: 0x%lx)\n", addr, spim->pending_repeat_misaligned_ram_addr, size); + + memcpy(spim_cs->receive_addr_ucode, &spim->pending_repeat_misaligned_ram_addr, spim_cs->receive_addr_ucode_size); + + cmd_size = spim_cs->udma_receive_cmd_size; + cmd = spim_cs->udma_receive_cmd; + + spim->pending_callback = pi_spim_receive_enqueue_transfer; + + if (size > chunk_size) + { + size = chunk_size; + } + else + { + size &= ~0x3; + } + + SPIM_TRACE(POS_LOG_TRACE, "Enqueueing aligned receive chunk (addr: 0x%x, size: 0x%lx)\n", spim->pending_repeat_misaligned_addr, size); + + spim->pending_repeat_misaligned_ram_addr += size; + spim->pending_repeat_misaligned_addr += size; + spim->pending_repeat_misaligned_size -= size; + + cmd[cmd_size++] = __BITINSERT(spim->rx_cmd, size*8-1, SPI_CMD_RX_DATA_SIZE_WIDTH, SPI_CMD_RX_DATA_SIZE_OFFSET); + cmd[cmd_size++] = SPI_CMD_EOT(1, 0); + + if (spim->pending_repeat_misaligned_size == 0) + { + if (spim->pending_repeat_misaligned_2d_size > 0) + { + uint32_t size = spim->pending_repeat_misaligned_length; + + if (size > spim->pending_repeat_misaligned_2d_size) + { + size = spim->pending_repeat_misaligned_2d_size; + } + + spim->pending_repeat_misaligned_2d_size -= size; + + spim->pending_repeat_misaligned_size = size; + spim->pending_repeat_misaligned_ram_addr = spim->pending_repeat_misaligned_ram_addr - spim->pending_repeat_misaligned_length + spim->pending_repeat_misaligned_stride; + } + else + { + spim->pending_callback = pi_spim_handle_eot; + } + } + + + pi_udma_core_lin_enqueue(spim->rx_channel_base, (uint32_t) addr, size, 0); + pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) cmd, cmd_size*4, 0); +} + + + +static void __attribute__((noinline)) pi_spim_receive_handle_misaligned(pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, pi_spim_t *spim) +{ + SPIM_TRACE(POS_LOG_TRACE, "Receiving SPIM chunk (addr: 0x%lx, size: 0x%lx)\n", addr, size); + + spim->pending_repeat_misaligned_ram_addr = addr; + spim->pending_repeat_misaligned_addr = data; + spim->pending_repeat_misaligned_size = size; + spim->pending_repeat_misaligned_2d_size = 0; + + pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs); + + pi_spim_receive_enqueue_transfer(0, spim_cs); + + if (spim->pending_repeat_misaligned_size) + { + pi_spim_receive_enqueue_transfer(0, spim_cs); + } + else + { + pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim->pending_callback); + } + +} + + + +static void __attribute__((noinline)) pi_spim_receive_handle_misaligned_2d( + pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, + uint32_t stride, uint32_t length, pi_spim_t *spim) +{ + SPIM_TRACE(POS_LOG_TRACE, "Receiving SPIM 2D chunk (addr: 0x%lx, data: 0x%lx, size: 0x%lx, stride: 0x%lx, length: 0x%lx)\n", addr, data, size, stride, length); + + int transfer_size = size > length ? length : size; + + spim->pending_repeat_misaligned_ram_addr = addr; + spim->pending_repeat_misaligned_addr = data; + spim->pending_repeat_misaligned_size = transfer_size; + spim->pending_repeat_misaligned_length = length; + spim->pending_repeat_misaligned_stride = stride; + spim->pending_repeat_misaligned_2d_size = size - transfer_size; + + pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs); + + pi_spim_receive_enqueue_transfer(0, spim_cs); + + if (spim->pending_repeat_misaligned_size) + { + pi_spim_receive_enqueue_transfer(0, spim_cs); + } + else + { + pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim->pending_callback); + } +} + + +void pi_spi_copy_2d_async(struct pi_device *device, uint32_t addr, void *data, + uint32_t size, uint32_t stride, uint32_t length, + pi_spi_flags_e flags, pi_task_t *task) +{ + SPIM_TRACE(POS_LOG_DEBUG, "Copy 2D bitstream (device: %p, ext2loc: %d, addr: 0x%lx, buffer: %p, size: 0x%lx, stride: 0x%lx, length: 0x%lx, flags: 0x%x, task: %p)\n", device, __BITEXTRACT(flags, 1, 5), addr, data, size, stride, length, flags, task); + + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data; + pi_spim_t *spim = spim_cs->spim; + + uint32_t irq = pi_irq_disable(); + + if (likely(!spim->pending_copy)) + { + int qspi = __BITEXTRACT(flags, 2, 2) == 1; + int cs_mode = __BITEXTRACT(flags, 2, 0); + int ext2loc = __BITEXTRACT(flags, 1, 4); + + spim->pending_copy = task; + spim->pending_is_auto = cs_mode == PI_SPI_CS_AUTO; + + if (ext2loc) + { + spim->rx_cmd = __BITINSERT(spim_cs->rx_cmd, qspi, SPI_CMD_RX_DATA_QPI_WIDTH, SPI_CMD_RX_DATA_QPI_OFFSET); + pi_spim_receive_handle_misaligned_2d(spim_cs, addr, (uint32_t) data, size, stride, length, spim); + } + else + { + spim->tx_cmd = __BITINSERT(spim_cs->tx_cmd, qspi, SPI_CMD_TX_DATA_QPI_WIDTH, SPI_CMD_TX_DATA_QPI_OFFSET); + pi_spim_send_handle_misaligned_2d(spim_cs, addr, (uint32_t) data, size, stride, length, spim); + } + } + else + { + pi_spim_enqueue_to_pending_7(spim, task, 4, (int) device, addr, (int) data, size, stride, length, flags); + } + + pi_irq_restore(irq); +} + + + +void pi_spi_copy(struct pi_device *device, uint32_t addr, void *data, + uint32_t size, pi_spi_flags_e flags) +{ + pi_task_t task; + pi_spi_copy_async(device, addr, data, size, flags, pi_task_block(&task)); + pi_task_wait_on(&task); +} + + + +void pi_spi_copy_async(struct pi_device *device, uint32_t addr, void *data, + uint32_t size, pi_spi_flags_e flags, pi_task_t *task) +{ + SPIM_TRACE(POS_LOG_DEBUG, "Copy bitstream (device: %p, ext2loc: %d, addr: 0x%lx, buffer: %p, size: 0x%lx, flags: 0x%x, task: %p)\n", device, __BITEXTRACT(flags, 1, 4), addr, data, size, flags, task); + + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data; + pi_spim_t *spim = spim_cs->spim; + + uint32_t irq = pi_irq_disable(); + + if (likely(!spim->pending_copy)) + { + int qspi = __BITEXTRACT(flags, 2, 2) == 1; + int cs_mode = __BITEXTRACT(flags, 2, 0); + int ext2loc = __BITEXTRACT(flags, 1, 4); + + spim->pending_copy = task; + spim->pending_is_auto = cs_mode == PI_SPI_CS_AUTO; + + if (ext2loc) + { + spim->rx_cmd = __BITINSERT(spim_cs->rx_cmd, qspi, SPI_CMD_RX_DATA_QPI_WIDTH, SPI_CMD_RX_DATA_QPI_OFFSET); + pi_spim_receive_handle_misaligned(spim_cs, addr, (uint32_t) data, size, spim); + } + else + { + spim->tx_cmd = __BITINSERT(spim_cs->tx_cmd, qspi, SPI_CMD_TX_DATA_QPI_WIDTH, SPI_CMD_TX_DATA_QPI_OFFSET); + pi_spim_send_handle_misaligned(spim_cs, addr, (uint32_t) data, size, spim); + } + } + else + { + pi_spim_enqueue_to_pending_7(spim, task, 3, (int) device, addr, (int) data, size, flags, 0, 0); + } + + pi_irq_restore(irq); +} + + + +void *pi_spi_receive_ucode_set(struct pi_device *device, uint8_t *ucode, + uint32_t ucode_size) +{ + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data; + + if (spim_cs->udma_receive_cmd) + { + pi_l2_free(spim_cs->udma_receive_cmd, (spim_cs->udma_receive_cmd_size + 2)*4); + } + + spim_cs->udma_receive_cmd = pi_l2_malloc(ucode_size + 4*4); + if (spim_cs->udma_receive_cmd == NULL) + { + return NULL; + } + + pi_spim_apply_conf(spim_cs); + + memcpy(&spim_cs->udma_receive_cmd[2], ucode, ucode_size); + spim_cs->udma_receive_cmd_size = 2 + (ucode_size >> 2); + + return (void *)&spim_cs->udma_receive_cmd[2]; +} + + + +void pi_spi_receive_ucode_set_addr_info(struct pi_device *device, uint8_t *ucode, + uint32_t ucode_size) +{ + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data; + + spim_cs->receive_addr_ucode = ucode; + spim_cs->receive_addr_ucode_size = ucode_size; +} + + + +void *pi_spi_send_ucode_set(struct pi_device *device, uint8_t *ucode, uint32_t ucode_size) +{ + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data; + + if (spim_cs->udma_send_cmd) + { + pi_l2_free(spim_cs->udma_send_cmd, (spim_cs->udma_send_cmd_size + 2)*4); + } + + spim_cs->udma_send_cmd = pi_l2_malloc(ucode_size + 4*4); + if (spim_cs->udma_send_cmd == NULL) + { + return NULL; + } + + pi_spim_apply_conf(spim_cs); + + memcpy(&spim_cs->udma_send_cmd[2], ucode, ucode_size); + spim_cs->udma_send_cmd_size = 2 + (ucode_size >> 2); + + return (void *)&spim_cs->udma_send_cmd[2]; +} + + + +void pi_spi_send_ucode_set_addr_info(struct pi_device *device, uint8_t *ucode, + uint32_t ucode_size) +{ + pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data; + + spim_cs->send_addr_ucode = ucode; + spim_cs->send_addr_ucode_size = ucode_size; +} + + + +void pi_spim_handle_waiting_copy(pi_task_t *task) +{ + if (task->data[0] == 0) + { + pi_spi_send_async((struct pi_device *)task->data[1], (void *)task->data[2], + task->data[3], task->data[4], task); + } + else if (task->data[0] == 1) + { + pi_spi_receive_async((struct pi_device *)task->data[1], (void *)task->data[2], + task->data[3], task->data[4], task); + } + else + { + pi_spi_transfer_async((struct pi_device *)task->data[1], (void *)task->data[2], + (void *)task->data[3], task->data[4], task->data[5], task); + } +} + +void pi_spi_conf_init(struct pi_spi_conf *conf) +{ + conf->wordsize = PI_SPI_WORDSIZE_8; + conf->big_endian = 0; + conf->max_baudrate = 10000000; + conf->cs = -1; + conf->itf = 0; + conf->polarity = 0; + conf->phase = 0; + conf->max_rcv_chunk_size = -1; + conf->max_snd_chunk_size = -1; + conf->is_slave = 0; +} + +#if !defined(__FREERTOS__) +static void __attribute__((constructor)) pi_spim_init() +{ + for (int i=0; i +#include +#include + +static int spi_init(struct device *device) +{ + ARG_UNUSED(device); + + pi_spim_init(); + + return 0; +} + +struct spi_config { +}; + +struct spi_data { +}; + +static const struct spi_config spi_cfg = { +}; + +static struct spi_data spi_data = { +}; + +DEVICE_INIT(spi, "spi", &spi_init, + &spi_data, &spi_cfg, + PRE_KERNEL_2, CONFIG_KERNEL_INIT_PRIORITY_DEVICE); + +#endif diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim_v4.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim_v4.h new file mode 100644 index 000000000..8a6d56e7c --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim_v4.h @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2019 GreenWaves Technologies + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ARCHI_UDMA_SPIM_SPIM_V4_H__ +#define __ARCHI_UDMA_SPIM_SPIM_V4_H__ + +// SPI commands IDS definition +#define SPI_CMD_CFG_ID 0 +#define SPI_CMD_SOT_ID 1 +#define SPI_CMD_SEND_CMD_ID 2 +#define SPI_CMD_SEND_BITS_ID 2 +#define SPI_CMD_SEND_ADDR_ID 3 +#define SPI_CMD_DUMMY_ID 4 +#define SPI_CMD_WAIT_ID 5 +#define SPI_CMD_TX_DATA_ID 6 +#define SPI_CMD_RX_DATA_ID 7 +#define SPI_CMD_RPT_ID 8 +#define SPI_CMD_EOT_ID 9 +#define SPI_CMD_RPT_END_ID 10 +#define SPI_CMD_RX_CHECK_ID 11 +#define SPI_CMD_FUL_ID 12 + +// SPI command fields offset, mask, value definition +// SPI commands fields offsets +#define SPI_CMD_ID_OFFSET 28 + +// COMMON definitions +#define SPI_CMD_QPI_ENA 1 +#define SPI_CMD_QPI_DIS 0 +#define SPI_CMD_LSB_FIRST 1 +#define SPI_CMD_MSB_FIRST 0 +#define SPI_CMD_4_WORD_PER_TRANSF 2 +#define SPI_CMD_2_WORD_PER_TRANSF 1 +#define SPI_CMD_1_WORD_PER_TRANSF 0 +#define SPI_CMD_DATA_WITDH(val) (val) +#define SPI_CMD_CMD_SIZE(val) (val) + +// CFG +#define SPI_CMD_CFG_CLK_DIV_OFFSET 0 +#define SPI_CMD_CFG_CLK_DIV_WIDTH 8 +#define SPI_CMD_CFG_CPHA_OFFSET 8 +#define SPI_CMD_CFG_CPOL_OFFSET 9 + +#define SPI_CMD_CFG_CLKDIV(val) (val) +#define SPI_CMD_CFG_CPOL_POS 1 +#define SPI_CMD_CFG_CPOL_NEG 0 +#define SPI_CMD_CFG_CPHA_STD 1 +#define SPI_CMD_CFG_CPHA_OPP 0 + +// SOT +#define SPI_CMD_SOT_CS_OFFSET 0 +#define SPI_CMD_SOT_CS_WIDTH 2 + +#define SPI_CMD_SOT_CS0 0 +#define SPI_CMD_SOT_CS1 1 +#define SPI_CMD_SOT_CS2 2 +#define SPI_CMD_SOT_CS3 3 + +// SEND_CMD +#define SPI_CMD_SEND_CMD_CMD_OFFSET 0 +#define SPI_CMD_SEND_CMD_CMD_WIDTH 16 +#define SPI_CMD_SEND_CMD_SIZE_OFFSET 16 +#define SPI_CMD_SEND_CMD_SIZE_WIDTH 4 +#define SPI_CMD_SEND_CMD_QPI_OFFSET 27 + +// SEND_CMD +#define SPI_CMD_SEND_BITS_BITS_OFFSET 0 +#define SPI_CMD_SEND_BITS_BITS_WIDTH 16 +#define SPI_CMD_SEND_BITS_SIZE_OFFSET 16 +#define SPI_CMD_SEND_BITS_SIZE_WIDTH 4 +#define SPI_CMD_SEND_BITS_QPI_OFFSET 27 + +// SEND_ADDR +#define SPI_CMD_SEND_ADDR_SIZE_OFFSET 16 +#define SPI_CMD_SEND_ADDR_SIZE_WIDTH 5 +#define SPI_CMD_SEND_ADDR_QPI_OFFSET 27 + +//#define SPI_CMD_SEND_ADDR_VALUE(value) ((((value) & 0xff000000) >> 24) | (((value) & 0xff0000) >> 8) | (((value) & 0xff00) << 8) | (((value) & 0xff) << 24)) +#define SPI_CMD_SEND_ADDR_VALUE(value) (value) + + +// SEND_DUMMY +#define SPI_CMD_DUMMY_CYCLE_OFFSET 16 +#define SPI_CMD_DUMMY_CYCLE_WIDTH 5 + +// TX_DATA +#define SPI_CMD_TX_DATA_SIZE_OFFSET 0 +#define SPI_CMD_TX_DATA_SIZE_WIDTH 16 +#define SPI_CMD_TX_DATA_QPI_OFFSET 27 +#define SPI_CMD_TX_DATA_QPI_WIDTH 1 +#define SPI_CMD_TX_DATA_WORDTRANS_OFFSET 21 +#define SPI_CMD_TX_DATA_WORDTRANS_WIDTH 2 +#define SPI_CMD_TX_DATA_LSBFIRST_OFFSET 26 +#define SPI_CMD_TX_DATA_BITSWORD_OFFSET 16 +#define SPI_CMD_TX_DATA_BITSWORD_WIDTH 5 + +// SLAVE_TX_DATA +#define SPI_CMD_SLAVE_TX_DATA_SIZE_OFFSET 0 +#define SPI_CMD_SLAVE_TX_DATA_SIZE_WIDTH 16 +#define SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_OFFSET 16 +#define SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_WIDTH 1 + + +// RX_DATA +#define SPI_CMD_RX_DATA_SIZE_OFFSET 0 +#define SPI_CMD_RX_DATA_SIZE_WIDTH 16 +#define SPI_CMD_RX_DATA_QPI_OFFSET 27 +#define SPI_CMD_RX_DATA_QPI_WIDTH 1 +#define SPI_CMD_RX_DATA_WORDTRANS_OFFSET 21 +#define SPI_CMD_RX_DATA_WORDTRANS_WIDTH 2 +#define SPI_CMD_RX_DATA_LSBFIRST_OFFSET 26 +#define SPI_CMD_RX_DATA_BITSWORD_OFFSET 16 +#define SPI_CMD_RX_DATA_BITSWORD_WIDTH 5 + + +// RPT +#define SPI_CMD_RPT_NB_OFFSET 0 +#define SPI_CMD_RPT_NB_WIDTH 16 + +// EOT +#define SPI_CMD_EOT_GEN_EVT_OFFSET 0 +#define SPI_CMD_EOT_CS_KEEP_OFFSET 1 + +#define SPI_CMD_EOT_EVENT_ENA 1 +#define SPI_CMD_EOT_EVENT_DIS 0 + +// WAIT +#define SPI_CMD_WAIT_EVENT_OFFSET 0 +#define SPI_CMD_WAIT_EVENT_WIDTH 2 + +// RX_CHECK +#define SPI_CMD_RX_CHECK_VALUE_OFFSET 0 +#define SPI_CMD_RX_CHECK_VALUE_WIDTH 16 + +#define SPI_CMD_RX_CHECK_SIZE_OFFSET 16 +#define SPI_CMD_RX_CHECK_SIZE_WIDTH 4 + +#define SPI_CMD_RX_CHECK_MODE_OFFSET 24 +#define SPI_CMD_RX_CHECK_MODE_WIDTH 2 + +#define SPI_CMD_RX_CHECK_BYTE_ALIGN_OFFSET 26 + +#define SPI_CMD_RX_CHECK_QPI_OFFSET 27 + +#define SPI_CMD_RX_CHECK_MODE_MATCH 0 +#define SPI_CMD_RX_CHECK_MODE_ONES 1 +#define SPI_CMD_RX_CHECK_MODE_ZEROS 2 +#define SPI_CMD_RX_CHECK_MODE_MASK 3 + +// FULL DUPLEX +#define SPI_CMD_FUL_SIZE_OFFSET 0 +#define SPI_CMD_FUL_SIZE_WIDTH 16 +#define SPI_CMD_FUL_WORDTRANS_OFFSET 21 +#define SPI_CMD_FUL_WORDTRANS_WIDTH 2 +#define SPI_CMD_FUL_LSBFIRST_OFFSET 26 +#define SPI_CMD_FUL_BITSWORD_OFFSET 16 +#define SPI_CMD_FUL_BITSWORD_WIDTH 5 + +#define SPI_CMD_SETUP_UC_TXRXEN_OFFSET 27 +#define SPI_CMD_SETUP_UC_DS_OFFSET 25 + +// SPI CMD encoding +#define SPI_CMD_CFG(clockDiv,cpol,cpha) ((SPI_CMD_CFG_ID<fifo_tail) + { + // tail insert + itf_data->fifo_tail->next = pi_task; + itf_data->fifo_tail = itf_data->fifo_tail->next; + itf_data->fifo_tail->next = NULL; + } + else + { + // Initialize the list + itf_data->fifo_head = pi_task; + itf_data->fifo_head->next = NULL; + // set the base tail + itf_data->fifo_tail = itf_data->fifo_head; + } + //pi_irq_restore(irq); +} + +static inline pi_task_t* __pi_udma_datamove_task_fifo_pop(pi_udma_datamove_itf_data_t *itf_data) +{ + //uint32_t irq = pi_irq_disable(); + pi_task_t *ret_task = NULL; + if (itf_data->fifo_head != NULL) + { + ret_task = itf_data->fifo_head; + hal_compiler_barrier(); + itf_data->fifo_head = itf_data->fifo_head->next; + if (itf_data->fifo_head == NULL) + { + itf_data->fifo_tail = NULL; + } + } + //pi_irq_restore(irq); + return ret_task; +} + +static inline void __pi_udma_datamove_copy_start(pi_udma_datamove_itf_data_t* itf_data) +{ + if (NULL == itf_data->end_task) + { + return; + } + + uint32_t src = itf_data->end_task->data[0]; + uint32_t dst = itf_data->end_task->data[1]; + uint32_t len = itf_data->end_task->data[2]; + + pi_udma_datamove_data_t* dev_data = (pi_udma_datamove_data_t*) itf_data->end_task->data[3]; + uint32_t udma_ctrl_base = (uint32_t) ARCHI_UDMA_ADDR; + + /* select the rx_channel according to the device configuration */ + int32_t rx_chan = -1; + if (dev_data->dst_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR) + { + rx_chan = itf_data->rx_lin_chan_id; + } + else + { + rx_chan = itf_data->rx_2d_chan_id; + } + /* select the tx_channel according to the device configuration */ + int32_t tx_chan = -1; + if (dev_data->src_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR) + { + tx_chan = itf_data->tx_lin_chan_id; + } + else + { + tx_chan = itf_data->tx_2d_chan_id; + } + + if (0 == itf_data->device_id) + { + /* set the channels to use */ + //hal_udma_ctrl_datamove0_cfg_set_ids(tx_chan, rx_chan); + udma_ctrl_datamove_cfg_source_id_0_set(udma_ctrl_base, tx_chan); + udma_ctrl_datamove_cfg_dest_id_0_set(udma_ctrl_base, rx_chan); + + /* launch the copy, also activate clock for udma channels */ + //hal_udma_ctrl_datamove0_enable(); + udma_ctrl_datamove0_size_en_set(udma_ctrl_base, 1); + } + else + { + /* set the channels to use */ + //hal_udma_ctrl_datamove1_cfg_set_ids(tx_chan, rx_chan); + udma_ctrl_datamove_cfg_source_id_1_set(udma_ctrl_base, tx_chan); + udma_ctrl_datamove_cfg_dest_id_1_set(udma_ctrl_base, rx_chan); + + /* launch the copy, also activate clock for udma channels */ + //hal_udma_ctrl_datamove1_enable(); + udma_ctrl_datamove1_size_en_set(udma_ctrl_base, 1); + } + + /* setup and launch channels */ + if (dev_data->dst_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR) + { + uint32_t udma_core = pi_udma_core_lin_addr_get(rx_chan); + uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_EN(1); + pi_udma_core_lin_enqueue(udma_core, dst, len, config); + } + else + { + uint32_t udma_core = pi_udma_core_2d_addr_get(rx_chan - UDMA_NB_CHAN_LIN); + uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_EN(1); + pi_udma_core_2d_enqueue(udma_core, dst, 0, len, dev_data->dst_trf_cfg.stride, + dev_data->dst_trf_cfg.row_len, config); + } + + if (dev_data->src_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR) + { + uint32_t udma_core = pi_udma_core_lin_addr_get(tx_chan); + uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_EN(1); + pi_udma_core_lin_enqueue(udma_core, src, len, config); + } + else + { + uint32_t udma_core = pi_udma_core_2d_addr_get(tx_chan - UDMA_NB_CHAN_LIN); + uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_EN(1); + pi_udma_core_2d_enqueue(udma_core, src, 0, len, dev_data->src_trf_cfg.stride, + dev_data->src_trf_cfg.row_len, config); + } +} + + +static void __pi_udma_datamove_event_handler(uint32_t event, void* arg) +{ + pi_udma_datamove_itf_data_t* itf_data = (pi_udma_datamove_itf_data_t*) arg; + pi_task_t* task = itf_data->end_task; + uint32_t udma_ctrl_base = (uint32_t) ARCHI_UDMA_ADDR; + + /* stop the DATAMOVE */ + if (0 == itf_data->device_id) + { + //hal_udma_ctrl_datamove0_stop(); + udma_ctrl_datamove0_size_stop_set(udma_ctrl_base, 1); + } + else + { + //hal_udma_ctrl_datamove1_stop(); + udma_ctrl_datamove1_size_stop_set(udma_ctrl_base, 1); + } + + /* handle current task end */ + if (task != NULL) + { + pi_task_push_irq_safe(task); + } + + itf_data->end_task = NULL; + + /* start new task if needed */ + pi_task_t *next_task = __pi_udma_datamove_task_fifo_pop(itf_data); + if (next_task) + { + itf_data->end_task = next_task; + __pi_udma_datamove_copy_start(itf_data); + } +} + +/******************************************************************************* + * API implementation + ******************************************************************************/ + +void pi_udma_datamove_conf_init(pi_udma_datamove_conf_t *conf) +{ + conf->device_id = 0; + conf->src_trf_cfg.type = PI_UDMA_DATAMOVE_TRF_LINEAR; + conf->src_trf_cfg.row_len = 0; + conf->src_trf_cfg.stride = 0; + conf->dst_trf_cfg.type = PI_UDMA_DATAMOVE_TRF_LINEAR; + conf->dst_trf_cfg.row_len = 0; + conf->dst_trf_cfg.stride = 0; +} + +int pi_udma_datamove_open(pi_device_t *device) +{ + int status = PI_OK; + uint32_t irq = pi_irq_disable(); + pi_udma_datamove_conf_t *conf = (pi_udma_datamove_conf_t*) device->config; + + if (conf->device_id >= __UDMA_NB_DATAMOVE) + { + pi_irq_restore(irq); + return PI_FAIL; + } + + pi_udma_datamove_itf_data_t* itf_data = g_udma_datamove_itf_data[conf->device_id]; + + if (NULL == itf_data) + { + /* allocate itf data */ + itf_data = pi_fc_l1_malloc(sizeof(pi_udma_datamove_itf_data_t)); + if (NULL == itf_data) + { + pi_irq_restore(irq); + return PI_ERR_NO_MEM; + } + g_udma_datamove_itf_data[conf->device_id] = itf_data; + + /* allocate lin channels */ + itf_data->rx_lin_chan_id = pi_udma_core_lin_alloc(); + itf_data->tx_lin_chan_id = pi_udma_core_lin_alloc(); + if ((0 > itf_data->rx_lin_chan_id) || (0 > itf_data->tx_lin_chan_id)) + { + pi_udma_core_lin_free(itf_data->rx_lin_chan_id); + pi_udma_core_lin_free(itf_data->tx_lin_chan_id); + pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t)); + g_udma_datamove_itf_data[conf->device_id] = NULL; + pi_irq_restore(irq); + return PI_FAIL; + } + + /* enable lin events */ + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id)); + pi_fc_event_handler_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id), + __pi_udma_datamove_event_handler, itf_data); + /* allocate 2d channels */ + itf_data->rx_2d_chan_id = pi_udma_core_2d_alloc(); + itf_data->tx_2d_chan_id = pi_udma_core_2d_alloc(); + if ((itf_data->rx_2d_chan_id < 0) || (itf_data->tx_2d_chan_id < 0)) + { + pi_udma_core_2d_free(itf_data->rx_2d_chan_id); + pi_udma_core_2d_free(itf_data->tx_2d_chan_id); + pi_udma_core_lin_free(itf_data->rx_lin_chan_id); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id)); + pi_udma_core_lin_free(itf_data->tx_lin_chan_id); + pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t)); + g_udma_datamove_itf_data[conf->device_id] = NULL; + pi_irq_restore(irq); + return PI_FAIL; + } + /* enable 2D events */ + pi_soc_eu_fc_mask_set(itf_data->rx_2d_chan_id); + pi_fc_event_handler_set(itf_data->rx_2d_chan_id, + __pi_udma_datamove_event_handler, itf_data); + + /* Initialize itf data */ + itf_data->nb_open = 1; + itf_data->end_task = NULL; + itf_data->fifo_head = NULL; + itf_data->fifo_tail = NULL; + itf_data->device_id = conf->device_id; + } + else + { + itf_data->nb_open++; + } + + /* allocate device data */ + device->data = pi_fc_l1_malloc(sizeof(pi_udma_datamove_data_t)); + if (NULL == device->data) + { + /* the device we are opening initialized the interface */ + /* we need to close it */ + if (itf_data->nb_open == 1) + { + /* clear events, disable IRQs && free allocated udma channels */ + pi_udma_core_2d_free(itf_data->rx_2d_chan_id); + pi_udma_core_2d_free(itf_data->tx_2d_chan_id); + pi_udma_core_lin_free(itf_data->rx_lin_chan_id); + pi_udma_core_lin_free(itf_data->tx_lin_chan_id); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_2D((itf_data->rx_2d_chan_id - UDMA_NB_CHAN_LIN))); + /* free itf data */ + g_udma_datamove_itf_data[itf_data->device_id] = NULL; + pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t)); + } + pi_irq_restore(irq); + return PI_ERR_NO_MEM; + } + + /* initialize device data */ + pi_udma_datamove_data_t* dev_data = (pi_udma_datamove_data_t*) device->data; + dev_data->itf_data = itf_data; + dev_data->src_trf_cfg.type = conf->src_trf_cfg.type; + dev_data->src_trf_cfg.row_len = conf->src_trf_cfg.row_len; + dev_data->src_trf_cfg.stride = conf->src_trf_cfg.stride; + dev_data->dst_trf_cfg.type = conf->dst_trf_cfg.type; + dev_data->dst_trf_cfg.row_len = conf->dst_trf_cfg.row_len; + dev_data->dst_trf_cfg.stride = conf->dst_trf_cfg.stride; + + pi_irq_restore(irq); + return status; +} + +void pi_udma_datamove_close(pi_device_t *device) +{ + uint32_t irq = pi_irq_disable(); + pi_udma_datamove_data_t* dev_data = (pi_udma_datamove_data_t*) device->data; + pi_udma_datamove_itf_data_t* itf_data = dev_data->itf_data; + + itf_data->nb_open--; + + if (0 == itf_data->nb_open) + { + /* clear events, disable IRQs && free allocated udma channels */ + pi_udma_core_2d_free(itf_data->rx_2d_chan_id); + pi_udma_core_2d_free(itf_data->tx_2d_chan_id); + pi_udma_core_lin_free(itf_data->rx_lin_chan_id); + pi_udma_core_lin_free(itf_data->tx_lin_chan_id); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_2D((itf_data->rx_2d_chan_id - UDMA_NB_CHAN_LIN))); + /* free itf data */ + g_udma_datamove_itf_data[itf_data->device_id] = NULL; + pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t)); + } + + /* free device data */ + pi_fc_l1_free(dev_data, sizeof(pi_udma_datamove_data_t)); + + pi_irq_restore(irq); +} + +int32_t pi_udma_datamove_copy_async(pi_device_t *device, void* src, void* dst, + uint32_t len, pi_task_t* task) +{ + uint32_t irq = pi_irq_disable(); + + pi_udma_datamove_data_t *dev_data = (pi_udma_datamove_data_t*) device->data; + task->data[0] = (uint32_t) src; + task->data[1] = (uint32_t) dst; + task->data[2] = (uint32_t) len; + task->data[3] = (uint32_t) dev_data; + + pi_udma_datamove_itf_data_t *itf_data = dev_data->itf_data; + + /* if a request is in progress, enqueue this one */ + /* else, execute it */ + if (NULL == itf_data->end_task) + { + itf_data->end_task = task; + __pi_udma_datamove_copy_start(itf_data); + } + else + { + __pi_udma_datamove_task_fifo_enqueue(itf_data, task); + } + + pi_irq_restore(irq); + return PI_OK; +} + +int32_t pi_udma_datamove_copy(pi_device_t *device, void* src, void* dst, uint32_t len) +{ + pi_task_t task; + pi_task_block(&task); + int32_t status = pi_udma_datamove_copy_async(device, src, dst, len, &task); + if (PI_OK == status) + { + pi_task_wait_on(&task); + } + pi_task_destroy(&task); + return status; +} diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_ffc.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_ffc.c new file mode 100644 index 000000000..794b0950d --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_ffc.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2020, GreenWaves Technologies, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * o Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * o Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * o Neither the name of GreenWaves Technologies, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pmsis.h" +#include "chips/gap9/drivers/udma/udma_core.h" + +#if !defined(__FREERTOS__) +#include +#endif /* */ + +/******************************************************************************* + * Definitions + ******************************************************************************/ + +/******************************************************************************* + * Driver data + ******************************************************************************/ + +typedef struct pi_ffc_data_s +{ + struct pi_ffc_itf_data_s *itf_data; + pi_ffc_fixed_type_e fixed_type; + uint32_t fixed_scale; + uint32_t fixed_precision; + pi_ffc_float_type_e float_type; + pi_ffc_mode_e mode; + pi_ffc_io_mode_e io_mode; + uint8_t continuous_mode; +} pi_ffc_data_t; + +typedef struct pi_ffc_itf_data_s +{ + pi_task_t *fifo_head; /*!< head of the tasks FIFO */ + pi_task_t *fifo_tail; /*!< tail of the tasks FIFO */ + pi_task_t *end_task; /*!< current callback task */ + void* latest_conf; /*!< last used FFC configuration */ + int32_t rx_chan_id; /*!< RX udma channel */ + int32_t tx_chan_id; /*!< TX udma channel */ + uint8_t device_id; /*!< Device ID */ + int32_t nb_open; /*!< number of devices opened */ +} pi_ffc_itf_data_t; + +static pi_ffc_itf_data_t* g_ffc_itf_data[ARCHI_UDMA_NB_FFC]; + +/******************** + * Static functions + *******************/ + +static inline uint32_t __ffc_compute_udma_lin_shift(uint8_t is_rx, + pi_ffc_mode_e mode, + pi_ffc_float_type_e fl_type, + pi_ffc_fixed_type_e fp_type) +{ + uint32_t fp_shift = 0; + uint32_t fl_shift = 0; + switch (fl_type) + { + case PI_FFC_FLOAT_FP16: + case PI_FFC_FLOAT_BFP16: + fl_shift = 1; + break; + + default: + fl_shift = 2; + break; + } + + switch (fp_type) + { + case PI_FFC_FIXED_8: + fp_shift = 0; + break; + + case PI_FFC_FIXED_16: + fp_shift = 1; + break; + + default: + fp_shift = 2; + break; + } + + if (0 != is_rx) + { + return (PI_FFC_FIXED_TO_FLOAT == mode) ? fp_shift : fl_shift; + } + else + { + return (PI_FFC_FLOAT_TO_FIXED == mode) ? fp_shift : fl_shift; + } +} + +// Has to be synchronized with irq_disabled(done in convert_async) +// since irq handler might pop at the same time +static inline void __ffc_drv_fifo_enqueue(pi_ffc_itf_data_t *itf_data, + pi_task_t *pi_task) +{ + //uint32_t irq = pi_irq_disable(); + if (itf_data->fifo_tail) + { + // tail insert + itf_data->fifo_tail->next = pi_task; + itf_data->fifo_tail = itf_data->fifo_tail->next; + itf_data->fifo_tail->next = NULL; + } + else + { + // Initialize the list + itf_data->fifo_head = pi_task; + itf_data->fifo_head->next = NULL; + // set the base tail + itf_data->fifo_tail = itf_data->fifo_head; + } + //pi_irq_restore(irq); +} + +static inline pi_task_t *__ffc_drv_fifo_pop(pi_ffc_itf_data_t *itf_data) +{ + pi_task_t *ret_task = NULL; + if (itf_data->fifo_head != NULL) + { + ret_task = itf_data->fifo_head; + hal_compiler_barrier(); + itf_data->fifo_head = itf_data->fifo_head->next; + if (itf_data->fifo_head == NULL) + { + itf_data->fifo_tail = NULL; + } + } + return ret_task; +} + +static void __pi_ffc_change_continuous_mode(pi_ffc_data_t* dev_data, + uint8_t continuous_mode) +{ + uint32_t base = (uint32_t) UDMA_FFC_ADDR(dev_data->itf_data->device_id); + + if (continuous_mode != dev_data->continuous_mode) + { + if (continuous_mode != 0) + { + udma_ffc_start_set(base, 1); + } + else + { + udma_ffc_start_set(base, 0); + } + dev_data->continuous_mode = continuous_mode; + } +} + +static void __pi_ffc_change_event_source(pi_ffc_itf_data_t* itf_data) +{ + /* change event source according to io mode */ + uint32_t base = (uint32_t) UDMA_FFC_ADDR(itf_data->device_id); + + pi_ffc_data_t* dev_data = (pi_ffc_data_t*) itf_data->end_task->data[3]; + + switch(dev_data->io_mode) + { + case PI_FFC_MEMORY_IN_MEMORY_OUT: + //fallthrough + case PI_FFC_STREAM_IN_MEMORY_OUT: + { + /* use output channel as event source */ + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id)); + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + break; + } + + case PI_FFC_MEMORY_IN_STREAM_OUT: + { + /* use input channel as event source */ + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id)); + break; + } + + case PI_FFC_STREAM_IN_STREAM_OUT: + { + /* FFC has no control over data flow, no event source */ + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id)); + break; + } + + default: + //invalid choice + break; + } +} + +static void __pi_ffc_conf_apply(pi_ffc_itf_data_t* itf_data) +{ + uint32_t base = (uint32_t) UDMA_FFC_ADDR(itf_data->device_id); + + pi_ffc_data_t* dev_data = (pi_ffc_data_t*) itf_data->end_task->data[3]; + + /* retrieve current task */ + if (itf_data->latest_conf != dev_data) + { + itf_data->latest_conf = dev_data; + + udma_ffc_fp_format_set(base, dev_data->fixed_type); + udma_ffc_fl_format_set(base, dev_data->float_type); + udma_ffc_fp_prec_set(base, dev_data->fixed_precision); + udma_ffc_fp_scale_set(base, dev_data->fixed_scale); + udma_ffc_mode_direction_set(base, dev_data->mode); + udma_ffc_mode_io_mode_set(base, dev_data->io_mode); + __pi_ffc_change_event_source(itf_data); + } + udma_ffc_rx_dest_set(base, itf_data->rx_chan_id); + udma_ffc_tx_dest_set(base, itf_data->tx_chan_id); + + /* set stream as blocking */ + udma_ctrl_stream_cfg_set((uint32_t) ARCHI_UDMA_ADDR, 1 << (18 + itf_data->device_id)); +} + +static void __pi_ffc_conversion_start(pi_ffc_itf_data_t* itf_data) +{ + uint32_t chan_id; + + if (NULL == itf_data->end_task) + { + return; + } + + uint32_t src = itf_data->end_task->data[0]; + uint32_t dst = itf_data->end_task->data[1]; + uint32_t len = itf_data->end_task->data[2]; + pi_ffc_data_t* dev_data = (pi_ffc_data_t*) itf_data->end_task->data[3]; + + __pi_ffc_conf_apply(itf_data); + + uint32_t base = (uint32_t) UDMA_FFC_ADDR(itf_data->device_id); + + /* launch the conversion */ + if (dev_data->continuous_mode != 0) + { + udma_ffc_conv_num_set(base, 0); + } + else + { + udma_ffc_conv_num_set(base, len); + udma_ffc_start_set(base, 1); + } + + /* setup & launching channels */ + + if (0 == (dev_data->io_mode & 2)) + { + chan_id = udma_ffc_rx_dest_get(base); + uint32_t udma_core = pi_udma_core_lin_addr_get(chan_id); + uint32_t rx_shift = __ffc_compute_udma_lin_shift(0, dev_data->mode, dev_data->float_type, dev_data->fixed_type); + pi_udma_core_lin_enqueue(udma_core, (uint32_t) dst, len << rx_shift, 0); + } + + if (0 == (dev_data->io_mode & 1)) + { + chan_id = udma_ffc_tx_dest_get(base); + uint32_t udma_core = pi_udma_core_lin_addr_get(chan_id); + uint32_t tx_shift = __ffc_compute_udma_lin_shift(1, dev_data->mode, dev_data->float_type, dev_data->fixed_type); + pi_udma_core_lin_enqueue(udma_core, (uint32_t) src, len << tx_shift, 0); + } +} + +/******************** + * Callback + ********************/ + +static void __pi_ffc_event_handler(uint32_t event, void *arg) +{ + + pi_ffc_itf_data_t* itf_data = (pi_ffc_itf_data_t*) arg; + pi_task_t* task = itf_data->end_task; + + /* handle current task end */ + if (task != NULL) + { + pi_task_push_irq_safe(task); + } + + itf_data->end_task = NULL; + + /* start new task if needed */ + pi_task_t *next_task = __ffc_drv_fifo_pop(itf_data); + if (next_task) + { + itf_data->end_task = next_task; + __pi_ffc_conversion_start(itf_data); + } +} + + +/******************************************************************************* + * API implementation + ******************************************************************************/ + +void pi_ffc_conf_init(pi_ffc_conf_t *conf) +{ + conf->itf = 0; + conf->mode = PI_FFC_FLOAT_TO_FIXED; + conf->io_mode = PI_FFC_MEMORY_IN_MEMORY_OUT; + conf->fixed_type = PI_FFC_FIXED_32; + conf->fixed_scale = 0; + conf->fixed_precision = 0; + conf->float_type = PI_FFC_FLOAT_FP32; +} + +int pi_ffc_open(pi_device_t *device) +{ + uint32_t irq = pi_irq_disable(); + pi_ffc_conf_t *conf = (pi_ffc_conf_t*) device->config; + pi_ffc_itf_data_t *itf_data = g_ffc_itf_data[conf->itf]; + + if (NULL == itf_data) + { + /* allocate itf data */ + itf_data = pi_fc_l1_malloc(sizeof(pi_ffc_itf_data_t)); + if (NULL == itf_data) + { + pi_irq_restore(irq); + return PI_ERR_NO_MEM; + } + g_ffc_itf_data[conf->itf] = itf_data; + + /* allocate 2 udma lin channels */ + /* set both of them to trigger event handler, io_mode will decide which + * one to use */ + int32_t tx_chan_id = pi_udma_core_lin_alloc(); + int32_t rx_chan_id = pi_udma_core_lin_alloc(); + if (rx_chan_id < 0 || tx_chan_id < 0) + { + pi_fc_l1_free(itf_data, sizeof(pi_ffc_itf_data_t)); + pi_irq_restore(irq); + return PI_FAIL; + } + itf_data->rx_chan_id = rx_chan_id; + itf_data->tx_chan_id = tx_chan_id; + + pi_fc_event_handler_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id), + __pi_ffc_event_handler, itf_data); + pi_fc_event_handler_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id), + __pi_ffc_event_handler, itf_data); + /* use rx as default */ + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + + itf_data->nb_open = 1; + itf_data->device_id = conf->itf; + itf_data->fifo_head = NULL; + itf_data->fifo_tail = NULL; + itf_data->end_task = NULL; + itf_data->latest_conf = NULL; + + /* disable udma reset before setting regs */ + uint32_t periph_id = ARCHI_UDMA_FFC_ID(itf_data->device_id); + udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + } + else + { + itf_data->nb_open++; + } + + /* allocate device data */ + device->data = pi_fc_l1_malloc(sizeof(pi_ffc_data_t)); + if (NULL == device->data) + { + pi_irq_restore(irq); + return PI_ERR_L2_NO_MEM; + } + + /* initialize device data */ + pi_ffc_data_t* dev_data = (pi_ffc_data_t*) device->data; + dev_data->itf_data = itf_data; + dev_data->fixed_type = conf->fixed_type; + dev_data->fixed_scale = conf->fixed_scale; + dev_data->fixed_precision = conf->fixed_precision; + dev_data->float_type = conf->float_type; + dev_data->mode = conf->mode; + dev_data->io_mode = conf->io_mode; + dev_data->continuous_mode = 0; /* continuous mode disabled by default */ + + pi_irq_restore(irq); + return PI_OK; +} + +void pi_ffc_close(pi_device_t *device) +{ + uint32_t irq = pi_irq_disable(); + pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data; + pi_ffc_itf_data_t* itf_data = dev_data->itf_data; + + /* decrement number of devices opened */ + itf_data->nb_open--; + + if (itf_data->nb_open == 0) + { + /* clear events, disable IRQs & free allocated udma channels */ + pi_udma_core_lin_reset(pi_udma_core_lin_addr_get(itf_data->rx_chan_id)); + pi_udma_core_lin_reset(pi_udma_core_lin_addr_get(itf_data->tx_chan_id)); + + pi_udma_core_lin_free(itf_data->rx_chan_id); + pi_udma_core_lin_free(itf_data->tx_chan_id); + + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id)); + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id)); + + uint32_t periph_id = ARCHI_UDMA_FFC_ID(itf_data->device_id); + udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + + /* free itf data */ + g_ffc_itf_data[itf_data->device_id] = NULL; + pi_fc_l1_free(itf_data, sizeof(pi_ffc_itf_data_t)); + } + /* free device data */ + pi_fc_l1_free(device->data, sizeof(pi_ffc_data_t)); + pi_irq_restore(irq); +} + +int32_t pi_ffc_ioctl(pi_device_t *device, uint32_t cmd, void *arg) +{ + uint32_t irq = pi_irq_disable(); + switch (cmd) + { + case PI_FFC_IOCTL_SET_IO_MODE: + { + pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data; + dev_data->io_mode = (pi_ffc_io_mode_e)((uintptr_t) arg); + /* make last config invalid */ + dev_data->itf_data->latest_conf = NULL; + break; + } + + case PI_FFC_IOCTL_CONTINUOUS_ENABLE: + { + pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data; + + uint8_t continuous_mode = (uint8_t)((uintptr_t) arg); + __pi_ffc_change_continuous_mode(dev_data, continuous_mode); + /* make last config invalid */ + dev_data->itf_data->latest_conf = NULL; + break; + } + + default: + pi_irq_restore(irq); + return PI_FAIL; + } + pi_irq_restore(irq); + return PI_OK; +} + +void pi_ffc_convert(pi_device_t *device, void* src, void* dst, uint16_t size) +{ + pi_task_t block; + pi_task_block(&block); + pi_ffc_convert_async(device, src, dst, size, &block); + pi_task_wait_on(&block); + pi_task_destroy(&block); +} + +void pi_ffc_convert_async(pi_device_t* device, void* src, void* dst, + uint16_t size, pi_task_t* task) +{ + uint32_t irq = pi_irq_disable(); + + pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data; + task->data[0] = (uint32_t) src; + task->data[1] = (uint32_t) dst; + task->data[2] = (uint32_t) size; + task->data[3] = (uint32_t) dev_data; + + pi_ffc_itf_data_t *itf_data = dev_data->itf_data; + + /* if a request is in progress, enqueue this one */ + /* else, execute it */ + if (NULL == itf_data->end_task) + { + itf_data->end_task = task; + __pi_ffc_conversion_start(itf_data); + } + else + { + __ffc_drv_fifo_enqueue(itf_data, task); + } + + pi_irq_restore(irq); +} diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timeout.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timeout.c new file mode 100644 index 000000000..88f7e780c --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timeout.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2021, GreenWaves Technologies, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * o Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * o Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * o Neither the name of GreenWaves Technologies, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pmsis.h" +#include "chips/gap9/drivers/udma/udma_timeout.h" + +/******************************************************************************* + * Definitions + ******************************************************************************/ + +#if !defined(__FREERTOS__) +#define UDMA_NB_TIMEOUT ( 8 ) +#endif /* __FREERTOS__ */ + +/******************************************************************************* + * Driver data + ******************************************************************************/ + +static struct pi_udma_timeout_s *g_udma_timeout[UDMA_NB_TIMEOUT]; + +/******************************************************************************* + * Function declaration + ******************************************************************************/ + +/* Event handler. */ +static void __pi_udma_timeout_event_handler(uint32_t event, void *arg); + +/* Enqueue task in SW fifo. */ +static inline void __pi_udma_timeout_task_push(struct pi_udma_timeout_s *driver_data, + pi_task_t *task); + +/* Pop task from SW fifo. */ +static inline pi_task_t *__pi_udma_timeout_task_pop(struct pi_udma_timeout_s *driver_data); + +/* Start a UDMA timer, when in SW trigger mode. */ +static void __pi_udma_timeout_start(uint8_t timeout_id); + +/* Stop a UDMA timer. */ +static void __pi_udma_timeout_stop(uint8_t timeout_id); + +/******************************************************************************* + * Internal functions + ******************************************************************************/ + +/** TIMEOUT_PRE Register. */ +static inline uint32_t __pi_udma_ctrl_timeout_prescaler_conf_get(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3)); + uint32_t prescaler = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + return prescaler; +} + +static inline uint32_t __pi_udma_ctrl_timeout_prescaler_enabled(uint8_t timeout_id) +{ + uint32_t prescaler = __pi_udma_ctrl_timeout_prescaler_conf_get(timeout_id); + prescaler &= UDMA_CTRL_TIMEOUT_PRE0_EN_MASK; + prescaler >>= UDMA_CTRL_TIMEOUT_PRE0_EN_BIT; + return prescaler; +} + +static inline void __pi_udma_ctrl_timeout_prescaler_set(uint8_t timeout_id, + uint16_t presc_value, + uint8_t enable) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3)); + uint32_t prescaler = (UDMA_CTRL_TIMEOUT_PRE0_CNT(presc_value) | + UDMA_CTRL_TIMEOUT_PRE0_EN(enable)); + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler); +} + +static inline void __pi_udma_ctrl_timeout_prescaler_reset(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3)); + udma_ctrl_timeout_pre0_t prescaler = {0}; + prescaler.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + prescaler.clr = 1; + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler.raw); +} + +static inline void __pi_udma_ctrl_timeout_prescaler_start(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3)); + udma_ctrl_timeout_pre0_t prescaler = {0}; + prescaler.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + prescaler.en = 1; + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler.raw); +} + +static inline void __pi_udma_ctrl_timeout_prescaler_stop(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3)); + udma_ctrl_timeout_pre0_t prescaler = {0}; + prescaler.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + prescaler.en = 0; + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler.raw); +} + + +/** TIMEOUT_CHX Register. */ +static inline uint32_t __pi_udma_ctrl_timeout_timeout_get(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3)); + uint32_t timeout = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + return timeout; +} + +static inline void __pi_udma_ctrl_timeout_timeout_set(uint8_t timeout_id, + uint8_t udma_chan_id, + uint8_t mode, + uint16_t timeout_val, + uint8_t enable) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3)); + uint32_t timeout = (UDMA_CTRL_TIMEOUT_CH0_SOURCE_ID(udma_chan_id) | + UDMA_CTRL_TIMEOUT_CH0_MODE(mode) | + UDMA_CTRL_TIMEOUT_CH0_EN(enable) | + UDMA_CTRL_TIMEOUT_CH0_CNT(timeout_val)); + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout); + +} +static inline void __pi_udma_ctrl_timeout_mode_set(uint8_t timeout_id, uint8_t mode) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3)); + udma_ctrl_timeout_ch0_t timeout = {0}; + timeout.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + timeout.mode = mode; + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout.raw); +} + +static inline void __pi_udma_ctrl_timeout_timeout_start(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3)); + udma_ctrl_timeout_ch0_t timeout = {0}; + timeout.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + timeout.en = 1; + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout.raw); +} + +static inline void __pi_udma_ctrl_timeout_timeout_stop(uint8_t timeout_id) +{ + uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3)); + udma_ctrl_timeout_ch0_t timeout = {0}; + timeout.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset); + timeout.en = 0; + GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout.raw); +} + + +static void __pi_udma_timeout_event_handler(uint32_t event, void *arg) +{ + uint32_t timeout_id = (uint32_t) arg; + //TIMEOUT_TRACE("Timeout id=%ld event\n", timeout_id); + struct pi_udma_timeout_s *driver_data = g_udma_timeout[timeout_id]; + struct pi_task *task = __pi_udma_timeout_task_pop(driver_data); + __pi_udma_timeout_stop(timeout_id); + if (task != NULL) + { + //TIMEOUT_TRACE("Handle task=%lx\n", task); + //printf("Handle task=%lx, arg=%lx\n", task, task->arg[3]); + /* Timeout reached, abort transfer. */ + pi_callback_func_t func = (pi_callback_func_t) task->arg[2]; + func((void *) task->arg[3]); + + /* Set transfer end result. */ + task->arg[2] = -1; + + /* Release event task. */ + pi_task_push(task); + } +} + +static inline void __pi_udma_timeout_task_push(struct pi_udma_timeout_s *driver_data, + pi_task_t *task) +{ + uint32_t irq = disable_irq(); + if ((driver_data->fifo_head == NULL) || (driver_data->fifo_head == (void *) 0xFFFFFFFF)) + { + driver_data->fifo_head = task; + } + else + { + driver_data->fifo_tail->next = task; + } + driver_data->fifo_tail = task; + restore_irq(irq); +} + +static inline pi_task_t *__pi_udma_timeout_task_pop(struct pi_udma_timeout_s *driver_data) +{ + pi_task_t *task_return = NULL; + if (driver_data->fifo_head != NULL) + { + task_return = driver_data->fifo_head; + driver_data->fifo_head = driver_data->fifo_head->next; + } + return task_return; +} + +static void __pi_udma_timeout_start(uint8_t timeout_id) +{ + __pi_udma_ctrl_timeout_prescaler_stop(timeout_id); + __pi_udma_ctrl_timeout_prescaler_reset(timeout_id); + __pi_udma_ctrl_timeout_timeout_start(timeout_id); + __pi_udma_ctrl_timeout_prescaler_start(timeout_id); +} + +static void __pi_udma_timeout_stop(uint8_t timeout_id) +{ + __pi_udma_ctrl_timeout_prescaler_stop(timeout_id); + __pi_udma_ctrl_timeout_timeout_stop(timeout_id); +} + +int32_t pi_udma_timeout_config_set(pi_task_t *task, uint8_t timeout_id, + uint8_t udma_chan_id, uint32_t timeout_us) +{ + int32_t status = 0; + /* Fast clock used by timeout. */ + float periph_freq = (float) ARCHI_FLL_REF_CLOCK; /* 24MHz. */ + float nb_tick_us = periph_freq / 1000000.0; + float timeout_val = ((float) timeout_us) * nb_tick_us; + uint16_t prescaler = 0; + TIMEOUT_TRACE("Timeout_%d : configure udma_chan=%d, timeout_us=%ld, task=%lx\n", + timeout_id, udma_chan_id, timeout_us, task); + TIMEOUT_TRACE("Periph_freq=%f, timeout_us=%ld, nb_tick_us=%f, timeout_val=%f\n", + periph_freq, timeout_us, nb_tick_us, timeout_val); + prescaler = (timeout_val > 0xFFFF); + while (timeout_val > 0xFFFF) + { + prescaler <<= 1; + timeout_val /= 2; + } + if (__pi_udma_ctrl_timeout_prescaler_enabled(timeout_id)) + { + /* Timeout already in use. */ + TIMEOUT_TRACE_ERR("Timeout id=%ld already in use\n", timeout_id); + return -11; + } + + __pi_udma_timeout_task_push(g_udma_timeout[timeout_id], task); + + uint32_t mode = g_udma_timeout[timeout_id]->mode; + uint8_t enable = (mode != PI_UDMA_TIMEOUT_MODE_SW_TRIGGER); + TIMEOUT_TRACE("Timeout_%d : mode=%d, prescaler=%d, timeout=%f, enable=%d\n", + timeout_id, mode, prescaler, timeout_val, enable); + __pi_udma_ctrl_timeout_prescaler_set(timeout_id, prescaler, enable); + __pi_udma_ctrl_timeout_timeout_set(timeout_id, udma_chan_id, mode, + (uint16_t) timeout_val, enable); + __pi_udma_ctrl_timeout_prescaler_reset(timeout_id); + return status; +} + +pi_task_t *__pi_udma_timeout_task_remove(uint8_t timeout_id) +{ + return __pi_udma_timeout_task_pop(g_udma_timeout[timeout_id]); +} + +/******************************************************************************* + * API implementation + ******************************************************************************/ + +int32_t pi_udma_timeout_alloc(pi_udma_timeout_mode_e mode) +{ + int32_t timeout_id = -1; + for (uint32_t tid = 0; tid < (uint32_t) UDMA_NB_TIMEOUT; tid++) + { + if ((g_udma_timeout[tid] == NULL) || (g_udma_timeout[tid]->fifo_head == NULL)) + { + /* Alloc UDMA timeout struct. */ + g_udma_timeout[tid] = pi_fc_l1_malloc(sizeof(struct pi_udma_timeout_s)); + if (g_udma_timeout[tid] == NULL) + { + TIMEOUT_TRACE_ERR("Timeout struct alloc failed !\n"); + timeout_id = -11; + break; + } + g_udma_timeout[tid]->fifo_head = (void *) 0xFFFFFFFF; + g_udma_timeout[tid]->mode = mode; + /* Set FC event handler. */ + pi_fc_event_handler_set(SOC_EVENT_UDMA_TIMEOUT(tid), + __pi_udma_timeout_event_handler, + (void *) tid); + /* Enable SoC events propagation to FC. */ + pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_TIMEOUT(tid)); + timeout_id = tid; + TIMEOUT_TRACE("Timeout id=%ld allocated\n", timeout_id); + break; + } + } + return timeout_id; +} + +void pi_udma_timeout_free(int32_t timeout_id) +{ + TIMEOUT_TRACE("Timeout_%ld : free timeout.\n", timeout_id); + g_udma_timeout[timeout_id]->fifo_head = NULL; + /* Free UDMA timeout struct. */ + pi_fc_l1_free(g_udma_timeout[timeout_id], sizeof(struct pi_udma_timeout_s)); + /* Clear FC event handler. */ + pi_fc_event_handler_clear(SOC_EVENT_UDMA_TIMEOUT(timeout_id)); + /* Disable SoC events propagation. */ + pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_TIMEOUT(timeout_id)); +} + +int32_t pi_udma_timeout_ioctl(int32_t timeout_id, uint32_t cmd, void *arg) +{ + TIMEOUT_TRACE("Timeout_%ld : ioctl cmd=%ld, arg=%lx\n", timeout_id, cmd, arg); + int32_t status = 0; + switch (cmd) + { + case PI_UDMA_TIMEOUT_IOCTL_START : + __pi_udma_timeout_start(timeout_id); + break; + + case PI_UDMA_TIMEOUT_IOCTL_STOP : + __pi_udma_timeout_stop(timeout_id); + break; + + default : + status = -1; + } + return status; +} diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.c new file mode 100644 index 000000000..dbe04185c --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2020, GreenWaves Technologies, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * o Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * o Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * o Neither the name of GreenWaves Technologies, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pmsis.h" +#include "udma_timestamp.h" + +/******************************************************************************* + * Definitions + ******************************************************************************/ + +#if !defined(__FREERTOS__) +#define UDMA_TIMESTAMP_ID(id) ( ARCHI_UDMA_TS_ID((id)) ) +#define UDMA_TIMESTAMP(id) ( UDMA_TS_ADDR((id)) ) +#define UDMA_NB_TIMESTAMP ( ARCHI_UDMA_NB_TS ) +#define TIMESTAMP_TRACE(...) ( (void) 0 ) +#define TIMESTAMP_TRACE_ERR(...) ( (void) 0 ) +#endif /* __FREERTOS__ */ + +/******************************************************************************* + * Driver data + ******************************************************************************/ + +static struct pi_udma_timestamp_cnt_t timestamp_cnt[UDMA_NB_TIMESTAMP]; +static struct pi_udma_ts_evt_t ts_evt[UDMA_NB_TIMESTAMP_EVT]; +static struct pi_udma_ts_input_t ts_input[UDMA_NB_TIMESTAMP_INPUT]; +static uint8_t evt_mask = 0xF; +static uint8_t input_mask = 0xFF; + +/******************************************************************************* + * Function declaration + ******************************************************************************/ + +/******************************************************************************* + * Internal functions + ******************************************************************************/ + +static void __pi_udma_timestamp_cnt_clr(uint32_t base) +{ + udma_timestamp_reg_cmd_cnt_clr_set(base, 1); +} + +static void __pi_udma_timestamp_cnt_stop(uint32_t base) +{ + udma_timestamp_reg_cmd_cnt_stop_set(base, 1); +} + +static void __pi_udma_timestamp_cnt_close(struct pi_udma_timestamp_cnt_t *ts) +{ + /* Stop the timestamp counter */ + __pi_udma_timestamp_cnt_stop(ts->base); + + ts->cnt_en = 0; + + if (ts->cnt_trig_gpio != 0xFF) + { + udma_timestamp_reg_setup_cnt_ext_sel_set(ts->base, 0); + udma_timestamp_reg_setup_cnt_ext_type_set(ts->base, 0); + udma_timestamp_reg_setup_cnt_ext_en_set(ts->base, 0); + } + + udma_timestamp_reg_clk_cfg_clk_mux_set(ts->base, 0); + udma_timestamp_reg_clk_cfg_gpio_sel_set(ts->base, 0); + udma_timestamp_reg_clk_cfg_pwm_sel_set(ts->base, 0); + udma_timestamp_reg_clk_cfg_prescaler_set(ts->base, 0); + udma_timestamp_reg_clk_cfg_clk_mux_en_set(ts->base, 0); + + //TODO: clean the ts + //memset(ts,0,sizeof(struct pi_udma_timestamp_cnt_t)); +} + +static void __pi_udma_evt_cfg_init(void) +{ + uint32_t base = ARCHI_UDMA_ADDR; + // Init all the cfg event in udma ctrl to 0xFF + udma_ctrl_cfg_event_cmp_evt0_set(base, 0xFF); + udma_ctrl_cfg_event_cmp_evt1_set(base, 0xFF); + udma_ctrl_cfg_event_cmp_evt2_set(base, 0xFF); + udma_ctrl_cfg_event_cmp_evt3_set(base, 0xFF); +} + + +static int __pi_udma_timestamp_evt_alloc(uint32_t ts_base, pi_timestamp_event_t * evt) +{ + uint8_t src_id = 0; + if(evt_mask) + { + src_id = __builtin_pulp_fl1((evt_mask)); + evt->ts_evt_id = src_id; + evt_mask &= ~(1<ts_evt_id].dest_id = evt->dest_id; + + switch (evt->ts_evt_id) + { + case 0: + udma_timestamp_reg_event_dest_id_evt_0_set(ts_base, evt->dest_id); + break; + case 1: + udma_timestamp_reg_event_dest_id_evt_1_set(ts_base, evt->dest_id); + break; + case 2: + udma_timestamp_reg_event_dest_id_evt_2_set(ts_base, evt->dest_id); + break; + case 3: + udma_timestamp_reg_event_dest_id_evt_3_set(ts_base, evt->dest_id); + break; + default: + TIMESTAMP_TRACE_ERR("Unknown timestamp event numbe= %d\n", evt->ts_evt_id); + break; + } + return 0; +} + +static int __pi_udma_timestamp_input_set(uint32_t base, pi_timestamp_input_t * input) +{ + + uint8_t src_id = 0; + if(input_mask) + { + src_id = __builtin_pulp_fl1((input_mask)); + ts_input[src_id].ts_input_id = src_id; + ts_input[src_id].dest_id = input->dest_id; + ts_input[src_id].input_sel = input->input_sel; + ts_input[src_id].input_type = input->input_type; + input->ts_input_id = src_id; + input_mask &= ~(1<input_sel >> 6) + { + TIMESTAMP_TRACE_ERR("GPIO ID bigger than 63\n"); + return -1; + } + else + { + switch (input->ts_input_id) + { + case 0: + udma_timestamp_reg_setup_ch0_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch0_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch0_input_en_set(base, 1); + udma_timestamp_reg_setup_ch0_dest_id_set(base, input->dest_id); + break; + case 1: + udma_timestamp_reg_setup_ch1_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch1_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch1_input_en_set(base, 1); + udma_timestamp_reg_setup_ch1_dest_id_set(base, input->dest_id); + break; + case 2: + udma_timestamp_reg_setup_ch2_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch2_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch2_input_en_set(base, 1); + udma_timestamp_reg_setup_ch2_dest_id_set(base, input->dest_id); + break; + case 3: + udma_timestamp_reg_setup_ch3_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch3_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch3_input_en_set(base, 1); + udma_timestamp_reg_setup_ch3_dest_id_set(base, input->dest_id); + break; + case 4: + udma_timestamp_reg_setup_ch4_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch4_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch4_input_en_set(base, 1); + udma_timestamp_reg_setup_ch4_dest_id_set(base, input->dest_id); + break; + case 5: + udma_timestamp_reg_setup_ch5_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch5_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch5_input_en_set(base, 1); + udma_timestamp_reg_setup_ch5_dest_id_set(base, input->dest_id); + break; + case 6: + udma_timestamp_reg_setup_ch6_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch6_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch6_input_en_set(base, 1); + udma_timestamp_reg_setup_ch6_dest_id_set(base, input->dest_id); + break; + case 7: + udma_timestamp_reg_setup_ch7_input_sel_set(base, input->input_sel); + udma_timestamp_reg_setup_ch7_input_type_set(base, input->input_type); + udma_timestamp_reg_setup_ch7_input_en_set(base, 1); + udma_timestamp_reg_setup_ch7_dest_id_set(base, input->dest_id); + break; + + default: + break; + } + + } + return 0; +} + +/******************************************************************************* + * API implementation + ******************************************************************************/ + +void pi_timestamp_conf_init(struct pi_timestamp_conf *conf) +{ + conf->itf = 0; + conf->cnt_trig_gpio = 0xFF; + conf->cnt_trig_type = PI_TIMESTAMP_AUX_INPUT; + conf->cnt_src = PI_TIMESTAMP_CNT_REF_CLK_QUICK; + conf->cnt_src_id = 0xFF; + conf->prescaler = 0; +} + +void pi_udma_timestamp_open(struct pi_device * device) +{ + uint32_t irq = pi_irq_disable(); + struct pi_timestamp_conf *conf = (struct pi_timestamp_conf *) device -> config; + struct pi_udma_timestamp_cnt_t *ts = ×tamp_cnt[conf->itf]; + + device->data = (void *)ts; + + if (ts->cnt_en) + { + TIMESTAMP_TRACE("Timestamp counter already set, ignore this counter init"); + } + else + { + ts->base = UDMA_TIMESTAMP(conf->itf); + /* Disable UDMA CG and reset periph. */ + uint32_t periph_id = UDMA_TIMESTAMP_ID(conf->itf); + udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + + if (conf->cnt_trig_gpio != 0xFF) + { + udma_timestamp_reg_setup_cnt_ext_sel_set(ts->base, conf->cnt_trig_gpio); + udma_timestamp_reg_setup_cnt_ext_type_set(ts->base, conf->cnt_trig_type); + udma_timestamp_reg_setup_cnt_ext_en_set(ts->base, 1); + } + + if (conf->cnt_src != PI_TIMESTAMP_CNT_SOC_CLK) + { + udma_timestamp_reg_clk_cfg_clk_mux_set(ts->base, conf->cnt_src); + if (conf->cnt_src == PI_TIMESTAMP_CNT_GPIO) + { + udma_timestamp_reg_clk_cfg_gpio_sel_set(ts->base, conf->cnt_src_id); + } + else if(conf->cnt_src == PI_TIMESTAMP_CNT_PWM) + { + udma_timestamp_reg_clk_cfg_pwm_sel_set(ts->base, conf->cnt_src_id); + } + udma_timestamp_reg_clk_cfg_clk_mux_en_set(ts->base, 1); + + } + + if (conf->prescaler) + { + udma_timestamp_reg_clk_cfg_prescaler_set(ts->base, conf->prescaler); + } + + ts->device_id = conf->itf; + ts->cnt_trig_gpio = conf->cnt_trig_gpio; + ts->cnt_trig_type = conf->cnt_trig_type; + ts->cnt_src = conf->cnt_src; + ts->cnt_src_id = conf->cnt_src_id; + ts->prescaler = conf->prescaler; + ts->cnt_en = 1; + + /* Set all the event to 0xFF */ + __pi_udma_evt_cfg_init(); + } + pi_irq_restore(irq); +} + +void pi_udma_timestamp_close(struct pi_device *device) +{ + uint32_t irq = pi_irq_disable(); + struct pi_udma_timestamp_cnt_t *ts = (struct pi_udma_timestamp_cnt_t *) device->data; + + __pi_udma_timestamp_cnt_close(ts); + + /* Set all the event to 0xFF */ + __pi_udma_evt_cfg_init(); + + /* Enable UDMA CG and reset periph. */ + uint32_t periph_id = UDMA_TIMESTAMP_ID(ts->device_id); + udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id)); + + /* Free all the udma timestamp allocated. */ + //TODO: clean all the event and input + /* + for (uint32_t tid = 0; tid < (uint32_t) UDMA_NB_TIMESTAMP_EVT; tid++) + { + if(ts_evt[tid].soc_evt) + soc_eu_prEventMask_setEvent(ts_evt[tid].soc_evt); + } + */ + + evt_mask = 0xF; + input_mask = 0xFF; + pi_irq_restore(irq); +} + + +int32_t pi_udma_timestamp_ioctl(struct pi_device *device, uint32_t cmd, void *arg) +{ + uint32_t irq = pi_irq_disable(); + struct pi_udma_timestamp_cnt_t *ts = (struct pi_udma_timestamp_cnt_t *) device->data; + int32_t status = 0; + int src_id = 0; + + switch (cmd) + { + case PI_UDMA_TIMESTAMP_IOCTL_CLR : + __pi_udma_timestamp_cnt_clr(ts->base); + break; + + case PI_UDMA_TIMESTAMP_IOCTL_STOP : + __pi_udma_timestamp_cnt_stop(ts->base); + break; + + case PI_UDMA_TIMESTAMP_IOCTL_EVT_ALLOC: + __pi_udma_timestamp_evt_alloc(ts->base, arg); + break; + + case PI_UDMA_TIMESTAMP_IOCTL_SET_EVT : + status = __pi_udma_timestamp_evt_cfg(ts->base, arg); + break; + + case PI_UDMA_TIMESTAMP_IOCTL_SET_INPUT : + status = __pi_udma_timestamp_input_set(ts->base, arg); + break; + + // TODO: complete these cases + case PI_UDMA_TIMESTAMP_IOCTL_FREE_EVT : + break; + + case PI_UDMA_TIMESTAMP_IOCTL_FREE_INPUT : + break; + + default : + TIMESTAMP_TRACE_ERR("Unknown timestamp command, cmd=%ld\n", cmd); + } + pi_irq_restore(irq); + return status; +} + +/** Not in PMSIS_API. */ +#if 0 +void pi_udma_timestamp_read_async(unsigned char src_type, unsigned char ts_id, + void *buffer, int32_t size, pi_task_t *task) +{ + pos_udma_channel_t *channel = src_type? &ts_evt[ts_id].channel : &ts_input[ts_id].channel; + + uint32_t irq = pi_irq_disable(); + pos_udma_enqueue(channel, task, (int)buffer, size); + pi_irq_restore(irq); +} +#endif /* 0 */ diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.h new file mode 100644 index 000000000..395a9def4 --- /dev/null +++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020, GreenWaves Technologies, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * o Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * o Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * o Neither the name of GreenWaves Technologies, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + + +/******************************************************************************* + * Definitions + ******************************************************************************/ + +#define UDMA_NB_TIMESTAMP_EVT (4) +#define UDMA_NB_TIMESTAMP_INPUT (8) + +struct pi_udma_timestamp_cnt_t +{ + uint32_t base; /*!< base addr of TS */ + uint8_t device_id; /*!< device ID for timestamp */ + uint8_t cnt_trig_gpio; /*!< gpio number for trigger the timestamp cnter */ + uint8_t cnt_trig_type; /*!< how the gpio trigger the timestamp counter */ + uint8_t cnt_src; /*!< timestamp counter source */ + uint8_t cnt_src_id; /*!< GPIO/PWM number depends on the counter source */ + uint8_t prescaler; /*!< Prescaler for timestamp counter */ + uint8_t cnt_en; /*!< If the counter is enabled */ +}; + +struct pi_udma_ts_evt_t +{ + uint8_t dest_id; /*!< fifo ID for timestamp */ + uint8_t soc_evt; /*!< soc event ID which should be propagated to periph */ + uint8_t ts_evt_id; /*!< The udma cfg evt number */ +}; + +struct pi_udma_ts_input_t +{ + uint8_t dest_id; + uint8_t ts_input_id; /*!< Timestamp input ID, max 8 input. Reg0-7 */ + uint8_t input_sel; /*!< Timestamp input selction: + if input_type=3, then 0-7 are SFU, 8-10 are SAI. + Else input sel are GPIO 0-63 */ + uint8_t input_type; /*!< Timestamp input GPIO trigger or input from AUX */ +}; + +/******************************************************************************* + * Driver data + ******************************************************************************/ + +/******************************************************************************* + * Function declaration + ******************************************************************************/ diff --git a/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h index f66c4076e..bb11d8454 100644 --- a/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h +++ b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h @@ -36,9 +36,16 @@ ******************************************************************************/ #if !defined(__FREERTOS__) +#define UDMA_NB_CHAN_LIN ( ARCHI_UDMA_NB_LIN_ADDRGEN ) +#define UDMA_NB_CHAN_2D ( ARCHI_UDMA_NB_2D_ADDRGEN ) +#define UDMA_NB_CHAN_FIFO ( ARCHI_UDMA_NB_FIFO_ADDRGEN ) #define UDMA_CHAN_LIN(id) ( UDMA_LIN_ADDRGEN_ADDR((id)) ) -#define UDMA_CHAN_2D(id) -#define UDMA_CHAN_FIFO(id) +#define UDMA_CHAN_2D(id) ( 0x1A103800 + 0x20 * id ) +#define UDMA_CHAN_FIFO(id) ( 0x1A103900 + 0x20 * id ) +#define UDMA_CHAN_LIN_ID(id) ( (id) ) +#define UDMA_CHAN_2D_ID(id) ( ARCHI_UDMA_NB_LIN_ADDRGEN + (id) ) +#define UDMA_CHAN_FIFO_ID(id) ( ARCHI_UDMA_NB_LIN_ADDRGEN + ARCHI_UDMA_NB_2D_ADDRGEN + (id) ) +#define SOC_EVENT_UDMA_CHAN_LIN(id) ( (id) ) #endif /* __FREERTOS__ */ @@ -87,11 +94,6 @@ static inline void pi_udma_core_channels_init(void) /** * UDMA_CHANNEL_LINEAR */ -static inline uint32_t pi_udma_core_lin_addr_get(int32_t chan_id) -{ - return UDMA_CHAN_LIN(chan_id); -} - static inline int32_t pi_udma_core_lin_alloc(void) { int32_t chan_id = -1; @@ -119,22 +121,58 @@ static inline void pi_udma_core_lin_free(int32_t chan_id) } } +static inline uint32_t pi_udma_core_lin_addr_get(int32_t chan_id) +{ + return UDMA_CHAN_LIN(chan_id); +} + +static inline void pi_udma_core_lin_enqueue(uint32_t udma_core_base, + uint32_t buf, + uint32_t size, uint32_t config) +{ + config |= UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_EN(1); + udma_core_lin_addrgen_cfg_sa_buf0_set(udma_core_base, buf); + udma_core_lin_addrgen_cfg_size_set(udma_core_base, size); + udma_core_lin_addrgen_cfg_ctrl_set(udma_core_base, config); +} -/** - * UDMA_CHANNEL_2D - */ -static inline uint32_t pi_udma_core_2d_addr_get(int32_t chan_id) +static inline void pi_udma_core_lin_stop(uint32_t udma_core_base) { - return UDMA_CHAN_2D(chan_id); + uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_STOP(1); + udma_core_lin_addrgen_cfg_ctrl_set(udma_core_base, config); +} + +static inline void pi_udma_core_lin_reset(uint32_t udma_core_base) +{ + uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_STOP(1); + /* udma_core_lin_addrgen_cfg_sa_buf0_set(udma_core_base, 0); */ + /* udma_core_lin_addrgen_cfg_sa_buf1_set(udma_core_base, 0); */ + /* udma_core_lin_addrgen_cfg_size_set(udma_core_base, 0); */ + udma_core_lin_addrgen_cfg_ctrl_set(udma_core_base, config); +} + +static inline uint32_t pi_udma_core_lin_curr_addr_get(uint32_t udma_core_base) +{ + return udma_core_lin_addrgen_cfg_curr_addr_get(udma_core_base); } +static inline uint32_t pi_udma_core_lin_bytes_left_get(uint32_t udma_core_base) +{ + return udma_core_lin_addrgen_cfg_bytes_left_get(udma_core_base); +} + + + +/** + * UDMA_CHANNEL_2D + */ static inline int32_t pi_udma_core_2d_alloc(void) { int32_t chan_id = -1; - uint32_t reg_status = __pi_udma_chan_2d; - if (0x0 != reg_status) + if (0x0 != __pi_udma_chan_2d) { - chan_id = __FF1(reg_status); + chan_id = __FF1(__pi_udma_chan_2d); + __pi_udma_chan_2d = __BITCLR_R(__pi_udma_chan_2d, 1, chan_id); return (chan_id + UDMA_CHAN_2D_ID(0)); } return chan_id; @@ -148,22 +186,62 @@ static inline void pi_udma_core_2d_free(int32_t chan_id) } } +static inline uint32_t pi_udma_core_2d_addr_get(int32_t chan_id) +{ + return UDMA_CHAN_2D(chan_id); +} -/** - * UDMA_CHANNEL_FIFO - */ -static inline uint32_t pi_udma_core_fifo_addr_get(int32_t chan_id) +static inline void pi_udma_core_2d_enqueue(uint32_t udma_core_base, + uint32_t buf_0, uint32_t buf_1, + uint32_t size, uint32_t stride, + uint32_t length, uint32_t config) { - return UDMA_CHAN_FIFO(chan_id); + config |= UDMA_CORE_2D_ADDRGEN_CFG_CTRL_EN(1); + udma_core_2d_addrgen_cfg_sa_buf0_set(udma_core_base, buf_0); + udma_core_2d_addrgen_cfg_sa_buf1_set(udma_core_base, buf_1); + udma_core_2d_addrgen_cfg_size_set(udma_core_base, size); + udma_core_2d_addrgen_cfg_stride_set(udma_core_base, stride); + udma_core_2d_addrgen_cfg_row_len_set(udma_core_base, length); + udma_core_2d_addrgen_cfg_ctrl_set(udma_core_base, config); +} + +static inline void pi_udma_core_2d_stop(uint32_t udma_core_base) +{ + uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_STOP(1); + udma_core_2d_addrgen_cfg_ctrl_set(udma_core_base, config); +} + +static inline void pi_udma_core_2d_reset(uint32_t udma_core_base) +{ + uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_STOP(1); + /* udma_core_2d_addrgen_cfg_sa_buf0_set(udma_core_base, 0); */ + /* udma_core_2d_addrgen_cfg_sa_buf1_set(udma_core_base, 0); */ + /* udma_core_2d_addrgen_cfg_size_set(udma_core_base, 0); */ + /* udma_core_2d_addrgen_cfg_stride_set(udma_core_base, 0); */ + /* udma_core_2d_addrgen_cfg_row_len_set(udma_core_base, 0); */ + udma_core_2d_addrgen_cfg_ctrl_set(udma_core_base, config); } +static inline uint32_t pi_udma_core_2d_curr_addr_get(uint32_t udma_core_base) +{ + return udma_core_2d_addrgen_cfg_curr_addr_get(udma_core_base); +} + +static inline uint32_t pi_udma_core_2d_bytes_left_get(uint32_t udma_core_base) +{ + return udma_core_2d_addrgen_cfg_bytes_left_get(udma_core_base); +} + +/** + * UDMA_CHANNEL_FIFO + */ static inline int32_t pi_udma_core_fifo_alloc(void) { int32_t chan_id = -1; - uint32_t reg_status = __pi_udma_chan_fifo; - if (0x0 != reg_status) + if (0x0 != __pi_udma_chan_fifo) { - chan_id = __FF1(reg_status); + chan_id = __FF1(__pi_udma_chan_fifo); + __pi_udma_chan_fifo = __BITCLR_R(__pi_udma_chan_fifo, 1, chan_id); return (chan_id + UDMA_CHAN_FIFO_ID(0)); } return chan_id; @@ -176,3 +254,37 @@ static inline void pi_udma_core_fifo_free(int32_t chan_id) __pi_udma_chan_fifo = __BITSET_R(__pi_udma_chan_fifo, 1, chan_id - UDMA_CHAN_FIFO_ID(0)); } } + +static inline uint32_t pi_udma_core_fifo_addr_get(int32_t chan_id) +{ + return UDMA_CHAN_FIFO(chan_id); +} + +static inline void pi_udma_core_fifo_enqueue(uint32_t udma_core_base, + uint32_t buf, + uint32_t size, + uint32_t config) +{ + config |= UDMA_CORE_FIFO_CFG_CTRL_EN(1); + udma_core_fifo_cfg_sa_buffer_set(udma_core_base, buf); + udma_core_fifo_cfg_size_set(udma_core_base, size); + udma_core_fifo_cfg_ctrl_set(udma_core_base, config); +} + +static inline void pi_udma_core_fifo_stop(uint32_t udma_core_base) +{ + uint32_t config = UDMA_CORE_FIFO_CFG_CTRL_STOP(1); + udma_core_fifo_cfg_ctrl_set(udma_core_base, config); +} + +static inline void pi_udma_core_fifo_event_enable(uint32_t udma_core_base, + uint8_t enable) +{ + udma_core_fifo_cfg_evt_en_set(udma_core_base, enable); +} + +static inline void pi_udma_core_fifo_event_set_threshold(uint32_t udma_core_base, + uint32_t threshold) +{ + udma_core_fifo_cfg_evt_num_bytes_set(udma_core_base, threshold); +} diff --git a/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_timeout.h b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_timeout.h new file mode 100644 index 000000000..8085819c1 --- /dev/null +++ b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_timeout.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020, GreenWaves Technologies, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * o Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * + * o Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * o Neither the name of GreenWaves Technologies, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + + +/******************************************************************************* + * Definitions + ******************************************************************************/ + +struct pi_udma_timeout_s +{ + struct pi_task *fifo_head; + struct pi_task *fifo_tail; + //uint8_t tid; + pi_udma_timeout_mode_e mode; +}; + +/******************************************************************************* + * Driver data + ******************************************************************************/ + +/******************************************************************************* + * Function declaration + ******************************************************************************/ + +int32_t pi_udma_timeout_config_set(pi_task_t *task, uint8_t timeout_id, + uint8_t udma_chan_id, uint32_t timeout_us); + +pi_task_t *__pi_udma_timeout_task_remove(uint8_t timeout_id); diff --git a/rtos/pulp/gap_archi/doc/ips/sfu.rst b/rtos/pulp/gap_archi/doc/ips/sfu.rst new file mode 100644 index 000000000..f83121cb4 --- /dev/null +++ b/rtos/pulp/gap_archi/doc/ips/sfu.rst @@ -0,0 +1,609 @@ +.. + Input file: fe/ips/udma/udma_anc/README.md + +Register map +^^^^^^^^^^^^ + + +Overview +"""""""" + + +Refer to :ref:`GAP9 address map` for the base address to be used. + +.. table:: + :align: center + :widths: 40 12 12 90 + + +--------------------------------------------+------+-----+-----------------------------------+ + | Name |Offset|Width| Description | + +============================================+======+=====+===================================+ + |:ref:`GRAPH_PTR` | 0| 32|Pointer to graph configuration | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`GRAPH_CMD` | 4| 32|Graph command register | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`CLOCK_PTR` | 8| 32|Pointer to clock configuration | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`CLOCK_CMD` | 12| 32|Clock command register | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`SFU_STATUS` | 16| 32|Status of graph and clocks commands| + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_STATUS` | 20| 32|Status of memory IN interfaces | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`LIMITER_MUTE` | 24| 32|Limiter mute/unmute control | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`VOLUME_INDEX` | 28| 32|Control of mute/unmute | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`VOLUME_VALUE` | 32| 32|Control of linear volume | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`CLK_MONITOR_0` | 36| 32|Control of clock monitors 0 to 3 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`CLK_MONITOR_1` | 40| 32|Control of clock monitors 4 to 7 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`OUT_MUTE` | 44| 32|Control of output channel mute | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`AUDIO_CLK_CFG_0`| 48| 32|Control audio clock generator 0 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`AUDIO_CLK_CFG_1`| 52| 32|Control audio clock generator 1 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`AUDIO_CLK_CFG_2`| 56| 32|Control audio clock generator 2 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`AUDIO_CLK_CFG_3`| 60| 32|Control audio clock generator 3 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`ASRC_RATIO_0` | 64| 32|ASRC0 conversion ratio | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`ASRC_RATIO_1` | 68| 32|ASRC1 conversion ratio | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`ASRC_RATIO_2` | 72| 32|ASRC2 conversion ratio | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_0_CNT` | 88| 32|Memory input counter 0 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_1_CNT` | 92| 32|Memory input counter 1 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_2_CNT` | 96| 32|Memory input counter 2 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_3_CNT` | 100| 32|Memory input counter 3 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_4_CNT` | 104| 32|Memory input counter 4 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_5_CNT` | 108| 32|Memory input counter 5 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_6_CNT` | 112| 32|Memory input counter 6 | + +--------------------------------------------+------+-----+-----------------------------------+ + |:ref:`MEM_IN_7_CNT` | 116| 32|Memory input counter 7 | + +--------------------------------------------+------+-----+-----------------------------------+ + +.. _sfu__GRAPH_PTR: + +GRAPH_PTR +""""""""" + +Pointer to graph configuration + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-------+-----+----------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+=======+=====+========================================+ + |31:0 |R/W|ADDRESS|0x0 |Address of graph configuration in memory| + +-----+---+-------+-----+----------------------------------------+ + +.. _sfu__GRAPH_CMD: + +GRAPH_CMD +""""""""" + +Graph command register + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-----------+-----+---------------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+===========+=====+=============================================+ + | 0|W |LOAD |0x0 |Write b1 to start graph load | + +-----+---+-----------+-----+---------------------------------------------+ + | 1|W |RECONF |0x0 |Write b1 to start graph reconfiguration | + +-----+---+-----------+-----+---------------------------------------------+ + | 2|W |UNLOAD |0x0 |Write b1 to start graph unload | + +-----+---+-----------+-----+---------------------------------------------+ + | 3|W |SAVE |0x0 |Write b1 to start graph save | + +-----+---+-----------+-----+---------------------------------------------+ + | 4|W |SET_CURRENT|0x0 |Sets the current graph (used for status read)| + +-----+---+-----------+-----+---------------------------------------------+ + +.. _sfu__CLOCK_PTR: + +CLOCK_PTR +""""""""" + +Pointer to clock configuration + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-------+-----+----------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+=======+=====+========================================+ + |31:0 |R/W|ADDRESS|0x0 |Address of clock configuration in memory| + +-----+---+-------+-----+----------------------------------------+ + +.. _sfu__CLOCK_CMD: + +CLOCK_CMD +""""""""" + +Clock command register + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+------+-----+------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+======+=====+==============================+ + | 0|W |LOAD |0x0 |Write b1 to start clock load | + +-----+---+------+-----+------------------------------+ + | 1|W |UNLOAD|0x0 |Write b1 to start clock unload| + +-----+---+------+-----+------------------------------+ + +.. _sfu__SFU_STATUS: + +SFU_STATUS +"""""""""" + +Status of graph and clocks commands + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+=============+=====+==============================================================================================+ + | 0|R |CLOCK_LOAD |0x0 |Bit is set to 1 when clock load is ongoing | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + | 1|R |GRAPH_LOAD |0x0 |Bit is set to 1 when graph load is ongoing | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + | 2|R |GRAPH_UNLOAD |0x0 |Bit is set to 1 when graph unload is ongoing | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + | 3|R |GRAPH_RECONF |0x0 |Bit is set to 1 when graph reconfiguration is ongoing | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + | 4|R |GRAPH_SAVE |0x0 |Bit is set to 1 when graph save is ongoing | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + | 5|R |GRAPH_SET_CUR|0x0 |Bit is set to 1 when current graph is being set | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + |8:6 |R |ASRC_LOCK |0x0 |Lock status of the 3 ASRCs: bit *i* is set to 1 when frequency tracking of ASRC\ *i* is locked| + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + | 9|R |GRAPH_BUSY |0x0 |Bit is set to 1 when current graph is busy | + +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_STATUS: + +MEM_IN_STATUS +""""""""""""" + +Status of memory IN interfaces + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+======+=====+========================================================================================================================================================================================+ + |7:0 |R/W|STATUS|0x0 |When reading, bit *i* give the status of MemIn interface *i*: b0: interface OK; b1: buffer has ended. Writing b1 to bit *i* restarts the MemIn interface *i* (e.g. after buffer restart)| + +-----+---+------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +.. _sfu__LIMITER_MUTE: + +LIMITER_MUTE +"""""""""""" + +Limiter mute/unmute control + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----------+-----+--------------------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+==========+=====+==================================================+ + | 0|R/W|MUTE_LIM_0|0x0 |Enable mute 0: b0: mute disabled; b1: mute enabled| + +-----+---+----------+-----+--------------------------------------------------+ + | 1|R/W|MUTE_LIM_1|0x0 |Enable mute 1: b0: mute disabled; b1: mute enabled| + +-----+---+----------+-----+--------------------------------------------------+ + | 2|R/W|MUTE_LIM_2|0x0 |Enable mute 2: b0: mute disabled; b1: mute enabled| + +-----+---+----------+-----+--------------------------------------------------+ + | 3|R/W|MUTE_LIM_3|0x0 |Enable mute 3: b0: mute disabled; b1: mute enabled| + +-----+---+----------+-----+--------------------------------------------------+ + | 4|R/W|MUTE_LIM_4|0x0 |Enable mute 4: b0: mute disabled; b1: mute enabled| + +-----+---+----------+-----+--------------------------------------------------+ + | 5|R/W|MUTE_LIM_5|0x0 |Enable mute 5: b0: mute disabled; b1: mute enabled| + +-----+---+----------+-----+--------------------------------------------------+ + +.. _sfu__VOLUME_INDEX: + +VOLUME_INDEX +"""""""""""" + +Control of mute/unmute + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-----+-----+---------------------------------------------------------+ + |Bit #|R/W|Name |Reset| Description | + +=====+===+=====+=====+=========================================================+ + |4:0 |R/W|INDEX|0x0 |Index of volume setting accessed by VOLUME_VALUE register| + +-----+---+-----+-----+---------------------------------------------------------+ + +.. _sfu__VOLUME_VALUE: + +VOLUME_VALUE +"""""""""""" + +Control of linear volume + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+---------+-----+------------------------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+=========+=====+======================================================+ + |25:0 |R/W|VOLUME |0x0 |Value of volume (linear) | + +-----+---+---------+-----+------------------------------------------------------+ + |31:26|R/W|SCALING_V|0x0 |Value in bits for the scaling (bit 5 is the direction)| + +-----+---+---------+-----+------------------------------------------------------+ + +.. _sfu__CLK_MONITOR_0: + +CLK_MONITOR_0 +""""""""""""" + +Control of clock monitors 0 to 3 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+-------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+=======================================================+ + |4:0 |R/W|SEL0|0x0 |Monitored clock selector (see Clock select table below)| + +-----+---+----+-----+-------------------------------------------------------+ + |7 |R/W|EN0 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + |12:8 |R/W|SEL1|0x0 |Monitored clock selector (see Clock select table below | + +-----+---+----+-----+-------------------------------------------------------+ + |15 |R/W|EN1 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + |20:16|R/W|SEL2|0x0 |Monitored clock selector (see Clock select table below | + +-----+---+----+-----+-------------------------------------------------------+ + |23 |R/W|EN2 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + |28:24|R/W|SEL3|0x0 |Monitored clock selector (see Clock select table below | + +-----+---+----+-----+-------------------------------------------------------+ + |31 |R/W|EN3 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + +.. _sfu__CLK_MONITOR_1: + +CLK_MONITOR_1 +""""""""""""" + +Control of clock monitors 4 to 7 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+-------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+=======================================================+ + |4:0 |R/W|SEL0|0x0 |Monitored clock selector (see Clock select table below)| + +-----+---+----+-----+-------------------------------------------------------+ + |7 |R/W|EN0 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + |12:8 |R/W|SEL1|0x0 |Monitored clock selector (see Clock select table below | + +-----+---+----+-----+-------------------------------------------------------+ + |15 |R/W|EN1 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + |20:16|R/W|SEL2|0x0 |Monitored clock selector (see Clock select table below | + +-----+---+----+-----+-------------------------------------------------------+ + |23 |R/W|EN2 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + |28:24|R/W|SEL3|0x0 |Monitored clock selector (see Clock select table below | + +-----+---+----+-----+-------------------------------------------------------+ + |31 |R/W|EN3 |0x0 |Set to b1 to enable monitoring | + +-----+---+----+-----+-------------------------------------------------------+ + +.. _sfu__OUT_MUTE: + +OUT_MUTE +"""""""" + +Control of output channel mute + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----------+-----+-------------------------------------+ + |Bit #|R/W| Name |Reset| Description | + +=====+===+==========+=====+=====================================+ + |7:0 |R/W|MEM_OUT |0x0 |Mutes corresponding MemOut channel | + +-----+---+----------+-----+-------------------------------------+ + |15:8 |R/W|STREAM_OUT|0x0 |Mutes corresponding StreamOut channel| + +-----+---+----------+-----+-------------------------------------+ + |18:16|R/W|PDM_OUT |0x0 |Mutes corresponding PDMOut channel | + +-----+---+----------+-----+-------------------------------------+ + +.. _sfu__AUDIO_CLK_CFG_0: + +AUDIO_CLK_CFG_0 +""""""""""""""" + +Control audio clock generator 0 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+---------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+=========================================================+ + |15:0 |R/W|DIV |0x0 |Division factor for audio clock | + +-----+---+----+-----+---------------------------------------------------------+ + |16 |R/W|EN |0x0 |Enable: b0: audio clock disabled; b1: audio clock enabled| + +-----+---+----+-----+---------------------------------------------------------+ + +.. _sfu__AUDIO_CLK_CFG_1: + +AUDIO_CLK_CFG_1 +""""""""""""""" + +Control audio clock generator 1 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+---------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+=========================================================+ + |15:0 |R/W|DIV |0x0 |Division factor for audio clock | + +-----+---+----+-----+---------------------------------------------------------+ + |16 |R/W|EN |0x0 |Enable: b0: audio clock disabled; b1: audio clock enabled| + +-----+---+----+-----+---------------------------------------------------------+ + +.. _sfu__AUDIO_CLK_CFG_2: + +AUDIO_CLK_CFG_2 +""""""""""""""" + +Control audio clock generator 2 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+---------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+=========================================================+ + |15:0 |R/W|DIV |0x0 |Division factor for audio clock | + +-----+---+----+-----+---------------------------------------------------------+ + |16 |R/W|EN |0x0 |Enable: b0: audio clock disabled; b1: audio clock enabled| + +-----+---+----+-----+---------------------------------------------------------+ + +.. _sfu__AUDIO_CLK_CFG_3: + +AUDIO_CLK_CFG_3 +""""""""""""""" + +Control audio clock generator 3 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+---------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+=========================================================+ + |15:0 |R/W|DIV |0x0 |Division factor for audio clock | + +-----+---+----+-----+---------------------------------------------------------+ + |16 |R/W|EN |0x0 |Enable: b0: audio clock disabled; b1: audio clock enabled| + +-----+---+----+-----+---------------------------------------------------------+ + +.. _sfu__ASRC_RATIO_0: + +ASRC_RATIO_0 +"""""""""""" + +ASRC0 conversion ratio + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-----+-----+----------------+ + |Bit #|R/W|Name |Reset| Description | + +=====+===+=====+=====+================+ + |25:0 |R/W|RATIO|0x0 |Conversion ratio| + +-----+---+-----+-----+----------------+ + +.. _sfu__ASRC_RATIO_1: + +ASRC_RATIO_1 +"""""""""""" + +ASRC1 conversion ratio + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-----+-----+----------------+ + |Bit #|R/W|Name |Reset| Description | + +=====+===+=====+=====+================+ + |25:0 |R/W|RATIO|0x0 |Conversion ratio| + +-----+---+-----+-----+----------------+ + +.. _sfu__ASRC_RATIO_2: + +ASRC_RATIO_2 +"""""""""""" + +ASRC2 conversion ratio + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+-----+-----+----------------+ + |Bit #|R/W|Name |Reset| Description | + +=====+===+=====+=====+================+ + |25:0 |R/W|RATIO|0x0 |Conversion ratio| + +-----+---+-----+-----+----------------+ + +.. _sfu__MEM_IN_0_CNT: + +MEM_IN_0_CNT +"""""""""""" + +Memory input counter 0 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_1_CNT: + +MEM_IN_1_CNT +"""""""""""" + +Memory input counter 1 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_2_CNT: + +MEM_IN_2_CNT +"""""""""""" + +Memory input counter 2 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_3_CNT: + +MEM_IN_3_CNT +"""""""""""" + +Memory input counter 3 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_4_CNT: + +MEM_IN_4_CNT +"""""""""""" + +Memory input counter 4 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_5_CNT: + +MEM_IN_5_CNT +"""""""""""" + +Memory input counter 5 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_6_CNT: + +MEM_IN_6_CNT +"""""""""""" + +Memory input counter 6 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ + +.. _sfu__MEM_IN_7_CNT: + +MEM_IN_7_CNT +"""""""""""" + +Memory input counter 7 + +.. table:: + :align: center + :widths: 13 12 45 24 85 + + +-----+---+----+-----+------------------------------------------------------------------------------+ + |Bit #|R/W|Name|Reset| Description | + +=====+===+====+=====+==============================================================================+ + |20:0 |R |CNT |0x0 |Reports how many samples have been pushed to the SFU from this MemIn interface| + +-----+---+----+-----+------------------------------------------------------------------------------+ diff --git a/rtos/pulp/gap_archi/doc/source/sfu.rst b/rtos/pulp/gap_archi/doc/source/sfu.rst new file mode 100644 index 000000000..436fefcbf --- /dev/null +++ b/rtos/pulp/gap_archi/doc/source/sfu.rst @@ -0,0 +1,4 @@ +SFU +--- + +.. include:: ../ips/sfu.rst \ No newline at end of file diff --git a/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h b/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h index d5ac42495..9690c6fb8 100644 --- a/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h +++ b/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h @@ -27,25 +27,25 @@ extern volatile PI_FC_TINY unsigned int pos_soc_event_status[ARCHI_SOC_EVENT_NB_ void pos_soc_event_init(); -static inline void pos_soc_event_register_callback_func(unsigned int channel_id, void (*callback)(int, void *)) +static inline void pos_soc_event_register_callback_func(unsigned int event, void (*callback)(uint32_t, void *)) { - pos_soc_event_callback[channel_id] = callback; + pos_soc_event_callback[event] = callback; } -static inline void pos_soc_event_register_callback_arg(unsigned int channel_id, void *arg) +static inline void pos_soc_event_register_callback_arg(unsigned int event, void *arg) { - pos_soc_event_callback_arg[channel_id] = arg; + pos_soc_event_callback_arg[event] = arg; } -static inline void pos_soc_event_register_callback(unsigned int channel_id, void (*callback)(int, void *), void *arg) +static inline void pos_soc_event_register_callback(unsigned int event, void (*callback)(uint32_t, void *), void *arg) { - pos_soc_event_register_callback_func(channel_id, callback); - pos_soc_event_register_callback_arg(channel_id, arg); + pos_soc_event_register_callback_func(event, callback); + pos_soc_event_register_callback_arg(event, arg); } -static inline void pi_fc_event_handler_set(unsigned int channel_id, void (*callback)(int, void *), void *arg) +static inline void pi_fc_event_handler_set(unsigned int event, void (*callback)(uint32_t, void *), void *arg) { - pos_soc_event_register_callback(channel_id, callback, arg); + pos_soc_event_register_callback(event, callback, arg); } static inline void pi_soc_eu_pr_mask_set(int evt) @@ -70,4 +70,14 @@ static inline void pos_soc_event_wait(int event) hal_irq_restore(irq); } -#endif \ No newline at end of file +static inline void pi_soc_eu_fc_mask_set(uint32_t event_num) +{ + soc_eu_fc_mask_clr_set(SOC_EU_ADDR, event_num); +} + +static inline void pi_soc_eu_fc_mask_clear(uint32_t event_num) +{ + soc_eu_fc_mask_set_set(SOC_EU_ADDR, event_num); +} + +#endif diff --git a/rtos/pulp/pulpos-2/include/pos/implem/task.h b/rtos/pulp/pulpos-2/include/pos/implem/task.h index a3f705e78..1e34be980 100644 --- a/rtos/pulp/pulpos-2/include/pos/implem/task.h +++ b/rtos/pulp/pulpos-2/include/pos/implem/task.h @@ -40,6 +40,17 @@ static inline void pi_task_destroy(pi_task_t *task) } +static inline int32_t pi_task_status_get(pi_task_t *task) +{ + return task->arg[3]; +} + +static inline void pi_task_status_set(pi_task_t *task, int32_t status) +{ + task->arg[3] = status; +} + + static inline void pi_task_wait_on(struct pi_task *task) { int irq = hal_irq_disable(); diff --git a/rtos/pulp/pulpos-2/kernel/init.c b/rtos/pulp/pulpos-2/kernel/init.c index 09c8e544b..310ede378 100644 --- a/rtos/pulp/pulpos-2/kernel/init.c +++ b/rtos/pulp/pulpos-2/kernel/init.c @@ -72,6 +72,11 @@ static void pos_init_bss() } +void __attribute__((weak)) pi_bsp_init() +{ +} + + void pos_init_start() { INIT_INF("Starting runtime initialization\n"); @@ -106,6 +111,8 @@ void pos_init_start() // Now now the minimal init are done, we can activate interruptions hal_irq_enable(); + pi_bsp_init(); + int retval = main(); exit(retval); diff --git a/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk b/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk index a5edf1b22..061e65c1e 100644 --- a/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk +++ b/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk @@ -309,6 +309,8 @@ build: $(TARGETS) image: gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(TARGET_BUILD_DIR) $(config_args) $(gapy_args) run --image --binary=$(TARGETS) $(runner_args) +flash_noforce: flash + flash: gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(TARGET_BUILD_DIR) $(config_args) $(gapy_args) run --flash --binary=$(TARGETS) $(runner_args) @@ -354,4 +356,4 @@ install-lib: build-lib # @echo " CONFIG_TRACE_LEVEL= Activate traces for the specified level (0=none, 1=fatal, 2=error, 3=warning, 4=info, 5=debug, 6=trace)." # @echo " CONFIG_TRACE_ALL=1 Activate all traces. Other traces can be individually activated with CONFIG_TRACE_." -.PHONY: image flash exec run dis size help clean all conf build-lib install-lib +.PHONY: image flash flash_noforce exec run dis size help clean all conf build-lib install-lib diff --git a/rtos/pulp/pulpos-2/rules/pulpos/src.mk b/rtos/pulp/pulpos-2/rules/pulpos/src.mk index cd77171cc..43feacbed 100644 --- a/rtos/pulp/pulpos-2/rules/pulpos/src.mk +++ b/rtos/pulp/pulpos-2/rules/pulpos/src.mk @@ -30,7 +30,7 @@ endif # HYPER ifeq '$(CONFIG_HYPER)' '1' -ifneq '$(udma/version)' '' +ifneq '$(udma/hyper/version)' '' ifeq '$(TARGET_CHIP_FAMILY)' 'GAP9' HYPER_HAS_ASM = 1 HYPER_HAS_OCTOSPI = 1 diff --git a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h index b5bdac14e..e3d807233 100644 --- a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h +++ b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h @@ -916,6 +916,7 @@ typedef struct { uint64_t PaddedSize; /* Total size in bytes or bits of this kernel argument, byte aligned, forced padding taken into account */ uint64_t BitSize; /* Total size in bits of this kernel argument (unaligned) */ uint64_t PaddedBitSize; /* Total size in bits of this kernel argument (unaligned), forced padding taken into account */ + uint64_t Overflow; /* Amount of read overflow in bytes, it can happen on TILED arg when right or bottom padding is greater than the size of the last tile */ KernelArgOneDimDescrT **DimDescr; /* A vector of dimension description outer to inner */ KernelArgOneDimDescrT **IterOrderDimDescr; /* Reordered DimDescr according to Kernel Iteration Order */ int *KerIterDimDescr; /* Indexed by kernel's IterOrder, if in DimDescr then position in IterOrderDimDescr otherwise -1 */ @@ -1429,6 +1430,7 @@ typedef struct AGraphNodeList_T { ArgBindingDescr_T *Binding; /* The bindings from which this edge is originating */ GraphEdgeWeb_T *Web; /* Which symbol */ unsigned int Size; /* Size of this symbol as seen in the related kernel argument */ + unsigned int Guard; /* Extra space above size in case related arg can read overflow */ int Offset; /* Offset applied to the base of this symbol in case binding Oper is + or - */ int Channel; /* To which channel this symbol belongs to */ int ChannelDepth; /* Channel depth */ @@ -1457,6 +1459,7 @@ typedef struct { AT_MemLocation_T MemType; unsigned int Address; unsigned int Size; + unsigned int Guard; int LiveFirst; int LiveLast; BoxType_T AllocType; @@ -1484,6 +1487,7 @@ typedef struct AGraphEdgeWeb_T { CKernel_Arg_T *Edge; /* The symbol, CArgs or Locals in the current graph */ unsigned int Index; /* Index of this Symbol */ unsigned int Size; /* Size of this symbol */ + unsigned int Guard; /* Guard on top of Size for this symbol in case of read overflow */ int LiveFirst; /* Graph node index of start life for this symbol */ int LiveLast; /* Graph node index of start stop for this symbol */ Kernel_Arg_T *KerArg; /* This symbol is bounded to this Kernel argument */ diff --git a/tools/autotiler_v3/Autotiler/TilingGenCode.h b/tools/autotiler_v3/Autotiler/TilingGenCode.h index 53f9bc350..b036d5d5a 100644 --- a/tools/autotiler_v3/Autotiler/TilingGenCode.h +++ b/tools/autotiler_v3/Autotiler/TilingGenCode.h @@ -22,6 +22,7 @@ extern void LogicalTileNAddressAndSizeOrig(Kernel_T *Ker, Kernel_Arg_T *Arg, uin extern char *BindOpImage(ArgBindingOper Op); extern char *KernelArgImage(Kernel_T *Ker, Kernel_Arg_T *Arg, CKernel_Arg_T *ArgVal, KernelArgSelect_T ArgSel, KernelIteratorT ArgSpace, KernelIteratorT ItSpace, int *IsInvar); +extern int EvalArgOverflow(Kernel_T *Ker, Kernel_Arg_T *Arg, Object_T *Obj); #endif diff --git a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c index d78ce6039..ef93bf8b1 100644 --- a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c +++ b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c @@ -422,7 +422,7 @@ Kernel_T *CNN_MM_ConvolutionNE16( int WOffsetCfg = 1; int QuantRightShift = 0; int QuantBits = (NeedReduct)?2:(Abs(Out_DataSize)==2?1:0); // 00: 8bit, 01: 16bit, 10: 32bit --> If tiling the channel input dimension you need to streamin (need 32 bits output) - int QuantNoRect = (Out_DataSize>0)?1:0; + int QuantNoRect = (NeedReduct || (Out_DataSize>0))?1:0; int NormShift = 1; int NormBias = 1; unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \ @@ -676,7 +676,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( LayerBandwidth += (Fcx*Fcy*Filter_DataSizeBits*InFeat*(DWConv?1:OutFeat)+7)/8; LayerBandwidth += Bias_DataSize*OutFeat; - if (ConvOper == KOP_CONV && Height == 1 && Fcy == 1) ConvOper = KOP_CONV1D; + if (ConvOper == KOP_CONV && Height == 1 && Fcy == 1 && Fcx > 1) ConvOper = KOP_CONV1D; ConvKerName = CNN_FindMatchingKernelAttr(ConvOper, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy, &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0); if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name); @@ -734,7 +734,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal( int WOffsetCfg = 1; int QuantRightShift = 0; int QuantBits = (NeedReduct)?2:(Abs(Out_DataSize)==2?1:0); // 00: 8bit, 01: 16bit, 10: 32bit --> If tiling the channel input dimension you need to streamin (need 32 bits output) - int QuantNoRect = (Out_DataSize>0 || Mode16)?1:0; + int QuantNoRect = (NeedReduct || (Out_DataSize>0))?1:0; int NormShift = 1; int NormBias = !Mode16; unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \ @@ -966,7 +966,7 @@ int CNN_ConvolutionNE16( ) { - if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) { + if (Fcx==1 && Fcy==1 && Height==1 && Width==1) { printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n"); CNN_LinearAct_NE16(Name, Ctrl, In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, KOP_LINEAR, ActOper); return 1; @@ -1063,7 +1063,7 @@ int CNN_ConvolutionNE16( OutDim: Number of outputs LinearOper KOP_LINEAR - ActOper Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + ActOper Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) @@ -1119,7 +1119,7 @@ static Kernel_T *CNN_LinearAct_NE16_Internal( LinearKerName = CNN_FindMatchingKernelAttr(LinearOper, KOP_NONE, 0, CALL_NE16_KER, Abs(In_DataSize), 0, Bias_DataSize, 0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); if (LinearKerName==0) GenTilingError("CNN_LinearAct_NE16 Kernel: %s, Can't find a matching Linear basic kernel: %d %d", Name, Abs(In_DataSize), Bias_DataSize); if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_SIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_TANH)) - GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name); + GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU, KOP_SIGMOID or KOP_TANH", Name); /* Also when in/out are 16bits you need to streamout 32bits but here the reduction step will be done in the cluster (KOP_DP_REDUCT) */ int NeedReductScale = Mode16; //Abs(Out_DataSize) == 2; @@ -1155,7 +1155,7 @@ static Kernel_T *CNN_LinearAct_NE16_Internal( int WOffsetCfg = 1; int QuantRightShift = 0; int QuantBits = NeedLinOut?2:((Abs(Out_DataSize)==1)?0:1); // 00: 8bit, 01: 16bit, 10: 32bit - int QuantNoRect = (Out_DataSize>0 || Mode16)?1:0; + int QuantNoRect = (NeedReduct || (Out_DataSize>0))?1:0; int NormBias = !Mode16; int NormShift = 1; unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \ diff --git a/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c index 61db23112..fb4b4693b 100644 --- a/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c +++ b/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c @@ -651,7 +651,7 @@ int RNN_Stack_NE16( if (Log) printf("Mapped sequence tile based with %d output size constraint\n", DoConstraint); } else { if (Log) printf("Failed to map sequence tile based with %d output size constraint, relaxing constraint\n", DoConstraint); - DoConstraint = (DoConstraint>16)?DoConstraint/2:0; + DoConstraint = (DoConstraint>16)?DoConstraint/2:1; } } else { if (Ok) { @@ -1128,7 +1128,7 @@ int LSTM_Stack_NE16( if (Log) printf("Mapped sequence tile based with %d output size constraint\n", DoConstraint); } else { if (Log) printf("Failed to map sequence tile based with %d output size constraint, relaxing constraint\n", DoConstraint); - DoConstraint = (DoConstraint>16)?DoConstraint-8:0; + DoConstraint = (DoConstraint>16)?DoConstraint-8:1; } } else { if (Ok) { @@ -1664,7 +1664,7 @@ int GRU_Stack_NE16( if (Log) printf("Mapped sequence tile based with %d output size constraint\n", DoConstraint); } else { if (Log) printf("Failed to map sequence tile based with %d output size constraint, relaxing constraint\n", DoConstraint); - DoConstraint = (DoConstraint>16)?DoConstraint-8:0; + DoConstraint = (DoConstraint>16)?DoConstraint-8:1; } } else { if (Ok) { diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c index 0deeb9d8f..44b3c2891 100644 --- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c +++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c @@ -448,17 +448,38 @@ void LoadCNN_SQ8_Library() LibKernel("KerParLinearLayer_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), 0, 1, CNN_Type(1,1,0,0,4), 0,0,0,0,0,0)); /* Linear layer, 8b output with bias and scaling/activation (ReLU, ReLUN) done in a single shot */ - LibKernel("KerParLinearLayerFullFeatB8_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerFullFeatB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerFullFeatB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); - - LibKernel("KerParLinearLayerFullFeatB16_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerFullFeatB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerFullFeatB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); - - LibKernel("KerParLinearLayerFullFeatB32_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerFullFeatB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); - LibKernel("KerParLinearLayerFullFeatB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_ReLUM_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_HSwish_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB8_Tanh_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParLinearLayerFullFeatB16_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_ReLUM_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_HSwish_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB16_Tanh_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0)); + + LibKernel("KerParLinearLayerFullFeatB32_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_ReLUM_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_HSwish_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); + LibKernel("KerParLinearLayerFullFeatB32_Tanh_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0)); /* Convolution or Linear output reduction with per channel scaling and optional activation. Out != In and In Place (IO) */ LibKernel("KerParReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 1, @@ -635,111 +656,281 @@ void LoadCNN_SQ8_Library() /* Matrix Multiplication for 1x1 convolutions with channel scaling and optional ReLU or ReLUN activation */ /* 8b Bias */ - LibKernel("KerParMatMulB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMulSxSyB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB8_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1)); /* 16b Bias */ - LibKernel("KerParMatMulB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); - - LibKernel("KerParMatMulSxSyB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMulSxSyB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB16_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1)); /* 32b Bias or No Bias at all */ - LibKernel("KerParMatMulB32_2x4_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB32_2x4_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - - LibKernel("KerParMatMulTransposedB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulTransposedB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulTransposedB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - - LibKernel("KerParMatMulB32_2x4_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB32_2x4_ReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB32_2x4_ReLUN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - - LibKernel("KerParMatMulTransposedB32_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulTransposedB32_ReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulTransposedB32_ReLUN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - - LibKernel("KerParMatMulSxSyB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); - LibKernel("KerParMatMulSxSyB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMulTransposedB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMulB32_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUM_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUMN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_LeakyReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_HSwish_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_HSigmoid_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_Sigmoid_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_Tanh_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMulTransposedB32_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLUN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLUM_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_ReLUMN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_LeakyReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_HSwish_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_HSigmoid_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_Sigmoid_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulTransposedB32_Tanh_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + + LibKernel("KerParMatMulSxSyB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_ReLUM_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_ReLUMN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_HSwish_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); + LibKernel("KerParMatMulSxSyB32_Tanh_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1)); /* Mat Mul based convolutions */ /* CHW In and Out tensors, [OutFeat,InFeat,Fy,Fx] weights */ - LibKernel("KerPar_MM_Conv1D_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv1D_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv1D_ReLUN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv1D_LeakyReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); - - LibKernel("KerPar_MM_Conv1D_DxDy_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); - LibKernel("KerPar_MM_Conv1D_DxDy_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); - - LibKernel("KerPar_MM_Conv2D_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv2D_DxDy_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); - LibKernel("KerPar_MM_Conv2D_DxDy_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); - + LibKernel("KerPar_MM_Conv1D_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLUN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLUM_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLUMN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_LeakyReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_HSwish_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_HSigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_Sigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_Tanh_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + + LibKernel("KerPar_MM_Conv1D_DxDy_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLUN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLUM_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLUMN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_LeakyReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_HSwish_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_HSigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_Sigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_Tanh_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + + LibKernel("KerPar_MM_Conv2D_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLUN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLUM_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLUMN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_LeakyReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_HSwish_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_HSigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_Sigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_Tanh_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + + LibKernel("KerPar_MM_Conv2D_DxDy_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLUN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLUM_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLUMN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_LeakyReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_HSwish_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_HSigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_Sigmoid_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_Tanh_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); /* HWC In and Out tensors, [OutFeat,Fy,Fx,InFeat] weights */ - LibKernel("KerPar_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); - LibKernel("Ker_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); - LibKernel("Ker_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv1D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv1D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); - LibKernel("KerPar_MM_Conv2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv2D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); - LibKernel("KerPar_MM_Conv2D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); - LibKernel("KerPar_MM_ConvDW2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV_DW), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); - LibKernel("Ker_MM_Conv2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); - LibKernel("Ker_MM_Conv2D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", - CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_LeakyReLU_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1x1_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + + LibKernel("Ker_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_LeakyReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv1x1_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1)); + + LibKernel("KerPar_MM_Conv1D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_LeakyReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1)); + + LibKernel("KerPar_MM_Conv1D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_LeakyReLU_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv1D_DxDy_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1)); + + LibKernel("KerPar_MM_Conv2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_LeakyReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + + LibKernel("KerPar_MM_Conv2D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLUN_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLUM_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_ReLUMN_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_LeakyReLU_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_HSwish_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_HSigmoid_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_Sigmoid_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + LibKernel("KerPar_MM_Conv2D_DxDy_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1)); + + LibKernel("Ker_MM_Conv2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_LeakyReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); + LibKernel("Ker_MM_Conv2D_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1)); /* Matrix Multiplication for 1x1 convolutions with channel scaling and optional ReLU or ReLUN activation, optimized form when In1 fits entirely into shared L1 */ /* 8b Bias */ - LibKernel("KerParMatMulB8_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB8_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB8_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUM_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_ReLUMN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_LeakyReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_HSwish_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_HSigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_Sigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB8_Tanh_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); /* 16b Bias */ - LibKernel("KerParMatMulB16_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB16_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB16_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUM_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_ReLUMN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_LeakyReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_HSwish_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_HSigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_Sigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB16_Tanh_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1)); /* 32b Bias */ - LibKernel("KerParMatMulB32_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - //LibKernel("KerParMatMulB32_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB32_2x4_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatMulB32_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); - + LibKernel("KerParMatMulB32_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUM_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_ReLUMN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_LeakyReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_HSwish_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_HSigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_Sigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatMulB32_Tanh_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1)); /* Matrix by vector multiplication with tensor centric scaling and optional activation */ - LibKernel("KerParMatVectMul_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatVectMul_ReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatVectMul_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatVectMul_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatVectMul_HSwish_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); - LibKernel("KerParMatVectMul_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_ReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_ReLUM_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_ReLUMN_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_LeakyReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_HSwish_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_HSigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_Sigmoid_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); + LibKernel("KerParMatVectMul_Tanh_SF_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_TANH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1)); /* SoftMax, pre scaling */ LibKernel("KerParSoftMax_SQ8", CALL_PARALLEL, 0, "KerSoftMax_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SOFTMAX), 0, -1, CNN_Type(1,0,0,0,2), 0,0,0,0,0,0)); @@ -950,7 +1141,18 @@ void LoadCNN_SQ8_Library() LibKernel("KerParReduct_CC_LeakyReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); LibKernel("KerParReduct_CC_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); LibKernel("KerParReduct_CC_Tanh_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0)); - + + LibKernel("KerParReduct_CC_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_ReLU_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_ReLUN_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_ReLUM_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_ReLUMN_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_HSigmoid_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_HSwish_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_LeakyReLU_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_Sigmoid_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_Tanh_HWC_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerParReduct_CC_HWC_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); LibKernel("KerParReduct_CC_ReLU_HWC_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); LibKernel("KerParReduct_CC_ReLUN_HWC_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); @@ -997,20 +1199,29 @@ void LoadCNN_SQ8_Library() LibKernel("KerReduct_CC_NoScale_Tanh_SQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0)); /* Unsigned */ - LibKernel("KerReduct_CC_NoScale_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLUN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLUM_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUMN),1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); - - LibKernel("KerReduct_CC_NoScale_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLU_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLUN_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLUM_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); - LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUMN),1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLUN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLUM_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_HSigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_HSwish_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_LeakyReLU_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_Sigmoid_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_Tanh_USQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0)); + + LibKernel("KerReduct_CC_NoScale_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_NONE), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLU_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLUN_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLUM_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUMN), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_HSigmoid_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_HSwish_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_LeakyReLU_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_Sigmoid_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_SIGMOID), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); + LibKernel("KerReduct_CC_NoScale_Tanh_USQ16", CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0)); /* Activations with tensor centric scaling */ - LibKernel("Ker_Scale_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("Ker_ActNone_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_ACT_NONE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("Ker_ReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELU), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("Ker_ReLUN_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELUN), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); @@ -1022,17 +1233,6 @@ void LoadCNN_SQ8_Library() LibKernel("Ker_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SIGMOID), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); LibKernel("Ker_Tanh_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_TANH), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_ActNone_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_ACT_NONE_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_ReLU_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELU_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_ReLUN_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELUN_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_ReLUM_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELUM_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_ReLUMN_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_RELUMN_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_HSigmoid_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_HSIGMOID_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_HSwish_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_HSWISH_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_LeakyReLU_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LEAKYRELU_IN_SCALE),0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_Sigmoid_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_SIGMOID_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - LibKernel("Ker_Tanh_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T", CNN_Match(CNN_OperList(1, KOP_TANH_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0)); - /* Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */ LibKernel("KerPool2x2Stride2_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,0,0,0,1), 2,2,1,1,2,2)); @@ -1100,7 +1300,7 @@ void LoadCNN_SQ8_Library() Spy: Pooling filter stride y dimension PoolPad: 0: No padding, 1: Zero padding - ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) @@ -1167,7 +1367,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal( } if (HWC && Fcy==1 && Fcx==1 && Scy==1 && Scx==1 && Dcy==1 && Dcx==1) - return CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, Height*Width, OutFeat, InFeat, 0,0,0,0, KOP_MATMUL_TRANSPOSED, ActOper, 0); + return CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, 1, InFeat, Height*Width, OutFeat, InFeat, 0,0,0,0, KOP_MATMUL_TRANSPOSED, ActOper, 0); if (ParFeatConv == 2 && HWC && Fcy>1 && (InFeat < 8)) ParFeatConv = 0; @@ -1194,7 +1394,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal( GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, ConvOper, expecting KOP_CONV, KOP_CONV_DW", Name); if (!(PoolOper == KOP_NONE || PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL)) GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, PoolOper, expecting KOP_NONE, KOP_MAXPOOL or KOP_AVGPOOL", Name); - if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH)) GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name); if (DWConv && (InFeat != OutFeat)) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Depth wise convolution requested with InFeat:%d != OutFeat:%d", Name, InFeat, OutFeat); @@ -1255,7 +1455,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal( &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0); else if (ActOper) StandAloneAct = 0; else if (PoolOper==KOP_AVGPOOL && ActOper==KOP_NONE && HWC) { - StandAloneAct = 1; ActOper = KOP_SCALE; + StandAloneAct = 1; ActOper = KOP_ACT_NONE; } if (PoolKerName==0) GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Pooling %s basic kernel", Name, ActOper?"with linear rectification":""); } @@ -1754,7 +1954,7 @@ static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal( Spy: Pooling filter stride y dimension PoolPad: 0: No padding, 1: Zero padding - ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) @@ -1844,10 +2044,6 @@ Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal( if (Ok!=0) return Ok; if (Log) printf("No solution found for im2col scheme, reverting to standard implementation\n"); } - if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) { - printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n"); - return CNN_LinearAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, KOP_LINEAR, ActOper); - } if (PoolOper==KOP_NONE) { Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1; @@ -1974,7 +2170,7 @@ Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal( if (Ok) return Ok; } if (Log) printf("Mapping this convolution to matrix multiplication\n"); - Kernel_T *Ok = CNN_MatMulAct_SQ8_Internal(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper, 1); + Kernel_T *Ok = CNN_MatMulAct_SQ8_Internal(Name, 0, Bias_DataSize, Scale_DataSize, 1, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper, 1); AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON); if (Ok) return Ok; if (Log) printf("Mapping this convolution to matrix multiplication FAILED, reverting to standard implementation\n"); @@ -2172,7 +2368,7 @@ Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal( Spy: Pooling filter stride y dimension PoolPad: 0: No padding, 1: Zero padding - ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) @@ -2227,7 +2423,7 @@ int CNN_GroupedConvolutionPoolAct_SQ8( GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, ConvOper, expecting KOP_NONE, KOP_CONV or KOP_CONV_DW", Name); if (!(PoolOper == KOP_NONE || PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL)) GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, PoolOper, expecting KOP_NONE, KOP_MAXPOOL or KOP_AVGPOOL", Name); - if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH)) GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name); CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, &Wc, &Hc, &Wo, &Ho, 0, 0, 0, 0); @@ -2309,7 +2505,7 @@ int CNN_GroupedConvolutionPoolAct_SQ8( Spx: Pooling stride, x dimension Spy: Pooling stride, y dimension - ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Out, Infos) @@ -2367,7 +2563,7 @@ Kernel_T * CNN_PoolAct_SQ8_Internal( if (!(PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL)) GenTilingError("CNN_Pool_SQ8 Kernel: %s, PoolOper, expecting KOP_MAXPOOL or KOP_AVGPOOL", Name); - if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH)) GenTilingError("CNN_Pool_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name); /* Set Kernel characteristics */ @@ -2514,7 +2710,6 @@ Kernel_T * CNN_PoolAct_SQ8_Internal( Height: Number of lines of a given feature map ActOper: KOP_ACT_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID - KOP_ACT_NONE_IN_SCALE, KOP_RELU_IN_SCALE, KOP_RELUN_IN_SCALE, KOP_HSIGMOID_IN_SCALE, KOP_HSWISH_IN_SCALE, KOP_LEAKYRELU_IN_SCALE, KOP_SIGMOID_IN_SCALE Signature: Name(In, Out, Infos) @@ -2544,9 +2739,8 @@ Kernel_T * CNN_Act_SQ8_Internal( int StandAloneAct = (ActOper!=KOP_NONE); int Log=1; - if (!(ActOper == KOP_ACT_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH || - ActOper == KOP_ACT_NONE_IN_SCALE || ActOper == KOP_RELU_IN_SCALE || ActOper == KOP_RELUN_IN_SCALE || ActOper == KOP_RELUM_IN_SCALE || ActOper == KOP_RELUMN_IN_SCALE || ActOper == KOP_HSIGMOID_IN_SCALE || ActOper == KOP_HSWISH_IN_SCALE || ActOper == KOP_LEAKYRELU_IN_SCALE || ActOper == KOP_SIGMOID_IN_SCALE || ActOper == KOP_TANH_IN_SCALE)) - GenTilingError("CNN_Act_SQ8 Kernel: %s, ActOper, expecting KOP_ACT_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_RELUMN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name); + if (!(ActOper == KOP_ACT_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH)) + GenTilingError("CNN_Act_SQ8 Kernel: %s, ActOper, expecting KOP_ACT_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_RELUMN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH, KOP_SIGMOID or KOP_LEAKYRELU", Name); ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, 0, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0); if (ActKerName==0) GenTilingError("CNN_Act_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name); @@ -2617,7 +2811,7 @@ Kernel_T * CNN_Act_SQ8_Internal( PoolOper: KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL - ActOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID + ActOper: Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Out, Infos) @@ -2656,7 +2850,7 @@ static Kernel_T *CNN_GlobalPoolAct_SQ8_Interal( if (!(PoolOper == KOP_GLOBAL_MAXPOOL || PoolOper == KOP_GLOBAL_AVGPOOL || PoolOper == KOP_GLOBAL_SUMPOOL)) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, PoolOper should be KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL or KOP_GLOBAL_SUMPOOL", Name); - if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU)) + if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH)) GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name); PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, ActOper, ParFeat, KerLayout, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); @@ -2835,7 +3029,7 @@ static Kernel_T *CNN_GlobalPoolAct_SQ8_Interal( OutDim: Number of outputs LinearOper KOP_LINEAR - ActOper Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID + ActOper Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In, Filter, Bias, Out, Scale, ScaleN, Infos) @@ -3380,24 +3574,11 @@ int CNN_MatAddPaddedAct_SQ8( char *TopName = NULL, *BotName = NULL, *BodyName = NULL; Ok = Ok && CNN_MatAddAct_SQ8(BodyName = AppendNames(Name, "Body"), Ctrl, FeatBody, Width, Height, AddMatOper, ActOper); - KernelOper_T PadActOper; - switch (ActOper) { - case KOP_NONE: PadActOper = KOP_ACT_NONE_IN_SCALE; break; - case KOP_RELU: PadActOper = KOP_RELU_IN_SCALE; break; - case KOP_RELUN: PadActOper = KOP_RELUN_IN_SCALE; break; - case KOP_RELUM: PadActOper = KOP_RELUM_IN_SCALE; break; - case KOP_RELUMN: PadActOper = KOP_RELUMN_IN_SCALE; break; - case KOP_HSIGMOID: PadActOper = KOP_HSIGMOID_IN_SCALE; break; - case KOP_HSWISH: PadActOper = KOP_HSWISH_IN_SCALE; break; - case KOP_LEAKYRELU: PadActOper = KOP_LEAKYRELU_IN_SCALE; break; - case KOP_SIGMOID: PadActOper = KOP_SIGMOID_IN_SCALE; break; - case KOP_TANH: PadActOper = KOP_TANH_IN_SCALE; break; - } if (PadTop) { - Ok = Ok && CNN_Act_SQ8(TopName = AppendNames(Name, "PadTop"), Ctrl, PadTop, Width, Height, PadActOper); + Ok = Ok && CNN_Act_SQ8(TopName = AppendNames(Name, "PadTop"), Ctrl, PadTop, Width, Height, ActOper); } if (PadBot) { - Ok = Ok && CNN_Act_SQ8(BotName = AppendNames(Name, "PadBot"), Ctrl, PadBot, Width, Height, PadActOper); + Ok = Ok && CNN_Act_SQ8(BotName = AppendNames(Name, "PadBot"), Ctrl, PadBot, Width, Height, ActOper); } CloseKernelGroupNoMerge(); if (Ok==0) return 0; @@ -3482,7 +3663,7 @@ int CNN_MatAddPaddedAct_SQ8( Height: Height of a In1 MatOper: KOP_MATVECTMUL - ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID + ActOper: Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH Signature: Name(In1, In2, Out, Infos) @@ -3692,6 +3873,7 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal( int Bias_DataSize, int Scale_DataSize, + int NBatches, int ColM1, int LineM1, int ColM2, @@ -3754,9 +3936,9 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal( } ColO = ((Width+Scx-1)/Scx) * ((Height+Scy-1)/Scy); - LayerOp += (int64_t) ColM1*ColO*LineM1; - LayerBandwidth += (int64_t) LineM1*(ColM1*ColM2*(1+1)); - LayerBandwidth += (int64_t) LineM1*ColM2*1; + LayerOp += (int64_t) NBatches*ColM1*ColO*LineM1; + LayerBandwidth += (int64_t) NBatches*LineM1*(ColM1*ColM2*(1+1)); + LayerBandwidth += (int64_t) NBatches*LineM1*ColM2*1; LayerBandwidth += (int64_t) LineM1*Bias_DataSize; if (Scy!=1) ConsT0 = Width*Scy; else ConsT0 = 4; @@ -3779,6 +3961,8 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal( } /* First try buffering small objects */ Kernel = UserKernel(Name, + (NBatches>1)? + KernelIterSpace(3, IterFixedSpace(D0, NBatches), IterTiledSpace(T1), IterTiledSpace(T0)): KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)), TILE_HOR, CArgs(7, @@ -3823,24 +4007,44 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal( ), ColFirst? KerArgs(8, - !Transposed?KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, - KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), - KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), - !NoBias?KerArg("Bias", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, - KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), - !ScaleScalar?KerArg("Scale", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, - !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, - KerArg("Infos", KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + !Transposed? + KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, + (NBatches>1)? + KerArg("In1", KerArgSpace(2,D0,T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"): + KerArg("In1", KerArgSpace(1, T0), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + (NBatches>1)? + KerArg("In2", KerArgSpace(2,D0,T1), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"): + KerArg("In2", KerArgSpace(1, T1), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + !NoBias? + KerArg("Bias", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, + (NBatches>1)? + KerArg("Out", KerArgSpace(2,D0,T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"): + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + !ScaleScalar? + KerArg("Scale", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, + !ScaleScalar? + KerArg("ScaleN", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, + KerArg("Infos", KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") ): KerArgs(8, - !Transposed?KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, - KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), - KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), - !NoBias?KerArg("Bias", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, - KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), - !ScaleScalar?KerArg("Scale", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, - !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, - KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") + !Transposed? + KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1, 1, 1, 0, 0, 0, 0):AT_NO_KER_ARG, + (NBatches>1)? + KerArg("In1", KerArgSpace(2,D0,T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"): + KerArg("In1", KerArgSpace(1, T1), O_IN|O_DB|O_CONST, ColM1, LineM1, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 8, "In1"), + (NBatches>1)? + KerArg("In2", KerArgSpace(2,D0,T0), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"): + KerArg("In2", KerArgSpace(1, T0), O_IN|O_DB, ColM2, LineM2, 1, 0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"), + !NoBias? + KerArg("Bias", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Bias"):AT_NO_KER_ARG, + (NBatches>1)? + KerArg("Out", KerArgSpace(2,D0,T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"): + KerArg("Out", KerArgSpace(1, T1), O_OUT|O_DB, ColO, LineO, 1, 0, OBJ_CONSTRAINTS_PAD_REM, 0, "Out"), + !ScaleScalar? + KerArg("Scale", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "Scale"):AT_NO_KER_ARG, + !ScaleScalar? + KerArg("ScaleN", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST, 1, SAxis, 1, 0, 0, 0, "ScaleN"):AT_NO_KER_ARG, + KerArg("Infos", KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED, 1, 1, AT_INF_DIM*1, 0, 0, 0, "Infos") ) ); if (Kernel) { @@ -4263,6 +4467,10 @@ int CNN_ConvolutionPoolAct_SQ8( KernelOper_T ActOper ) { + if (Fcx==1 && Fcy==1 && Height==1 && Width==1) { + printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n"); + return CNN_LinearAct_SQ8(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, KOP_LINEAR, ActOper); + } Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0; float K = 0.9; Tile_Orientation_T TileOrientation = TILE_HOR; @@ -4340,7 +4548,11 @@ int CNN_MatAddAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, i } int CNN_MatMulAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) { - return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0); + return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, 1, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0); +} + +int CNN_BatchedMatMulAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int NBatches, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) { + return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, NBatches, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0); } int CNN_MatMulSmallM1Act_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) { diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h index ba223b55d..ab919a417 100644 --- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h +++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h @@ -698,6 +698,29 @@ int CNN_MatMulAct_SQ8( KernelOper_T ActOper ); +int CNN_BatchedMatMulAct_SQ8( + char *Name, + + CNN_GenControl_T *Ctrl, + + int Bias_DataSize, + int Scale_DataSize, + + int NBatches, + int ColM1, + int LineM1, + int ColM2, + int LineM2, + + int Width, + int Height, + int Scx, + int Scy, + + KernelOper_T MatMulOper, + KernelOper_T ActOper + ); + Kernel_T *CNN_MatMulAct_SQ8_Internal( char *Name, @@ -706,6 +729,7 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal( int Bias_DataSize, int Scale_DataSize, + int NBatches, int ColM1, int LineM1, int ColM2, diff --git a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c index 659e2e34e..9f912d010 100644 --- a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c +++ b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c @@ -1279,7 +1279,7 @@ Kernel_T *CNN_ConvolutionPoolAct_fp16_Internal( if (Ok!=0) return Ok; if (Log) printf("Mapping this convolution to im2col scheme failed, reverting to standard implementation\n"); } - if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) { + if (Fcx==1 && Fcy==1 && Height==1 && Width==1) { printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n"); return CNN_LinearAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, KOP_LINEAR, ActOper); } diff --git a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c index cdce41f3b..b1a395965 100644 --- a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c +++ b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c @@ -15,13 +15,6 @@ #include "CNN_BasicKernels.h" #include "SSD_BasicKernels.h" -#ifndef __EMUL__ - #define CL_CRITICAL_ENTER() pi_cl_team_critical_enter() - #define CL_CRITICAL_EXIT() pi_cl_team_critical_exit() -#else - #define CL_CRITICAL_ENTER() - #define CL_CRITICAL_EXIT() -#endif // optimize the division to find the chunk size // equivalent to ceil(KerArg0->W/rt_nb_pe()) inline static unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) @@ -137,14 +130,14 @@ void Ker_SSD_Decoder(Ker_SSD_Decoder_ArgT *KerArg0 ) boxes_idx = i*num_coords; for (unsigned int j=1; j score_th){ - CL_CRITICAL_ENTER(); + gap_cl_critical_enter(); bbn = KerArg0->bbox_idx[0]++; // printf("Core: %d\tbbox_idx:%d\n", CoreId, KerArg0->bbox_idx[0]); if(bbn > n_max_bb){ // check if we reched n_max_bb - CL_CRITICAL_EXIT(); + gap_cl_critical_exit(); goto exit_double_for; } - CL_CRITICAL_EXIT(); + gap_cl_critical_exit(); // Valid BBOX --> alive bbox[bbn].alive = 1; //Save score always as a Q7 diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c index df3b6affc..5c5a14412 100644 --- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c +++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#if 0 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wextra" @@ -1060,261 +1061,6 @@ void KerParReduct_CC_Tanh_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) gap_waitbarrier(0); } - -void KerParReduct_CC_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - int Feat = Arg->Feat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; iFeat; -// int W = Arg->W; -// int H = Arg->H; -// unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); -// int * __restrict__ In = (int *__restrict__) Arg->In; -// unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; -// unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; -// unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; -// signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; -// unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; -// int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; -// int Prenorm = Infos[AT_INF_PRENORM]; - -// for (int i=First; iFeat; -// int W = Arg->W; -// int H = Arg->H; -// unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); -// int * __restrict__ In = (int *__restrict__) Arg->In; -// unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; -// unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; -// unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; -// signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; -// unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; -// int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; -// int Prenorm = Infos[AT_INF_PRENORM]; - -// for (int i=First; iFeat; -// int W = Arg->W; -// int H = Arg->H; -// unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); -// int * __restrict__ In = (int *__restrict__) Arg->In; -// unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; -// unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; -// unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; -// signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; -// unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; -// int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; -// int Prenorm = Infos[AT_INF_PRENORM]; - -// for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; i> 8, ActScale, ActScaleN), 8); - } - } - gap_waitbarrier(0); -} - -void KerParReduct_CC_Tanh_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - int Feat = Arg->Feat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - int Prenorm = Infos[AT_INF_PRENORM]; - - for (int i=First; i> 8, ActScale, ActScaleN), 7) + 128); - } - } - gap_waitbarrier(0); -} - // void parray(unsigned char * Out, int Feat, int H, int W) { // for (int c=0; c Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - // Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4) - // y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[2*i] = gap_clip(Acc0, 7), Out[2*i+1] = gap_clip(Acc1, 7); - } - if (N&0x1) { - unsigned int i=N-1; - int Acc0 = In[i]; - switch (Activation) { - case ACT_NONE: Acc0 = AT_SCALE(Acc0, ActScale, ActScaleN); break; - case ACT_RELU: Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); break; - case ACT_RELUM: Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); break; - case ACT_RELUMN: Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); break; - case ACT_RELUN: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); break; - case ACT_HSIGMOID: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); break; - case ACT_HSWISH: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); break; - case ACT_LEAKYRELU: - { - int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0; - int Acc0N = AT_NORM(Acc0 * A0, 7); - Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN); - // Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM((Acc0 * A0), 7):Acc0), ActScale, ActScaleN); - } - break; - case ACT_SIGMOID: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[i] = gap_clip(Acc0, 7); - } -} - -/* - * Standalone activation variant with Scale = 1.0 -*/ -static void Ker_ActivationScale1_SQ8( - signed char * __restrict__ In, - signed char * __restrict__ Out, - unsigned int N, - CNN_ActivationOper_T Activation, - int A0, - int B0 - ) - -{ - for (unsigned int i=0; i Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - // Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4) - // y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[2*i] = gap_clip(Acc0, 7), Out[2*i+1] = gap_clip(Acc1, 7); - } - if (N&0x1) { - unsigned int i=N-1; - int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: Acc0 = AT_SCALE(Acc0, ActScale, ActScaleN); break; - case ACT_RELU: Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); break; - case ACT_RELUN: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); break; - case ACT_RELUM: Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); break; - case ACT_RELUMN: Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); break; - case ACT_HSIGMOID: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); break; - case ACT_HSWISH: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); break; - case ACT_LEAKYRELU: - { - int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0; - int Acc0N = AT_NORM(Acc0 * A0, 7); - Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN); - // Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM((Acc0 * A0), 7):Acc0), ActScale, ActScaleN); - } - break; - case ACT_SIGMOID: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[i] = gap_clip(Acc0, 7); - } -} - -/* - * Standalone activation variant with Scale = 1.0 -*/ -static void Ker_ActivationScale1_ScaleIn_SQ8( - signed char * __restrict__ In, - signed char * __restrict__ Out, - unsigned int Scale, - unsigned int ScaleN, - unsigned int N, - CNN_ActivationOper_T Activation, - int A0, - int B0 - ) - -{ - for (unsigned int i=0; i> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[i] = gap_clip(Acc0, 7); - } -} - -/* - * Conv/Linear DP scaling followed by an optional activation, Out buffer is different from In Buffer -*/ -/*static void KerReduct_Activation_SQ8( - int * __restrict__ In, - signed char * __restrict__ Out, - unsigned int N, - unsigned int Scale, - unsigned int ScaleN, - CNN_ActivationOper_T Activation, - unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0 - ) - -{ - for (unsigned int i=0; i> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[i*Feat] = gap_clip(Acc0, 7); - } -} - -/* - * Conv/Linear DP scaling followed by an optional activation, variant for ScaleAct=1.0, Out buffer is different from In Buffer -*/ -static void KerReduct_ActivationScale1_SQ8( - int * __restrict__ In, - signed char * __restrict__ Out, - unsigned int N, - unsigned int Scale, - unsigned int ScaleN, - CNN_ActivationOper_T Activation, - int A0, int B0, int C0 - ) - -{ - for (unsigned int i=0; i Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[i] = gap_clip(Acc0, 7); - } -} - -/* - * Conv/Linear DP scaling followed by an optional activation, variant for ActScale=1.0, In place version - * Input is 32b int output is 8b -*/ -static void KerReductIO_ActivationScale1_SQ8( - signed char *__restrict__ Out, - int *__restrict__ In, - unsigned int N, - unsigned int Scale, - unsigned int ScaleN, - CNN_ActivationOper_T Activation, - int A0, int B0, int C0 - ) - -{ - for (unsigned int i=0; i Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - // Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4) - // y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[2*i] = gap_clip(Acc0, 7); Out[2*i+1] = gap_clip(Acc1, 7); - } - if (N&0x1) { - int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); - break; - case ACT_RELUN: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); - break; - case ACT_RELUM: - Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); - break; - case ACT_RELUMN: - Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); - break; - case ACT_HSIGMOID: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); - break; - case ACT_HSWISH: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); - break; - case ACT_LEAKYRELU: - { - int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0; - int Acc0N = AT_NORM(Acc0 * A0, 7); - Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN); - - // Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN); - } - break; - case ACT_SIGMOID: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[N-1] = gap_clip(Acc0, 7); - } -} - -/* - * Conv/Linear DP scaling followed by an optional activation, variant for ActScale=1.0, Out buffer is different from In Buffer - * Partial unroll to avoid load use penalty -*/ -static void _KerReduct_ActivationScale1_SQ8( - int * __restrict__ In, - signed char * __restrict__ Out, - unsigned int N, - unsigned int Scale, - unsigned int ScaleN, - CNN_ActivationOper_T Activation, - int A0, int B0, int C0 - ) - -{ - for (unsigned int i=0; i<(N/2); i++) { - int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7); - int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7); - - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = Max(0, Acc0); - Acc1 = Max(0, Acc1); - break; - case ACT_RELUN: - Acc0 = AT_CLIP_POS(Acc0, A0); - Acc1 = AT_CLIP_POS(Acc1, A0); - break; - case ACT_RELUM: - Acc0 = Max(A0, Acc0); - Acc1 = Max(A0, Acc1); - break; - case ACT_RELUMN: - Acc0 = Min(B0, Max(A0, Acc0)); - Acc1 = Min(B0, Max(A0, Acc1)); - break; - } - Out[2*i] = Acc0; Out[2*i+1] = Acc1; - } - if (N&0x1) { - int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = Max(0, Acc0); - break; - case ACT_RELUN: - Acc0 = AT_CLIP_POS(Acc0, A0); - break; - case ACT_RELUM: - Acc0 = Max(A0, Acc0); - break; - case ACT_RELUMN: - Acc0 = Min(B0, Max(A0, Acc0)); - break; - } - Out[N-1] = Acc0; - } -} - -/* - * Conv/Linear DP scaling followed by an optional activation, In place version - * Input is 32b int output is 8b - * Partially unrolled version to avoid load use penalty -*/ -static void _KerReductIO_Activation_SQ8( - signed char * __restrict__ Out, - int *__restrict__ In, - unsigned int N, - unsigned int Scale, - unsigned int ScaleN, - CNN_ActivationOper_T Activation, - unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0 - ) - -{ - for (unsigned int i=0; i<(N/2); i++) { - int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7); - int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); - Acc1 = AT_SCALE(Max(0, Acc1), ActScale, ActScaleN); - break; - case ACT_RELUN: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); - Acc1 = AT_SCALE(AT_CLIP_POS(Acc1, A0), ActScale, ActScaleN); - break; - case ACT_RELUM: - Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); - Acc1 = AT_SCALE(Max(A0, Acc1), ActScale, ActScaleN); - break; - case ACT_RELUMN: - Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); - Acc1 = AT_SCALE(Min(B0, Max(A0, Acc1)), ActScale, ActScaleN); - break; - case ACT_HSIGMOID: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); - Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScale, ActScaleN); - break; - case ACT_HSWISH: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); - Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScale, ActScaleN); - break; - case ACT_LEAKYRELU: - { - int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0; - int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1; - int Acc0N = AT_NORM(Acc0 * A0, 7); - int Acc1N = AT_NORM(Acc1 * A0, 7); - Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN); - Acc1 = AT_SCALE((Neg1*Acc1N+Pos1*Acc1), ActScale, ActScaleN); - - // Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN); - // Acc1 = AT_SCALE(((Acc1<0) ? AT_NORM(Acc1 * A0, 7):Acc1), ActScale, ActScaleN); - } - break; - case ACT_SIGMOID: - { - // Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4) - // y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - // Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4) - // y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8 - // y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - int Acc1N = Acc1 << 8; - Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[2*i] = gap_clip(Acc0, 7); Out[2*i+1] = gap_clip(Acc1, 7); - } - if (N&0x1) { - int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); - break; - case ACT_RELUN: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); - break; - case ACT_RELUM: - Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); - break; - case ACT_RELUMN: - Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); - break; - case ACT_HSIGMOID: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); - break; - case ACT_HSWISH: - Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); - break; - case ACT_LEAKYRELU: - { - int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0; - int Acc0N = AT_NORM(Acc0 * A0, 7); - Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN); - - // Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN); - } - break; - case ACT_SIGMOID: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - case ACT_TANH: - { - int Acc0N = Acc0 << 8; - Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN); - } - break; - } - Out[N-1] = gap_clip(Acc0, 7); - } -} - -/* - * Conv/Linear DP scaling followed by an optional activation, Variant for ActScale=1.0, In place version - * Input is 32b int output is 8b - * Partially unrolled version to avoid load use penalty -*/ -static void _KerReductIO_ActivationScale1_SQ8( - signed char *__restrict__ Out, - int *__restrict__ In, - unsigned int N, - unsigned int Scale, - unsigned int ScaleN, - CNN_ActivationOper_T Activation, - int A0, int B0, int C0 - ) - -{ - for (unsigned int i=0; i<(N/2); i++) { - int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7); - int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = Max(0, Acc0); - Acc1 = Max(0, Acc1); - break; - case ACT_RELUN: - Acc0 = AT_CLIP_POS(Acc0, A0); - Acc1 = AT_CLIP_POS(Acc1, A0); - break; - case ACT_RELUM: - Acc0 = Max(A0, Acc0); - Acc1 = Max(A0, Acc1); - break; - case ACT_RELUMN: - Acc0 = Min(B0, Max(A0, Acc0)); - Acc1 = Min(B0, Max(A0, Acc1)); - break; - } - Out[2*i] = Acc0; Out[2*i+1] = Acc1; - } - if (N&0x1) { - int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7); - switch (Activation) { - case ACT_NONE: - break; - case ACT_RELU: - Acc0 = Max(0, Acc0); - break; - case ACT_RELUN: - Acc0 = AT_CLIP_POS(Acc0, A0); - break; - case ACT_RELUM: - Acc0 = Max(A0, Acc0); - break; - case ACT_RELUMN: - Acc0 = Min(B0, Max(A0, Acc0)); - break; - } - Out[N-1] = Acc0; - } -} - -/* - * Buffer compaction, scattered by chunk size groups of 8b moved to a contiguous representation through a parallel reduction tree -*/ -static void __attribute__ ((noinline)) KerReductIO_Compact_SQ8(int *__restrict__ In, unsigned int Size, unsigned int CoreId, unsigned int ChunkCell) - -{ - unsigned int U = gap_ncore()/2, Log2Core = gap_fl1(gap_ncore()), A = 2, B = 1; - for (int k=0; kW*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + decl(in_d_type * __restrict__, In) = decl((in_d_type *__restrict__), Arg->In); \ + decl(out_d_type * __restrict__, Out) = decl((out_d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Max(0, Last-First); \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ +\ + for (unsigned int i=First; iFeat; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Arg->W*Arg->H; \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int c=First; cFeat; \ + unsigned S = Arg->W*Arg->H; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Max(0, Last-First); \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int c=0; cFeat; \ + unsigned int Size = Arg->W*Arg->H; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int c=First; cFeat); \ +} while(0); + +#define KER_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \ +do { \ + unsigned int Feat = Arg->Feat; \ + unsigned int S = Arg->W*Arg->H; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ InOut = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Max(0, Last-First); \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ + \ + for (unsigned int c=0; cFeat; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Arg->W*Arg->H; \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int c=First; cFeat; \ + unsigned S = Arg->W*Arg->H; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int Size = Max(0, Last-First); \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int c=0; cFeat; \ + unsigned S = Arg->W*Arg->H; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int i=First; iFeat; \ + unsigned S = Arg->W*Arg->H; \ + unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); \ + int * __restrict__ In = (int *__restrict__) Arg->In; \ + unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \ + unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \ + decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \ + signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \ + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \ + int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \ + int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \ +\ + for (unsigned int i=First; iFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Arg->W*Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int W = Arg->W, H = Arg->H; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=First; cFeat; - unsigned int Size = Arg->W*Arg->H; - unsigned int W = Arg->W, H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -void KerParReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int S = Arg->Feat; - unsigned int Size = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - S = Size*Max(0, Last-First); - for (int c=First; cFeat); -} - -/* Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated one after the other in parallel */ -void KerReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - unsigned int Feat = Arg->Feat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cFeat; - unsigned int S = Arg->W*Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - int * __restrict__ InOut = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int c=0; cW*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - - - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_NONE, ActScale, ActScaleN, 0, 0, 0); - gap_waitbarrier(0); +void KerParReduct_CC_CHW2HWC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUM, signed char, unsigned char, 8, 0); } -void Ker_Scale_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - +void KerParReduct_CC_CHW2HWC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUMN, signed char, unsigned char, 8, 0); +} - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_NONE, ActScale, ActScaleN, 0, 0, 0); - gap_waitbarrier(0); +void KerParReduct_CC_CHW2HWC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSIGMOID, signed char, unsigned char, 8, 0); } -void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg) +void KerParReduct_CC_CHW2HWC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSWISH, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_CHW2HWC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_LEAKYRELU, signed char, unsigned char, 8, 0); +} +void KerParReduct_CC_CHW2HWC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_SIGMOID, signed char, unsigned char, 8, 0); +} - if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELU, A0, B0); - gap_waitbarrier(0); +void KerParReduct_CC_CHW2HWC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_CHW2HWC(ACT_TANH, signed char, unsigned char, 8, 0); } -void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg) +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated in parallel +*/ +void KerParReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0); +} +void KerParReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0); +} - if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUN, A0, B0); - gap_waitbarrier(0); +void KerParReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0); } -void Ker_ReLUM_SQ8(KerActivation_SQ8_T *Arg) +void KerParReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0); +} +void KerParReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0); +} - if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUM, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUM, A0, B0); - gap_waitbarrier(0); +void KerParReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0); } -void Ker_ReLUMN_SQ8(KerActivation_SQ8_T *Arg) +void KerParReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0); +} - if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUMN, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUMN, A0, B0); - gap_waitbarrier(0); +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated one after the other in parallel +*/ +void KerReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0); } -void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0); +} +void KerReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0); +} - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void KerReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0); } -void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg) +void KerReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0); +} +void KerReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0); +} - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void KerReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0); } -void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg) +void KerReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0); +} +/* + * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated one after the other in parallel +*/ +void KerReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0); +} - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void KerReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0); } -void Ker_Sigmoid_SQ8(KerActivation_SQ8_T *Arg) +void KerReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0); +} +void KerReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0); +} - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void KerReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0); } -void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg) +void KerReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0); +} -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0); +} +void KerReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0); +} - Ker_Activation_SQ8(In+First, Out+First, Size, ACT_TANH, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void KerReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0); } /* - * Standalone Scaled Activation with Extra Scale before activation, Features are evaluated one after the other in parallel + * Standalone Scaled Activation, No reduction with Scale[c] ScaleN[c] - All the elements can be evaluated in parallel */ -void Ker_ActNone_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_NONE, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_NONE, A0, B0); - gap_waitbarrier(0); +void Ker_ActNone_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_NONE, signed char, signed char, unsigned char, 8, 0); } -void Ker_ReLU_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELU, A0, B0); - gap_waitbarrier(0); +void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_RELU, signed char, signed char, unsigned char, 8, 0); } -void Ker_ReLUN_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUN, A0, B0); - gap_waitbarrier(0); +void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_RELUN, signed char, signed char, unsigned char, 8, 0); } -void Ker_ReLUM_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUM, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUM, A0, B0); - gap_waitbarrier(0); +void Ker_ReLUM_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_RELUM, signed char, signed char, unsigned char, 8, 0); } -void Ker_ReLUMN_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUMN, ActScale, ActScaleN, A0, B0, C0); - else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUMN, A0, B0); - gap_waitbarrier(0); +void Ker_ReLUMN_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_RELUMN, signed char, signed char, unsigned char, 8, 0); } -void Ker_HSigmoid_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_HSIGMOID, signed char, signed char, unsigned char, 8, 0); } -void Ker_HSwish_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_HSWISH, signed char, signed char, unsigned char, 8, 0); } -void Ker_LeakyReLU_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_LEAKYRELU, signed char, signed char, unsigned char, 8, 0); } -void Ker_Sigmoid_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - - Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +void Ker_Sigmoid_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_SIGMOID, signed char, signed char, unsigned char, 8, 0); } -void Ker_Tanh_ScaleIn_SQ8(KerActivation_SQ8_T *Arg) - -{ - unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, S); - signed char * __restrict__ In = (signed char *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int Size = Max(0, Last-First); - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg) { + KER_ACT(ACT_TANH, signed char, signed char, unsigned char, 8, 0); +} +/* + * from int32 to 8/16bits + optional Activation - Reduction with Scale[c] ScaleN[c] - All the elements can be evaluated in parallel +*/ - Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_TANH, ActScale, ActScaleN, A0, B0, C0); - gap_waitbarrier(0); +/* ------------------------------------------------------ Signed 8 bits ------------------------------------------------------ */ +void KerReduct_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_NONE, int, signed char, unsigned char, 8, 0); } - -/* - * Input Scaling and reduction to 8b then channel cnetric activation, Out location != In location. Features are evaluated in parallel -*/ -void KerReduct_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg) - -{ - int Feat = Arg->Feat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_RELUM, int, signed char, unsigned char, 8, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_HSWISH, int, signed char, unsigned char, 8, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_ReLUM_SQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_RELUM, int, signed short, unsigned short, 16, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_LeakyReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_LEAKYRELU, int, signed short, unsigned short, 16, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_RELUN, int, unsigned char, unsigned char, 8, 1); +} - for (int i=First; i> 8), ActScale, ActScaleN), 7); - } - gap_waitbarrier(0); +void KerReduct_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_RELUM, int, unsigned char, unsigned char, 8, 1); } -void KerReduct_CC_NoScale_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) +void KerReduct_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_RELUMN, int, unsigned char, unsigned char, 8, 1); +} -{ - int Feat = Arg->Feat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_HSigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_HSIGMOID, int, unsigned char, unsigned char, 8, 1); +} - for (int i=First; i> 8), ActScale, ActScaleN), 7); - } - gap_waitbarrier(0); +void KerReduct_CC_NoScale_HSwish_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_HSWISH, int, unsigned char, unsigned char, 8, 1); } +void KerReduct_CC_NoScale_LeakyReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_LEAKYRELU, int, unsigned char, unsigned char, 8, 1); +} -/* - * Input Scaling and reduction to 8b then channel cnetric activation, Out location != In location. Features are evaluated in parallel -*/ -void KerReduct_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg) - -{ - int Feat = Arg->Feat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_RELU, int, unsigned short, unsigned short, 16, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_HSigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_HSIGMOID, int, unsigned short, unsigned short, 16, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerReduct_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_ACT(ACT_TANH, int, unsigned short, unsigned short, 16, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_ReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed char, unsigned char, 8, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_ReLUMN_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed char, unsigned char, 8, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_LeakyReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed char, unsigned char, 8, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +/* ----------------------------------------------------- UnSigned 8 bits ----------------------------------------------------- */ +void KerParReduct_CC_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned char, unsigned char, 8, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_LeakyReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned char, unsigned char, 8, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; - unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +/* ----------------------------------------------------- Signed 16 bits ---------------------------------------------------- */ +void KerParReduct_CC_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed short, unsigned short, 16, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_ReLUMN_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed short, unsigned short, 16, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_Sigmoid_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed short, unsigned short, 16, 0); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +/* ----------------------------------------------------- UnSigned 16 bits ---------------------------------------------------- */ +void KerParReduct_CC_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned short, unsigned short, 16, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_ReLUM_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned short, unsigned short, 16, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; +void KerParReduct_CC_HSwish_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) { + KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned short, unsigned short, 16, 1); +} - for (int i=First; iFeat; - int W = Arg->W; - int H = Arg->H; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, Feat*H*W); - int * __restrict__ In = (int *__restrict__) Arg->In; - unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out; - signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; - unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; - int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0]; - for (int i=First; iIn; @@ -330,6 +333,9 @@ void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg) unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); v4s * __restrict__ VectIn = (v4s *) In; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); for (int i=First; iIn; - unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; - const signed char * __restrict__ Weights = Arg->Weights; - const signed char * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; - unsigned char *Scale = Arg->Scale; - unsigned char *ScaleN = Arg->ScaleN; - signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; +void KerParLinearLayerFullFeatB8_ReLU_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_RELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); - v4s * __restrict__ VectIn = (v4s *) In; +void KerParLinearLayerFullFeatB8_ReLUN_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_RELUN); +} - for (int i=First; iIn; - unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; - const signed char * __restrict__ Weights = Arg->Weights; - const signed char * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; - unsigned char *Scale = Arg->Scale; - unsigned char *ScaleN = Arg->ScaleN; - int A0 = Arg->Infos[AT_INF_A0]; - signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; +void KerParLinearLayerFullFeatB8_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_LEAKYRELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); - v4s * __restrict__ VectIn = (v4s *) In; +void KerParLinearLayerFullFeatB8_HSigmoid_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_HSIGMOID); +} - for (int i=First; iIn; unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; @@ -423,6 +412,9 @@ void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg) unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); v4s * __restrict__ VectIn = (v4s *) In; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); for (int i=First; iIn; - unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; - const signed char * __restrict__ Weights = Arg->Weights; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; - unsigned char *Scale = Arg->Scale; - unsigned char *ScaleN = Arg->ScaleN; - signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; +void KerParLinearLayerFullFeatB16_ReLU_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_RELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); - v4s * __restrict__ VectIn = (v4s *) In; +void KerParLinearLayerFullFeatB16_ReLUN_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_RELUN); +} - for (int i=First; iIn; - unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; - const signed char * __restrict__ Weights = Arg->Weights; - const short int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; - unsigned char *Scale = Arg->Scale; - unsigned char *ScaleN = Arg->ScaleN; - int A0 = Arg->Infos[AT_INF_A0]; - signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; +void KerParLinearLayerFullFeatB16_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_LEAKYRELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); - v4s * __restrict__ VectIn = (v4s *) In; +void KerParLinearLayerFullFeatB16_HSigmoid_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_HSIGMOID); +} - for (int i=First; iIn; unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; @@ -513,6 +487,9 @@ void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg) unsigned char *Scale = Arg->Scale; unsigned char *ScaleN = Arg->ScaleN; signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); v4s * __restrict__ VectIn = (v4s *) In; @@ -527,70 +504,51 @@ void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg) } if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc); for (int j=4*(InDim/4); jIn; - unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; - const signed char * __restrict__ Weights = Arg->Weights; - const int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; - unsigned char *Scale = Arg->Scale; - unsigned char *ScaleN = Arg->ScaleN; - signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; +void KerParLinearLayerFullFeatB32_ReLU_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_RELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); - v4s * __restrict__ VectIn = (v4s *) In; +void KerParLinearLayerFullFeatB32_ReLUN_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_RELUN); +} - for (int i=First; iIn; - unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim; - const signed char * __restrict__ Weights = Arg->Weights; - const int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN]; - unsigned char *Scale = Arg->Scale; - unsigned char *ScaleN = Arg->ScaleN; - int A0 = Arg->Infos[AT_INF_A0]; - signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out; +void KerParLinearLayerFullFeatB32_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_LEAKYRELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last = Min(First+ChunkCell, OutDim); - v4s * __restrict__ VectIn = (v4s *) In; +void KerParLinearLayerFullFeatB32_HSigmoid_SQ8(KerLinear_SQ8_T *Arg) { + KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_HSIGMOID); +} - for (int i=First; iTotalInFeatures, InFeatures = Arg->InFeatures, OutFeatures = Arg->OutFeatures; - for (unsigned int of=0; ofOutFirstCol; signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; int ColFirst = Arg->ColFirst; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -702,7 +707,11 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg) S3 += V0 * BufferColIn2[i+3*H_In2]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7)); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R; } gap_waitbarrier(0); @@ -730,8 +739,10 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg) S1 += V0 * BufferColIn2[i+1*H_In2]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7); - Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Sc, ScN), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7); + Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7); } gap_waitbarrier(0); } @@ -754,136 +765,172 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg) S0 += V0 * BufferColIn2[i]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7); } gap_waitbarrier(0); } } -void KerParMatMulB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg) +void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_NONE); +} + +void KerParMatMulB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_RELU); +} + +void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_RELUN); +} + +void KerParMatMulB8_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_RELUM); +} + +void KerParMatMulB8_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_RELUMN); +} + +void KerParMatMulB8_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParMatMulB8_HSwish_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_HSWISH); +} + +void KerParMatMulB8_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParMatMulB8_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerParMatMulB8_Tanh_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB8_SQ8_act(Arg, ACT_TANH); +} + +static inline void __attribute__((always_inline)) KerParMatMulSxSyB8_SQ8_act( + KerMatMul_SQ8_T *Arg, + CNN_ActivationOper_T Activation) { - /* - Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2 - */ - signed char * __restrict__ In1 = Arg->In1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; +/* + In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat] + In2 is [InFeat][Width*Height] + + When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] +*/ + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + signed char * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; Col0) { + for (i=Fi;iH_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned char * __restrict__ Scale = Arg->Scale; @@ -903,7 +950,9 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) unsigned int OutFirstCol = Arg->OutFirstCol; signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -948,8 +997,11 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) S3 += V0 * BufferColIn2[i+3*H_In2]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7), - gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7)); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R; } gap_waitbarrier(0); @@ -977,8 +1029,10 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) S1 += V0 * BufferColIn2[i+1*H_In2]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7); - Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7); + Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7); } gap_waitbarrier(0); } @@ -1001,13 +1055,57 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) S0 += V0 * BufferColIn2[i]; } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7); } gap_waitbarrier(0); } } -void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg) +void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_NONE); +} + +void KerParMatMulB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_RELU); +} + +void KerParMatMulB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_RELUN); +} + +void KerParMatMulB16_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_RELUM); +} + +void KerParMatMulB16_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_RELUMN); +} + +void KerParMatMulB16_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParMatMulB16_HSwish_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_HSWISH); +} + +void KerParMatMulB16_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParMatMulB16_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerParMatMulB16_Tanh_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB16_SQ8_act(Arg, ACT_TANH); +} + + +static inline void __attribute__((always_inline)) KerParMatMulSxSyB16_SQ8_act( + KerMatMul_SQ8_T *Arg, + CNN_ActivationOper_T Activation) { /* @@ -1021,7 +1119,7 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg) unsigned int H_In1 = Arg->H_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; + short int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; int Pi = Arg->OutFirstCol; @@ -1032,6 +1130,9 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg) int ColFirst = Arg->ColFirst; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1062,7 +1163,8 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg) if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S); for (i=(W_In1/4)*4; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; +void KerParMatMulSxSyB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB16_SQ8_act(Arg, ACT_RELUN); +} - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; +void KerParMatMulSxSyB16_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB16_SQ8_act(Arg, ACT_RELUM); +} - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - int A0 = Arg->Infos[AT_INF_A0]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; +void KerParMatMulSxSyB16_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB16_SQ8_act(Arg, ACT_LEAKYRELU); +} - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; +void KerParMatMulSxSyB16_HSwish_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB16_SQ8_act(Arg, ACT_HSWISH); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); +void KerParMatMulSxSyB16_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB16_SQ8_act(Arg, ACT_HSIGMOID); +} - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; +void KerParMatMulSxSyB16_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB16_SQ8_act(Arg, ACT_SIGMOID); +} - while (L>0) { - for (i=Fi;iH_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; + int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; unsigned char * __restrict__ Scale = Arg->Scale; @@ -1233,6 +1242,9 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg) unsigned int OutFirstCol = Arg->OutFirstCol; signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; int ColFirst = Arg->ColFirst; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -1243,6 +1255,7 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg) v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); + unsigned int Iter = (Last>First)?(Last-First):0; unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); int OffLine = 0, OffCol = 0; @@ -1259,27 +1272,87 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg) BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3]; } gap_waitbarrier(0); - for (Line=First; LineIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; +/* + In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat] + In2 is [InFeat][Width*Height] + + When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)] +*/ + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int W_In2 = Arg->W_In2; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; + unsigned int W_Out = Arg->W_Out; + int Pi = Arg->OutFirstCol; + signed char *BufferColIn2 = Arg->BufferColIn2; + unsigned int NormBias = Arg->NormBias; + int Wi = Arg->W, Hi = Arg->H; + int Sx = Arg->Sx, Sy = Arg->Sy; + int ColFirst = Arg->ColFirst; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); + unsigned int H_In2 = W_In1; + unsigned int H_Out = H_In1; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; + int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; + int Oo, OffLine; + int At, F=0, L = W_In2; unsigned int Line, Col, i; v4s *VBuff = (v4s *) BufferColIn2; @@ -1629,7 +1509,7 @@ void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg) gap_waitbarrier(0); for (Line=First; LineIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; +void KerParMatMulSxSyB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELUN); +} - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; +void KerParMatMulSxSyB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELUM); +} - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; +void KerParMatMulSxSyB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELUMN); +} - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; +void KerParMatMulSxSyB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB32_SQ8_act(Arg, ACT_LEAKYRELU); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); +void KerParMatMulSxSyB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB32_SQ8_act(Arg, ACT_HSWISH); +} - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; +void KerParMatMulSxSyB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulSxSyB32_SQ8_act(Arg, ACT_HSIGMOID); +} - while (L>0) { - for (i=Fi;iIn1; unsigned int W_In1 = Arg->W_In1; unsigned int H_In1 = Arg->H_In1; signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - short int * __restrict__ Bias = Arg->Bias; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + signed char * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned int NormBias = Arg->NormBias; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = (Last>First)?(Last-First):0; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; + for (int i=0; i0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + short int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = (Last>First)?(Last-First):0; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; + signed char * __restrict__ In1 = Arg->In1; + unsigned int W_In1 = Arg->W_In1; + unsigned int H_In1 = Arg->H_In1; + signed char * __restrict__ In2 = Arg->In2; + unsigned int H_In2 = Arg->W_In2; + unsigned int W_In2 = W_In1; + int * __restrict__ Bias = Arg->Bias; + signed char * __restrict__ Out = Arg->Out; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); + unsigned int Iter = (Last>First)?(Last-First):0; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; +void KerParMatMulB32_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELU); +} - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); +void KerParMatMulB32_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELUN); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; +void KerParMatMulB32_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg) { + KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELUM); +} - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; + unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); + + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); + + if (Scale) + for (int i=First; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; +void KerParMatVectMul_ReLU_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_SQ8_act(Arg, ACT_RELU); +} - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); +void KerParMatVectMul_ReLUN_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_SQ8_act(Arg, ACT_RELUN); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; +void KerParMatVectMul_ReLUM_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_SQ8_act(Arg, ACT_RELUM); +} - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; +void KerParMatVectMul_HSwish_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_SQ8_act(Arg, ACT_HSWISH); +} - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); +void KerParMatVectMul_HSigmoid_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_SQ8_act(Arg, ACT_HSIGMOID); +} - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; +void KerParMatVectMul_Sigmoid_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_SQ8_act(Arg, ACT_SIGMOID); +} - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); + signed char * __restrict__ In1 = Arg->In1; + signed char * __restrict__ In2 = Arg->In2; + signed char * __restrict__ Out = Arg->Out; + int W = Arg->W; + int H = Arg->H; + int Feat = Arg->Feat; + unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; + unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); + int S = W*H; - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; + unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - int Pi = Arg->OutFirstCol; - signed char *BufferColIn2 = Arg->BufferColIn2; - unsigned int NormBias = Arg->NormBias; - int Wi = Arg->W, Hi = Arg->H; - int Sx = Arg->Sx, Sy = Arg->Sy; - int ColFirst = Arg->ColFirst; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - int A0 = Arg->Infos[AT_INF_A0]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - - int Wo = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy; - int Oo, OffLine; - int At, F=0, L = W_In2; - - unsigned int Line, Col, i; - v4s *VBuff = (v4s *) BufferColIn2; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li = Min(H_In2, Fi+Ci); - - At = 0; OffLine = 0; Oo = 0; - if (ColFirst) OffLine = Pi; else Oo = Pi; - - while (L>0) { - for (i=Fi;iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; +void KerParMatVectMul_ReLUM_HWC_SQ8(KerMat3_SQ8_T *Arg) { + KerParMatVectMul_HWC_SQ8_act(Arg, ACT_RELUM); +} - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - signed char * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - int A0 = Arg->Infos[AT_INF_A0]; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - short int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - int A0 = Arg->Infos[AT_INF_A0]; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - - for (int i=0; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int H_In2 = Arg->W_In2; - unsigned int W_In2 = W_In1; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2); - unsigned int Iter = (Last>First)?(Last-First):0; - int A0 = Arg->Infos[AT_INF_A0]; - - for (int i=0; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); - - if (Scale) - for (int i=First; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int Feat = Arg->Feat; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - int S = W*H; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat); - - if (Scale) - for (int i=First; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); - - if (Scale) - for (int i=First; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - int A0 = Arg->Infos[AT_INF_A0]; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); - - if (Scale) - for (int i=First; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - unsigned int ActScale = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE]; - unsigned int ActScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN]; - int A0 = Arg->Infos[AT_INF_A0]; - int B0 = Arg->Infos[AT_INF_B0]; - int C0 = Arg->Infos[AT_INF_C0]; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); - - if (Scale) - for (int i=First; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - unsigned int ActScale = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE]; - unsigned int ActScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN]; - int A0 = Arg->Infos[AT_INF_A0]; - int B0 = Arg->Infos[AT_INF_B0]; - int C0 = Arg->Infos[AT_INF_C0]; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); - - if (Scale) - for (int i=First; iIn1; - signed char * __restrict__ In2 = Arg->In2; - signed char * __restrict__ Out = Arg->Out; - int W = Arg->W; - int H = Arg->H; - unsigned int ActScale = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE]; - unsigned int ActScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN]; - int A0 = Arg->Infos[AT_INF_A0]; - int B0 = Arg->Infos[AT_INF_B0]; - int C0 = Arg->Infos[AT_INF_C0]; - unsigned int Scale = ((unsigned char *)(Arg->Infos))[AT_INF_SCALE]; - unsigned int ScaleN = ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN]; - - unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat); - - if (Scale) - for (int i=First; iIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Out = Arg->Out; - int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = Arg->NormBias; - unsigned int W_Out = Arg->W_Out; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Out = Arg->Out; - int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = Arg->NormBias; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - signed char * __restrict__ Out = Arg->Out; - int * __restrict__ Bias = Arg->Bias; - unsigned int NormBias = Arg->NormBias; - unsigned int W_Out = Arg->W_Out; - unsigned int OutFirstCol = Arg->OutFirstCol; - signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; - int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; - v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); - v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); - v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - for (Col=0; ColIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - signed char *pOut = Out + W_Out*OffLine + OffCol; - for (Line=0; LineIn1; - unsigned int W_In1 = Arg->W_In1; - unsigned int H_In1 = Arg->H_In1; - signed char * __restrict__ In2 = Arg->In2; - unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; - signed char * __restrict__ Out = Arg->Out; - unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; - unsigned int OutFirstCol = Arg->OutFirstCol; - int ColFirst = Arg->ColFirst; - - unsigned int H_In2 = W_In1; - unsigned int H_Out = H_In1; - unsigned int Line, Col, i; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - signed char *pOut = Out + W_Out*OffLine + OffCol; - for (Line=0; LineH_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - unsigned int NormBias = Arg->NormBias; + unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; + unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; unsigned int Line, Col, i; + v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2; + v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2); + v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2); + v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2); unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); unsigned int Iter = (Last>First)?(Last-First):0; + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); int OffLine = 0, OffCol = 0; if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - signed char *pOut = Out + W_Out*OffLine + OffCol; - for (Line=0; LineH_In1; signed char * __restrict__ In2 = Arg->In2; unsigned int W_In2 = Arg->W_In2; - int * __restrict__ Bias = Arg->Bias; signed char * __restrict__ Out = Arg->Out; + int * __restrict__ Bias = Arg->Bias; + unsigned int NormBias = Arg->NormBias; unsigned int W_Out = Arg->W_Out; unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; - unsigned int NormBias = Arg->NormBias; unsigned int OutFirstCol = Arg->OutFirstCol; + signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2; int ColFirst = Arg->ColFirst; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; unsigned int Line, Col, i; - - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last = Min(H_In1, First+ChunkCell); - unsigned int Iter = (Last>First)?(Last-First):0; - int OffLine = 0, OffCol = 0; - - if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; - signed char *pOut = Out + W_Out*OffLine + OffCol; - for (Line=0; LineFirst)?(Last-First):0; + unsigned int C = ChunkSize(H_In2), F = CoreId*C, L = Min(H_In2, F+C); + int OffLine = 0, OffCol = 0; + + if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol; + for (Col=0; ColBias; signed char * __restrict__ Out = Arg->Out; unsigned int W_Out = Arg->W_Out; - unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE]; - unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN]; + unsigned char * __restrict__ Scale = Arg->Scale; + unsigned char * __restrict__ ScaleN = Arg->ScaleN; unsigned int NormBias = Arg->NormBias; unsigned int OutFirstCol = Arg->OutFirstCol; int ColFirst = Arg->ColFirst; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -5700,10 +2729,16 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) S6 += V1 * pIn2[i+2*H_In2]; S7 += V1 * pIn2[i+3*H_In2]; } - v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7), - AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7)); - v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Scale, ScaleN), 7), - AT_CLIP_POS_IMM(AT_SCALE(S6, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Scale, ScaleN), 7)); + S0 = AT_SCALE(S0, Scale[4*Col ], ScaleN[4*Col ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, Scale[4*Col ], ScaleN[4*Col ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S5 = AT_SCALE(S5, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S6 = AT_SCALE(S6, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S7 = AT_SCALE(S7, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); + v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut+(l1 )*W_Out+4*Col)) = R1; *((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2; pIn2 += 4*H_In2; @@ -5727,8 +2762,10 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) S0 += V0 * pIn2[i]; S1 += V1 * pIn2[i]; } - pOut[(l1 )*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7); - pOut[(l1+1)*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7); + S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Scale[Col], ScaleN[Col]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + pOut[(l1 )*W_Out + Col] = gap_clip(S0, 7); + pOut[(l1+1)*W_Out + Col] = gap_clip(S1, 7); pIn2 += H_In2; } } @@ -5741,12 +2778,12 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2); v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2); v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2); - int S0=0, S1=0, S2=0, S3=0; + int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0; if (Bias) { - S0 = (Bias[4*Col]<NormBias; unsigned int OutFirstCol = Arg->OutFirstCol; int ColFirst = Arg->ColFirst; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = (unsigned char *) Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int H_In2 = W_In1; unsigned int H_Out = H_In1; @@ -5869,10 +2955,16 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) S6 += V1 * pIn2[i+2*H_In2]; S7 += V1 * pIn2[i+3*H_In2]; } - v4s R1 = gap_pack4(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), - AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0)); - v4s R2 = gap_pack4(AT_CLIP_POS(AT_SCALE(S4, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S5, Scale, ScaleN), A0), - AT_CLIP_POS(AT_SCALE(S6, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S7, Scale, ScaleN), A0)); + S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); + v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut+(l1 )*W_Out+4*Col)) = R1; *((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2; pIn2 += 4*H_In2; @@ -5896,8 +2988,10 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) S0 += V0 * pIn2[i]; S1 += V1 * pIn2[i]; } - pOut[(l1 )*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0); - pOut[(l1+1)*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0); + S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + pOut[(l1 )*W_Out + Col] = gap_clip(S0, 7); + pOut[(l1+1)*W_Out + Col] = gap_clip(S1, 7); pIn2 += H_In2; } } @@ -5935,8 +3029,11 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) S2 += V0 * pIn2[i+2*H_In2]; S3 += V0 * pIn2[i+3*H_In2]; } - v4s R1 = gap_pack4(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), - AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0)); + S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (pOut+(l1 )*W_Out+4*Col)) = R1; pIn2 += 4*H_In2; } @@ -5955,10 +3052,52 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) int V0 = In1[(l1 )*W_In1 + i]; S0 += V0 * pIn2[i]; } - pOut[(l1 )*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0); + S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + pOut[(l1 )*W_Out + Col] = gap_clip(S0, 7); pIn2 += H_In2; } } gap_waitbarrier(0); } +void KerParMatMulTransposedB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_NONE); +} + +void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELU); +} + +void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELUN); +} + +void KerParMatMulTransposedB32_ReLUM_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELUM); +} + +void KerParMatMulTransposedB32_ReLUMN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELUMN); +} + +void KerParMatMulTransposedB32_LeakyReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerParMatMulTransposedB32_HSwish_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_HSWISH); +} + +void KerParMatMulTransposedB32_HSigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerParMatMulTransposedB32_Sigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerParMatMulTransposedB32_Tanh_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) { + KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_TANH); +} + +#pragma GCC diagnostic pop diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c index 81e4e8db1..e90a8d3b4 100644 --- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c +++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c @@ -16,6 +16,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wswitch" #include #include "CNN_BasicKernels_SQ8.h" @@ -53,8 +54,14 @@ static int LastDefinedOutput(int DimIn, int F, int PadL, int Stride, int D) } // #define OLD -void KerPar_MM_Conv1D_SQ8( - Ker_MM_Conv_SQ8_T *Arg + +/* + * 1D Convolutional Kernels kernels based on MatMul (im2col: ColBuff) with CHW inout tensor order + * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -73,6 +80,9 @@ void KerPar_MM_Conv1D_SQ8( signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; @@ -149,7 +159,8 @@ void KerPar_MM_Conv1D_SQ8( S0 = gap_sumdotp4(V1, C1, S0); } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); } @@ -157,81 +168,54 @@ void KerPar_MM_Conv1D_SQ8( } } -static void __attribute__ ((noinline)) MatMul2Out( - signed char *__restrict__ pI, - signed char *__restrict__ pC, - int *__restrict__ pBias, - unsigned char *__restrict__ pSc, - unsigned char *__restrict__ pScN, - signed char *__restrict__ pOut0, - signed char *__restrict__ pOut1, - unsigned int InFeat, - unsigned int IterOut, - unsigned int NormBias - ) +void KerPar_MM_Conv1D_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_SQ8_act(Arg, ACT_NONE); +} -{ - for (int i=0; i<(IterOut/4); i++) { - signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat, - *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat; - pC+=4; - int S00 = (*pBias)< ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -251,6 +235,9 @@ void KerPar_MM_Conv1x1_HWC_SQ8( unsigned char * __restrict__ ScaleN = Arg->ScaleN; int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); @@ -294,17 +281,25 @@ void KerPar_MM_Conv1x1_HWC_SQ8( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++; + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S00, 7); pOut0++; + *pOut1 = gap_clip(S01, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S10, 7); pOut0++; + *pOut1 = gap_clip(S11, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S20, 7); pOut0++; + *pOut1 = gap_clip(S21, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S30, 7); pOut0++; + *pOut1 = gap_clip(S31, 7); pOut1++; } for (int i=4*(IterOut/4); iIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Sx = Arg->Sx, Sy = Arg->Sy; - unsigned int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; +void KerPar_MM_Conv1x1_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUN); +} - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; +void KerPar_MM_Conv1x1_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUM); +} - int Wo = Arg->Wo, Ho = Arg->Ho; +void KerPar_MM_Conv1x1_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUMN); +} - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); - int IterOut = Last - First; +void KerPar_MM_Conv1x1_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_LEAKYRELU); +} - int PosL = 0; - for (int l=0; l ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -576,6 +462,9 @@ void Ker_MM_Conv1x1_HWC_SQ8( unsigned char * __restrict__ ScaleN = Arg->ScaleN; int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int CoreId = gap_coreid(); unsigned int ChunkCell = ChunkSize(Ho), First = Min(Ho, CoreId*ChunkCell), Last = Min(Ho, First+ChunkCell); @@ -618,17 +507,25 @@ void Ker_MM_Conv1x1_HWC_SQ8( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++; - *pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++; + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S00, 7); pOut0++; + *pOut1 = gap_clip(S01, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S10, 7); pOut0++; + *pOut1 = gap_clip(S11, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S20, 7); pOut0++; + *pOut1 = gap_clip(S21, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S30, 7); pOut0++; + *pOut1 = gap_clip(S31, 7); pOut1++; } for (int i=4*(OutFeat/4); i ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -726,184 +675,27 @@ void Ker_MM_Conv1x1_ReLU_HWC_SQ8( signed char *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; signed char *__restrict__ Filter = Arg->Filter; - int Sx = Arg->Sx, Sy = Arg->Sy; - unsigned int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; + int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy; + int PadL = Arg->Pad[0]; + int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; int * __restrict__ Bias = Arg->Bias; int NormBias = Arg->Infos[AT_INF_BIASN]; signed char * __restrict__ Out = Arg->Out; unsigned char * __restrict__ Scale = Arg->Scale; unsigned char * __restrict__ ScaleN = Arg->ScaleN; + signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); - unsigned int CoreId = gap_coreid(); - unsigned int ChunkCell = ChunkSize(Ho), First = Min(Ho, CoreId*ChunkCell), Last = Min(Ho, First+ChunkCell); - int IterOut = Last - First; - - for (int l=First; lIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy; - int PadL = Arg->Pad[0]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - - int Wo = Arg->Wo, Ho = Arg->Ho; - - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); + /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ + v4s * __restrict__ VBuff = (v4s *) ColBuff; + unsigned int W_In1 = InFeat*Fx; + unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); + unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); int Tail = 2*((W_In1+7)/8); signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail; @@ -992,10 +784,16 @@ void KerPar_MM_Conv1D_HWC_SQ8( S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7)); - v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7)); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); + v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; *((v4s *) (pOut1+4*Line)) = R2; } @@ -1013,8 +811,10 @@ void KerPar_MM_Conv1D_HWC_SQ8( S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - *(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7); - *(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *(pOut0+i) = gap_clip(S0, 7); + *(pOut1+i) = gap_clip(S4, 7); } gap_waitbarrier(0); } @@ -1068,8 +868,11 @@ void KerPar_MM_Conv1D_HWC_SQ8( S3 += V0*C3; pIn++; pC0++; pC1++; pC2++; pC3++; } - v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7)); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (pOut0+4*Line)) = R1; } for (int i=4*(IterOut/4); i ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -1113,6 +963,9 @@ void KerPar_MM_Conv1D_DxDy_SQ8( signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; @@ -1157,7 +1010,8 @@ void KerPar_MM_Conv1D_DxDy_SQ8( S0 = gap_sumdotp4(V1, C1, S0); } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); } @@ -1165,8 +1019,54 @@ void KerPar_MM_Conv1D_DxDy_SQ8( } } -void KerPar_MM_Conv1D_DxDy_HWC_SQ8( - Ker_MM_Conv_SQ8_T *Arg +void KerPar_MM_Conv1D_DxDy_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_NONE); +} + +void KerPar_MM_Conv1D_DxDy_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELU); +} + +void KerPar_MM_Conv1D_DxDy_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELUN); +} + +void KerPar_MM_Conv1D_DxDy_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELUM); +} + +void KerPar_MM_Conv1D_DxDy_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELUMN); +} + +void KerPar_MM_Conv1D_DxDy_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerPar_MM_Conv1D_DxDy_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_HSWISH); +} + +void KerPar_MM_Conv1D_DxDy_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerPar_MM_Conv1D_DxDy_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerPar_MM_Conv1D_DxDy_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_TANH); +} + + +/* + * 1D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order + * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -1189,6 +1089,10 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8( int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); + /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; unsigned int W_In1 = InFeat*Fx; @@ -1306,10 +1210,16 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8( S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7)); - v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7)); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); + v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; *((v4s *) (pOut1+4*Line)) = R2; } @@ -1327,8 +1237,10 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8( S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - *(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7); - *(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *(pOut0+i) = gap_clip(S0, 7); + *(pOut1+i) = gap_clip(S4, 7); } gap_waitbarrier(0); } @@ -1393,8 +1305,11 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8( S3 += V0*C3; pIn++; pC0++; pC1++; pC2++; pC3++; } - v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7)); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); *((v4s *) (pOut0+4*Line)) = R1; } for (int i=4*(IterOut/4); iIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy, Dx = Arg->Dx; - int PadL = Arg->Pad[0]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; +void KerPar_MM_Conv1D_DxDy_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELU); +} - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; +void KerPar_MM_Conv1D_DxDy_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELUN); +} - int Wo = Arg->Wo, Ho = Arg->Ho; +void KerPar_MM_Conv1D_DxDy_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELUM); +} - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); +void KerPar_MM_Conv1D_DxDy_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELUMN); +} - int Tail = 2*((W_In1+7)/8); - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - - int DFx = Dx*(Fx-1)+1; - // int Prec=10; - int InvDx = ((1< ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { signed char *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy; - int PadL = Arg->Pad[0]; + int Fx = Arg->Fx, Sx = Arg->Sx; + int Fy = Arg->Fy, Sy = Arg->Sy; + int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; int * __restrict__ Bias = Arg->Bias; int NormBias = Arg->Infos[AT_INF_BIASN]; @@ -1503,67 +1397,105 @@ void KerPar_MM_Conv1D_ReLU_SQ8( signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); + /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx; + unsigned int W_In1 = InFeat*Fx*Fy; unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); + int FS = Fx*Fy; int Tail = 2*((W_In1+7)/8); ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - int PosL = 0; + int PosL = Arg->FirstTile?(-PadT):0; int Iter = L-F; - int Iter1 = Iter*Fx; + int Iter1 = Iter*FS; + + // printf("If: %3d, Of: %3d, W: %3d, H: %3d, Wo: %3d, Ho: %3d, PosL: %d\n", InFeat, OutFeat, W, H, Wo, Ho, PosL); for (int l=0; l4) { if (Size&0x2) { if (Size&0x1) { for (int f=F; f=2) { if (Size&0x4) { for (int f=F; f> %d = %d\n", Line, l, c, S0, Sc, ScN, gap_clip(AT_SCALE(S0, Sc, ScN), 7)); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); } PosL += Sy; } + gap_waitbarrier(0); } -void KerPar_MM_Conv1D_ReLUN_SQ8( - Ker_MM_Conv_SQ8_T *Arg - ) +void KerPar_MM_Conv2D_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_SQ8_act(Arg, ACT_NONE); +} -{ - signed char *__restrict__ In = Arg->In; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy; - int PadL = Arg->Pad[0]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; +void KerPar_MM_Conv2D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_SQ8_act(Arg, ACT_RELU); +} - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); +void KerPar_MM_Conv2D_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_SQ8_act(Arg, ACT_RELUN); +} - int Tail = 2*((W_In1+7)/8); - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - int PosL = 0; - int Iter = L-F; - int Iter1 = Iter*Fx; - for (int l=0; l4) { - if (Size&0x2) { - if (Size&0x1) { - for (int f=F; f=2) { - if (Size&0x4) { - for (int f=F; fIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy; - int PadL = Arg->Pad[0]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; - unsigned int ActScale = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALEN]; +void KerPar_MM_Conv2D_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_SQ8_act(Arg, ACT_LEAKYRELU); +} - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); +void KerPar_MM_Conv2D_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_SQ8_act(Arg, ACT_HSWISH); +} - int Tail = 2*((W_In1+7)/8); - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - int PosL = 0; - int Iter = L-F; - int Iter1 = Iter*Fx; - for (int l=0; l4) { - if (Size&0x2) { - if (Size&0x1) { - for (int f=F; f=2) { - if (Size&0x4) { - for (int f=F; f ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation) { + /* + For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat] + */ signed char *__restrict__ In = Arg->In; int W = Arg->W, H = Arg->H; signed char *__restrict__ Filter = Arg->Filter; @@ -1794,922 +1580,307 @@ void KerPar_MM_Conv2D_SQ8( unsigned char * __restrict__ ScaleN = Arg->ScaleN; signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; unsigned int W_In1 = InFeat*Fx*Fy; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); - - int FS = Fx*Fy; - int Tail = 2*((W_In1+7)/8); - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - int PosL = Arg->FirstTile?(-PadT):0; - int Iter = L-F; - int Iter1 = Iter*FS; - - // printf("If: %3d, Of: %3d, W: %3d, H: %3d, Wo: %3d, Ho: %3d, PosL: %d\n", InFeat, OutFeat, W, H, Wo, Ho, PosL); - for (int l=0; l4) { - if (Size&0x2) { - if (Size&0x1) { - for (int f=F; f=2) { - if (Size&0x4) { - for (int f=F; f> %d = %d\n", Line, l, c, S0, Sc, ScN, gap_clip(AT_SCALE(S0, Sc, ScN), 7)); - Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7); - } - gap_waitbarrier(0); - } - PosL += Sy; - } - gap_waitbarrier(0); -} - -void KerPar_MM_Conv2D_HWC_SQ8( - Ker_MM_Conv_SQ8_T *Arg - ) - -{ - /* - For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat] - */ - signed char *__restrict__ In = Arg->In; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx; - int Fy = Arg->Fy, Sy = Arg->Sy; - int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; - - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx*Fy; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); - - int FS = Fx*Fy; - int Tail = 2*((W_In1+7)/8); - - signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail; - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - ((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0; - int PosL = Arg->FirstTile?(-PadT):0; - - int Iter = L-F; - int Iter1 = Iter*FS; - int IterOut = Max(0, Last - First); - for (int l=0; l=4) { - for (int f=0; f<(Iter/4); f++) - for (int j=Tb; j=2) { - if (Iter&0x2) - for (int j=Tb; j0) { - for (int j=Tb; j=4) { - for (int f=0; f<(Iter/4); f++) - for (int j=Tb; j=2) { - if (Iter&0x2) - for (int j=Tb; j0) - for (int j=Tb; jIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx; - int Fy = Arg->Fy, Sy = Arg->Sy; - int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; - - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx*Fy; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); - - int FS = Fx*Fy; - int Tail = 2*((W_In1+7)/8); - - signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail; - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - ((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0; - int PosL = Arg->FirstTile?(-PadT):0; - - int Iter = L-F; - int Iter1 = Iter*FS; - int IterOut = Max(0, Last - First); - for (int l=0; l=4) { - for (int f=0; f<(Iter/4); f++) - for (int j=Tb; j=2) { - if (Iter&0x2) - for (int j=Tb; j0) { - for (int j=Tb; j=4) { - for (int f=0; f<(Iter/4); f++) - for (int j=Tb; j=2) { - if (Iter&0x2) - for (int j=Tb; j0) - for (int j=Tb; jIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx; - int Fy = Arg->Fy, Sy = Arg->Sy; - int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - signed char * __restrict__ ColBuff1; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; - - - unsigned int W_In1 = InFeat*Fx*Fy; - unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Wo), First = CoreId*ChunkCell, Last = Min(Wo, First+ChunkCell); + unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); + unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); int FS = Fx*Fy; + int Tail = 2*((W_In1+7)/8); + + signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail; + ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; + ((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0; int PosL = Arg->FirstTile?(-PadT):0; - int Iter = InFeat; + int Iter = L-F; int Iter1 = Iter*FS; - int IterOut = OutFeat; - int IterW = Max(0, Last-First); - ColBuff += 2*CoreId*InFeat*FS; - ColBuff1 = ColBuff + InFeat*FS; + int IterOut = Max(0, Last - First); for (int l=0; l=4) { for (int f=0; f<(Iter/4); f++) for (int j=Tb; j=2) { if (Iter&0x2) for (int j=Tb; j0) { for (int j=Tb; j=4) { for (int f=0; f<(Iter/4); f++) - for (int j=Tb; j=2) { if (Iter&0x2) - for (int j=Tb; j0) { - for (int j=Tb; j0) + for (int j=Tb; j Each Core has 2 im2col buffer + * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -2731,7 +1902,9 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8( signed char * __restrict__ ColBuff = Arg->ColBuff; signed char * __restrict__ ColBuff1; int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); unsigned int W_In1 = InFeat*Fx*Fy; @@ -2848,17 +2021,25 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8( } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++; - *pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++; - *pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S11, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++; - *pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S21, Sc, ScN), 7); pOut1++; - Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++; - *pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S31, Sc, ScN), 7); pOut1++; + S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S00, 7); pOut0++; + *pOut1 = gap_clip(S01, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S10, 7); pOut0++; + *pOut1 = gap_clip(S11, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S20, 7); pOut0++; + *pOut1 = gap_clip(S21, 7); pOut1++; + Sc = *pSc; ScN = *pScN; pSc++; pScN++; + S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S30, 7); pOut0++; + *pOut1 = gap_clip(S31, 7); pOut1++; } for (int Line=4*(IterOut/4); Line ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -3005,7 +2238,9 @@ void KerPar_MM_Conv2D_DxDy_SQ8( unsigned char * __restrict__ ScaleN = Arg->ScaleN; signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; @@ -3057,7 +2292,8 @@ void KerPar_MM_Conv2D_DxDy_SQ8( S0 = gap_sumdotp4(V1, C1, S0); } unsigned int Sc = Scale[Line], ScN = ScaleN[Line]; - Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7); + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7); } gap_waitbarrier(0); } @@ -3066,8 +2302,54 @@ void KerPar_MM_Conv2D_DxDy_SQ8( gap_waitbarrier(0); } -void KerPar_MM_Conv2D_DxDy_HWC_SQ8( - Ker_MM_Conv_SQ8_T *Arg +void KerPar_MM_Conv2D_DxDy_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_NONE); +} + +void KerPar_MM_Conv2D_DxDy_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELU); +} + +void KerPar_MM_Conv2D_DxDy_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELUN); +} + +void KerPar_MM_Conv2D_DxDy_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELUM); +} + +void KerPar_MM_Conv2D_DxDy_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELUMN); +} + +void KerPar_MM_Conv2D_DxDy_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_LEAKYRELU); +} + +void KerPar_MM_Conv2D_DxDy_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_HSWISH); +} + +void KerPar_MM_Conv2D_DxDy_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_HSIGMOID); +} + +void KerPar_MM_Conv2D_DxDy_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_SIGMOID); +} + +void KerPar_MM_Conv2D_DxDy_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_TANH); +} + + +/* + * 2D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order + * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h + */ +static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_HWC_SQ8_act( + Ker_MM_Conv_SQ8_T *Arg, + CNN_ActivationOper_T Activation ) { @@ -3088,7 +2370,9 @@ void KerPar_MM_Conv2D_DxDy_HWC_SQ8( unsigned char * __restrict__ ScaleN = Arg->ScaleN; signed char * __restrict__ ColBuff = Arg->ColBuff; int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; + unsigned char * Infos = Arg->Infos; + unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; + int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]); /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ v4s * __restrict__ VBuff = (v4s *) ColBuff; @@ -3206,10 +2490,16 @@ This part is more efficient but NOT WORKING ???? TOCHECK S3 += V0*C3; S7 += V1*C3; pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++; } - v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7)); - v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7), - gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7)); + S0 = AT_SCALE(S0, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, pSc[4*Line ], pScN[4*Line ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7)); + v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7)); *((v4s *) (pOut0+4*Line)) = R1; *((v4s *) (pOut1+4*Line)) = R2; } @@ -3227,8 +2517,10 @@ This part is more efficient but NOT WORKING ???? TOCHECK S0 += V0*C0; S4 += V1*C0; pIn++; pIn1++; pC++; } - *(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7); - *(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7); + S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *(pOut0+i) = gap_clip(S0, 7); + *(pOut1+i) = gap_clip(S4, 7); } gap_waitbarrier(0); } @@ -3278,13 +2570,17 @@ This part is more efficient but NOT WORKING ???? TOCHECK } unsigned int Sc, ScN; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S0, Sc, ScN), 7); pOut0++; + S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S0, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S1, Sc, ScN), 7); pOut0++; + S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S1, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S2, Sc, ScN), 7); pOut0++; + S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S2, 7); pOut0++; Sc = *pSc; ScN = *pScN; pSc++; pScN++; - *pOut0 = gap_clip(AT_SCALE(S3, Sc, ScN), 7); pOut0++; + S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0); + *pOut0 = gap_clip(S3, 7); pOut0++; } for (int i=4*(IterOut/4); iIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Dx = Arg->Dx; - int Fy = Arg->Fy, Sy = Arg->Sy, Dy = Arg->Dy; - int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; - - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx*Fy; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); - - int FS = Fx*Fy; - int Tail = 2*((W_In1+7)/8); - ((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0; - int PosL = Arg->FirstTile?(-PadT):0; - int DFx = Dx*(Fx-1)+1, DFy = Dy*(Fy-1)+1; - // int Prec=10; - int InvDx = ((1<In; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy; - int PadL = Arg->Pad[0]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - - int Wo = Arg->Wo, Ho = Arg->Ho; +void KerPar_MM_Conv2D_DxDy_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_RELU); +} - /* ColBuff must be large enough to accomodate ((((InFeat/NCores)+3)/4)*4)*8 elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - v4s M0 = (v4s){-1,0,0,0}, M1 = (v4s){0,-1,0,0}, M2 = (v4s){0,0,-1,0}, M3 = (v4s){0,0,0,-1}; +void KerPar_MM_Conv2D_DxDy_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_RELUN); +} - int PosL = 0; - int Iter = L-F; - for (int l=0; l=4) { - for (int f=0; f<(Iter/4); f++) - for (int i=Lb; i=2) { - if (Iter&0x2) for (int i=Lb; i0) - for (int i=Lb; i fx*Infeat + f */ - for (int f=0; f<(Iter/4); f++) { - v4s B = ((v4s *)(Bias + F))[f]; - int S0 = B[0], S1 = B[1], S2 = B[2], S3 = B[3]; - for (int i=0; iIn; - int W = Arg->W, H = Arg->H; - signed char *__restrict__ Filter = Arg->Filter; - int Fx = Arg->Fx, Sx = Arg->Sx; - int Fy = Arg->Fy, Sy = Arg->Sy; - int PadL = Arg->Pad[0], PadT = Arg->Pad[2]; - int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat; - int * __restrict__ Bias = Arg->Bias; - int NormBias = Arg->Infos[AT_INF_BIASN]; - signed char * __restrict__ Out = Arg->Out; - unsigned char * __restrict__ Scale = Arg->Scale; - unsigned char * __restrict__ ScaleN = Arg->ScaleN; - signed char * __restrict__ ColBuff = Arg->ColBuff; - int Wo = Arg->Wo, Ho = Arg->Ho; - int A0 = Arg->Infos[AT_INF_A0]; +void KerPar_MM_Conv2D_DxDy_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_LEAKYRELU); +} - /* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */ - v4s * __restrict__ VBuff = (v4s *) ColBuff; - unsigned int W_In1 = InFeat*Fx*Fy; - unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C); - //unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last = Min(OutFeat, First+ChunkCell); +void KerPar_MM_Conv2D_DxDy_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_HSWISH); +} - int FS = Fx*Fy; - int PosL = Arg->FirstTile?(-PadT):0; +void KerPar_MM_Conv2D_DxDy_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) { + KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_HSIGMOID); +} - for (int l=0; l CxHW: InFeatxFyxFx - for (int c=F; c>1; + int iLog4N = (gap_fl1(N_fft))>>1; v2s *DataV = (v2s *) Data; v2s *CoeffV = (v2s *) Twiddles; unsigned int CoreId; @@ -580,7 +580,7 @@ void Radix4FFT_DIF_Par_Fix32(FFT_Arg_T *Arg) int iCnt1, iCnt2, iCnt3, iL, iM, iQ, iA, iB, iC, iD; - unsigned int iLog4N = (gap_fl1(N_fft))>>1; + int iLog4N = (gap_fl1(N_fft))>>1; unsigned int CoreId; int First, Last, Chunk; @@ -656,7 +656,7 @@ void Radix4FFT_DIF_Par_f16(FFT_Arg_T *Arg) int iCnt1, iCnt2, iCnt3, iL, iM, iQ, iA, iB, iC, iD; - unsigned int iLog4N = (gap_fl1(N_fft))>>1; + int iLog4N = (gap_fl1(N_fft))>>1; F16V_DSP *DataV = (F16V_DSP *) Data; F16V_DSP *CoeffV = (F16V_DSP *) Twiddles; unsigned int CoreId; @@ -746,7 +746,7 @@ void Radix4FFT_DIF_Par_f32(FFT_Arg_T *Arg) int iCnt1, iCnt2, iCnt3, iL, iM, iQ, iA, iB, iC, iD; - unsigned int iLog4N = (gap_fl1(N_fft))>>1; + int iLog4N = (gap_fl1(N_fft))>>1; unsigned int CoreId; int First, Last, Chunk; @@ -1428,7 +1428,7 @@ void Radix2FFT_DIF_Par_Fix32_Scal(FFT_scal_Arg_T *Arg) // reset the shift table Chunk = N_fft/nbcore; First = CoreId*Chunk; Last = Min( First+Chunk,N_fft); - for (int i = First; i < Last; i++) shift_fft[i]=0; + for (unsigned int i = First; i < Last; i++) shift_fft[i]=0; gap_waitbarrier(0); // compute fft @@ -1656,14 +1656,17 @@ void RFFT_DIF_Par_Fix16(RFFT_Arg_T *Arg){ if (CoreId == 0){ xBR = pB[0][0]; xBI = pB[0][1]; - xAR = pA[0][0]; - xAI = pA[0][1]; // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); - RFFT_Out[0][0] = ( xBR + xAR + xBI + xAI ) >> 2; - RFFT_Out[0][1] = ( xAI - xBI + xBR - xAR ) >> 2; // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + RFFT_Out[0][0] = ( xBR + xBI ) >> 1; + RFFT_Out[0][1] = 0; + + // Gr(N) = Gr(0) - Gi(0) + // Gi(N) = 0 + RFFT_Out[k+1][0] = ( xBR - xBI ); + RFFT_Out[k+1][1] = 0; } gap_waitbarrier(0); @@ -1698,16 +1701,6 @@ void RFFT_DIF_Par_Fix16(RFFT_Arg_T *Arg){ t2 = gap_add2div4( xA, gap_cplxconj(xB)); RFFT_Out[i] = gap_cplxmuls(tw, t1) + t2; } - if (CoreId == 0){ - xBR = pB[-(k-1)][0]; - xBI = pB[-(k-1)][1]; - xAR = pA[ (k-1)][0]; - xAI = pA[ (k-1)][1]; - RFFT_Out[k][0] = ( xBR + xAR - xBI - xAI ) >> 2; - // TODO - CHECK - // RFFT_Out[k][1] = ( xAI - xBI - xBR + xAR ) >> 2; - RFFT_Out[k][1] = 0; - } gap_waitbarrier(0); #ifdef PRINTDEB if (CoreId==0){ @@ -1854,14 +1847,14 @@ void RFFT_DIF_Par_f16(RFFT_Arg_T *Arg){ if (CoreId == 0){ xBR = pB[0][0]; xBI = pB[0][1]; - xAR = pA[0][0]; - xAI = pA[0][1]; // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); - RFFT_Out[0][0] = 0.5f * ( xBR + xAR + xBI + xAI ); - RFFT_Out[0][1] = 0.5f * ( xAI - xBI + xBR - xAR ); + RFFT_Out[0][0] = xBR + xBI; + RFFT_Out[0][1] = 0.0f; // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + RFFT_Out[k+1][0] = xBR - xBI; + RFFT_Out[k+1][1] = 0.0f; } gap_waitbarrier(0); @@ -1897,15 +1890,6 @@ void RFFT_DIF_Par_f16(RFFT_Arg_T *Arg){ t2 = t2 + xA; RFFT_Out[i] = (CplxMult_f16(tw, t1) + t2) * (F16V_DSP) {0.5f, 0.5f}; } - if (CoreId == 0){ - xBR = pB[-(k-1)][0]; - xBI = pB[-(k-1)][1]; - xAR = pA[(k-1)][0]; - xAI = pA[(k-1)][1]; - RFFT_Out[k][0] = 0.5f * ( xBR + xAR - xBI - xAI ); - // RFFT_Out[k][1] = 0.5f * ( xAI - xBI - xBR + xAR ); - RFFT_Out[k][1] = 0.0f; - } gap_waitbarrier(0); #ifdef PRINTDEB if (CoreId==0){ @@ -1947,10 +1931,10 @@ void RFFT_DIF_Par_f32(RFFT_Arg_T *Arg){ if (CoreId == 0){ xBR = pB[0]; xBI = pB[1]; - xAR = pA[0]; - xAI = pA[1]; - RFFT_Out[0] = 0.5f * ( xBR + xAR + xBI + xAI ); - RFFT_Out[1] = 0.5f * ( xAI - xBI + xBR - xAR ); + RFFT_Out[0] = xBR + xBI; + RFFT_Out[1] = 0.0f; + RFFT_Out[2*(k+1)] = xBR - xBI; + RFFT_Out[2*(k+1)+1] = 0.0f; } gap_waitbarrier(0); @@ -1998,16 +1982,6 @@ void RFFT_DIF_Par_f32(RFFT_Arg_T *Arg){ RFFT_Out[2*i] = 0.5f * (xAR + xBR + p0 + p3 ); //xAR RFFT_Out[2*i+1] = 0.5f * (xAI - xBI + p1 - p2 ); //xAI - // printf("%d %f %f\n", i, RFFT_Out[2*i] ,RFFT_Out[2*i+1] ); - } - if (CoreId == 0){ - xBR = pB[-2*(k-1)]; - xBI = pB[-2*(k-1)+1]; - xAR = pA[2*(k-1)]; - xAI = pA[2*(k-1)+1]; - RFFT_Out[2*k] = 0.5f * ( xBR + xAR - xBI - xAI ); - RFFT_Out[2*k+1] = 0.0f; - // RFFT_Out[2*k+1] = 0.5f * ( xAI - xBI - xBR + xAR ); } gap_waitbarrier(0); #ifdef PRINTDEB diff --git a/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c b/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c index f1f31e2cf..e00ff23c0 100644 --- a/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c +++ b/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c @@ -35,7 +35,7 @@ void MelFilterBank_Fix32(MelFilterBank_T *Arg) unsigned short int *__restrict__ Mel_Coeffs = (unsigned short int *__restrict__) Arg->Mel_Coeffs; signed char *__restrict__ shift_buff = (signed char *__restrict__) Arg->shift_buff; fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank; - short int Mel_NBanks = Arg->Mel_NBanks; + unsigned int Mel_NBanks = (unsigned int) Arg->Mel_NBanks; short int Mel_Coeff_Dyn = Arg->Mel_Coeff_dyn; unsigned int Chunk, First, Last, CoreId=gap_coreid(); @@ -86,7 +86,7 @@ void MelFilterBank_Fix32_Scal(MelFilterBank_T *Arg) signed char *__restrict__ shift_fft = (signed char *__restrict__) Arg->shift_fft; short int *__restrict__ Mel_Coeffs = (short int *__restrict__) Arg->Mel_Coeffs; fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank; - short int Mel_NBanks = Arg->Mel_NBanks; + unsigned int Mel_NBanks = (unsigned int) Arg->Mel_NBanks; short int Mel_Coeff_Dyn = Arg->Mel_Coeff_dyn; signed char IsMagSquared = Arg->IsMagSquared; int MUL_EXP = IsMagSquared?2:1; @@ -112,7 +112,7 @@ void MelFilterBank_Fix32_Scal(MelFilterBank_T *Arg) } // align the block scaling on the min , compute the max value in the block for (k=0, j=Mel_FilterBank[i].Start; k<(unsigned int) NonZeroItems; j++, k++) { - int TMP = FramePower[j] >> (MUL_EXP * (shift_fft[j] - min_shift)); + unsigned int TMP = FramePower[j] >> (MUL_EXP * (shift_fft[j] - min_shift)); if (TMP > (unsigned int) maxin) maxin = TMP; } @@ -149,7 +149,7 @@ void MelFilterBank_f16(MelFilterBank_T *Arg) F16_DSP *__restrict__ Mel_Spectr = (F16_DSP *__restrict__) Arg->MelSpectr; F16_DSP *__restrict__ Mel_Coeffs = (F16_DSP *__restrict__) Arg->Mel_Coeffs; fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank; - short int Mel_NBanks = Arg->Mel_NBanks; + unsigned int Mel_NBanks = (unsigned int) Arg->Mel_NBanks; unsigned int Chunk, First, Last, CoreId=gap_coreid(); //Chunk = ChunkSize(Mel_NBanks); @@ -183,7 +183,7 @@ void MelFilterBank_f32(MelFilterBank_T *Arg) float *__restrict__ Mel_Spectr = (float *__restrict__) Arg->MelSpectr; float *__restrict__ Mel_Coeffs = (float *__restrict__) Arg->Mel_Coeffs; fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank; - short int Mel_NBanks = Arg->Mel_NBanks; + unsigned int Mel_NBanks = (unsigned int) Arg->Mel_NBanks; unsigned int Chunk, First, Last, CoreId=gap_coreid(); //Chunk = ChunkSize(Mel_NBanks); @@ -211,7 +211,7 @@ void MelFilterBank_f32(MelFilterBank_T *Arg) void MFCC_ComputeLog_Fix32(MFCC_Log_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; unsigned int *frameIn = (unsigned int *) Arg->FrameIn; short int *frameOut = (short int *) Arg->FrameOut; @@ -269,7 +269,7 @@ void MFCC_ComputeLog_Fix32(MFCC_Log_T *Arg) void MFCC_ComputeLog_Fix32_Scal(MFCC_Log_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; unsigned int *frameIn = (unsigned int *) Arg->FrameIn; short int *frameOut = (short int *) Arg->FrameOut; @@ -327,7 +327,7 @@ void MFCC_ComputeLog_Fix32_Scal(MFCC_Log_T *Arg) void MFCC_ComputeLog_f16( MFCC_LogF_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; F16_DSP *frameIn = (F16_DSP *) Arg->FrameIn; F16_DSP *frameOut = (F16_DSP *) Arg->FrameOut; @@ -360,7 +360,7 @@ void MFCC_ComputeLog_f16( MFCC_LogF_T *Arg) void MFCC_ComputeLog_f32(MFCC_LogF_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; float *frameIn = (float *) Arg->FrameIn; float *frameOut = (float *) Arg->FrameOut; @@ -392,7 +392,7 @@ void MFCC_ComputeLog_f32(MFCC_LogF_T *Arg) void MFCC_ComputeDB_Fix32(MFCC_Log_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; unsigned int *frameIn = (unsigned int *) Arg->FrameIn; short int *frameOut = (short int *) Arg->FrameOut; @@ -450,7 +450,7 @@ void MFCC_ComputeDB_Fix32(MFCC_Log_T *Arg) void MFCC_ComputeDB_Fix32_Scal(MFCC_Log_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; unsigned int *frameIn = (unsigned int *) Arg->FrameIn; short int *frameOut = (short int *) Arg->FrameOut; @@ -508,7 +508,7 @@ void MFCC_ComputeDB_Fix32_Scal(MFCC_Log_T *Arg) void MFCC_ComputeDB_f16( MFCC_LogF_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; F16_DSP *frameIn = (F16_DSP *) Arg->FrameIn; F16_DSP *frameOut = (F16_DSP *) Arg->FrameOut; @@ -540,7 +540,7 @@ void MFCC_ComputeDB_f16( MFCC_LogF_T *Arg) void MFCC_ComputeDB_f32(MFCC_LogF_T *Arg) { - int i; + unsigned int i; int size = Arg->FrameSize; float *frameIn = (float *) Arg->FrameIn; float *frameOut = (float *) Arg->FrameOut; @@ -577,7 +577,7 @@ void norm_clip_16(Norm_Clip_args_T *Args) short int Norm = Args->Norm; int N = Args->N; - int i; + unsigned int i; unsigned int Chunk, First, Last, CoreId=gap_coreid(); if (CoreId==0) { @@ -604,7 +604,7 @@ void norm_clip_32_melspect(MFCC_Clip_32_T *Args) { unsigned int Chunk, First, Last, CoreId=gap_coreid(); if (CoreId==0){ - for (i=0; i<(unsigned int)N; i++) { + for (i=0; i> Norm):0; @@ -635,12 +635,12 @@ void norm_clip_32_melspect_scal(MFCC_Clip_32_T *Args) if (CoreId==0){ if (IsMagSquared){ - for (i=0; i<(unsigned int)N; i++) { + for (i=0; i> Norm):0; } } else { - for (i=0; i<(unsigned int)N; i++) { + for (i=0; i> Norm):0; } @@ -660,8 +660,8 @@ void MFCC_ComputeDCT_II_Fix16(DCT_II_Arg_T *Args) v2s * in_dct = (v2s * __restrict__ ) Args->Data; short int * DCTCoeff = (short int * __restrict__) Args->DCTCoeff; short int * FeatList = (short int * __restrict__ ) Args->FeatList; - short int NDCT = Args->n_dct; - short int NInputs = Args->n_input; + unsigned int NDCT = (unsigned int) Args->n_dct; + unsigned int NInputs = (unsigned int) Args->n_input; unsigned int Chunk, First, Last, CoreId=gap_coreid(); @@ -692,8 +692,8 @@ void MFCC_ComputeDCT_II_f16(DCT_II_Arg_T *Args) F16V_DSP * in_dct = (F16V_DSP * __restrict__ ) Args->Data; F16_DSP * FeatList = (F16_DSP * __restrict__ ) Args->FeatList; F16_DSP * DCTCoeff = (F16_DSP * __restrict__) Args->DCTCoeff; - short int NDCT = Args->n_dct; - short int NInputs = Args->n_input; + unsigned int NDCT = (unsigned int) Args->n_dct; + unsigned int NInputs = (unsigned int) Args->n_input; unsigned int Chunk, First, Last, CoreId=gap_coreid(); @@ -725,8 +725,8 @@ void MFCC_ComputeDCT_II_f32(DCT_II_Arg_T *Args) float * in_dct = (float * __restrict__ ) Args->Data; float * FeatList = (float * __restrict__ ) Args->FeatList; float * DCTCoeff = (float * __restrict__) Args->DCTCoeff; - short int NDCT = Args->n_dct; - short int NInputs = Args->n_input; + unsigned int NDCT = (unsigned int) Args->n_dct; + unsigned int NInputs = (unsigned int) Args->n_input; unsigned int Chunk, First, Last, CoreId=gap_coreid(); diff --git a/tools/autotiler_v3/DSP_Libraries/PreProcessing.c b/tools/autotiler_v3/DSP_Libraries/PreProcessing.c index 8d39cca00..e4dae722e 100644 --- a/tools/autotiler_v3/DSP_Libraries/PreProcessing.c +++ b/tools/autotiler_v3/DSP_Libraries/PreProcessing.c @@ -27,7 +27,7 @@ void get_max(PreEmphasis_T *Arg) maxin[CoreId]=0; if (CoreId==0) maxin[0] = Abs(Arg->Prev); - for (int j=First;jmaxin[CoreId]) maxin[CoreId]=Abs((int)Frame[j]); } gap_waitbarrier(0); diff --git a/tools/autotiler_v3/Emulation/GapSystem.h b/tools/autotiler_v3/Emulation/GapSystem.h index 558bb6688..4729a268e 100644 --- a/tools/autotiler_v3/Emulation/GapSystem.h +++ b/tools/autotiler_v3/Emulation/GapSystem.h @@ -98,6 +98,8 @@ static int Private_call(void (*fn)(void *), void * arg, __event_cb * event) #define gap_setupbarrier(BarN, CoreM) #define gap_waitbarrier(BarN) #define gap_waitbarrier_cc(BarN) +#define gap_cl_critical_enter() +#define gap_cl_critical_exit() #define rt_event_sched_init(x) #define rt_event_alloc(x,y) 0 @@ -190,7 +192,8 @@ static inline void __cl_dma_memcpy_2d(uint32_t ext, uint32_t loc, uint16_t size, #define gap_waitbarrier_cc() eu_bar_trig_wait_clr(eu_bar_addr(1)) #endif - +#define gap_cl_critical_enter() pi_cl_team_critical_enter() +#define gap_cl_critical_exit() pi_cl_team_critical_exit() #endif diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile index dc12e7fd7..0abf4ba65 100644 --- a/tools/autotiler_v3/Makefile +++ b/tools/autotiler_v3/Makefile @@ -1,4 +1,4 @@ -TILER_VER=4.3.0 +TILER_VER=4.3.1 export TILER_LIB=libtile.${TILER_VER}.a ifdef GAP_SDK_HOME export TILER_URL=$(GAP_SDK_HOME)/.tiler_url diff --git a/tools/autotiler_v3/version.cfg b/tools/autotiler_v3/version.cfg index 4a4782aee..332f897c0 100644 --- a/tools/autotiler_v3/version.cfg +++ b/tools/autotiler_v3/version.cfg @@ -3,7 +3,7 @@ { "version": "autotiler-v3", "magicNum": 718930176, - "git-hash": "de73fc4e0db316fa61057a8e1c9cfde47a75b6c0" + "git-hash": "de88fbeb3017c0db55f1e86e49cce5a0160ccbe5" } ] } \ No newline at end of file diff --git a/tools/jenkins/gap_sdk_version.txt b/tools/jenkins/gap_sdk_version.txt index f8268c0f2..59f52fae3 100644 --- a/tools/jenkins/gap_sdk_version.txt +++ b/tools/jenkins/gap_sdk_version.txt @@ -1 +1 @@ -9240e025d9f6a0efa51ad259adea0ae1287f6610 +9af2d93598d20541f4c18ba45e2124b767be2388 diff --git a/tools/nntool/execution/graph_executer.py b/tools/nntool/execution/graph_executer.py index 9e15d7393..297ea50e3 100644 --- a/tools/nntool/execution/graph_executer.py +++ b/tools/nntool/execution/graph_executer.py @@ -73,7 +73,7 @@ def execute_qnoq_iterator(self, G = self._G saved_outputs = {} - for node in G.dfs(): + for node in G.topological_sort(): step_idx = node.step_idx if step_idx_limit is not None and step_idx > step_idx_limit: break @@ -166,7 +166,7 @@ def execute_iterator(self, if not silent: LOG.info("execute uncached: quantization mode %s", qmode) ExecutionProgress.start() - for node in G.dfs(): + for node in G.topological_sort(): step_idx = node.step_idx if step_idx_limit is not None and step_idx > step_idx_limit: break diff --git a/tools/nntool/execution/kernels/float/matrix_operations.py b/tools/nntool/execution/kernels/float/matrix_operations.py index b2bbd97eb..fa84e469c 100644 --- a/tools/nntool/execution/kernels/float/matrix_operations.py +++ b/tools/nntool/execution/kernels/float/matrix_operations.py @@ -130,7 +130,7 @@ def execute(cls, params, in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="float") if isinstance(params, MatMulTransposedParameters): - mat1, mat2 = in_tensors[0], np.transpose(in_tensors[1], (1, 0)) + mat1, mat2 = in_tensors[0], np.swapaxes(in_tensors[1], -2, -1) else: mat1, mat2 = in_tensors[0], in_tensors[1] diff --git a/tools/nntool/execution/kernels/float/ssd_postprocess.py b/tools/nntool/execution/kernels/float/ssd_postprocess.py index 591ae806b..516abe6b2 100644 --- a/tools/nntool/execution/kernels/float/ssd_postprocess.py +++ b/tools/nntool/execution/kernels/float/ssd_postprocess.py @@ -106,7 +106,10 @@ def execute(cls, params, # out_boxes, out_scores, out_classes = cls.nms( # params, qrec, decoded_bboxes, valid_scores) # out_count = np.array([sum(out_classes != 0)]) - return qrec.get_outputs(params, [out_boxes, out_classes, out_scores], ktype="float") + outputs = [out_boxes, out_classes, out_scores] + if params.output_detection_count: + outputs.append(np.array([out_idx])) + return qrec.get_outputs(params, outputs, ktype="float") @params_type(NMSParameters) @qrec_type('float') diff --git a/tools/nntool/execution/kernels/quant/activations.py b/tools/nntool/execution/kernels/quant/activations.py index 2556d0f88..b146a8da9 100644 --- a/tools/nntool/execution/kernels/quant/activations.py +++ b/tools/nntool/execution/kernels/quant/activations.py @@ -71,13 +71,13 @@ def execute(cls, params, params, in_tensors, ktype="symmetric")[0] # compute_in_out_scale(qrec) in_tensor = in_tensor.astype(np.int32) - neg_in = at_norm(in_tensor * leak_mult_gen_factor_q7(params), 7) + neg_in = at_norm((in_tensor) * qrec.cache["leak_factor"], 7) in_tensor = in_tensor * (in_tensor > 0) + neg_in * (in_tensor < 0) scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] - in_tensor = scale_mul_biases_q.apply_scales(in_tensor) - if qrec.out_qs[0] != qrec.in_qs[0]: - return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric") - return qrec.get_outputs(params, [in_tensor], ktype="symmetric") + in_tensor = scale_mul_biases_q.apply_scales(in_tensor) + qrec.cache["zero_point"] + #if qrec.out_qs[0] != qrec.in_qs[0]: + # return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric") + return qrec.get_outputs(params, [qrec.out_qs[0].clip(in_tensor)], ktype="symmetric") def sigmoid(params, @@ -147,29 +147,7 @@ def hsigmoid_mult_gen_factors(params, qrec): return fac_1, upper_bound, lower_bound -@params_type(HSigmoidActivationParameters) -@qrec_type('scaled') -class HSigmoidSymmetricMult(KernelBase): - @classmethod - def execute(cls, params, - in_tensors, - qrec: QRec, - **kwargs): - in_tensor = qrec.prepare_inputs( - params, in_tensors, ktype="symmetric")[0] - offset = qrec.cache['offset'] - upper_bound = qrec.cache['upper_bound'] - mult = qrec.cache['mult'] - lower_bound = qrec.in_qs[0].zero_point - in_tensor = in_tensor.astype(np.int32) - in_tensor_relued = np.minimum(np.maximum( - in_tensor + offset, lower_bound), upper_bound) * mult - scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] - in_tensor = scale_mul_biases_q.apply_scales(in_tensor_relued) - return qrec.get_outputs(params, - [in_tensor], - ktype="symmetric") @params_type(HSigmoidActivationParameters) @@ -230,14 +208,18 @@ def execute(cls, params, params, in_tensors, ktype="symmetric")[0] if in_tensor.dtype == np.int8: in_tensor = in_tensor.astype(np.int32) << 8 - - output = sigmoid_lut(in_tensor, q16_out=qrec.out_qs[0].dtype == np.uint16) - if qrec.out_qs[0].dtype == np.int8: - # compute_in_out_scale(qrec, extra_scale=QType.Pow2( - # bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale) - output >>= 8 + elif in_tensor.dtype == np.uint8: + in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point + in_tensor <<= 8 + elif in_tensor.dtype == np.uint16: + in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point + else: + in_tensor = in_tensor.astype(np.int32) + + out_q15 = sigmoid_lut(in_tensor) scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] - output = scale_mul_biases_q.apply_scales(output) + outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.cache['zero_point'] + output = qrec.out_qs[0].clip(outp) return qrec.get_outputs(params, [output], ktype="symmetric") @@ -279,12 +261,22 @@ def execute(cls, params, **kwargs): in_tensor = qrec.prepare_inputs( params, in_tensors, ktype="symmetric")[0] - out_q15 = tanh_lut(in_tensor.astype(np.int32) << 8) + if in_tensor.dtype == np.int8: + in_tensor = in_tensor.astype(np.int32) << 8 + elif in_tensor.dtype == np.uint8: + in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point'] + in_tensor <<= 8 + elif in_tensor.dtype == np.uint16: + in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point'] + else: + in_tensor = in_tensor.astype(np.int32) + + out_q15 = tanh_lut(in_tensor) # compute_in_out_scale(qrec, extra_scale=QType.Pow2( # bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale) scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] - output = scale_mul_biases_q.apply_scales(out_q15 >> 8) - + outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.out_qs[0].zero_point + output = qrec.out_qs[0].clip(outp) return qrec.get_outputs(params, [output], ktype="symmetric") @@ -316,15 +308,6 @@ def execute(cls, params, ktype="symmetric") -def hswish_mult_gen_factors(qrec): - in_q = qrec.in_qs[0] - fac_1 = in_q.quantize(np.array([3.])) - # The scale of the result is actually in in_scale * in_scale since it is multiplied by itself - compute_in_out_scale(qrec, extra_scale=qrec.in_qs[0].scale * 1/6) - upper_bound = in_q.quantize([6.]) - lower_bound = in_q.quantize([0.]) - return fac_1, upper_bound, lower_bound - @params_type(HSwishActivationParameters) @qrec_type('scaled') @@ -336,18 +319,50 @@ def execute(cls, params, **kwargs): in_tensor = qrec.prepare_inputs( params, in_tensors, ktype="symmetric")[0] - fac_1, upper_bound, lower_bound = hswish_mult_gen_factors(qrec) in_tensor = in_tensor.astype(np.int32) + + offset = qrec.cache['offset'] + upper_bound = qrec.cache['upper_bound'] + zero_point = qrec.cache['zero_point'] + in_tensor_relued = np.minimum(np.maximum( - in_tensor + fac_1, lower_bound), upper_bound) + in_tensor + offset, 0), upper_bound) scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] in_tensor = scale_mul_biases_q.apply_scales( in_tensor * in_tensor_relued) + in_tensor += zero_point + in_tensor = qrec.out_qs[0].clip(in_tensor) return qrec.get_outputs(params, [in_tensor], ktype="symmetric") +@params_type(HSigmoidActivationParameters) +@qrec_type('scaled') +class HSigmoidSymmetricMult(KernelBase): + @classmethod + def execute(cls, params, + in_tensors, + qrec: QRec, + **kwargs): + in_tensor = qrec.prepare_inputs( + params, in_tensors, ktype="symmetric")[0] + in_tensor = in_tensor.astype(np.int32) + + offset = qrec.cache['offset'] + upper_bound = qrec.cache['upper_bound'] + zero_point = qrec.cache['zero_point'] + + in_tensor_relued = np.minimum(np.maximum( + in_tensor + offset, 0), upper_bound) + scale_mul_biases_q = qrec.cache['scale_mul_biases_q'] + in_tensor = scale_mul_biases_q.apply_scales(in_tensor_relued) + in_tensor += zero_point + in_tensor = qrec.out_qs[0].clip(in_tensor) + return qrec.get_outputs(params, + [in_tensor], + ktype="symmetric") + @params_type(HSwishActivationParameters) @qrec_type('symmetric') class HSwishSymmetric(KernelBase): diff --git a/tools/nntool/execution/kernels/quant/dsp_preprocessing.py b/tools/nntool/execution/kernels/quant/dsp_preprocessing.py index c93a0bb24..ddf0263a1 100644 --- a/tools/nntool/execution/kernels/quant/dsp_preprocessing.py +++ b/tools/nntool/execution/kernels/quant/dsp_preprocessing.py @@ -16,13 +16,14 @@ import logging import numpy as np +from execution.kernels.kernel_base import KernelBase, params_type, qrec_type from graph.types import (MFCCPreprocessingParameters, RFFT2DPreprocessingParameters) -from execution.kernels.kernel_base import KernelBase, params_type, qrec_type from quantization.new_qrec import QRec from utils.at_norm import at_norm from utils.fft_quant import (Rad2_FFT_DIF_Fix16, Rad4_FFT_DIF_Fix16, RFFT_Step_Fix16, SwapSamples) +from utils.numpy_helpers import np_asscalar from utils.pow_sqrt import (LN_2_1F15, LN_10_INV_Q10, LOG10_2, gap_fl1, logn_17_15, sqrt_17_15) @@ -84,7 +85,7 @@ def melspectrogram_step(cls, params, in_data, filterbanks_sparsity, filterbank_c max_in = np.max(in_data[start:start+nonzero_items]) logn_items = gap_fl1(nonzero_items) shift0 = gap_fl1(max_in) if max_in else 0 - shift = np.asscalar( + shift = np_asscalar( np.int32(shift0 + mel_coeff_q + logn_items - 31 if shift0 + mel_coeff_q + logn_items > 31 else 0)) melbin = 0 @@ -130,9 +131,11 @@ def execute(cls, params, qrec: QRec, **kwargs): in_data = in_tensors[0] - fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0) + fft_twiddles = np.stack( + [in_tensors[2][::2], in_tensors[2][1::2]], axis=0) swap_table = in_tensors[3] - rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0) + rfft_twiddles = np.stack( + [in_tensors[4][::2], in_tensors[4][1::2]], axis=0) spectrograms = [] for frame_idx in range(params.n_frames): @@ -164,9 +167,11 @@ def execute(cls, params, in_tensors, qrec: QRec, **kwargs): - fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0) + fft_twiddles = np.stack( + [in_tensors[2][::2], in_tensors[2][1::2]], axis=0) swap_table = in_tensors[3] - rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0) + rfft_twiddles = np.stack( + [in_tensors[4][::2], in_tensors[4][1::2]], axis=0) mel_filterbank_sparsity_mat = in_tensors[5] mel_filterbank_coeff = in_tensors[6] diff --git a/tools/nntool/execution/kernels/quant/matrix_operations.py b/tools/nntool/execution/kernels/quant/matrix_operations.py index 2116a2315..ba4c84ed6 100644 --- a/tools/nntool/execution/kernels/quant/matrix_operations.py +++ b/tools/nntool/execution/kernels/quant/matrix_operations.py @@ -193,7 +193,7 @@ def execute(cls, params, in_tensors = [in_tensor.astype(np.int32) for in_tensor in qrec.prepare_inputs( params, in_tensors, ktype="symmetric")] if isinstance(params, MatMulTransposedParameters): - mat1, mat2 = in_tensors[0], np.transpose(in_tensors[1], (1, 0)) + mat1, mat2 = in_tensors[0], np.swapaxes(in_tensors[1], -2, -1) else: mat1, mat2 = in_tensors[0], in_tensors[1] @@ -208,9 +208,10 @@ def execute(cls, params, biases = 0 out_tensor = np.matmul(mat1, mat2) + biases + out_rank = len(out_tensor.shape) mul_biases_q = qrec.cache['mul_biases_q'] scale_axis = None if len(mul_biases_q.scale) == 1 else \ - (1 if isinstance(params, MatMulTransposedParameters) else 0) + (out_rank-1 if isinstance(params, MatMulTransposedParameters) else out_rank-2) out_tensor = mul_biases_q.apply_scales(out_tensor, scale_axis) return qrec.get_outputs(params, [out_tensor], ktype="symmetric") @@ -228,7 +229,7 @@ def execute(cls, params, params, in_tensors, ktype="symmetric")] if isinstance(params, MatMulTransposedParameters): - mat1, mat2 = in_tensors[0], np.transpose(in_tensors[1], (1, 0)) + mat1, mat2 = in_tensors[0], np.swapaxes(in_tensors[1], -2, -1) else: mat1, mat2 = in_tensors[0], in_tensors[1] diff --git a/tools/nntool/execution/kernels/quant/ssd_postprocess.py b/tools/nntool/execution/kernels/quant/ssd_postprocess.py index 911701401..3cb5a609f 100644 --- a/tools/nntool/execution/kernels/quant/ssd_postprocess.py +++ b/tools/nntool/execution/kernels/quant/ssd_postprocess.py @@ -131,7 +131,10 @@ def execute(cls, params, # params, qrec, offsets, anchors, scores, anchors_type='centers') # out_boxes, out_scores, out_classes = cls.nms(params, qrec, decoded_bboxes, valid_scores) # out_count = np.array([sum(out_classes != 0)]) - return qrec.get_outputs(params, [out_boxes, out_classes, out_scores], ktype="symmetric") + outputs = [out_boxes, out_classes, out_scores] + if params.output_detection_count: + outputs.append(np.array([out_idx], dtype=np.int32)) + return qrec.get_outputs(params, outputs, ktype="symmetric") @params_type(NMSParameters) @qrec_type('scaled') diff --git a/tools/nntool/expressions/symbolic/basic.py b/tools/nntool/expressions/symbolic/basic.py index c7917ff48..cdb9f9c2e 100644 --- a/tools/nntool/expressions/symbolic/basic.py +++ b/tools/nntool/expressions/symbolic/basic.py @@ -13,6 +13,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import logging + import numpy as np from bfloat16 import bfloat16 from quantization.qtype import DTYPE_GAP_CTYPE @@ -22,6 +24,8 @@ from .symbol import (Constant, Rational, c_headers, copy_props, environment, handles, handlesr, nargs) +LOG = logging.getLogger('nntool.'+__name__) + @nargs(2) @handles('__add__') @@ -178,6 +182,7 @@ class GapAbs(Abs): def _c_expr(self, *args, **kwargs): return "gap_abs(%s)" % (args[0]) + @nargs(1) class Round(Function): @@ -271,6 +276,7 @@ def _py_expr(self, *args, **kwargs): def _c_expr(self, *args, **kwargs): return "sqrtf(%s)" % (args[0],) + @nargs(1) @c_headers('') class RSqrt(Function): @@ -284,6 +290,7 @@ def _py_expr(self, *args, **kwargs): def _c_expr(self, *args, **kwargs): return "1.0f/sqrtf(%s)" % (args[0],) + @nargs(1) @c_headers('') class Log(Function): @@ -353,13 +360,14 @@ def _py_expr(self, *args, **kwargs): def _c_expr(self, *args, **kwargs): return f"square({args[0]}))" + @nargs(2) @c_headers('') class Pow(Function): def _impl(self, *args, **kwargs): if any(b < 0 and e < 1 for b, e in np.broadcast(*args)): - raise ValueError( + LOG.warning( 'fractional powers are being passed to a negative base for Pow operator') return np.power(args[0], args[1], dtype=self.dtype) @@ -563,6 +571,7 @@ def _eval(self, *args, **kwargs): return self._eval_float_to_quant(*args, **kwargs) return self._eval_quant_to_float(*args, **kwargs) + @nargs(2) class SquaredDifference(CompoundFunction): def _eval(self, *args, **kwargs): diff --git a/tools/nntool/expressions/symbolic/function_collection.py b/tools/nntool/expressions/symbolic/function_collection.py index d4c7e9774..ff6717ebd 100644 --- a/tools/nntool/expressions/symbolic/function_collection.py +++ b/tools/nntool/expressions/symbolic/function_collection.py @@ -123,6 +123,10 @@ def ops(self): def c_header_set(self): return set().union(*[func.c_header_set for func in self._functions.values()]) + def set_var_shapes(self): + for var, func in self.functions.items(): + var.shape = func.shape + @staticmethod def split_indexes(unique_axis_groups): uaq = sorted(unique_axis_groups, key=len) diff --git a/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py b/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py index b3fd805e7..76fc4d5ed 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py @@ -63,6 +63,7 @@ class Norm(Function): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + x=0 def _impl(self, *args, **kwargs): dtype = self.dtype diff --git a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py index 530f5f8b8..4cd591872 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py @@ -295,7 +295,8 @@ def _quantize(cls, prod_q, 15), max_val=prod_scale, min_val=-prod_scale) if prod_q > 15: qsym = Norm(sym_cls(*in_syms, dtype=np.int32), - QuantizedConstant(prod_q - 15)) + QuantizedConstant(prod_q - 15), + dtype=np.int32) else: qsym = sym_cls(*in_syms) return (qsym, out_qrec) diff --git a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py index 76a4d77bd..30d812ee4 100644 --- a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py +++ b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py @@ -54,7 +54,7 @@ def _c_expr(self, *args, **kwargs): @c_headers('"Gap.h"') @copy_props('from_qrec', 'to_qrec', 'num_bits') class ScaleQuantized(CompoundFunction): - def __init__(self, *args, from_qrec=None, to_qrec=None, num_bits=15, **kwargs): + def __init__(self, *args, from_qrec=None, to_qrec=None, num_bits=8, **kwargs): self._from_qrec = from_qrec self._to_qrec = to_qrec self._qbias, self._qnorm = None, None @@ -130,7 +130,8 @@ def _eval(self, *args, **kwargs): ), #pylint: disable=invalid-unary-operand-type QuantizedConstant(-qnorm, dtype=np.int8), - name=self.name + name=self.name, + dtype=self._to_qrec.dtype ) elif qnorm > 0: sym = Norm( @@ -140,7 +141,8 @@ def _eval(self, *args, **kwargs): dtype=self._to_qrec.dtype ), QuantizedConstant(qnorm, dtype=np.int8), - name=self.name + name=self.name, + dtype=self._to_qrec.dtype ) else: sym = Mul( diff --git a/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py b/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py index 258fdc484..c5709b71c 100644 --- a/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py +++ b/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py @@ -139,6 +139,8 @@ def gen_activation_op(activation): aop = "KOP_LEAKYRELU" elif activation == "sigmoid": aop = "KOP_SIGMOID" + elif activation == "tanh": + aop = "KOP_TANH" else: raise NotImplementedError("activation type %s not implemented" % activation) return aop diff --git a/tools/nntool/generation/at_types/at_params.py b/tools/nntool/generation/at_types/at_params.py index a075d4c19..0ba4358cd 100644 --- a/tools/nntool/generation/at_types/at_params.py +++ b/tools/nntool/generation/at_types/at_params.py @@ -42,22 +42,22 @@ def gen_activation_op(activation, force_relu=False, asymmetric=False): else: aop = "KOP_RELUN" if not force_relu else "KOP_RELU" elif activation == "hsigmoid": - assert not asymmetric, 'asymmetric not supported' + #assert not asymmetric, 'asymmetric not supported' aop = "KOP_HSIGMOID" elif activation == "htanh": - assert not asymmetric, 'asymmetric not supported' + #assert not asymmetric, 'asymmetric not supported' aop = "KOP_HTANH" elif activation == "swish" or activation == "hswish": - assert not asymmetric, 'asymmetric not supported' + #assert not asymmetric, 'asymmetric not supported' aop = "KOP_HSWISH" elif activation == "leaky": - assert not asymmetric, 'asymmetric not supported' + #assert not asymmetric, 'asymmetric not supported' aop = "KOP_LEAKYRELU" elif activation == "sigmoid": - assert not asymmetric, 'asymmetric not supported' + #assert not asymmetric, 'asymmetric not supported' aop = "KOP_SIGMOID" elif activation == "tanh": - assert not asymmetric, 'asymmetric not supported' + #assert not asymmetric, 'asymmetric not supported' aop = "KOP_TANH" else: raise NotImplementedError("activation type %s not implemented" % activation) diff --git a/tools/nntool/generation/code_generator.py b/tools/nntool/generation/code_generator.py index 0490a2809..3a243be1c 100644 --- a/tools/nntool/generation/code_generator.py +++ b/tools/nntool/generation/code_generator.py @@ -18,15 +18,17 @@ import numpy as np from bfloat16 import bfloat16 from expressions.symbolic.kernel_codegen import BasicKernel -from generation.new_generators import ne16 +from graph.manipulations.dimensions import add_dimensions from graph.types import (ConcatParameters, ConstantInputParameters, InputParameters, OutputParameters, ReshapeParameters, SplitParameters, TransposeParameters) -from graph.types.lstm import LSTMParameters -from graph.types.others import CopyParameters, QuantizeParameters +from graph.types.base import NNEdge +from graph.types.fusions import FusionBase +from graph.types.others import CopyParameters, NoOPParameters, QuantizeParameters from graph.types.rnn import RNNBaseParameters from utils.node_id import NodeId +from generation.gen_utils import ModelGenerationInternalError from generation.generator_decorators import RegisteredGeneratorsMixin # pylint: disable=wildcard-import,unused-wildcard-import from generation.generators import * @@ -117,6 +119,17 @@ class CodeGenerator(NewGenerator, RegisteredGeneratorsMixin): def __init__(self, G, naming_convension, opts=None): super().__init__() self.G = G + # this generates a view of the graph with all nodes that are not generated removed + self.hidden_graph = G.with_hidden_nodes( + lambda node: node.no_model_code, + edge_class=NNEdge + ) + self.sorted_nodes = sorted( + self.hidden_graph.nodes(), key=lambda node: node.step_idx) + naming_convension.G = self.hidden_graph + # the edge parameters are generated from the graph with the hidden nodes but the dimensions + # are not updated. They are read from the nodes + add_dimensions(self.hidden_graph, update_graph=False) self.naming_convension = naming_convension self.name_cache = NameCache() self.bindings = [] @@ -133,14 +146,22 @@ def __init__(self, G, naming_convension, opts=None): self.opts.update(opts) if self.opts['include_project_header']: self.include_files.append(self.project_name + '.h') - has_vcd = False - for step in G.graph_state.steps: - node = step['node'] - if node.at_options.vcd_trace_on is not None: - has_vcd = True - if has_vcd: + if any(step and step['node'].at_options.vcd_trace_on is not None + for step in G.graph_state.steps): self.include_files.append('hal/gvsoc/gvsoc.h') + @property + def output_nodes(self): + for node in sorted(self.hidden_graph.outputs(), key=lambda node: node.step_idx): + if isinstance(node, OutputParameters): + yield node + + @property + def input_nodes(self): + for node in sorted(self.hidden_graph.inputs(), key=lambda node: node.step_idx): + if isinstance(node, InputParameters): + yield node + @property def project_name(self): return self.naming_convension.get_project_name() @@ -160,8 +181,9 @@ def get_edge_name(self, eparams): def get_node_name(self, params, target): try: return self.name_cache[params][target] - except: - raise ValueError(f"Name Cache: {params.name} {target} not found") + except KeyError as ex: + raise ModelGenerationInternalError( + f"Name Cache: {params.name} {target} not found") from ex def memory_device_generator(self, indent=0): self.opts['memory_devices'].set_l2_ram_ext_managed( @@ -200,66 +222,36 @@ def binding_generator(self, indent=0): return str(code_block) @staticmethod - def real_up_connection(G, eparams, set_real=False): - while isinstance(eparams.creating_node, ReshapeParameters) or \ - (isinstance(eparams.creating_node, TransposeParameters) and eparams.creating_node.does_nothing()): - set_real = True - eparams = G.in_edges(eparams.creating_node.name)[0].params - return eparams, set_real - - @staticmethod - def real_down_connection(G, eparams): + def get_output(G, eparams): oedges = G.indexed_out_edges(eparams.creating_node.name)[ eparams.creating_node_idx] - while any(isinstance(oedge.to_node, ReshapeParameters) or \ - (isinstance(oedge.to_node, TransposeParameters) and oedge.to_node.does_nothing()) for oedge in oedges): - if len(oedges) > 1: - raise NotImplementedError('multiple edges on ungenerated node') - oedges = G.out_edges(oedges[0].to_node.name) + if len(oedges) != 1 or not isinstance(oedges[0].to_node, OutputParameters): + return None return oedges[0] def local_generator(self, indent=0): - edges = set(edge.params for edge in self.G.edges()) + edges = set(edge.params for edge in self.hidden_graph.edges()) sorted_edges = list(edges) sorted_edges.sort(key=lambda eparams: eparams.creating_step) for eparams in sorted_edges: - # check if the following real node is an output - if isinstance(eparams.creating_node, ConcatParameters): - rout_edge = self.real_down_connection(self.G, eparams) - if isinstance(rout_edge.to_node, OutputParameters): - rout_eparams = rout_edge.params - cname = self.naming_convension.get_edge_name(rout_eparams.creating_node, - rout_eparams.creating_step, - rout_eparams.edge_type, - rout_eparams.edge_order) - LOG.info("edge from step %s %s is not used and is replaced with edge to step %s:%s %s cname: %s", - eparams.creating_node.step_idx, eparams.creating_node.name, - rout_eparams.creating_node.name, rout_eparams.creating_node.step_idx, - rout_eparams.creating_step, cname) - self.name_cache.set(eparams, 'edge', cname) - continue - - rin_eparams, set_real = self.real_up_connection(self.G, eparams) - if rin_eparams.edge_type == "out": - # The edge was marked as an output so find the real edge down - rin_eparams = self.real_down_connection( - self.G, rin_eparams).params + if eparams.edge_type == "out": + # The edge was marked as an output so find the real output edge + oedges = self.hidden_graph.indexed_out_edges(eparams.creating_node.name)[ + eparams.creating_node_idx] + oedges = list(filter(lambda edge: isinstance( + edge.to_node, OutputParameters), oedges)) + if not oedges: + raise ModelGenerationInternalError( + f'output edge created by {eparams.creating_node.name}:{eparams.creating_node_idx} ' + f'is not connected to an output - {" ".join(edge.to_node.name for edge in oedges)}') + if len(oedges) > 1: + raise ModelGenerationInternalError( + f'output edge created by {eparams.creating_node.name}:{eparams.creating_node_idx} ' + f'is connected to more than one output - {" ".join(edge.to_node.name for edge in oedges)}') + + rin_eparams = oedges[0].params self.name_cache.set(eparams, 'edge', rin_eparams.name) continue - else: - if set_real: - # Code will not be generated for reshape or empty transpose so the input to the - # following node is the input to this node - cname = self.naming_convension.get_edge_name(rin_eparams.creating_node, - rin_eparams.creating_step, - rin_eparams.edge_type, - rin_eparams.edge_order) - LOG.info("edge from step %s %s is not used and is replaced with edge from step %s:%s %s cname: %s", - eparams.creating_node.step_idx, eparams.creating_node.name, - rin_eparams.creating_node.name, rin_eparams.creating_node.step_idx, - rin_eparams.creating_step, cname) - self.name_cache.set(eparams, 'edge', cname) - continue cname = self.naming_convension.get_edge_name(eparams.creating_node, eparams.creating_step, @@ -304,7 +296,7 @@ def local_generator(self, indent=0): return str(code_block) def stack_generator(self, indent=0): - edges = set(edge.params for edge in self.G.edges()) + edges = set(edge.params for edge in self.hidden_graph.edges()) sorted_edges = list(edges) sorted_edges.sort(key=lambda eparams: eparams.creating_step) concat_edges = list([eparams for eparams in sorted_edges if isinstance( @@ -313,15 +305,15 @@ def stack_generator(self, indent=0): node = eparams.creating_node cname_out = self.name_cache[eparams]['edge'] in_edge_names = [self.name_cache[edge.params]['edge'] - for edge in self.G.indexed_in_edges(node.name)] + for edge in self.hidden_graph.indexed_in_edges(node.name)] self.stacked_tensors.append(TensorStack(cname_out, in_edge_names)) - split_nodes = [node for node in self.G.nodes( + split_nodes = [node for node in self.hidden_graph.nodes( ) if isinstance(node, SplitParameters)] for split_node in split_nodes: - eparams_in = self.G.in_edges(split_node.name)[0].params + eparams_in = self.hidden_graph.in_edges(split_node.name)[0].params eparams_out = [ - edge_bundle[0].params for edge_bundle in self.G.indexed_out_edges(split_node.name)] + edge_bundle[0].params for edge_bundle in self.hidden_graph.indexed_out_edges(split_node.name)] cname_in = self.name_cache[eparams_in]['edge'] cnames_out = [self.name_cache[eparams]['edge'] for eparams in eparams_out] @@ -359,22 +351,27 @@ def global_generator(self, indent=0): def generate_outputs(self): outputs = set() - count_outputs = 0 - for node in self.G.output_nodes(): + for node in self.output_nodes: qrec = self.G.quantization[NodeId(node)] - for edge in self.G.in_edges(node.name): - if isinstance(edge.from_node, (LSTMParameters, )) and count_outputs: + for edge in self.hidden_graph.in_edges(node.name): + if isinstance(edge.from_node, (RNNBaseParameters, )) and edge.from_idx > 0: continue - eparams, _ = self.real_up_connection(self.G, edge.params) + eparams = edge.params if eparams in outputs: continue eparams.edge_type = "out" outputs.add(eparams) self.execute_phase("outputs", node, qrec, edge) - count_outputs += 1 + + def sorted_nodes_and_fusions(self): + for node in self.sorted_nodes: + if isinstance(node, FusionBase) and node.quantize_internals: + for fnode in node.contained_nodes(): + yield node, fnode + yield node, None def generate_constants(self): - for _, pnode, _, fnode in self.G.nodes_iterator(): + for pnode, fnode in self.sorted_nodes_and_fusions(): anode = pnode if not fnode else fnode qrec = self.G.quantization.get(NodeId(pnode, fnode)) if not self.new_execute_phase("globals", anode, qrec, pnode, fnode): @@ -382,9 +379,9 @@ def generate_constants(self): def generate_inputs(self): inputs = set() - for node in self.G.input_nodes(): + for node in self.input_nodes: qrec = self.G.quantization[NodeId(node)] - for edge in self.G.out_edges(node.name): + for edge in self.hidden_graph.out_edges(node.name): eparams = edge.params if eparams in inputs: continue @@ -474,22 +471,22 @@ def get_node_cname(self, node): def kernel_generator(self, indent=0): code_block = CodeBlock(starting_indent=indent) - for _, node, _, _ in self.G.nodes_iterator(yield_fusions=False): + for node in self.sorted_nodes: name = node.name cname = self.get_node_cname(node) if node.at_options.vcd_trace_on is not None: self.add_vcd_trace_binding(cname, node.at_options.vcd_trace_on) self.name_cache.set(node, 'node', cname) - in_eparams = self.G.get_in_params(name) - out_eparams = self.G.get_out_params(name) + in_eparams = [edge.params if edge else None + for edge in self.hidden_graph.indexed_in_edges(name)] + out_eparams = [edge_bundle[0].params if edge_bundle else None + for edge_bundle in self.hidden_graph.indexed_out_edges(name)] try: qrec = self.G.quantization[NodeId(node)] except KeyError as err: LOG.error("Quantization record not found for node %s", node.name) raise err - if isinstance(node, ReshapeParameters): - continue - if isinstance(node, TransposeParameters) and node.does_nothing(): + if node.no_model_code: continue elif isinstance(node, (InputParameters, OutputParameters)): continue @@ -507,7 +504,8 @@ def kernel_generator(self, indent=0): in_eparams, out_eparams, cname) if not (self.new_execute_phase("kernels", node, qrec, in_eparams, out_eparams, cname) or self.execute_phase("kernels", node, qrec, in_eparams, out_eparams, cname)): - raise NotImplementedError(f"Don't know how to generate kernel for parameter type {node.name} {node.CLS_OP_NAME}. " + raise NotImplementedError("Don't know how to generate kernel for parameter type " + f"{node.name} {node.CLS_OP_NAME}. " "Perhaps you need to run fusions -a expression_matcher.") # if self.opts['generate_checksums']: @@ -527,16 +525,16 @@ def add_vcd_trace_binding(self, cname, enable): before=True)) def add_checksum_binding(self, cname, name, step_idx, eparams, before): - node = self.G[name] + node = self.hidden_graph[name] if before: size = node.in_dims[0].size() else: size = node.out_dims[0].size() self.bindings.append( FunctionBindingList(cname, - checksum_func(self.G, name), + checksum_func(self.hidden_graph, name), Imm(step_idx), - Imm(calc_value_checksum(self.G, name)), + Imm(calc_value_checksum(self.hidden_graph, name)), GArgEdge(eparams[0]), Imm(size), before=before) @@ -578,9 +576,7 @@ def load_basic_kernel_library(self, indent=0): def header_generator(self, indent=0): code_block = CodeBlock(starting_indent=indent) - for _, node, _, fnode in self.G.nodes_iterator(): - if fnode: - continue + for node in self.sorted_nodes: cname = self.name_cache[node]['node'] qrec = self.G.quantization[NodeId(node)] code_block.comment(cname) @@ -687,7 +683,7 @@ def expressions_user_kernel_source_generator(self, indent=0): def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, indent=0): code_block = CodeBlock(starting_indent=indent) code_block.write("/* Inputs */") - for i, node in enumerate(self.G.input_nodes()): + for i, node in enumerate(self.input_nodes): if node.at_options.allocate or node.at_options.extern_input_pointer: continue nodeq = self.G.quantization[NodeId(node, None)].out_qs[0] @@ -703,7 +699,7 @@ def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, inde code_block.write( f"L2_MEM {CTYPE[nodeq.ctype]} {node.name.capitalize()}[{node.out_dims[0].size()}];") code_block.write("/* Outputs */") - for node in self.G.output_nodes(): + for node in self.output_nodes: if node.at_options.allocate: continue nodeq = self.G.quantization[NodeId(node, None)].out_qs[0] @@ -711,7 +707,7 @@ def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, inde f"L2_MEM {CTYPE[nodeq.ctype]} {node.name.capitalize()}[{node.out_dims[0].size()}];") if test_outputs: - for out_n, outp in zip(self.G.output_nodes(), test_outputs): + for out_n, outp in zip(self.output_nodes, test_outputs): code_block.write( 'L2_MEM {} {}_gt[] = {{{}}};', dtype2ctype(outp), @@ -722,15 +718,13 @@ def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, inde def gen_inout_list(self): inout_str = "" - for node in self.G.input_nodes(): + for node in self.input_nodes: if node.at_options.allocate or node.at_options.extern_input_pointer: continue inout_str += f"{node.name.capitalize()}, " - rnn_present = any([isinstance(node, RNNBaseParameters) - for node in self.G.nodes()]) - if rnn_present: + if self.hidden_graph.nodes(node_classes=RNNBaseParameters): inout_str += "1, " - for node in self.G.output_nodes(): + for node in self.output_nodes: if node.at_options.allocate: continue inout_str += f"{node.name.capitalize()}, " @@ -739,7 +733,7 @@ def gen_inout_list(self): def generate_output_check(self, tol=0.0, indent=0): code = CodeBlock(starting_indent=indent) code.write('int errors;') - for idx, out_node in enumerate(self.G.output_nodes()): + for out_node in self.output_nodes: out_sz = out_node.out_dims[0].size() nodeq = self.G.quantization[NodeId(out_node, None)].out_qs[0] dtype = "%f" if nodeq.is_floating else "%d" @@ -753,8 +747,9 @@ def generate_output_check(self, tol=0.0, indent=0): f"{dtype2ctype(nodeq)} diff = {out_node.name.capitalize()}[j] - " f"{out_node.name.capitalize()}_gt[j];") code.write("diff = (diff>0)?diff:(-diff);") - code.write(f"if (diff > max_diff) max_diff = diff;") - code.write(f'if (diff > {nodeq.quantize(np.array(tol)).item()}) {{') + code.write("if (diff > max_diff) max_diff = diff;") + code.write( + f'if (diff > {nodeq.quantize(np.array(tol)).item()}) {{') else: code.write( f'if ({out_node.name.capitalize()}[j] != {out_node.name.capitalize()}_gt[j]) {{') diff --git a/tools/nntool/generation/default_appl_main_template.py b/tools/nntool/generation/default_appl_main_template.py index c6493a947..f0a3a5e5f 100644 --- a/tools/nntool/generation/default_appl_main_template.py +++ b/tools/nntool/generation/default_appl_main_template.py @@ -99,7 +99,8 @@ def generate_main_appl_template(G, gen, test_inputs=None, test_outputs=None, tol printf("Call cluster\\n"); #ifndef __EMUL__ - struct pi_cluster_task task = {0}; + struct pi_cluster_task task; + pi_cluster_task(&task,NULL,NULL); task.entry = cluster; task.arg = NULL; task.stack_size = (unsigned int) STACK_SIZE; @@ -186,7 +187,7 @@ def generate_main_appl_make(G, gen, quantized, open_args=""): TRAINED_MODEL = ${os.path.split(G.graph_identity.filename)[1]} -MODEL_EXPRESSIONS = ${"$(MODEL_BUILD)/" + gen.opts['basic_kernel_source_file'] if gen.G.has_expressions else ""} +MODEL_EXPRESSIONS = ${"$(MODEL_BUILD)/" + gen.opts['basic_kernel_source_file']} NNTOOL_EXTRA_FLAGS += ${open_args} ${"MODEL_QUANTIZED=1" if quantized else ""} @@ -225,7 +226,7 @@ def generate_main_appl_make_atproject(G, gen, quantized, model_path): AT_MODEL_PATH=${model_path} -MODEL_EXPRESSIONS = ${gen.opts['basic_kernel_source_file'] if gen.G.has_expressions else ""} +MODEL_EXPRESSIONS = ${gen.opts['basic_kernel_source_file']} ${"MODEL_QUANTIZED=1" if quantized else ""} diff --git a/tools/nntool/generation/gen_utils.py b/tools/nntool/generation/gen_utils.py index 0744c27ab..c2207782a 100644 --- a/tools/nntool/generation/gen_utils.py +++ b/tools/nntool/generation/gen_utils.py @@ -13,6 +13,11 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import os + +from utils.exception import NNToolInternelError + + def at_bits(qtype): if qtype is None: return 0 @@ -32,3 +37,11 @@ def at_q(qtype): def at_bits_and_q(qtype): return "{}, {}".format(at_bits(qtype), qtype.q) + +def write_empty(model_directory, model_file, comment): + model_path = os.path.join(model_directory, model_file) + with open(model_path, "w") as output_fp: + output_fp.write(f"/** {comment}\n**/") + +class ModelGenerationInternalError(NNToolInternelError): + pass diff --git a/tools/nntool/generation/generators/globals/constant_input_generator.py b/tools/nntool/generation/generators/globals/constant_input_generator.py index 526cd8755..7ac6691b4 100644 --- a/tools/nntool/generation/generators/globals/constant_input_generator.py +++ b/tools/nntool/generation/generators/globals/constant_input_generator.py @@ -18,11 +18,13 @@ from generation.at_types.constant_info import ConstantInfo from generation.at_types.tc_arg_info import (GlobalArgInfo, GlobalResetArgInfo, InputArgInfo) +from generation.gen_utils import ModelGenerationInternalError from generation.generator_decorators import (QREC_FLOAT, QREC_MULT8, QREC_POW2, generation_function) from graph.types import ConstantInputParameters -from graph.types.fusions import ConvFusionParameters, LinearFusionParameters -from graph.types.linear import FcParameters +from graph.types.fusions import (ConvFusionParameters, + LinearFusionParameters, + MatMulOpFusionParameters) from utils.node_id import NodeId from utils.numpy_helpers import interleave, packbits @@ -83,13 +85,15 @@ def constant_input_globals_generator(gen, node, qrec, pnode, fnode) -> bool: if qtype.attr.ne16_biases: to_node = gen.G.out_edges(pnode.name)[0].to_node - if isinstance(to_node, (ConvFusionParameters, LinearFusionParameters)): + if isinstance(to_node, (ConvFusionParameters, LinearFusionParameters, MatMulOpFusionParameters)): cnodes = to_node.contained_nodes() quants = [gen.G.quantization[NodeId( to_node, fnode)] for fnode in cnodes] filter_qrec = quants[0] else: filter_qrec = gen.G.quantization[NodeId(to_node)] + if 'mul_biases_q' not in filter_qrec.cache: + raise ModelGenerationInternalError(f"mul_biases_q not found in qrec for {to_node.name}") mul_qbiases = filter_qrec.cache['mul_biases_q'].qbiases mul_qnorms = filter_qrec.cache['mul_biases_q'].qnorms value = np.where(mul_qnorms > 0, diff --git a/tools/nntool/generation/naming_convension.py b/tools/nntool/generation/naming_convension.py index cc05166c2..60d10abd8 100644 --- a/tools/nntool/generation/naming_convension.py +++ b/tools/nntool/generation/naming_convension.py @@ -28,9 +28,16 @@ class NamingConvension(ABC): - def __init__(self, G): - self.G = G - self.multi_out_edges = {} + def __init__(self, G=None): + self._G = G + + @property + def G(self): + return self._G + + @G.setter + def G(self, val): + self._G = val @abstractmethod def get_node_name(self, node_name, step_idx, params): diff --git a/tools/nntool/generation/new_generators/general/expressions.py b/tools/nntool/generation/new_generators/general/expressions.py index 2f4c64573..3752f18b3 100644 --- a/tools/nntool/generation/new_generators/general/expressions.py +++ b/tools/nntool/generation/new_generators/general/expressions.py @@ -25,7 +25,7 @@ @paramstype(ExpressionFusionParameters) -class GenCopyParameters(GeneratorBase, InOutBindingsMixin): +class GenExpressionParameters(GeneratorBase, InOutBindingsMixin): @classmethod def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: return True diff --git a/tools/nntool/generation/new_generators/helpers/act_infos.py b/tools/nntool/generation/new_generators/helpers/act_infos.py index a1defd826..c47d75ffe 100644 --- a/tools/nntool/generation/new_generators/helpers/act_infos.py +++ b/tools/nntool/generation/new_generators/helpers/act_infos.py @@ -13,13 +13,14 @@ # along with this program. If not, see . import numpy as np - +from generation.gen_utils import ModelGenerationInternalError from graph.types import (HSigmoidActivationParameters, HSwishActivationParameters, LeakyActivationParameters, ReluActivationParameters, SigmoidActivationParameters, SoftMaxParameters, TanHActivationParameters) from graph.types.activations import HTanHActivationParameters + def gen_act_infos(act_params, act_q): comment = "" if isinstance(act_params, ReluActivationParameters): @@ -27,42 +28,20 @@ def gen_act_infos(act_params, act_q): 'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8), 'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8) } - if act_params.upper_bound is None: # or fnode is not None: - if act_q.in_qs[0].zero_point == 0: - contents.update({ - 'a0': np.uint8(0), - 'b0': np.uint8(0), - 'c0': np.uint8(0), - }) - else: - contents.update({ - 'a0': act_q.in_qs[0].zero_point.astype(act_q.in_qs[0].dtype), - 'b0': np.uint8(0), - 'c0': np.uint8(0), - }) - else: - if act_q.in_qs[0].zero_point == 0: - contents.update({ - 'a0': act_q.in_qs[0].quantize(act_params.upper_bound), - 'b0': np.uint8(0), - 'c0': np.uint8(0), - }) - else: - contents.update({ - 'a0': act_q.in_qs[0].zero_point.astype(act_q.in_qs[0].dtype), - 'b0': act_q.in_qs[0].quantize(act_params.upper_bound), - 'c0': np.uint8(0), - }) + contents.update({ + 'a0': act_q.cache['lower_bound'] if "lower_bound" in act_q.cache else np.uint8(0), + 'b0': act_q.cache['upper_bound'] if "upper_bound" in act_q.cache else np.uint8(0), + 'c0': np.uint8(0), + }) elif isinstance(act_params, (HSigmoidActivationParameters, HSwishActivationParameters)): - # currently combines all scaling factors into one scale and shift - assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported" + # mult factor is combined into scale contents = { 'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8), 'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8), 'a0': act_q.cache['upper_bound'], 'b0': act_q.cache['offset'], - 'c0': act_q.cache['mult'] + 'c0': act_q.cache['zero_point'] } elif isinstance(act_params, SoftMaxParameters): assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported" @@ -71,27 +50,25 @@ def gen_act_infos(act_params, act_q): 'bias_sm': act_q.cache['bias_sm'] } elif isinstance(act_params, LeakyActivationParameters): - assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported" + #assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported" contents = { 'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8), 'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8), 'a0': act_q.cache['leak_factor'], - 'b0': np.uint8(0), + 'b0': act_q.cache['zero_point'], 'c0': np.uint8(0), } - elif isinstance(act_params, (SigmoidActivationParameters, TanHActivationParameters, HTanHActivationParameters)): + elif isinstance(act_params, (SigmoidActivationParameters, TanHActivationParameters)): contents = { 'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8), 'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8), - 'a0': np.uint8(0), + 'a0': act_q.cache["zero_point"], 'b0': np.uint8(0), 'c0': np.uint8(0), } else: - raise NotImplementedError( - "activation type not implemented in model generator") + raise ModelGenerationInternalError( + f"activation type {act_params.__class__.__name__} not implemented in model generator") comment += f"in: {act_q.in_qs[0].scale[0]:.5f} out: {act_q.out_qs[0].scale[0]:.5f} " - comment += f"actscale: {contents['actscale']} actscalen: {contents['actscalen']} " - comment += f"A0: {contents['a0']} B0: {contents['b0']} C0: {contents['c0']}" return contents, comment diff --git a/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py b/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py index 492df0a58..6e197e35e 100644 --- a/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py @@ -77,12 +77,11 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: return False - comment = f"BiasQ: {0}" + infos_comment infos['BIASN'] = np.int8(0) # BiasQ + conv_mul_bias = filt_q.cache.get('mul_biases_q') + infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(conv_mul_bias, MultMulBiasScaleQType) else 0) if filt_q.cache.get('ne16'): - conv_mul_bias = filt_q.cache.get('mul_biases_q') - infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(conv_mul_bias, MultMulBiasScaleQType) else 0) infos['NE16_PADVAL'] = np.atleast_1d(filt_q.in_qs[0].zero_point).astype(filt_q.in_qs[0].dtype) infos['NE16_WOFFSET'] = -np.array(filt_q.in_qs[1].zero_point).astype(np.int32) infos_len = 'NE16_DIM' @@ -91,7 +90,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array(infos_len, **infos) + contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos) + comment = infos_comment + new_comment cname, file_name = gen_constant(gen, pnode, fnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents) diff --git a/tools/nntool/generation/new_generators/mult8/linear_mult8.py b/tools/nntool/generation/new_generators/mult8/linear_mult8.py index 1a829c160..9c6364661 100644 --- a/tools/nntool/generation/new_generators/mult8/linear_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/linear_mult8.py @@ -60,13 +60,12 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: else: return False - comment = f"BiasQ: {0}" + infos_comment infos['BIASN'] = np.int8(0) # BiasQ + conv_mul_bias = filt_q.cache.get('mul_biases_q') + infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance( + conv_mul_bias, MultMulBiasScaleQType) else 0) if filt_q.cache.get('ne16'): - conv_mul_bias = filt_q.cache.get('mul_biases_q') - infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance( - conv_mul_bias, MultMulBiasScaleQType) else 0) infos['NE16_PADVAL'] = np.atleast_1d( filt_q.in_qs[0].zero_point).astype(np.uint16) infos['NE16_WOFFSET'] = - \ @@ -76,7 +75,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos_len = 'DIM' infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array(infos_len, **infos) + contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos) + comment = infos_comment + new_comment cname, file_name = gen_constant(gen, pnode, fnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2( diff --git a/tools/nntool/generation/new_generators/mult8/matadd_mult8.py b/tools/nntool/generation/new_generators/mult8/matadd_mult8.py index 325e5cd9d..507a69e59 100644 --- a/tools/nntool/generation/new_generators/mult8/matadd_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/matadd_mult8.py @@ -61,7 +61,7 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: add_node = pnode add_quant = qrec infos = {} - acomments = "" + acomments = "no activation - " infos.update({ 'IN1SCALE': add_quant.cache['scale_in_mul_biases_q'].qbiases, @@ -69,7 +69,6 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: 'OUTSCALE': add_quant.cache['scale_mul_biases_q'].qbiases, 'OUTSCALEN': add_quant.cache['scale_mul_biases_q'].qnorms }) - comments = " ".join(f'{k}: {infos[k]}' for k in ['IN1SCALE', 'IN1SCALEN', 'OUTSCALE', 'OUTSCALEN']) + acomments if not add_quant.in_qs[0].signed: infos['ADD_BIAS'] = add_quant.cache['add_bias_offset'] infos_len = 'ASYM_ADD_DIM' @@ -78,7 +77,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array(infos_len, **infos) + contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos) + comments = acomments + new_comment cname, file_name = gen_constant(gen, pnode, pnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2( diff --git a/tools/nntool/generation/new_generators/mult8/matmul_mult8.py b/tools/nntool/generation/new_generators/mult8/matmul_mult8.py index d8dedd71b..cdb59f43b 100644 --- a/tools/nntool/generation/new_generators/mult8/matmul_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/matmul_mult8.py @@ -23,6 +23,7 @@ from generation.at_types.tc_arg_info import GlobalArgInfo from generation.bindings import (CommentBindingList, GNodeArgEdge, GNodeArgNode, NodeBindingList) +from generation.gen_utils import ModelGenerationInternalError from generation.generators.globals.global_names import (INFOS, MULSCALE, MULSHIFT) from generation.generators.kernels.autotiler_kernel import NewAutoTilerKernel @@ -75,10 +76,10 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos['OUTSCALE'] = mul_qrec.cache['mul_biases_q'].qbiases[0] infos['OUTSCALEN'] = mul_qrec.cache['mul_biases_q'].qnorms[0] + conv_mul_bias = mul_qrec.cache.get('mul_biases_q') + infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(conv_mul_bias, MultMulBiasScaleQType) else 0) + if mul_qrec.cache.get('ne16'): - conv_mul_bias = mul_qrec.cache.get('mul_biases_q') - infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance( - conv_mul_bias, MultMulBiasScaleQType) else 0) infos['NE16_PADVAL'] = np.atleast_1d( mul_qrec.in_qs[0].zero_point).astype(mul_qrec.in_qs[0].dtype) infos['NE16_WOFFSET'] = - \ @@ -88,7 +89,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos_len = 'DIM' infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array(infos_len, **infos) + contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos) + comment += new_comment cname, file_name = gen_constant(gen, pnode, mul_node, INFOS) const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents) @@ -184,12 +186,41 @@ def set_matmul_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q GNodeArgEdge(out_eparams[0], "GNA_OUT"), GNodeArgNode(node, INFOS))) +def calculate_dimensions(node_name, matmul_params): + in1_shape = tuple(matmul_params.in_dims[0].shape) + in2_shape = tuple(matmul_params.in_dims[1].shape) + + rank1 = len(in1_shape) + rank2 = len(in2_shape) + + if rank2 <= 2: + if rank2 == 1: + # TODO - is this correct if transposed? + in2_shape = (in2_shape[0], 1) + channels = 1 + if rank1 > 2 and not all(dim == 1 for dim in in1_shape[:-2]): + in1_shape = (int(np.prod(in1_shape[:-1])), in1_shape[-1]) + elif rank1 == 1: + in1_shape = (1, in1_shape[0]) + elif rank1 == rank2 and in1_shape[:-2] == in2_shape[:-2]: + channels = np.prod(in1_shape[:-2]) + LOG.warning(f'Matmul over batches is not yet properly generated - output will not be correct') + else: + raise ModelGenerationInternalError( + f'{node_name} Invalid dimensions for matmul kernel {in1_shape} {in2_shape}') + + height_1 = in1_shape[-2] + width_1 = in1_shape[-1] + height_2 = in2_shape[-2] + width_2 = in2_shape[-1] + return height_1,width_1, height_2, width_2, channels + class MatMulKernel(NewAutoTilerKernel): CALL_TEMPLATE = '''// generator for {node_name} -CNN_MatMulAct_SQ8("{cname}", {gen_ctrl}, {bias_datasize}, 1, - {width_1}, {height_1}, {width_2}, {height_2}, - 0, 0, 1, 1, {matmul_op}, {act_op}); +CNN_BatchedMatMulAct_SQ8("{cname}", {gen_ctrl}, {bias_datasize}, 1, + {batch_size}, {width_1}, {height_1}, {width_2}, {height_2}, + 0, 0, 1, 1, {matmul_op}, {act_op}); ''' def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen_ctrl=None, force_relu=True): @@ -205,10 +236,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen else: act_op = 'KOP_NONE' - height_1 = matmul_params.in_dims[0][0] - width_1 = matmul_params.in_dims[0][1] - height_2 = matmul_params.in_dims[1][0] - width_2 = matmul_params.in_dims[1][1] + height_1, width_1, height_2, width_2, batch_size = calculate_dimensions(node_name, matmul_params) if len(matmul_params.in_dims) == 3: bias_datasize = at_bits(matmul_qrec.in_qs[2]) @@ -222,8 +250,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen if isinstance(matmul_params, MatMulTransposedParameters): matmul_op += '_TRANSPOSED' - height_2 = matmul_params.in_dims[1][1] - width_2 = matmul_params.in_dims[1][0] + height_2, width_2 = width_2, height_2 # attributes affecting generation attrs = { @@ -231,6 +258,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen 'width_1': width_1, 'height_2': height_2, 'width_2': width_2, + 'batch_size': batch_size, 'bias_datasize': bias_datasize, 'matmul_op': matmul_op, 'act_op': act_op @@ -243,6 +271,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen } super().__init__(attrs, extra_attrs, gen_ctrl=gen_ctrl) + class MatMulKernelNE16(NewAutoTilerKernel): CALL_TEMPLATE = '''// generator for {node_name} CNN_MatMulAct_NE16("{cname}", {gen_ctrl}, {in1_datasize}, {out_datasize}, {bias_datasize}, {in2_datasize_bits}, @@ -263,10 +292,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen else: act_op = 'KOP_NONE' - height_1 = matmul_params.in_dims[0][0] - width_1 = matmul_params.in_dims[0][1] - height_2 = matmul_params.in_dims[1][1] - width_2 = matmul_params.in_dims[1][0] + height_1, width_1, width_2, height_2, channels = calculate_dimensions(node_name, matmul_params) bias_datasize = at_bits(matmul_qrec.in_qs[2]) in1_datasize = at_bits(matmul_qrec.in_qs[0]) in2_datasize_bits = matmul_qrec.in_qs[1].bits diff --git a/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py b/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py index 8a17f468a..c6de99bd2 100644 --- a/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py @@ -14,6 +14,9 @@ # along with this program. If not, see . import logging +from quantization.multiplicative.scaling_qtypes import MultMulBiasScaleQType + +import numpy as np from generation.at_types.at_params import NO_ACTIVATION, gen_activation_op from generation.at_types.constant_info import ConstantInfo @@ -42,9 +45,6 @@ class PaddedMatAddSQ8Generator(GeneratorBase): @classmethod def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: - cnodes = node.contained_nodes() - quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes] - cnodes = pnode.contained_nodes() quants = [gen.G.quantization[NodeId(pnode, cnode)] for cnode in cnodes] @@ -63,10 +63,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: 'OUTSCALE': quants[1].cache['scale_mul_biases_q'].qbiases, 'OUTSCALEN': quants[1].cache['scale_mul_biases_q'].qnorms }) - comments = " ".join(f'{k}: {infos1[k]}' for k in ['IN1SCALE', 'IN1SCALEN', 'OUTSCALE', 'OUTSCALEN']) + f" {acomments}" - infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array("DIM", **infos1) + contents, comments = infos_encoder.gen_infos_array('DIM', **infos1) cname, file_name = gen_constant(gen, pnode, pnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2( @@ -77,15 +75,17 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: const_info=const_info, comment=comments)) - - infos.update({ - 'IN1SCALE': quants[1].cache['scale_mul_biases_q'].qbiases, - 'IN1SCALEN': quants[1].cache['scale_mul_biases_q'].qnorms - }) - comments = " ".join(f'{k}: {infos[k]}' for k in ['IN1SCALE', 'IN1SCALEN']) + f" {acomments}" - + # Padded part needs to apply out scale of the matadd + act scale + double_scale = MultMulBiasScaleQType( + dtype=np.uint8, + scale=quants[1].cache['scale_mul_biases_q'].scale * quants[2].cache['scale_mul_biases_q'].scale \ + if len(cnodes) == 3 else \ + quants[1].cache['scale_mul_biases_q'].scale + ) + infos['actscale'] = double_scale.qbiases + infos['actscalen'] = double_scale.qnorms infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array("DIM", **infos) + contents, comments = infos_encoder.gen_infos_array('DIM', **infos) cname, file_name = gen_constant(gen, pnode, cnodes[0], INFOS, extra_name='Pad') const_info = ConstantInfo(file_name, QType.Pow2( diff --git a/tools/nntool/generation/new_generators/mult8/pool_mult8.py b/tools/nntool/generation/new_generators/mult8/pool_mult8.py index 717aa5729..2294ea81c 100644 --- a/tools/nntool/generation/new_generators/mult8/pool_mult8.py +++ b/tools/nntool/generation/new_generators/mult8/pool_mult8.py @@ -82,7 +82,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos['GLOBAL_SUM_SCALEN'] = pool_q.cache['scale_mul_biases_q'].qnorms infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array('DIM', **infos) + contents, new_comment = infos_encoder.gen_infos_array('DIM', **infos) + comment += new_comment cname, file_name = gen_constant(gen, pnode, pnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2( @@ -103,7 +104,7 @@ def bindings_generator(cls, gen, node, qrec, in_eparams, out_eparams, cname) -> gen, in_eparams, out_eparams, cname, node, qrec) return True return False - elif isinstance(node, (GlobalPoolingParameters, PoolingParameters)): + elif isinstance(node, (GlobalPoolingParameters, PoolingParameters, ActivationParameters)): cls.set_in_out_infos_bindings( gen, in_eparams, out_eparams, cname, node, qrec) else: diff --git a/tools/nntool/generation/new_generators/mult8/softmax_mult.py b/tools/nntool/generation/new_generators/mult8/softmax_mult.py index cb9d34ac3..98cf23325 100644 --- a/tools/nntool/generation/new_generators/mult8/softmax_mult.py +++ b/tools/nntool/generation/new_generators/mult8/softmax_mult.py @@ -45,10 +45,11 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool: infos = { 'BIASL_SM': np.uint8(15 + np.ceil(np.log2(qrec.in_qs[0].scale))) } - comment = f"in: {qrec.in_qs[0].scale[0]:.5f} out: {qrec.out_qs[0].scale[0]:.5f} NORM: {infos['BIASL_SM']}" + comment = f"in: {qrec.in_qs[0].scale[0]:.5f} out: {qrec.out_qs[0].scale[0]:.5f} " infos_encoder = SQ8ActInfos() - contents = infos_encoder.gen_infos_array('DIM', **infos) + contents, new_comment = infos_encoder.gen_infos_array('DIM', **infos) + comment += new_comment cname, file_name = gen_constant(gen, pnode, pnode, INFOS) const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents) @@ -95,7 +96,7 @@ def __init__(self, node_name, cname, params, qrec, gen_ctrl=None): # attributes affecting generation attrs = { 'size': in_dim.size(), - 'width': in_dim.size()/in_dim.shape[axis], + 'width': in_dim.size()//in_dim.shape[axis], 'height': in_dim.shape[axis], 'softmax_op': softmax_op } diff --git a/tools/nntool/graph/manipulations/__init__.py b/tools/nntool/graph/manipulations/__init__.py index f7ce2002b..e69de29bb 100644 --- a/tools/nntool/graph/manipulations/__init__.py +++ b/tools/nntool/graph/manipulations/__init__.py @@ -1,19 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -from .dimensions import add_dimensions -from .adjust_order import adjust_order -from .liveness import calculate_liveness -from .balance_filter import balance_filter, balance_all_filters diff --git a/tools/nntool/graph/manipulations/adjust_base.py b/tools/nntool/graph/manipulations/adjust_base.py index 98ff7f199..6881a1444 100644 --- a/tools/nntool/graph/manipulations/adjust_base.py +++ b/tools/nntool/graph/manipulations/adjust_base.py @@ -66,7 +66,7 @@ def apply_input_trans(self, G, node, trans: list, index=None): if node.in_dims_hint: node.in_dims_hint[idx] = apply_transpose(node.in_dims_hint[idx], trans) nid = NodeId(node) - if G.quantization: + if G.quantization and nid in G.quantization: G.quantization.copy_qrec(node, 'in', idx, params) def apply_output_trans(self, G, node, trans: list, index=None): @@ -86,7 +86,8 @@ def apply_output_trans(self, G, node, trans: list, index=None): ) if node.out_dims_hint: node.out_dims_hint[idx] = apply_transpose(node.out_dims_hint[idx], self.invert(trans)) - if G.quantization: + nid = NodeId(node) + if G.quantization and nid in G.quantization: G.quantization.copy_qrec(node, 'out', idx, params) @staticmethod diff --git a/tools/nntool/graph/manipulations/dimensions.py b/tools/nntool/graph/manipulations/dimensions.py index f883d4d02..a6c79b97d 100644 --- a/tools/nntool/graph/manipulations/dimensions.py +++ b/tools/nntool/graph/manipulations/dimensions.py @@ -13,12 +13,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from graph.verify import verify_graph import logging from typing import Sequence from generation.naming_convension import (DefaultNamingConvension, NamingConvension) +from utils.graph import GraphView +# from graph.verify import verify_graph from ..dim import Dim, MissMatchedInputsError, MoreThanOneInputError from ..types import (ConcatParameters, ConstantInputParameters, EdgeParameters, @@ -30,9 +31,10 @@ def set_out_edges_multi(G, node: Parameters, dims: Sequence[Dim], step_idx: int, - naming_convension: NamingConvension, edge_type: str = "in_out"): + naming_convension: NamingConvension, update_graph, edge_type: str = "in_out"): # clone the dims first so that the edge dims are the same objects as the node output dims - dims = node.set_output_size(dims) + if update_graph: + dims = node.set_output_size(dims) out_edges = G.indexed_out_edges(node) is_multi_out = len(out_edges) > 1 for edge_idx, edge_group in enumerate(out_edges): @@ -49,17 +51,20 @@ def set_out_edges_multi(G, node: Parameters, dims: Sequence[Dim], step_idx: int, def set_out_edges_one(G, node: Parameters, dim: Dim, step_idx: int, - naming_convension: NamingConvension, edge_type: str = "in_out"): + naming_convension: NamingConvension, update_graph, edge_type: str = "in_out"): ename = naming_convension.get_edge_name(node, step_idx, edge_type) eparams = EdgeParameters(ename, dim, node, 0, step_idx, edge_type) for edge in G.out_edges(node.name): assert edge.from_idx == 0, "Only for use with nodes that have one output" edge.params = eparams LOG.debug("%s %s", node.name, ename) - eparams.dims = node.set_output_size([dim])[0] + if update_graph: + eparams.dims = node.set_output_size([dim])[0] + else: + eparams.dims = node.out_dims[0] -def validate_one_in_edge(G, node: Parameters, expect_named: bool = True): +def validate_one_in_edge(G, node: Parameters, update_graph, expect_named: bool = True): edges = G.in_edges(node.name) if len(edges) != 1: if len(edges) > 1: @@ -70,11 +75,12 @@ def validate_one_in_edge(G, node: Parameters, expect_named: bool = True): assert eparams is not None, "edge parameters not yet set" assert not expect_named or eparams.dims.has_keys( ['c', 'h', 'w']), "dimensions not yet set" - eparams.dims = node.set_input_size([eparams.dims])[0] + if update_graph: + eparams.dims = node.set_input_size([eparams.dims])[0] return eparams -def validate_multi_in_edge(G, node: Parameters, expect_named: bool = True): +def validate_multi_in_edge(G, node: Parameters, update_graph, expect_named: bool = True): dims = [] for edge in G.indexed_in_edges(node.name): if edge is None: @@ -85,64 +91,77 @@ def validate_multi_in_edge(G, node: Parameters, expect_named: bool = True): assert not expect_named or eparams.dims.has_keys( ['c', 'h', 'w']), "dimensions not yet set" dims.append(eparams.dims) - try: - dims = node.set_input_size(dims) - except MissMatchedInputsError as exc: - raise ValueError(f'missmatched inputs on node {node.name}') from exc + if update_graph: + try: + dims = node.set_input_size(dims) + except MissMatchedInputsError as exc: + raise ValueError(f'missmatched inputs on node {node.name}') from exc return dims def add_dimensions_concat(G, node: Parameters, step_idx: int, - naming_convension: NamingConvension, indexes): + naming_convension: NamingConvension, + indexes, update_graph): del indexes - in_dims = validate_multi_in_edge(G, node, expect_named=False) - out_dims = node.get_output_size(in_dims) - set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension) + in_dims = validate_multi_in_edge(G, node, update_graph, expect_named=False) + if update_graph: + out_dims = node.get_output_size(in_dims) + else: + out_dims = node.out_dims + set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension, update_graph ) def add_dimensions_constant(G, node: Parameters, step_idx: int, - naming_convension: NamingConvension, indexes): + naming_convension: NamingConvension, indexes, update_graph): node.index = indexes['constant'] indexes['constant'] += 1 constant_dims = node.get_output_size(None) set_out_edges_one(G, node, constant_dims[0], step_idx, - naming_convension, edge_type="in") + naming_convension, update_graph, edge_type="in") def add_dimensions_input(G, node: Parameters, step_idx: int, - naming_convension: NamingConvension, indexes): + naming_convension: NamingConvension, indexes, update_graph): node.index = indexes['input'] indexes['input'] += 1 input_dims = node.get_output_size(None) node.set_input_size(input_dims) set_out_edges_one(G, node, input_dims[0], step_idx, - naming_convension, edge_type="in") + naming_convension, update_graph , edge_type="in") def add_dimensions_output(G, node: Parameters, step_idx: int, - naming_convension: NamingConvension, indexes): + naming_convension: NamingConvension, indexes, update_graph): node.index = indexes['output'] indexes['output'] += 1 - eparams = validate_one_in_edge(G, node, expect_named=False) + eparams = validate_one_in_edge(G, node, update_graph, expect_named=False) eparams.edge_type = "out" eparams.name = naming_convension.get_edge_name(node, step_idx, "out") # set the dimensions of the output node - node.set_output_size(node.get_output_size([eparams.dims])) + if update_graph: + node.set_output_size(node.get_output_size([eparams.dims])) def add_dimensions_unknown_single(G, node: Parameters, step_idx: int, - naming_convension: NamingConvension, indexes): + naming_convension: NamingConvension, indexes, update_graph): del indexes - eparams = validate_one_in_edge(G, node, expect_named=False) - out_dims = node.get_output_size([eparams.in_dims]) - set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension) + eparams = validate_one_in_edge(G, node, update_graph, expect_named=False) + if update_graph: + out_dims = node.get_output_size([eparams.in_dims]) + else: + out_dims = node.out_dims + set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension, update_graph) def add_dimensions_unknown(G, node: Parameters, step_idx: int, - naming_convension: NamingConvension): - in_dims = validate_multi_in_edge(G, node, expect_named=False) - set_out_edges_multi(G, node, node.get_output_size(in_dims), - step_idx, naming_convension) + naming_convension: NamingConvension, update_graph): + in_dims = validate_multi_in_edge(G, node, update_graph, expect_named=False) + if update_graph: + out_dims = node.get_output_size(in_dims) + else: + out_dims = node.out_dims + set_out_edges_multi(G, node, out_dims, + step_idx, naming_convension, update_graph) OP_ROUTINES = { @@ -154,7 +173,7 @@ def add_dimensions_unknown(G, node: Parameters, step_idx: int, } -def add_dimensions(G, naming_convension: NamingConvension = None) -> list: +def add_dimensions(G: GraphView, naming_convension: NamingConvension = None, update_graph=True) -> list: """ Walks graph setting all edge names and dimensions """ if naming_convension is None: @@ -171,15 +190,21 @@ def add_dimensions(G, naming_convension: NamingConvension = None) -> list: # else "b" + (str(node.step_idx) if node.step_idx else node.name))) LOG.debug("inputs: %s", [node.name for node in inputs]) - for node in G.dfs(inputs): + def add_step(step, idx): + if len(steps) <= idx: + steps.extend([None] * (idx + 1 - len(steps))) + steps[idx] = step + + for node in G.topological_sort(inputs): LOG.debug("add dimensions to: %s", node.name) - node.step_idx = len(steps) - steps.append({'node': node}) + if update_graph: + node.step_idx = len(steps) + add_step({'node': node}, node.step_idx) if node.__class__ in OP_ROUTINES: OP_ROUTINES[node.__class__]( - G, node, node.step_idx, naming_convension, indexes) + G, node, node.step_idx, naming_convension, indexes, update_graph) else: - add_dimensions_unknown(G, node, node.step_idx, naming_convension) + add_dimensions_unknown(G, node, node.step_idx, naming_convension, update_graph) set_aliases(G) # verify_graph(G, throw_exception=True) return steps diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py index e0276fd36..e1f13ee60 100644 --- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py +++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 GreenWaves Technologies, SAS +# Copyright (C) 2021, 2022 GreenWaves Technologies, SAS # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -13,29 +13,26 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from functools import reduce import logging from collections.abc import MutableSet from copy import deepcopy from typing import Iterator, Sequence -from graph.dim import Dim -from graph.types import (BinaryOpParameters, ConcatParameters, - ConstantInputParameters, FcParameters, - InputParameters, LinearFusionParameters, - OutputParameters, PadParameters, ReshapeParameters, - ReverseParameters, StridedSliceParameters, - TransposeParameters, ActivationParameters) -from graph.types.base import NNEdge, SensitiveToOrder -from graph.types.others import CopyParameters, UnaryOpParameters -from graph.types.tensor_arithmetic import Broadcastable -from utils.compatible_transposes import (find_all_compatible_transposes, - find_combination) +from graph.types import (ActivationParameters, BinaryOpParameters, + Broadcastable, ConcatParameters, + ConstantInputParameters, CopyParameters, FcParameters, + GlobalPoolingParameters, InputParameters, + LinearFusionParameters, NNEdge, OutputParameters, + PadParameters, PowOpParameters, ReshapeParameters, + ReverseParameters, SensitiveToOrder, + StridedSliceParameters, TransposeParameters, + UnaryOpParameters) +from utils.compatible_transposes import reverse_reshape from utils.graph import Node -from utils.graph_utils.copy_expressions import do_transpose from utils.node_id import NodeId -from .eliminate_transposes_actions import (Action, DeleteReshapeAction, +from .eliminate_transposes_actions import (Action, CantContinueError, + DeleteReshapeAction, DeleteTransposeAction, EndActionDown, EndActionUp, InsertReshapeAction, @@ -46,11 +43,11 @@ SetReshapeAction, SetTransposeAction, SwitchBatchLinearAction, - TransposePad, TransposeReverse, + TransposePad, + TransposeReverse, TransposeSlidedSlice) -from .transpose_helpers import (apply_transpose, get_reshape_transpose, - identity_transpose, reshape_is_transpose, - reverse_transpose, reverses_transpose, +from .transpose_helpers import (apply_transpose, identity_transpose, + reverse_transpose, reverses_transpose_up, transpose_does_nothing) LOG = logging.getLogger("nntool." + __name__) @@ -67,7 +64,7 @@ def debug(msg): TRANSIENT_ACTIONS = { PadParameters: TransposePad, ReverseParameters: TransposeReverse, - StridedSliceParameters: TransposeSlidedSlice + StridedSliceParameters: TransposeSlidedSlice, } NODES_TO_EXPLORE_UP = { @@ -77,10 +74,6 @@ def debug(msg): } -class CantContinueError(Exception): - pass - - class TransposeHistory(): def __init__(self, node, from_shape=None, transpose=None, to_shape=None) -> None: self.node = node @@ -140,13 +133,13 @@ def visited_down(self, node, idx=None) -> bool: def visit_up(self, node, idx): val = self._nodes.setdefault(node, set()) - val.add('up{idx}') + val.add(f'up{idx}') def visited_up(self, node, idx=None) -> set: visited = self._nodes.get(node, set()) if idx is None: return any(k.startswith('up') for k in visited) - return 'up{idx}' in visited or 'up*' in visited + return f'up{idx}' in visited or 'up*' in visited def visited_direction(self, direction, idx, node) -> bool: return f'{direction}{idx}' in self._nodes.get(node, set()) @@ -180,55 +173,6 @@ def __repr__(self) -> str: return "{" + ",".join(f"{repr(node)}: {visited}" for node, visited in self._nodes.items()) + "}" -def is_broadcasted(from_shape, to_shape): - from_len = len(from_shape) - to_len = len(to_shape) - if from_len >= to_len: - return False - return tuple(([1] * (to_len - from_len)) + list(from_shape)) == tuple(to_shape) - - -def expand_to_len(trans, length): - extra = length-len(trans) - return tuple(list(range(extra)) + [dim + extra for dim in trans]) - - -def reverse_reshape(trans, from_shape, to_shape): - """reverses the effect of this reshape on the transpose""" - # if the from_shape -> to_shape is actually a broadcast reshape - # i.e. 4, 10, 1 -> 1, 4, 10, 1 we absolutely need to keep the order 4, 10, 1 in - # the transpose however the 2 1s in the result are ambiguous so handle this as a - # (simple) special case. Just expand the transpose with no transpose at the start - # and expand_len + original transpose dim at the end - if len(from_shape) == 0 or len(to_shape) == 0: - return None - if is_broadcasted(from_shape, to_shape): - return expand_to_len(trans, len(to_shape)) - - return next(iter([t for t in find_all_compatible_transposes(find_combination(from_shape, to_shape), trans) - if len(t) == len(to_shape)]), None) - - -def none_or_idx(trans, idx): - return None if trans[idx] is None else idx - - -def reverse_broadcast(old_shape, new_shape, transpose): - old_shape_idx = new_shape_idx = 0 - res_pos = {} - while old_shape_idx < len(old_shape) or new_shape_idx < len(new_shape): - if old_shape_idx < len(old_shape) and old_shape[old_shape_idx] == new_shape[new_shape_idx]: - res_pos[old_shape_idx] = new_shape_idx - old_shape_idx += 1 - new_shape_idx += 1 - elif new_shape_idx < len(new_shape) and new_shape[new_shape_idx] == 1: - new_shape_idx += 1 - else: - raise ValueError( - f'reverse broadcast not possible between {old_shape} and {new_shape}') - return tuple([res_pos[idx] for idx in transpose] + [idx for idx, _ in enumerate(new_shape) if idx not in res_pos.values()]) - - def requires_reshape(trans1, trans2, dim): """Checks if layout shape doesn't change but a reshape is necessary due to 1 position""" if (tuple(dim.shape) != tuple(dim.layout_shape) and @@ -240,56 +184,38 @@ def requires_reshape(trans1, trans2, dim): return False -def strip_nones(trans): - return [i for i in trans if i is not None] +def check_for_null_transpose(node, transpose): + if transpose is None: + raise CantContinueError(f"can't continue at {node.name}") # @IgnoreException -def broadcast_reduce(out_shape, in_shape, transpose): - """Looking at a broadcasted input that has a lower rank than out_shape find - the equivalent transpose to the transpose on the broadcasted shape before - the broadcast +def check_continue(visited_nodes: VisitedNodes, cur_visited_nodes: VisitedNodes, exclude_nodes, node, direction, idx): + """Checks to see if we should skip visiting node on edge Args: - out_shape (Sequence): The full shape of the output of the broadcasted operation - in_shape (Sequence): The shape of the unbroadcasted input - transpose (Sequence): The transpose on the output + visited_nodes (VisitedNodes): All nodes visited in previous eliminations + cur_visited_nodes (VisitedNodes): Nodes visited on this branch + exclude_nodes (Sequence[Parameters]): Don't visit these nodes + node (Parameters): Node on edge + direction (str): direction of visit 'down' or 'up' + idx (int): edge index + + Raises: + CantContinueError: Fail this transpose test Returns: - Tuple: The in shape, the broadcasted in shape, the equivalent transpose + bool: True if skip False if visit """ - diff_shape = len(out_shape) - len(in_shape) - # broadcast shape with nones - exp_in_shape = ([None] * diff_shape) + list(range(len(in_shape))) - # apply the reverse of the transpose. now we have the broadcasted shape before the transpose - transpose_exp_in_shape = apply_transpose( - exp_in_shape, reverse_transpose(transpose)) - # strip the nones and reverse the result. This gives the transpose of the unbroadcasted shape - new_transpose = reverse_transpose(strip_nones(transpose_exp_in_shape)) - new_shape = ([1] * diff_shape) + in_shape - return in_shape, new_shape, new_transpose - - -def broadcast_expand(out_shape, in_shape, transpose): - diff_shape = len(out_shape) - len(in_shape) - exp_in_shape = ([None] * diff_shape) + list(range(len(in_shape))) - transpose_exp_in_shape = apply_transpose(exp_in_shape, transpose) - new_transpose = list(range(diff_shape)) + \ - [dim + diff_shape for dim in transpose] - new_shape = ([1] * diff_shape) + in_shape - return in_shape, new_shape, new_transpose - - -def check_for_null_transpose(node, transpose): - if transpose is None: - raise CantContinueError(f"can't continue at {node.name}") # @IgnoreException - - -def check_continue(visited_nodes: VisitedNodes, cur_visited_nodes: VisitedNodes, exclude_nodes, node, direction, idx): all_visited = visited_nodes | cur_visited_nodes - if direction == 'up' and all_visited.visited_down(node): - return True - if direction == 'down' and all_visited.visited_up(node): - raise CantContinueError() # @IgnoreException + # if the node is sensitive to order then even if we have already visited it down + # we must visit it up and vice versa so that we maybe insert a reshape/transpose after it + if not isinstance(node, SensitiveToOrder): + if direction == 'up' and all_visited.visited_down(node): + # trying to visit node that was already visited in the other direction. + return True + if direction == 'down' and all_visited.visited_up(node): + # trying to visit node that was already visited in the other direction. + return True if all_visited.visited_direction(direction, idx, node): raise CantContinueError() # @IgnoreException if node in exclude_nodes: @@ -297,16 +223,11 @@ def check_continue(visited_nodes: VisitedNodes, cur_visited_nodes: VisitedNodes, return False -def strip_leading_ones(shape, in_len): - res = [] - seen_dim = False - for dim in shape: - if seen_dim: - res.append(dim) - elif dim != 1: - res.append(dim) - seen_dim = True - return res +def strip_leading_dim(shape, dim=1): + res = list(shape.copy()) + while len(res) > 1 and res[0] == dim: + res.pop(0) + return tuple(res) def compute_max_shape(dims): @@ -343,9 +264,9 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, node : The node to look at visited_nodes : Nodes already traversed in_edge : The edge we are arriving on at this node - transpose_history : A history of the reshapes passed that did not allow us to determine the transpose - transpose : The current transpose being propagated. Can be None to indicate that we cannot translate - the transpose via that reshape + transpose_history : A history of the reshapes passed that did not allow us + to determine the transpose. Transposes + are in the downwards direction. Returns: A tuple of a list of actions and a list of nodes traversed @@ -385,7 +306,7 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, # if arriving on a broadcasted input the transpose needs to be expanded # since the transpose is only acting on the broadcasted dimensions no reshape is necessary - if isinstance(node, Broadcastable) and len(in_shape) != node.out_dims[0].rank: + if isinstance(node, (Broadcastable, PowOpParameters)) and len(in_shape) != node.out_dims[0].rank: check_for_null_transpose(node, transpose) # This could be an expression so need to broadcaset the output max_shape = compute_max_shape(node.out_dims) @@ -414,15 +335,15 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, if len(edge_in_shape) != len(max_shape): # strip the broadcasted axis from the transpose b_axes = broadcasted_axes(edge_in_shape, max_shape) - transpose_without_broadcast = strip_axes_from_transpose( - reverse_transpose(transpose), b_axes) - # from shape will be the old shape with the unbroadcasted transpose + # Transpose moving down through the broadcast - strip the broadcast off it + transpose_without_broadcast = strip_axes_from_transpose(transpose, b_axes) + # from shape will be the old shape with the reversed unbroadcasted transpose - i.e. going up from_shape = apply_transpose( - edge_in_shape, transpose_without_broadcast) - # to shape is the broadcasted input shape with the transpose with the leading ones removed + edge_in_shape, reverse_transpose(transpose_without_broadcast)) + # to shape is the broadcasted input shape with the reverse transpose with the leading ones removed broadcasted_shape = ([1] * len(b_axes)) + list(edge_in_shape) - to_shape = strip_leading_ones(apply_transpose( - broadcasted_shape, reverse_transpose(transpose)), len(from_shape)) + to_shape = strip_leading_dim(apply_transpose( + broadcasted_shape, reverse_transpose(transpose))) # if they are not equal insert a reshape if from_shape != to_shape: info( @@ -456,27 +377,36 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, if filter_node.batch_size > 1: info( f"rejected {node.name} - multibatch linear layer - inserting transpose {transpose}") - return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes + return [ + InsertTransposeAction( + node, direction='in', idx=in_edge.to_idx, transpose=transpose), + EndActionDown(node)], cur_visited_nodes info( f"accepted {node.name} - linear layer reorder input - {transpose}") qrec = G.quantization and G.quantization[NodeId(node)] - return cur_actions + [ReorderLinearAction.in_from_history(node, transpose_history, qrec), EndActionDown(node)], cur_visited_nodes + return cur_actions + [ + ReorderLinearAction.in_from_history(node, transpose_history, qrec), + EndActionDown(node)], cur_visited_nodes if isinstance(node, TransposeParameters): - # TODO - Might be able to get rid of this and check history check_for_null_transpose(node, transpose) - if reverses_transpose(transpose, node.transpose, node.out_dims[0]): + reverses_transpose, old_shape = reverses_transpose_up(transpose, node.transpose, node.out_dims[0]) + if reverses_transpose: info( f"accepted {node.name} - transpose {node.transpose} reversed in by {transpose} on {node.in_dims[0]}") - reshape = requires_reshape( - transpose, node.transpose, node.in_dims[0]) - if reshape: + if old_shape: + reshape = (old_shape, node.out_dims[0].shape) info(f"requires reshape {reshape[0]} -> {reshape[1]}") + else: + reshape = None return [DeleteTransposeAction(node, reshape=reshape), EndActionDown(node)], cur_visited_nodes new_transpose = apply_transpose(transpose, node.transpose) info( - f"rejected {node.name} - transpose - does not reverse - absorbing {transpose} into {node.transpose} -> {new_transpose}") - return [SetTransposeAction(node, new_transpose), EndActionDown(node)], cur_visited_nodes + f"rejected {node.name} - transpose - does not reverse - absorbing {transpose} " + f"into {node.transpose} -> {new_transpose}") + return [ + SetTransposeAction(node, new_transpose), + EndActionDown(node)], cur_visited_nodes if isinstance(node, OutputParameters): # TODO - Might be able to get rid of this and check history @@ -484,7 +414,10 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, if node.fixed_order: info( f"rejected {node.name} - fixed order output - inserting transpose {transpose}") - return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes + return [ + InsertTransposeAction( + node, direction='in', idx=in_edge.to_idx, transpose=transpose), + EndActionDown(node)], cur_visited_nodes info( f"accepted {node.name} - output without fixed order - transpose output {transpose}") # No change here since the output dimensions will be computed by the shape inference @@ -493,23 +426,20 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, if isinstance(node, StridedSliceParameters) and node.slice_shape != node.out_shape: # strided slice that is also reshaping check_for_null_transpose(node, transpose) - new_transpose = reverse_transpose(reverse_reshape( - reverse_transpose(transpose), node.slice_shape, node.out_shape)) + new_transpose, from_shape, to_shape = reverse_reshape( + transpose, node.slice_shape, node.out_shape) if new_transpose is None: info( - f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}") + f"rejected {node.name} - cannot pass slice reshape - inserting transpose {transpose}") return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes cur_actions.append(TransposeSlidedSlice( - node, reverse_transpose(transpose), transpose_out=reverse_transpose(new_transpose), dir="down")) + node, transpose, out_shape=to_shape, dir="down")) if identity_transpose(new_transpose): return cur_actions + [EndActionDown(node)], cur_visited_nodes - from_shape = do_transpose(reverse_transpose( - transpose), node.slice_shape) if transpose is not None else None - transpose_history = transpose_history + \ [TransposeHistory(node, node.slice_shape, new_transpose, node.out_shape)] @@ -521,49 +451,21 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, elif isinstance(node, ReshapeParameters): # TODO - Might be able to get rid of this and check history check_for_null_transpose(node, transpose) - if reshape_is_transpose(node.old_shape.shape, node.shape.shape): - # if the reshape looks like a transpose then treat it as one. THe reshape rewriter sometimes gets - # the order wrong in this case - old_transpose = get_reshape_transpose( - node.old_shape.shape, node.shape.shape) - if reverses_transpose(transpose, old_transpose): - cur_actions += [ - DeleteReshapeAction( - node - ) - ] - return cur_actions + [ - DeleteReshapeAction( - node - ), - EndActionDown(node)], cur_visited_nodes - new_transpose = apply_transpose( - transpose, old_transpose) - info( - f"pass reshape that is transpose {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}") - # insert an action to rewrite the reshape shapes - from_shape = apply_transpose( - node.old_shape.shape, reverse_transpose(transpose)) - to_shape = apply_transpose( - node.shape.shape, reverse_transpose(transpose)) - else: - # the transpose that we are actually applying is the reverse of the transpose that we are propagating down - # So we reverse the transpose before evaluating the reshape and then reverse the result - new_transpose = reverse_transpose(reverse_reshape( - reverse_transpose(transpose), node.old_shape, node.shape)) + new_transpose, from_shape, to_shape = reverse_reshape( + transpose, node.old_shape, node.shape) + info( + f"pass reshape {node.name} down trans: old {transpose} new {new_transpose} " + f"shape: old {node.old_shape} new {node.shape}") + + if new_transpose is None and len(node.shape) > 1: info( - f"pass reshape {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}") + f"rejected {node.name} - cannot pass reshape - inserting transpose {transpose}") + return [ + InsertTransposeAction( + node, direction='in', idx=in_edge.to_idx, transpose=transpose), + EndActionDown(node)], cur_visited_nodes - if new_transpose is None and len(node.shape) > 1: - info( - f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}") - return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes - - # insert an action to rewrite the reshape shapes - from_shape = apply_transpose(node.old_shape.shape, - reverse_transpose(transpose)) if transpose is not None else None - to_shape = apply_transpose(node.shape.shape, reverse_transpose( - new_transpose)) if new_transpose is not None else None + # insert an action to rewrite the reshape shapes info(f"rewrite reshape to {from_shape}->{to_shape}") if from_shape is None or to_shape is None or from_shape != to_shape: cur_actions += [ @@ -589,16 +491,21 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge, if new_transpose is None: try: - return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(), cur_actions.copy(), transpose_history, new_transpose) + return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(), + cur_actions.copy(), transpose_history, new_transpose) except CantContinueError as ex: if transpose is None: raise ex info( - f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}") - return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes + f"rejected {node.name} - cannot continue {ex} - inserting transpose {transpose}") + return [ + InsertTransposeAction( + node, direction='in', idx=in_edge.to_idx, transpose=transpose), + EndActionDown(node)], cur_visited_nodes transpose = new_transpose - return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose) + return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, + cur_actions, transpose_history, transpose) def continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose): @@ -621,7 +528,8 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history info( f'accepted {node.name} - single dimension transpose') return [EndActionUp(node)], cur_visited_nodes - if isinstance(node, SensitiveToOrder) and transpose_does_nothing(reverse_transpose(transpose), node.out_dims[out_edge.from_idx].shape): + if (isinstance(node, SensitiveToOrder) and + transpose_does_nothing(reverse_transpose(transpose), node.out_dims[out_edge.from_idx].shape)): new_shape = apply_transpose( node.out_dims[out_edge.from_idx].shape, reverse_transpose(transpose)) # could be that the transpose does nothing to the data layout but still changes the positions of @@ -643,7 +551,10 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history check_for_null_transpose(node, transpose) info( f'rejected {node.name} - sensitive to order - inserting transpose {transpose}') - return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes + return [ + InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, + out_edge=out_edge, transpose=reverse_transpose(transpose)), + EndActionUp(node)], cur_visited_nodes cur_actions = [] @@ -664,7 +575,9 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history exclude_nodes, visited_nodes | cur_visited_nodes, edge, - [TransposeHistory(node, node.out_dims[edge.from_idx], transpose, apply_transpose(node.out_dims[edge.from_idx], transpose))]) + [ + TransposeHistory(node, node.out_dims[edge.from_idx], transpose, + apply_transpose(node.out_dims[edge.from_idx], transpose))]) cur_visited_nodes |= visited_down_nodes cur_actions += new_actions @@ -680,15 +593,20 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history f"accepted {node.name} - linear layer switch batch dimension") return cur_actions + [SwitchBatchLinearAction(node), EndActionUp(node)], cur_visited_nodes info(f"rejected {node.name} - batched linear") - return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes + return [ + InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, + out_edge=out_edge, transpose=reverse_transpose(transpose)), + EndActionUp(node)], cur_visited_nodes info(f"accepted {node.name} - linear layer reorder output") qrec = G.quantization and G.quantization[NodeId(node)] - return cur_actions + [ReorderLinearAction.out_from_history(node, transpose_history, qrec), EndActionUp(node)], cur_visited_nodes + return cur_actions + [ + ReorderLinearAction.out_from_history( + node, transpose_history, qrec), + EndActionUp(node)], cur_visited_nodes # Transpose may reverse the propagated transpose or be reordered if isinstance(node, TransposeParameters): check_for_null_transpose(node, transpose) - # TODO - in_dims or out_dims - 99% sure in_dims if tuple(node.transpose) == tuple(transpose): info( f"accepted {node.name} - transpose {node.transpose} equals {transpose} on {node.in_dims[0]}") @@ -696,11 +614,18 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history node.transpose, transpose, node.out_dims[0]) if reshape: info(f"requires reshape {reshape[0]} -> {reshape[1]}") - return cur_actions + [DeleteTransposeAction(node, reshape=reshape), EndActionUp(node)], cur_visited_nodes - # TODO - This should merge with the existing Transpose - new_transpose = apply_transpose(node.transpose, transpose) + return cur_actions + [ + DeleteTransposeAction(node, reshape=reshape), EndActionUp(node)], cur_visited_nodes + + # absorb transpose in a -> tranpose T1 -> b -> existing trans node T2 -> c + # a -> TNew -> c + # Apply reversed T1 to T2 + + new_transpose = apply_transpose( + node.transpose, reverse_transpose(transpose)) info( - f"rejected {node.name} - transpose - does not reverse - absorbing {transpose} into {node.transpose} -> {new_transpose}") + f"rejected {node.name} - transpose - does not reverse - absorbing " + f"{transpose} into {node.transpose} -> {new_transpose}") return [SetTransposeAction(node, new_transpose), EndActionDown(node)], cur_visited_nodes # Input can be reordered if not frozen @@ -708,30 +633,48 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history check_for_null_transpose(node, transpose) if node.fixed_order: info(f"rejected {node.name} - fixed order input") - return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=transpose), EndActionUp(node)], cur_visited_nodes + return [ + InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, + out_edge=out_edge, transpose=transpose), EndActionUp(node)], cur_visited_nodes info( f"accepted {node.name} - input without fixed order - transpose input {reverse_transpose(transpose)}") - return cur_actions + [ReorderInputDims.from_history(node, transpose_history, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes + return cur_actions + [ + ReorderInputDims.from_history( + node, transpose_history, transpose=reverse_transpose(transpose)), + EndActionUp(node)], cur_visited_nodes # Constant can be reordered if isinstance(node, ConstantInputParameters): check_for_null_transpose(node, transpose) info( f"accepted {node.name} - constant input - transpose constant {transpose}") - return cur_actions + [ReorderConstantInput.from_history(node, transpose_history, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes + return cur_actions + [ + ReorderConstantInput.from_history( + node, transpose_history, transpose=reverse_transpose(transpose)), + EndActionUp(node)], cur_visited_nodes # Conditions that can pass through the Transpose if isinstance(node, StridedSliceParameters) and node.changes_shape: - reversed_below = reverse_transpose(transpose) - reversed_above = reverse_broadcast( - node.out_shape, node.post_slice_shape, reversed_below) - new_transpose = reverse_transpose(reversed_above) + # special case for a strided slice that also has a reshape + check_for_null_transpose(node, transpose) + new_transpose, from_shape, to_shape = reverse_reshape( + transpose, node.slice_shape, node.out_shape, going_up=True) + if new_transpose is None: + info( + f"rejected {node.name} - cannot pass slice reshape - inserting transpose {transpose}") + return [InsertTransposeAction(node, direction='out', idx=0, transpose=reverse_transpose(transpose)), + EndActionDown(node)], cur_visited_nodes + + cur_actions.append(TransposeSlidedSlice( + node, reverse_transpose(transpose), out_shape=to_shape)) + + if identity_transpose(new_transpose): + return cur_actions + [EndActionUp(node)], cur_visited_nodes + transpose_history = transpose_history + \ [TransposeHistory(node, node.out_shape, - new_transpose, node.post_slice_shape)] - cur_actions.append( - TransposeSlidedSlice(node, reversed_above, "up", transpose)) + new_transpose, node.in_dims[0].shape)] transpose = new_transpose elif node.__class__ in TRANSIENT_ACTIONS: check_for_null_transpose(node, transpose) @@ -741,26 +684,28 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history elif isinstance(node, ReshapeParameters): check_for_null_transpose(node, transpose) # TODO - may eliminate - new_transpose = reverse_reshape(reverse_transpose( - transpose), node.shape, node.old_shape) + # reversed transpose is being propagated up + new_transpose, from_shape, to_shape = reverse_reshape( + transpose, node.old_shape, node.shape, going_up=True) # if the upwards shape has one dimension we keep going since we want to find # nodes such as a linear layer that can reorder their output filters # This could be extended to recurrent layers for the inner dimension info( - f"pass reshape {node.name} up trans: old {transpose} new {new_transpose} shape: {node.old_shape} -> {node.shape}") + f"pass reshape {node.name} up trans: old {transpose} new {new_transpose} " + f"shape: {node.old_shape} -> {node.shape}") if new_transpose is None and len(node.old_shape) > 1: - info(f"rejected {node.name} - transpose in - does not reverse") - return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes + info(f"rejected {node.name} - cannot pass reshape - inserting transpose {transpose}") + # since we are going up the transpose is in the up direction so needs to be reversed + return [ + InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, + out_edge=out_edge, transpose=reverse_transpose(transpose)), + EndActionUp(node)], cur_visited_nodes # insert an action to rewrite the reshape shapes - from_shape = node.old_shape.calc_transpose( - new_transpose) if new_transpose is not None else None - to_shape = node.shape.calc_transpose( - reverse_transpose(transpose)) if transpose is not None else None transpose_history = transpose_history + \ [TransposeHistory(node, node.shape, new_transpose, node.old_shape)] info(f"rewrite reshape to {from_shape}->{to_shape}") - if from_shape is None or to_shape is None or from_shape.shape != to_shape.shape: + if from_shape is None or to_shape is None or from_shape != to_shape: cur_actions.extend([ SetReshapeAction( node, @@ -781,16 +726,21 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history if new_transpose is None: try: # @IgnoreException - return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(), cur_actions.copy(), transpose_history, transpose) + return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(), + cur_actions.copy(), transpose_history, transpose) except CantContinueError as ex: if transpose is None: raise ex - info(f"rejected {node.name} - transpose in - does not reverse") - return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes + info(f"rejected {node.name} - cannot continue {ex} - inserting transpose {transpose}") + return [ + InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, + out_edge=out_edge, transpose=reverse_transpose(transpose)), + EndActionUp(node)], cur_visited_nodes transpose = new_transpose # Continue to visit upwards - return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose) + return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, + cur_actions, transpose_history, transpose) def continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose): @@ -802,19 +752,18 @@ def continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_ac if check_continue(visited_nodes, cur_visited_nodes, exclude_nodes, edge.from_node, 'up', edge.from_idx): continue edge_in_shape = node.in_dims[edge.to_idx].shape - if isinstance(node, Broadcastable) and len(edge_in_shape) != node.out_dims[0].rank: + if isinstance(node, (Broadcastable, PowOpParameters)) and len(edge_in_shape) != node.out_dims[0].rank: max_shape = compute_max_shape(node.out_dims) b_axes = broadcasted_axes(edge_in_shape, max_shape) - transpose_without_broadcast = strip_axes_from_transpose( - reverse_transpose(transpose), b_axes) + transpose_without_broadcast = strip_axes_from_transpose(transpose, b_axes) # from shape will be the old shape with the unbroadcasted transpose from_shape = apply_transpose( - edge_in_shape, transpose_without_broadcast) + edge_in_shape, reverse_transpose(transpose_without_broadcast)) # to shape is the broadcasted input shape with the transpose with the leading ones removed broadcasted_shape = ([1] * len(b_axes)) + list(edge_in_shape) - to_shape = strip_leading_ones(apply_transpose( - broadcasted_shape, reverse_transpose(transpose)), len(from_shape)) + to_shape = strip_leading_dim(apply_transpose( + broadcasted_shape, reverse_transpose(transpose))) # if they are not equal insert a reshape if from_shape != to_shape: info( @@ -896,7 +845,8 @@ def combine_transposes(G): for tstart, tend in trans_pairs: new_transpose = apply_transpose(tstart.transpose, tend.transpose) info( - f'combine transposes {tstart.name} and {tend.name} {tstart.transpose} & {tend.transpose} -> {new_transpose}') + f'combine transposes {tstart.name} and {tend.name} {tstart.transpose} & ' + f'{tend.transpose} -> {new_transpose}') tstart.transpose = new_transpose G.remove_and_reconnect(tend, edge_class=NNEdge) @@ -948,10 +898,12 @@ def delete_step_idx(G, action: DeleteTransposeAction): return G.in_edges(action.node)[0].from_node.step_idx -def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, do_silly=True): +def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, do_silly=True, only_up=False): info("eliminating unnecessary transposes") found_results = True pass_count = 0 + # keep trying to eliminate until we can't do more + # This should not loop since there is a bias in pushing transposes down while found_results: if steps is not None: if pass_count >= steps: @@ -965,7 +917,8 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, visited_nodes = set() actions = [] info(f"search for transposes +++ STEP {pass_count}") - transposes = G.nodes(node_classes=TransposeParameters) + transposes = sorted( + G.nodes(node_classes=TransposeParameters), key=lambda node: node.name) while transposes: transpose_node = transposes.pop(0) if transpose_node in visited_nodes: @@ -998,6 +951,8 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, cur_actions_up.insert(0, DeleteTransposeAction(transpose_node)) # search down for elimination try: + if only_up: + raise CantContinueError cur_visited_down = VisitedNodes() cur_visited_down.visit_down(transpose_node, 0) cur_actions_down = [] @@ -1032,7 +987,7 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, down_count = count_eliminated(cur_actions_down) # if the count is zero then the transpose has been eliminated however # 1 is better than 0 since another real transpose was deleted rather than a reorder etc - # always choose up before down since up is where we will transpose constants rather than reshaping them + # always favor up before down since up is where we will transpose constants if up_count > 0 and up_count >= down_count: info( f'found elimination for {transpose_node.name} upwards - {up_count} eliminated') @@ -1042,7 +997,7 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, visited_nodes.add(transpose_node) if single_step or steps is not None: break - # if transpose cannot be removed upwards movement push the transpose down if it actually moved + # if transpose cannot be removed upwards push the transpose down if it actually moved elif down_count > 0 or (down_count == 0 and transpose_moved(G, cur_actions_down)): info( f'found elimination for {transpose_node.name} downwards - {down_count} eliminated') @@ -1053,8 +1008,7 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, if single_step or steps is not None: break else: - info( - f'no elimination for {transpose_node.name} found') + info(f'no elimination for {transpose_node.name} found') if found_results: info("eliminate transposes") diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py index 55927a224..6a0d8c462 100644 --- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py +++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS +# Copyright (C) 2020, 2022 GreenWaves Technologies, SAS # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -28,12 +28,19 @@ LOG = logging.getLogger("nntool." + __name__) + +class CantContinueError(Exception): + pass + + def info(msg): LOG.info(msg) + def debug(msg): LOG.debug(msg) + class Action(ABC): def __init__(self, node) -> None: self.node = node @@ -162,6 +169,7 @@ def _execute(self, node, G): def __str__(self) -> str: return f"insert reshape at {self.node.name}:{self.direction}_{self.idx} in {self.in_shape} out {self.out_shape}" + def make_dim(shape): if shape is None: return shape @@ -169,6 +177,7 @@ def make_dim(shape): return shape.clone() return Dim.unnamed(shape) + class SetReshapeAction(Action): def __init__(self, node, in_shape=None, out_shape=None) -> None: super(SetReshapeAction, self).__init__(node) @@ -196,21 +205,22 @@ def __str__(self) -> str: class TransposeSlidedSlice(Action): - def __init__(self, node, transpose_in, dir=None, transpose_out=None) -> None: + def __init__(self, node, transpose, dir=None, out_shape=None) -> None: super(TransposeSlidedSlice, self).__init__(node) - self.transpose_in = tuple(transpose_in) - if transpose_out is None: - self.transpose_out = self.transpose_in - else: - self.transpose_out = tuple(transpose_out) + self.transpose = tuple(transpose) + self.shape_out = out_shape def _execute(self, node, G): info(f"{self}") - node.act_slice = [node.act_slice[idx] for idx in self.transpose_in] - node.out_shape = [node.out_shape[idx] for idx in self.transpose_out] + node.act_slice = apply_transpose(node.act_slice, self.transpose) + if self.shape_out is not None: + node.out_shape = self.shape_out + else: + node.out_shape = apply_transpose(node.out_shape, self.transpose) def __str__(self) -> str: - return "%s transpose slided slice parameters with %s/%s" % (self.node.name, self.transpose_in, self.transpose_out) + out_shape = "unchanged" if self.shape_out is None else f"changed to {self.shape_out}" + return f"{self.node.name} transpose slided slice parameters with {self.transpose} out shape {out_shape}" class TransposePad(Action): @@ -227,19 +237,26 @@ def __str__(self) -> str: return "%s transpose pad parameters with %s" % (self.node.name, self.transpose) -class TransposeReverse(Action): +class TransposeAxisBase(Action): def __init__(self, node, transpose, dir=None) -> None: - super(TransposeReverse, self).__init__(node) + super(TransposeAxisBase, self).__init__(node) self.transpose = tuple(transpose) def _execute(self, node, G): info(f"{self}") node.axis = self.transpose[node.axis] + +class TransposeReverse(TransposeAxisBase): def __str__(self) -> str: return "%s transpose reverse parameters with %s" % (self.node.name, self.transpose) +class TransposeGlobalPool(TransposeAxisBase): + def __str__(self) -> str: + return "%s transpose global pool parameters with %s" % (self.node.name, self.transpose) + + class TransposeInputBase(Action): def __init__(self, node, transpose, dir=None) -> None: super(TransposeInputBase, self).__init__(node) @@ -360,25 +377,35 @@ def __str__(self) -> str: class ReorderLinearAction(Action): - def __init__(self, node, direction, transpose, shape, qrec=None) -> None: + def __init__(self, node, direction, transpose, shape, set_reshape_shape=None, qrec=None) -> None: super(ReorderLinearAction, self).__init__(node) self.direction = direction self.shape = shape self.transpose = tuple(transpose) self.qrec = qrec + self.set_reshape_shape = set_reshape_shape @classmethod - def from_history(cls, node, history, qrec, dir): + def from_history(cls, node, history, qrec, direction): # Find the first entry in the transpose history that actually has a transpose - first_valid_entry = next(iter([rec - for rec in reversed(history) - if rec.transpose])) + entry_idx, first_valid_entry = next(iter([(idx, rec) for idx, rec in enumerate(reversed(history)) + if rec.transpose])) # arriving from the top the transpose is in the down direction and from the # bottom in the up direction so in both cases we need to reverse it transpose = tuple(reverse_transpose(first_valid_entry.transpose)) # shape closest to the node shape = tuple(first_valid_entry.to_shape) - return cls(node, dir, transpose, shape, qrec=qrec) + set_reshape_shape = None + # if direction == "out": + # first_reshape = next(iter([elem.node for elem + # in list(reversed(history))[:entry_idx] if isinstance(elem.node, ReshapeParameters)]), None) + # if first_reshape: + # if shape != tuple(first_reshape.shape.shape): + # raise CantContinueError(f'reshape {first_reshape.name} after linear {node.name} has ' + # f'incorrect out shape {first_reshape.shape.shape} to apply transpose {transpose}') + # set_reshape_shape = (first_reshape, apply_transpose(first_reshape.shape.shape, transpose)) + + return cls(node, direction, transpose, shape, set_reshape_shape=set_reshape_shape, qrec=qrec) @classmethod def out_from_history(cls, node, history, qrec): @@ -390,7 +417,8 @@ def in_from_history(cls, node, history, qrec): def _execute(self, node, G): info(f"{self}") - filter_node = node.contained_filters()[0] if isinstance(node, LinearFusionParameters) else node + filter_node = node.contained_filters()[0] if isinstance( + node, LinearFusionParameters) else node in_edges = G.indexed_in_edges(node.name) weights_node = in_edges[1].from_node if self.direction == "in": @@ -425,7 +453,11 @@ def _execute(self, node, G): list(self.transpose) ), biases_node.value.shape) - nid = NodeId(node, filter_node) if isinstance(node, LinearFusionParameters) else NodeId(node) + nid = NodeId(node, filter_node) if isinstance( + node, LinearFusionParameters) else NodeId(node) + if self.set_reshape_shape: + self.set_reshape_shape[0].shape = Dim.unnamed( + self.set_reshape_shape[1]) # since the output channel order has changed we need to make channel scaled qrec match this if G.quantization and nid in G.quantization: qrec = G.quantization[nid] @@ -446,8 +478,6 @@ def _execute(self, node, G): if len(qrec.in_qs) > 2: fqrec.in_qs[2] = qrec.in_qs[2] - - def __str__(self) -> str: return "reorder linear layer %s %s with shape %s transposed %s" % (self.node.name, self.direction, self.shape, self.transpose) diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py b/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py index 00b865e59..f043bd65a 100644 --- a/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py +++ b/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py @@ -22,20 +22,6 @@ def reverse_transpose(trans): return [trans.index(idx) for idx in range(len(trans))] -def reverses_transpose(trans1, trans2, dim=None): - """Checks if one transpose reverses another. If a dim is provided then - look if the transpose sequence produces an equivalent dim to cope with 1s in - dimensions.""" - if trans1 is None or trans2 is None: - return False - if dim and dim.layout_shape == dim.calc_transpose(trans1).calc_transpose(trans2).layout_shape: - return True - for idx, val in enumerate(trans1): - if trans2[val] != idx: - return False - return True - - def identity_transpose(trans): if trans is None: return False @@ -46,6 +32,35 @@ def apply_transpose(elems, trans): return [elems[i] for i in trans] +def strip_ones(shape): + return tuple(dim for dim in shape if dim != 1) + + +def reverses_transpose_up(trans1, trans2, dim=None): + """trans1->trans2->dim + 1) without dim do the transposes cancel + 2) with dim to the transposes cancel considering layout shape (i.e. without 1s in shape""" + if dim is not None and not isinstance(dim, tuple): + dim = tuple(dim.shape) + if trans1 is None or trans2 is None: + return False, None + if identity_transpose(apply_transpose(trans1, trans2)): + return True, None + if dim is not None: + # apply dim -> reverse t2 -> reverse t1 + # strip 1s and see if it is the same + layout_shape_after = strip_ones(dim) + shape_before = apply_transpose( + apply_transpose(dim, reverse_transpose(trans2)), + reverse_transpose(trans1)) + return strip_ones(shape_before) == layout_shape_after, shape_before + return False, None + + +def indexes_of(trans1, trans2): + return [trans1.index(i) for i in trans2] + + def transpose_does_nothing(transpose, shape): if transpose is None: return False @@ -57,10 +72,6 @@ def reduce_mask(mask): return reduce_mask(mask) == tmask -def strip_ones(shape): - return tuple(dim for dim in shape if dim != 1) - - def reshape_is_transpose(old_shape, new_shape): # TODO - check the order of the non 1 dimensions if len(old_shape) != len(new_shape): diff --git a/tools/nntool/graph/manipulations/extract.py b/tools/nntool/graph/manipulations/extract.py index 24bdec766..ffb07dc93 100644 --- a/tools/nntool/graph/manipulations/extract.py +++ b/tools/nntool/graph/manipulations/extract.py @@ -51,7 +51,6 @@ def extract_node(G: NNGraph, keep_node: Parameters): if node not in keep_nodes and node.name in G: LOG.info(f'remove {node.name}') G.remove(node) - G.reset_inout_counts() for edge in in_edges: input_node = G.add_input(edge.from_node.out_dims[edge.from_idx]) G.add_edge(NNEdge(input_node, keep_node, to_idx=edge.to_idx)) diff --git a/tools/nntool/graph/manipulations/liveness.py b/tools/nntool/graph/manipulations/liveness.py index 66e8a1aae..0c3959d83 100644 --- a/tools/nntool/graph/manipulations/liveness.py +++ b/tools/nntool/graph/manipulations/liveness.py @@ -15,6 +15,8 @@ from typing import Mapping, Sequence +from graph.types.input_output import InputBaseParameters, InputParameters, OutputParameters + def calculate_liveness(G, steps: Sequence[Mapping]) -> Mapping[str, Mapping]: liveness = {} for i, step in enumerate(steps): @@ -23,7 +25,7 @@ def calculate_liveness(G, steps: Sequence[Mapping]) -> Mapping[str, Mapping]: step['start'] = [] step['end'] = [] # input nodes create tensors - if G.is_input(node): + if isinstance(node, InputBaseParameters): edges = G.out_edges(node.name) if edges: assert all(edge.from_idx == 0 for edge in edges), "inputs should create a single tensor" @@ -40,7 +42,7 @@ def calculate_liveness(G, steps: Sequence[Mapping]) -> Mapping[str, Mapping]: assert live is not None, "Inputs to node must have already been created" if live['end'] < i: live['end'] = i - if G.is_output(node): + if isinstance(node, OutputParameters): live['is_output'] = True # check what we create for edge in G.out_edges(node.name): diff --git a/tools/nntool/graph/manipulations/set_aliases.py b/tools/nntool/graph/manipulations/set_aliases.py index dd7ff1614..94bbf8192 100644 --- a/tools/nntool/graph/manipulations/set_aliases.py +++ b/tools/nntool/graph/manipulations/set_aliases.py @@ -34,13 +34,7 @@ def walk_up(G, edge, concat_node): edge.to_node.name, edge.to_idx) edge.params.is_alias = True node = edge.from_node - if isinstance(node, ReshapeParameters): - # since it is a reshape it can only have one input - return walk_up(G, G.in_edges(node.name)[0], concat_node) - if isinstance(node, TransposeParameters): - if not node.does_nothing(): - return False - # since it is a reshape it can only have one input + if node.no_model_code: return walk_up(G, G.in_edges(node.name)[0], concat_node) if isinstance(node, SplitParameters): LOG.warning("split node %s is directly connected to concat node %s", @@ -61,12 +55,7 @@ def walk_down(G, edge, split_node): edge.params.is_alias = True node = edge.to_node errors = False - if isinstance(node, ReshapeParameters): - for edge in G.out_edges(node.name): - errors = errors or walk_down(G, edge, split_node) - elif isinstance(node, TransposeParameters): - if not node.does_nothing(): - return errors + if node.no_model_code: for edge in G.out_edges(node.name): errors = errors or walk_down(G, edge, split_node) elif isinstance(node, ConcatParameters): diff --git a/tools/nntool/graph/matches/fusions.py b/tools/nntool/graph/matches/fusions.py new file mode 100644 index 000000000..544a30ad1 --- /dev/null +++ b/tools/nntool/graph/matches/fusions.py @@ -0,0 +1,49 @@ +# Copyright (C) 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from graph.matches.matches import get_matches +from graph.types.constant_input import ConstantInputParameters +from quantization.verify_quantization import verify_quantization +from quantization.quantizer.new_quantizer import NewQuantizer + + +def fusions(nngraph, *match_names, no_postprocess: bool = False): + state = ConstantInputParameters.save_compression_state(nngraph) + try: + match_group = get_matches(*match_names) + while match_group: + match_group.match(nngraph) + nngraph.add_dimensions() + if no_postprocess: + break + if match_group.run_qtune: + quantizer = NewQuantizer(nngraph) + quantizer.quantize() + if match_group.run_adjust: + nngraph.adjust_order() + if match_group.run_again: + match_group = get_matches(*match_group.run_again) + else: + match_group = None + + if nngraph.quantization and verify_quantization(nngraph): + quantizer = NewQuantizer(nngraph) + quantizer.quantize() + problems = verify_quantization(nngraph) + if problems: + problems = "\n".join(problems) + raise ValueError(f'quantization issue after fusions\n{problems}\n') + finally: + ConstantInputParameters.restore_compression_state(nngraph, state) diff --git a/tools/nntool/graph/matches/match_utils.py b/tools/nntool/graph/matches/match_utils.py index bdffdc756..6ebb02ee4 100644 --- a/tools/nntool/graph/matches/match_utils.py +++ b/tools/nntool/graph/matches/match_utils.py @@ -30,7 +30,7 @@ def search_down(G, edge, for_node_classes, can_pass=None, can_pass_fn=None, edge multi_on_target (bool, optional): Allow target to have multiple edges. Defaults to True. Returns: - Optional[Sequence[Edge]]: Edges found or None + Optional[Sequence[Edge]]: Edges found or None """ if edge_list is None: edge_list = [] @@ -71,18 +71,19 @@ def search_up(G, edge, for_node_classes, can_pass=None, can_pass_fn=None, edge_l Args: G (NNGraph): Graph - edge (Edge): Edge to look down + edge (Edge): Edge to look up for_node_classes (Union[Parameters, Tuple[Parameters]]): Node class or classes to look for can_pass (Union[Parameters, Tuple[Parameters]], optional): Will pass through this node class or classes. Defaults to None. can_pass_fn (Callable, optional): function with graph and node as parameters. Should return True if search can pass this node. Defaults to None. - follow_multi (str, optional): Follow multi edge outputs. Defaults to empty string which means don't follow can be same or any. + follow_multi (str, optional): Follow multi edge outputs. Defaults to empty string which means don't + follow can be same or any. follow_first (bool, optional): Only follow first input. Defaults to True. multi_on_target (bool, optional): Allow target to have multiple edges. Defaults to True. Returns: - Optional[Sequence[Edge]]: Edges found or None + Optional[Sequence[Edge]]: Edges found or None """ if edge_list is None: edge_list = [] diff --git a/tools/nntool/graph/matches/matcher.py b/tools/nntool/graph/matches/matcher.py index a2914b99a..809d595a6 100644 --- a/tools/nntool/graph/matches/matcher.py +++ b/tools/nntool/graph/matches/matcher.py @@ -15,11 +15,10 @@ import logging from abc import ABC, abstractmethod -from typing import Generator, Sequence +from typing import Sequence +from utils.graph import GraphView, MatchNode from utils.node_id import NodeId -from utils.graph import GraphView, MatchNode, Node - LOG = logging.getLogger("nntool." + __name__) @@ -55,6 +54,18 @@ def __init__(self, identity: str = None): def name(self): return self.NAME + @property + def run_again(self): + return self.RUN_AGAIN_ON_MATCH + + @property + def run_qtune(self): + return self.RUN_QTUNE_ON_MATCH + + @property + def run_adjust(self): + return self.RUN_ADJUST_ON_MATCH + @staticmethod def remove_quantization(G, node): if G.quantization: @@ -83,7 +94,7 @@ def description(val): @staticmethod def needs_valid_dimension(val): - return Matcher.property_register("DESCRIPTION", val) + return Matcher.property_register("NEEDS_VALID_DIMENSION", val) @staticmethod def modifies_dimensions(val): @@ -102,12 +113,14 @@ def run_again_on_match(*args): return Matcher.property_register("RUN_AGAIN_ON_MATCH", args) @staticmethod - def run_qtune_on_match(val): - return Matcher.property_register("RUN_QTUNE_ON_MATCH", val) + def run_qtune_on_match(cls): + setattr(cls, 'RUN_QTUNE_ON_MATCH', True) + return cls @staticmethod - def run_adjust_on_match(val): - return Matcher.property_register("RUN_ADJUST_ON_MATCH", val) + def run_adjust_on_match(cls): + setattr(cls, 'RUN_ADJUST_ON_MATCH', True) + return cls @staticmethod def groups(*args): @@ -134,59 +147,6 @@ def deco(cls): groups = Matcher.groups -class DontReplaceError(Exception): - pass - - -class DefaultMatcher(Matcher): - @abstractmethod - def match_function(self, G: GraphView) -> Generator[GraphView, None, None]: - pass - - @abstractmethod - def replace_function(self, G: GraphView, subgraph: GraphView) -> Node: - pass - - def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: - replaced = True - has_modified_graph = False - while replaced: - replaced = False - for subgraph in self.match_function(G): - # TODO - Save in and out edges here since the replace function may modify the - # subgraph - in_edges = [in_edge for input_node in subgraph.inputs() - for in_edge in G.in_edges(input_node.name)] - out_edges = [out_edge for output_node in subgraph.outputs() - for out_edge in G.out_edges(output_node.name)] - try: - replacement, edge_in_mapping, edge_out_mapping = self.replace_function( - G, subgraph) - if replacement is None: - G.remove_fragment(subgraph) - has_modified_graph = True - elif isinstance(replacement, Node): - # use saved in and out edges - G.replace_fragment(subgraph, - replacement, - frag_in_edges=in_edges, - frag_out_edges=out_edges, - edge_in_mapping=edge_in_mapping, - edge_out_mapping=edge_out_mapping) - has_modified_graph = True - else: - raise TypeError( - "unexcepted return value from replace_function") - replaced = True - break - except DontReplaceError: - pass - - if set_identity: - self.set_identity(G) - - return has_modified_graph - # This can be used to define groups of matches to be selected # from the command line @@ -195,52 +155,56 @@ class MatchGroup(Matcher): def __init__(self, *args: Sequence[Matcher], identity: str = None): super().__init__(identity) - self.matches = list(args) + self._matches = {match.name: match for match in args} + self._matches_pending = [] + self._adjust_pending = False + self._qtune_pending = False + + @property + def run_again(self): + return self._matches_pending + + @property + def run_qtune(self): + return self._qtune_pending + + @property + def run_adjust(self): + return self._adjust_pending def add_match(self, match: Matcher): - self.matches.append(match) + self._matches.append(match) def _match(self, G: GraphView, set_identity: bool = True, **kwargs): # Note: assumption is that dimensions are valid when a match is called found_match = True dimensions_set = True + self._matches_pending = [] + self._adjust_pending = False + self._qtune_pending = False while found_match: found_match = False - for match_instance in self.matches: - LOG.debug("fusions - start %s", match_instance.name) - if match_instance.NEEDS_VALID_DIMENSION and not dimensions_set: + matches = list(self._matches.values()) + while matches: + match = matches.pop(0) + LOG.debug("fusions - start %s", match.name) + if match.NEEDS_VALID_DIMENSION and not dimensions_set: G.add_dimensions(quiet=True) dimensions_set = True - has_modified_graph = match_instance.match( + has_modified_graph = match.match( G, set_identity=False, group_identity=self._identity) if has_modified_graph: - LOG.info("++ fusion %s modified graph", match_instance.name) + LOG.info("++ fusion %s modified graph", match.name) found_match = True G.add_dimensions(quiet=True) + for required_match in match.run_again: + if match not in self._matches_pending: + self._matches_pending.append(required_match) + self._adjust_pending = self._adjust_pending or match.run_adjust + if G.quantization: + self._qtune_pending = self._qtune_pending or match.run_qtune + if dimensions_set and has_modified_graph: dimensions_set = False if set_identity: self.set_identity(G) - - -def find_forward(G: GraphView, edge, find_node_classes, skip_node_classes=None, find_skip=None): - if find_skip is None: - find_skip = [find_node_classes, skip_node_classes] - for idx, elem in enumerate(find_skip): - if elem is not None and not isinstance(elem, tuple): - if isinstance(elem, list): - find_skip[idx] = tuple(elem) - else: - find_skip[idx] = tuple([elem]) - if isinstance(edge.to_node, find_skip[0]): - return [[edge]] - if skip_node_classes and isinstance(edge.to_node, find_skip[0]): - res = [] - for out_edge in G.out_edges(edge.to_node.name): - edge_lists = find_forward(G, out_edge, find_node_classes, - find_skip=find_skip) - if not edge_lists: - continue - res.extend([[edge] + edge_list for edge_list in edge_lists]) - return res - return [] diff --git a/tools/nntool/graph/matches/matchers/concat_slice.py b/tools/nntool/graph/matches/matchers/concat_slice.py new file mode 100644 index 000000000..ef6fb82b6 --- /dev/null +++ b/tools/nntool/graph/matches/matchers/concat_slice.py @@ -0,0 +1,186 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from functools import reduce + +from graph.matches.match_utils import search_up +from graph.types import ConcatParameters, NNEdge +from graph.types.others import (NoOPParameters, ReshapeParameters, + StridedSliceParameters) +from utils.compatible_transposes import find_combination +from utils.graph import GraphView + +from ..matcher import (Matcher, description, groups, match_name, + needs_valid_dimension, run_qtune_on_match) + +LOG = logging.getLogger("nntool." + __name__) + + +def info(msg): + LOG.info(msg) + +def validate_slice(node: StridedSliceParameters): + if any(sl[2] != 1 for sl in node.act_slice): + return [None] * 2 + slices_axes = node.slices_axes + if len(slices_axes) != 1: + return [None] * 2 + axis = slices_axes[0] + act_slice = node.act_slice[axis] + return axis, (act_slice[0], act_slice[1]-act_slice[0]) + +def up_from_slice(G, edge, axis, shape, remove_nodes=None, removing=True, reshape=None): + if remove_nodes is None: + remove_nodes = [] + if removing: + remove_nodes.append(edge.to_node) + node = edge.from_node + if isinstance(node, ConcatParameters): + if axis != node.axis: + return [None] * 4 + offsets = node.offsets + if shape[0] not in offsets: + return [None] * 4 + index = offsets.index(shape[0]) + length = node.in_dims[index].shape[node.axis] + if shape[1] != length: + return [None] * 4 + return (remove_nodes, edge, index, reshape) + else: + if isinstance(node, ReshapeParameters): + if reshape is None: + reshape = node + combinations = find_combination(node.shape, node.old_shape) + combination = next(iter([comb for comb in combinations if (axis,) in comb]), None) + if combination is None: + return [None] * 4 + axis = combination.index((axis,)) + elif not isinstance(node, NoOPParameters): + return [None] * 4 + return up_from_slice( + G, + G.in_edges(node)[0], + axis, + shape, + remove_nodes=remove_nodes, + removing=removing and len(G.out_edges(node)) == 1, + reshape=reshape + ) + +@groups('*') +@run_qtune_on_match +@needs_valid_dimension(True) +@match_name("concat_slice") +@description("removes slices after concats that match an input of the concat") +class ConcatSliceMatch(Matcher): + + def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: + has_modified_graph = False + concat_slice_edges = {} + for slice_node in G.nodes(node_classes=StridedSliceParameters): + # validate that slice is positive unit stride on a single axis + axis, shape = validate_slice(slice_node) + if axis is None: + continue + # search up for a concat + remove_nodes, concat_edge, concat_in_idx, reshape = up_from_slice( + G, + G.in_edges(slice_node)[0], + axis, + shape) + if not remove_nodes: + continue + concat = concat_edge.from_node + concat_slice_edges.setdefault( + concat, []).append((slice_node, remove_nodes, concat_in_idx, reshape)) + if not concat_slice_edges: + return False + + for concat, slices in concat_slice_edges.items(): + for (slice_node, remove_nodes, concat_in_idx, reshape) in slices: + self.eliminate_slice(G, concat, slice_node, remove_nodes, concat_in_idx, reshape) + if not G.out_edges(concat): + has_modified_graph = True + concat_in_edges = G.in_edges(concat) + info(f"removing concat {concat.name}") + if concat_in_edges: + self.resolve_unused_concat_in_edges( + G, concat, concat_in_edges) + if concat.name in G: + G.remove(concat) + + if set_identity: + self.set_identity(G) + + return has_modified_graph + + def search_delete_nodes_up(self, G, edge): + out_edges = G.out_edges(edge.from_node) + if len(out_edges) > 1: + return [] + return [edge.from_node] + self.search_delete_nodes_up(G, G.in_edges(edge.from_node)[0]) + + @staticmethod + def edge_list_str(G, edges): + edge_list = list(reversed([edge.to_node.name for edge in edges])) + if len(G.out_edges(edges[-1].from_node)) == 1: + edge_list.insert(0, f'{edges[-1].from_node.name} (removed)') + else: + edge_list.insert(0, edges[-1].from_node.name) + return "->".join(edge_list) + + def resolve_unused_concat_in_edges(self, G, concat, concat_in_edges): + for edge in concat_in_edges: + nodes = self.search_delete_nodes_up(G, edge) + if nodes: + info(f"removing unused nodes {' '.join(node.name for node in nodes)}") + for node in nodes: + if node.name in G: + G.remove(node) + + def eliminate_slice(self, G, concat, slice_node, remove_nodes, concat_in_idx, reshape_in): + concat_in_edge = G.indexed_in_edges(concat)[concat_in_idx] + node_idx = (concat_in_edge.from_node, concat_in_edge.from_idx) + info(f'removing slice {slice_node.name} and connecting {concat_in_edge.from_node.name}:{concat_in_edge.from_idx} to its edges') + if reshape_in: + reshape = ReshapeParameters( + G.unique_name(f'{slice_node.name}_reshape'), + old_shape=concat_in_edge.from_node.out_dims[concat_in_edge.from_idx].shape, + shape=slice_node.out_shape) + elif slice_node.changes_shape: + reshape = ReshapeParameters( + G.unique_name(f'{slice_node.name}_reshape'), + old_shape=slice_node.post_slice_shape, + shape=slice_node.out_shape) + else: + reshape = None + if reshape: + G.add_edge( + NNEdge( + from_node=node_idx[0], + from_idx=node_idx[1], + to_node=reshape)) + node_idx = (reshape, 0) + + slice_out_edges = G.out_edges(slice_node) + for rnode in remove_nodes: + if rnode.name in G: + G.remove(rnode) + if slice_node.name in G: + G.remove(slice_node) + for edge in slice_out_edges: + G.add_edge(NNEdge(from_node=node_idx[0], from_idx=node_idx[1], + to_node=edge.to_node, to_idx=edge.to_idx)) diff --git a/tools/nntool/graph/matches/matchers/concat_split.py b/tools/nntool/graph/matches/matchers/concat_split.py index 7adb7f978..e6a362b0c 100644 --- a/tools/nntool/graph/matches/matchers/concat_split.py +++ b/tools/nntool/graph/matches/matchers/concat_split.py @@ -15,15 +15,10 @@ import logging -from graph.dim import Dim from graph.types import ConcatParameters, NNEdge, SplitParameters -from graph.types.others import (CopyParameters, NoOPParameters, - ReshapeParameters, TransposeParameters) from utils.graph import GraphView -from utils.node_id import NodeId -from ..match_utils import search_down -from ..matcher import Matcher, description, groups, match_name, run_before +from ..matcher import Matcher, description, groups, match_name LOG = logging.getLogger("nntool." + __name__) @@ -35,7 +30,7 @@ class ConcatSplitMatch(Matcher): def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: has_modified_graph = False - for split_node in set([node for node in G.nodes() if isinstance(node, SplitParameters)]): + for split_node in G.nodes(node_classes=SplitParameters): in_edges = G.in_edges(split_node.name) if len(in_edges) > 1: continue @@ -70,120 +65,3 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: self.set_identity(G) return has_modified_graph - - -def reduce_slices(slices, shapes): - res_slice = [] - res_shape = [] - for slice_axis, shape_axis in zip(zip(*slices), zip(*shapes)): - if slice_axis[0] == slice_axis[1]: - res_slice.append(slice_axis[0]) - res_shape.append(shape_axis[0]) - else: - res_slice.append( - (slice_axis[0][0], - slice_axis[-1][1], - slice_axis[0][2])) - res_shape.append(sum(shape_axis)) - return res_slice, res_shape - - -def remove_edges(G, edges): - if not edges: - return - edges = edges.copy() - while len(edges) > 1: - edge = edges.pop(0) - G.remove(edge.to_node) - if G.quantization: - nid = NodeId(edge.to_node) - if nid in G.quantization: - del G.quantization[nid] - try: - G.remove_edge(edges[0]) # @IgnoreException - except KeyError: - pass - - -@groups('*') -@match_name("split_concat") -@run_before('remove_noops', 'remove_copies') -@description("removes splits that go to concats where all the out edges of the split are in sequence in the concat") -class SplitConcatMatch(Matcher): - def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: - edge_groups = [] - for node in G.nodes(node_classes=SplitParameters): - cur_group = None - for out_edge_bundle in G.indexed_out_edges(node): - if len(out_edge_bundle) == 1: - out_edge = out_edge_bundle[0] - concat_node_edges = search_down( - G, out_edge, ConcatParameters, - can_pass=(CopyParameters, NoOPParameters, - ReshapeParameters), - can_pass_fn=lambda _, node: isinstance(node, TransposeParameters) and node.does_nothing) - if concat_node_edges: - if cur_group: - this_concat_edge = concat_node_edges[-1] - last_concat_edge = cur_group[-1][-1] - if this_concat_edge.to_node == last_concat_edge.to_node and this_concat_edge.to_idx == last_concat_edge.to_idx + 1: - cur_group.append(concat_node_edges) - continue - if len(cur_group) > 1: - edge_groups.append(cur_group) - cur_group = [concat_node_edges] - continue - if cur_group: - if len(cur_group) > 1: - edge_groups.append(cur_group) - cur_group = None - if cur_group: - if len(cur_group) > 1: - edge_groups.append(cur_group) - cur_group = None - # we leave the splits and concats after this since they will be cleared up by remove_noops - for edge_group in edge_groups: - split_node = edge_group[0][0].from_node - concat_node = edge_group[0][-1].to_node - from_idx = edge_group[0][0].from_idx - to_idx = edge_group[-1][0].from_idx - from_concat_idx = edge_group[0][-1].to_idx - to_concat_idx = edge_group[1][-1].to_idx - LOG.info( - f"combining outputs {from_idx}:{to_idx} on split node {split_node.name} followed by concat {concat_node.name}") - # combine slices and shapes on edges in group - new_slice, new_shape = reduce_slices( - split_node.act_slices[from_idx:to_idx+1], - split_node.out_shapes[from_idx:to_idx+1] - ) - new_concat_shape = Dim.combine( - [concat_node.in_dims[idx] - for idx in range(from_concat_idx, to_concat_idx+1)], - concat_node.axis) - split_node.act_slices = split_node.act_slices[:from_idx] + [ - new_slice] + split_node.act_slices[to_idx+1:] - # the slice may need to reshape since we will remove everything in between - split_node.out_shapes = split_node.out_shapes[:from_idx] + [ - new_concat_shape.shape] + split_node.out_shapes[to_idx+1:] - - # remove all edges and intermediate nodes on all edge groups - for edge_list in edge_group: - remove_edges(G, edge_list) - # add back a direct edge to the first idx - G.add_edge(NNEdge(from_node=split_node, from_idx=edge_group[0][0].from_idx, to_node=concat_node, to_idx=edge_group[0][-1].to_idx)) - out_edge_bundles = G.indexed_out_edges(split_node) - # move edges beyond the edge group after the first index - for offset, edge_list in enumerate(out_edge_bundles[to_idx+1:]): - assert len(edge_list) == 1 - edge = edge_list[0] - G.remove_edge(edge) - G.add_edge(NNEdge.clone(edge, from_idx=from_idx+1+offset)) - # reindex the in edges in the concat - from_idx = edge_group[0][-1].to_idx - to_idx = edge_group[-1][-1].to_idx - in_edges = G.indexed_in_edges(concat_node) - for offset, in_edge in enumerate(in_edges[to_idx+1:]): - G.remove_edge(in_edge) - G.add_edge(NNEdge.clone(in_edge, to_idx=from_idx+1+offset)) - - return bool(edge_groups) diff --git a/tools/nntool/graph/matches/matchers/copy_on_outputs.py b/tools/nntool/graph/matches/matchers/copy_on_outputs.py index cbb64195b..3a4462842 100644 --- a/tools/nntool/graph/matches/matchers/copy_on_outputs.py +++ b/tools/nntool/graph/matches/matchers/copy_on_outputs.py @@ -15,9 +15,7 @@ import logging -from graph.matches.matchers.insert_copies import find_real_in_edge -from graph.types import (CopyParameters, NNEdge, OutputParameters, - ReshapeParameters, TransposeParameters) +from graph.types import CopyParameters, NNEdge, OutputParameters from utils.graph import GraphView from utils.node_id import NodeId @@ -29,7 +27,7 @@ def search_down(G, edge): node = edge.to_node - if isinstance(node, ReshapeParameters) or (isinstance(node, TransposeParameters) and node.does_nothing): + if node.no_model_code: res = [] for out_edge in G.out_edges(node): res.extend(search_down(G, out_edge)) @@ -49,7 +47,7 @@ def search_up(G, edge): if out_edge == edge: continue res.extend(search_down(G, out_edge)) - if isinstance(node, (OutputParameters, ReshapeParameters)) or (isinstance(node, TransposeParameters) and node.does_nothing): + if node.no_model_code: edge = G.in_edges(node)[0] res.extend(search_up(G, edge)) return res diff --git a/tools/nntool/graph/matches/matchers/copy_on_split_inputs.py b/tools/nntool/graph/matches/matchers/copy_on_split_inputs.py deleted file mode 100644 index 4b0be9606..000000000 --- a/tools/nntool/graph/matches/matchers/copy_on_split_inputs.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -from graph.matches.matchers.insert_copies import find_real_in_edge -import logging - -from graph.types import InputParameters, ReshapeParameters, ConstantInputParameters -from graph.types.others import ConcatParameters, CopyParameters, SplitParameters -from utils.graph import GraphView - -from ..matcher import Matcher, groups, match_name, description, modifies_dimensions, run_after - -LOG = logging.getLogger("nntool." + __name__) - - -def search_up_for_input(G, node, going_up=None): - if going_up is None or isinstance(node, ReshapeParameters): - return search_up_for_input(G, G.in_edges(node.name)[0].from_node, going_up=True) - if isinstance(node, (InputParameters, ConstantInputParameters)): - return node - return None - -@match_name("copy_on_split_inputs") -@description("Insert copy on inputs that could be in a tensor stack") -@modifies_dimensions(True) -@groups('*') -class CopyOnSplitInputs(Matcher): - - def _match(self, G: GraphView, set_identity: bool = True, **kwargs): - - candidates = [node for node in G.nodes(node_classes=(SplitParameters, ConcatParameters))] - need_a_copy_edges = [] - for node in candidates: - for idx, edge in enumerate(G.indexed_in_edges(node.name)): - real_from_node, _ = find_real_in_edge(G, edge) - if isinstance(real_from_node, (InputParameters, ConstantInputParameters)): - need_a_copy_edges.append((edge, idx)) - has_modified_graph = False - for edge in need_a_copy_edges: - LOG.info( - "Insert copy on split input %s", edge[0].to_node.name) - has_modified_graph = True - cnode = CopyParameters(G.unique_name(f'{edge[0].to_node.name}_copy')) - G.insert_node_at_edge(cnode, edge[0]) - if G.quantization: - G.quantization.copy_qrec(edge[0].to_node, 'in', 0, cnode) - if set_identity: - self.set_identity(G) - return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/duplicate_constants.py b/tools/nntool/graph/matches/matchers/duplicate_constants.py index 6aebf13cb..bd4a103c3 100644 --- a/tools/nntool/graph/matches/matchers/duplicate_constants.py +++ b/tools/nntool/graph/matches/matchers/duplicate_constants.py @@ -22,7 +22,7 @@ LOG = logging.getLogger("nntool." + __name__) -@match_name("match_duplicate_constants") +@match_name("duplicate_constants") @description("""Find constants that are linked to more than one node and duplicate them""") @run_before('*') @groups('symmetric', 'scaled') diff --git a/tools/nntool/graph/matches/matchers/duplicate_operations.py b/tools/nntool/graph/matches/matchers/duplicate_operations.py index 4ac00b591..6072c389f 100644 --- a/tools/nntool/graph/matches/matchers/duplicate_operations.py +++ b/tools/nntool/graph/matches/matchers/duplicate_operations.py @@ -12,32 +12,37 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from graph.dim import Dim import logging from copy import deepcopy from functools import partial from itertools import groupby import numpy as np +from graph.dim import Dim from graph.types import SplitParameters from graph.types.base import ComparableParameters, NNEdge from utils.graph import GraphView -from ..matcher import Matcher, description, groups, match_name, run_before +from ..matcher import Matcher, description, groups, match_name, run_before, run_qtune_on_match LOG = logging.getLogger("nntool." + __name__) -@match_name("match_duplicate_operations") + +@match_name("duplicate_operations") @description("""Removes operations that are duplicates on the same edge""") @run_before("*") @groups('symmetric', 'scaled') +@run_qtune_on_match class MatchDuplicateOperations(Matcher): + def __init__(self, identity: str = None, limit_to_dest_classes=None): + super().__init__(identity) + self._limit_to_dest_classes = limit_to_dest_classes def _match(self, G: GraphView, set_identity: bool = True, **kwargs): - if G.quantization: - LOG.warning( - 'match_duplicate_operations does not handle quantized graphs') - return False + # if G.quantization: + # LOG.warning( + # 'duplicate_operations does not handle quantized graphs') + # return False def same_source_edge_fn(x): return f"{x.from_node.__hash__()}##{x.from_idx}" @@ -53,6 +58,11 @@ def same_dest_edge(x): # all have the same origin same_source_edges = [elem for elem in same_source_edges if len(elem) > 1] + if self._limit_to_dest_classes: + same_source_edges = list( + filter( + lambda edges: all(isinstance(edge.to_node, self._limit_to_dest_classes) for edge in edges), + same_source_edges)) same_dest_edges = [] same_dest_group_edges = [] @@ -63,7 +73,7 @@ def same_dest_edge(x): first = same_source_edge.pop(0) others = list(filter(partial(lambda x, y: x.to_node != y.to_node and y.to_node.is_same_operation_as(G, - x.to_node), first), same_source_edge)) + x.to_node), first), same_source_edge)) if others: same_dest_edges.append(tuple([first] + others)) for other in others: @@ -83,10 +93,11 @@ def same_dest_edge(x): while same_dest_edges: edge_set = same_dest_edges.pop(0) keep_node = edge_set[0].to_node - other_edge_sets = [edges for edges in same_dest_edges if any(edge.to_node == keep_node for edge in edges)] + other_edge_sets = [edges for edges in same_dest_edges if any( + edge.to_node == keep_node for edge in edges)] for other_edge_set in other_edge_sets: same_dest_edges.remove(other_edge_set) - + nodes_to_delete = set() for edge_set in [edge_set] + other_edge_sets: for edge in edge_set: @@ -95,13 +106,13 @@ def same_dest_edge(x): continue nodes_to_delete.add(other_node) for out_edge in G.out_edges(other_node): - G.add_edge(NNEdge(from_node=keep_node, to_node=out_edge.to_node, to_idx=out_edge.to_idx)) + G.add_edge( + NNEdge(from_node=keep_node, to_node=out_edge.to_node, to_idx=out_edge.to_idx)) LOG.info( f'removed duplicates {",".join(node.name for node in nodes_to_delete)} to {keep_node.name}') for node in nodes_to_delete: G.remove(node) - - + # # all are multiple edges that go to something comparable # for edge_set in same_dest_edges: @@ -145,7 +156,8 @@ def same_dest_edge(x): out_edges = G.out_edges(first_node.name) for edge in out_edges: G.remove_edge(edge) - G.add_edge(NNEdge(from_node=split1, from_idx=out_num, to_node=edge.to_node, to_idx=edge.to_idx)) + G.add_edge(NNEdge(from_node=split1, from_idx=out_num, + to_node=edge.to_node, to_idx=edge.to_idx)) G.add_edge(NNEdge(from_node=first_node, to_node=split1)) # first split output goes to original output for other in edge_set[1::]: @@ -169,7 +181,8 @@ def same_dest_edge(x): G.remove(weights_other) G.remove(biases_other) for edge in out_edges: - G.add_edge(NNEdge(from_node=split1, from_idx=out_num, to_node=edge.to_node, to_idx=edge.to_idx)) + G.add_edge(NNEdge(from_node=split1, from_idx=out_num, + to_node=edge.to_node, to_idx=edge.to_idx)) LOG.info( f'merged convolutions {",".join(dup_nodes)} into {first_node.name}') if not found_more: diff --git a/tools/nntool/graph/matches/matchers/duplicate_operations_out.py b/tools/nntool/graph/matches/matchers/duplicate_operations_out.py index 1700adb7c..98078efe8 100644 --- a/tools/nntool/graph/matches/matchers/duplicate_operations_out.py +++ b/tools/nntool/graph/matches/matchers/duplicate_operations_out.py @@ -23,7 +23,7 @@ LOG = logging.getLogger("nntool." + __name__) -@match_name("match_duplicate_operations_out") +@match_name("duplicate_operations_out") @description("""Removes operations that are duplicates on the same out edge""") @run_before("*") @groups('*') @@ -37,7 +37,8 @@ def explore(self, G, nodes, result=None): out_edges_bundle = [G.indexed_out_edges(node.name) for node in nodes] if any(len(out_edges) != 1 or len(out_edges[0]) != 1 for out_edges in out_edges_bundle): return result - if any(not isinstance(node, ComparableParameters) or not node.is_same_operation_as(G, nodes[0]) + # node == nodes[0] added since node could be a multi input expression + if any(not isinstance(node, ComparableParameters) or not node.is_same_operation_as(G, nodes[0]) or node == nodes[0] for node in nodes[1::]): return result if not result: diff --git a/tools/nntool/graph/matches/matchers/equalize_sym_mult_concats.py b/tools/nntool/graph/matches/matchers/equalize_sym_mult_concats.py deleted file mode 100644 index a3aa280ac..000000000 --- a/tools/nntool/graph/matches/matchers/equalize_sym_mult_concats.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import logging - -from graph.types import (ConcatParameters, ReshapeParameters, SplitParameters, - TransposeParameters) -from graph.types.base import FilterParameters -from utils.graph import Edge, GraphView -from utils.node_id import NodeId - -from ..matcher import Matcher, description, groups, match_name, modifies_dimensions - -LOG = logging.getLogger("nntool." + __name__) - -CAN_PASS = ( - ReshapeParameters, - TransposeParameters, - SplitParameters -) - -# TODO - This match should be rewritten to use the quantizer - - -def set_in_scale(qrec, index, scale): - in_q = qrec.in_qs[index] - assert qrec.ktype.startswith( - 'scaled'), "not supported on other quantization types" - in_q.scale = scale - - -def set_out_scale(node, qrec, index, scale): - out_q = qrec.out_qs[index] - assert qrec.ktype.startswith( - 'scaled'), "not supported on other quantization types" - if isinstance(node, FilterParameters): - assert index == 0, "trying to set strange index on filter quantization record" - out_q.scale = scale - qrec.cache['mul_biases_q'].scale = qrec.in_qs[0].scale * \ - qrec.in_qs[1].scale / out_q.scale - else: - out_q.scale = scale - - -def propagate_qtype_up(G, qtype, edge: Edge): - LOG.info("propagating scale up from node %s to node %s", - edge.to_node.name, edge.from_node.name) - qrec_out = G.quantization[NodeId(edge.from_node)] - set_out_scale(edge.from_node, qrec_out, edge.from_idx, qtype.scale) - qrec_in = G.quantization[NodeId(edge.to_node)] - set_in_scale(qrec_in, edge.to_idx, qtype.scale) - if isinstance(edge.from_node, CAN_PASS): - for edge in G.in_edges(edge.from_node.name): - propagate_qtype_up(G, qtype, edge) - -@match_name("equalize_sm_concats") -@description("""Equalize input quantization of concats with symmetric multiplicative quantization""") -@groups('symmetric') -@modifies_dimensions(False) -class EqualizeSymmetricMultiplicativeQuantivedConcats(Matcher): - - def _match(self, G: GraphView, set_identity: bool = True, **kwargs): - if not G.quantization: - return - concats = [node for node in G.nodes() if isinstance(node, - ConcatParameters)] - qrecs = [G.quantization[NodeId(node)] for node in concats] - if not all(qrec.ktype.startswith('scaled') for qrec in qrecs): - return - for concat, qrec in zip(concats, qrecs): - out_q = qrec.out_qs[0] - for edge in G.in_edges(concat.name): - in_q = qrec.in_qs[edge.to_idx] - if in_q != out_q: - propagate_qtype_up(G, out_q, edge) - - if set_identity: - self.set_identity(G) - - return False diff --git a/tools/nntool/graph/matches/matchers/expression_matcher.py b/tools/nntool/graph/matches/matchers/expression_matcher.py index cd1f8c5d8..4f25528f1 100644 --- a/tools/nntool/graph/matches/matchers/expression_matcher.py +++ b/tools/nntool/graph/matches/matchers/expression_matcher.py @@ -250,7 +250,7 @@ def find_connected_groups(G): @match_name("expression_matcher") @description("Groups piecewise expressions for kernel generation") -@run_after('expand_transposes') +@run_after('*') @needs_valid_dimension(True) class ExpressionMatcher(Matcher): diff --git a/tools/nntool/graph/matches/matchers/filt_bigger_than_in.py b/tools/nntool/graph/matches/matchers/filter_bigger_than_input.py similarity index 100% rename from tools/nntool/graph/matches/matchers/filt_bigger_than_in.py rename to tools/nntool/graph/matches/matchers/filter_bigger_than_input.py diff --git a/tools/nntool/graph/matches/matchers/find_asymmetric_quantization.py b/tools/nntool/graph/matches/matchers/find_asymmetric_quantization.py deleted file mode 100644 index 411a35991..000000000 --- a/tools/nntool/graph/matches/matchers/find_asymmetric_quantization.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# import logging - -# from graph.matches.matcher import Matcher -# from graph.types import (ActivationParameters, ConcatParameters, -# ConstantInputParameters, Conv2DParameters, -# ConvFusionParameters, FcParameters, -# GlobalPoolingParameters, InputParameters, -# MatrixAddParameters, OutputParameters, -# PoolingParameters, ReshapeParameters) - -# from utils.graph import GraphView -# from utils.node_id import NodeId - -# LOG = logging.getLogger("nntool." + __name__) - -# CAN_CHANGE_OUTPUT = ( -# InputParameters, ConstantInputParameters, Conv2DParameters, -# ConvFusionParameters, FcParameters, MatrixAddParameters -# ) - -# CAN_CHANGE_INPUT = ( -# OutputParameters, Conv2DParameters, ConvFusionParameters, -# FcParameters, MatrixAddParameters -# ) - -# CAN_PROPAGATE_INPUT = ( -# GlobalPoolingParameters, ReshapeParameters, ConcatParameters, ActivationParameters, PoolingParameters -# ) - -# ARE_MULTI_INPUT = ( -# ConcatParameters -# ) - -# class FindAsymmetricQuantization(Matcher): -# NAME = "find_asymmetric_quantization" -# DESCRIPTION = """Find nodes that can have asymmetric quantization. Must run after padding has been fused.""" - -# def can_change_input(self, G, node, exclude=None): -# """Returns None or a list of tuples of (node, multi_input_node) where node is an -# input of multi_input_node. An empty list is a confirmed string. A list that contains -# multi input nodes needs to be reconciled. An empty list means that this node -# cannot be changed.""" - -# if isinstance(node, CAN_PROPAGATE_INPUT): -# if exclude and node in exclude: -# return None -# nodes = [] -# for succ in [succ -# for succs in G.successors(node.name) -# for succ in succs]: -# can_change = self.can_change_input(G, succ, exclude=exclude) -# if can_change is None: -# return None -# nodes += can_change -# if isinstance(succ, ARE_MULTI_INPUT): -# nodes.append((node, succ)) -# return nodes -# if not isinstance(node, CAN_CHANGE_INPUT): -# return None -# if isinstance(node, ConvFusionParameters): -# filters = node.contained_filters() -# if len(filters) == 1 and not filters[0].padding.has_padding: -# return [] -# else: -# return None -# if isinstance(node, Conv2DParameters): -# return None if node.padding.has_padding else [] -# return [] - -# def can_change_output(self, node): -# return isinstance(node, CAN_CHANGE_OUTPUT) - -# def validate_multi_input(self, G, input_dict): -# # {start_node: [(pred, mi_node), ..]} -# mi_nodes = {} -# # index all of the predecessor nodes by mi node -# for pr_node, mi_node in [match for matches in input_dict.values() for match in matches]: -# pr_node_set = mi_nodes.get(mi_node) -# if pr_node_set is None: -# pr_node_set = set() -# mi_nodes[mi_node] = pr_node_set -# pr_node_set.add(pr_node) -# bad_mi_nodes = [] -# # check that all the predecessors were OK -# for mi_node, pr_nodes in mi_nodes.items(): -# if not all(node in pr_nodes for node in G.predecessors(mi_node)): -# bad_mi_nodes.append(mi_node) -# start_nodes = [] -# # find the records that have bad nodes in them -# if bad_mi_nodes: -# for start_node, matches in input_dict.items(): -# if any(mi_node in bad_mi_nodes for _, mi_node in matches): -# start_nodes.append(start_nodes) -# for start_node in start_nodes: -# del input_dict[start_node] -# matches = self.can_change_input(G, start_node, exclude=bad_mi_nodes) -# if matches is not None: -# assert len(matches) == 0 -# input_dict[start_node] = [] -# return input_dict - -# def change_output_to_async(self, G, node, idx): -# if isinstance(node, ConvFusionParameters): -# changing = False -# for fnode in node.contained_nodes(): -# if changing: -# nid = NodeId(node, fnode) -# qrec = G.quantization[nid] -# if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper): -# qrec.in_qs[0] = qrec.in_qs[0].wrapped -# if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): -# qrec.out_qs[0] = qrec.out_qs[0].wrapped -# elif isinstance(fnode, (Conv2DParameters, FcParameters)): -# changing = True -# nid = NodeId(node, fnode) -# qrec = G.quantization[nid] -# if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): -# qrec.out_qs[0] = qrec.out_qs[0].wrapped - -# nid = NodeId(node) -# qrec = G.quantization[nid] -# if isinstance(qrec.out_qs[idx], SymmetricMultQTypeWrapper): -# qrec.out_qs[idx] = qrec.out_qs[idx].wrapped - -# def change_input_to_async(self, G, node, idx): -# if isinstance(node, ConvFusionParameters): -# for fnode in node.contained_nodes(): -# nid = NodeId(node, fnode) -# qrec = G.quantization[nid] -# if isinstance(fnode, (Conv2DParameters, FcParameters)): -# if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper): -# qrec.in_qs[0] = qrec.in_qs[0].wrapped -# qrec.in_qs[2].link(qrec.in_qs[1], qrec.in_qs[0]) -# return -# if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper): -# qrec.in_qs[0] = qrec.in_qs[0].wrapped -# if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): -# qrec.out_qs[0] = qrec.out_qs[0].wrapped - -# nid = NodeId(node) -# qrec = G.quantization[nid] -# if isinstance(qrec.in_qs[idx], SymmetricMultQTypeWrapper): -# qrec.in_qs[idx] = qrec.in_qs[idx].wrapped -# if isinstance(node, (Conv2DParameters, FcParameters)): -# qrec.in_qs[2].link(qrec.in_qs[1], qrec.in_qs[idx]) -# if isinstance(node, OutputParameters) and isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper): -# qrec.out_qs[0] = qrec.out_qs[0].wrapped - -# def do_change(self, G, node, idx=0): -# self.change_output_to_async(G, node, idx) -# for edge in G.out_edges(node.name): -# if isinstance(edge.to_node, CAN_PROPAGATE_INPUT): -# self.change_input_to_async(G, edge.to_node, edge.to_idx) -# self.do_change(G, edge.to_node, edge.from_idx) -# else: -# assert isinstance(edge.to_node, CAN_CHANGE_INPUT) -# if isinstance(edge.to_node, ConvFusionParameters): -# filters = edge.to_node.contained_filters() -# assert len(filters) == 1 and not filters[0].padding.has_padding -# if isinstance(edge.to_node, Conv2DParameters): -# assert not edge.to_node.padding.has_padding -# self.change_input_to_async(G, edge.to_node, edge.to_idx) - -# def _match(self, G: GraphView, set_identity: bool = True, **kwargs): -# if not G.quantization: -# return -# input_dict = {} -# for node in G.nodes(): -# if not self.can_change_output(node): -# continue -# all_matches = [] -# for succ in [succ for succs in G.successors(node.name) for succ in succs]: -# matches = self.can_change_input(G, succ) -# if matches is None: -# all_matches = None -# break -# all_matches += matches -# if all_matches is None: -# continue -# input_dict[node] = all_matches - -# input_dict = self.validate_multi_input(G, input_dict) -# for node in input_dict: -# # all nodes that can currently change output have one output -# self.do_change(G, node) - -# if set_identity: -# self.set_identity(G) -# return False diff --git a/tools/nntool/graph/matches/matchers/match_external_bias.py b/tools/nntool/graph/matches/matchers/fuse_external_bias.py similarity index 80% rename from tools/nntool/graph/matches/matchers/match_external_bias.py rename to tools/nntool/graph/matches/matchers/fuse_external_bias.py index 213a99851..fe258b10f 100644 --- a/tools/nntool/graph/matches/matchers/match_external_bias.py +++ b/tools/nntool/graph/matches/matchers/fuse_external_bias.py @@ -20,9 +20,8 @@ MatrixAddParameters, MatrixMulParameters, NNEdge) from graph.types.others import ReshapeParameters from utils.graph import GraphView -from utils.node_id import NodeId -from ..matcher import Matcher, match_name, description, groups, run_before +from ..matcher import Matcher, match_name, description, groups, run_before, run_qtune_on_match LOG = logging.getLogger("nntool." + __name__) @@ -33,10 +32,13 @@ MatrixMulParameters: (np.multiply, True) } + @match_name('fuse_external_bias') @description('Fuse bias addition after filter with filter bias') @groups('scaled', 'symmetric') -@run_before('match_op_activation', 'move_pooling_scale8', 'move_activations_pow2', 'move_activations_scale8') +@run_before('fuse_op_activation_scale8', 'fuse_op_activation_pow2', 'move_pooling_scale8', + 'move_activations_up') +@run_qtune_on_match class MatchExternalBias(Matcher): def _match(self, G: GraphView, set_identity: bool = True, **kwargs): @@ -77,37 +79,35 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): LOG.warning('could not absorb %s into %s', const_node.name, filter_node.name) break - # If there is quantization then essentially the output of the filter - # takes the quantization of the output of the operation. - # The biases will not change since their quantization depends on the weights - # and input - fnid = NodeId(filter_node) - opnid = NodeId(op_node) - if G.quantization and (fnid in G.quantization or opnid in G.quantization): - if not (fnid in G.quantization and opnid in G.quantization): - LOG.warning( - 'could not absorb %s into %s - graph is partially quantized', const_node.name, filter_node.name) - break - fqrec = G.quantization[fnid] - opqrec = G.quantization[opnid] - fqrec.out_qs[0] = opqrec.out_qs[0] has_modified_graph = True LOG.info("fusing bias in %s into %s", const_node.name, filter_node.name) self.fuse_bias(G, filter_node, other_idx, op, flat_value, 2) if weights_and_biases: - # TODO - need to adjust weights quantization here LOG.info("fusing multiplicative bias in %s into %s", const_node.name, filter_node.name) self.fuse_bias(G, filter_node, other_idx, op, flat_value, 1) - out_edges = G.out_edges(op_node.name) + # save out edges and remove the mul or add + out_edges = G.out_edges(op_node) G.remove(op_node) if remove_constant: G.remove(const_node) + + # it's possible that there is a broadcast on the op from the constant + # if there is insert a reshape since it will no longer happen + in_shape = tuple(op_node.in_dims[out_edge.to_idx].shape) + out_shape = tuple(op_node.out_dims[0].shape) from_node = seen_reshape[-1] if seen_reshape else filter_node + if in_shape != out_shape: + reshape = ReshapeParameters(G.unique_name(f'{op_node.name}_reshape'), + old_shape=in_shape, shape=out_shape) + G.add_edge(NNEdge(from_node=from_node, to_node=reshape)) + from_node = reshape + + # connect up the output nodes for edge in out_edges: G.add_edge(NNEdge(from_node=from_node, to_node=edge.to_node, to_idx=edge.to_idx)) diff --git a/tools/nntool/graph/matches/matchers/match_external_bias_matmul.py b/tools/nntool/graph/matches/matchers/fuse_external_bias_matmul.py similarity index 92% rename from tools/nntool/graph/matches/matchers/match_external_bias_matmul.py rename to tools/nntool/graph/matches/matchers/fuse_external_bias_matmul.py index 04aa887c8..03c50aee0 100644 --- a/tools/nntool/graph/matches/matchers/match_external_bias_matmul.py +++ b/tools/nntool/graph/matches/matchers/fuse_external_bias_matmul.py @@ -21,7 +21,7 @@ RemoveUnnecessaryQuantizeOperators from graph.types import (ConstantInputParameters, MatrixAddParameters, MatrixMulParameters, NNEdge) -from graph.types.tensor_arithmetic import MatMulOpParameters +from graph.types.tensor_arithmetic import MatMulOpParameters, MatMulTransposedParameters from quantization.quantizer.new_quantizer import NewQuantizer from utils.graph import GraphView from utils.node_id import NodeId @@ -65,7 +65,7 @@ def reverse_matmul(G: GraphView, params): @match_name('fuse_external_bias_matmul') @description('Fuse bias addition after matmul') @groups('scaled', 'symmetric') -@run_before('match_op_activation', 'move_pooling_scale8', 'move_activations_pow2', 'move_activations_scale8', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2') +@run_before('fuse_op_activation_scale8', 'fuse_op_activation_pow2', 'move_pooling_scale8', 'move_activations_up', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2') class MatchExternalBiasMatmul(Matcher): def _match(self, G: GraphView, set_identity: bool = True, **kwargs): @@ -98,10 +98,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): flat_value = const_node.dqvalue.flatten() out_shape = matmul.out_dims[0].shape - if len(out_shape) != 2: - raise ValueError( - f'strange outputs shape of {out_shape} for matmul {params.name}') - if len(flat_value) != out_shape[0] and len(flat_value) != out_shape[1]: + if len(flat_value) != out_shape[-1] and len(flat_value) != out_shape[-2]: LOG.info("can't fuse %s into %s - value shape is not correct for bias", const_node.name, matmul.name) break @@ -110,7 +107,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): out_node = seen_reshape[-1] if seen_reshape else matmul if isinstance(op_node, MatrixAddParameters): if has_bias: - if len(flat_value.shape) != len(matmul.in_dims[2]): + if len(flat_value) != matmul.in_dims[2].size(): LOG.info( "can't fuse %s into %s - bias shape is not the same", const_node.name, matmul.name) break @@ -120,7 +117,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): "folding additive bias from %s into existing bias on %s", op_node.name, matmul.name) bias_node.value = bias_node.dqvalue + flat_value else: - if len(flat_value) != out_shape[1]: + if len(flat_value) != out_shape[-1]: # matmul needs to be transposed to fuse this in_nodes, trans_node = reverse_matmul(G, matmul) if seen_reshape: diff --git a/tools/nntool/graph/matches/matchers/fuse_gap_convs.py b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py new file mode 100644 index 000000000..1b4c368cd --- /dev/null +++ b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py @@ -0,0 +1,226 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from copy import deepcopy + +from graph.types import (ActivationParameters, Conv2DParameters, + ConvFusionParameters, HSigmoidActivationParameters, + HSwishActivationParameters, LeakyActivationParameters, + NNEdge, PoolingParameters, ReluActivationParameters, + SigmoidActivationParameters) +from graph.types.activations import (HTanHActivationParameters, + TanHActivationParameters) +from graph.types.base import NNNodeRef +from graph.types.fusions import FusionInputParameters, FusionOutputParameters +from utils.graph import GraphView, NodeRef + +from ..matcher import (Matcher, description, groups, match_name, + run_adjust_on_match, run_qtune_on_match) + +LOG = logging.getLogger("nntool." + __name__) + +VALID_ACTIVATIONS_SQ8 = ( + ReluActivationParameters, + LeakyActivationParameters, + HSigmoidActivationParameters, + HSwishActivationParameters, + SigmoidActivationParameters, + TanHActivationParameters, + HTanHActivationParameters +) + +VALID_FUSIONS_SQ8 = ( + 'conv_active', + 'conv_max_active', + 'conv_average_active', + 'conv_active_max', +) + +VALID_ACTIVATIONS_POW2 = ( + ReluActivationParameters, + LeakyActivationParameters, + HSigmoidActivationParameters, + HSwishActivationParameters, + SigmoidActivationParameters +) + +VALID_FUSIONS_POW2 = ( + 'conv_active', + 'conv_max_active', + 'conv_average_active', + 'conv_active_max', +) + + +class MergeStopError(Exception): + pass + + +class MergeAbortError(Exception): + pass +class NewFusionMatch(): + def __init__(self, valid_activations, valid_fusions) -> None: + self.fusion = None + self.conv = None + self.pool = None + self.active = None + self.valid_activations = valid_activations + self.valid_fusions = valid_fusions + self.order = [] + self.nodes_in_fusion = 0 + + @classmethod + def from_node(cls, G, node, valid_activations, valid_fusions) -> 'NewFusionMatch': + matcher = cls(valid_activations, valid_fusions) + try: + matcher.add_node(node) + while node: + edges = G.out_edges(node) + if len(edges) > 1: + break + node = edges[0].to_node + matcher.add_node(node) + except MergeStopError: + pass + except MergeAbortError: + return None + return matcher + + @staticmethod + def calc_fusion_type(contents, pool_type=False): + return '_'.join(['conv' if isinstance(params, Conv2DParameters) + else 'active' if isinstance(params, ActivationParameters) + else params.pool_type if pool_type else 'pool' for params in contents]) + + def can_add(self, node): + fusion_type = self.calc_fusion_type( + self.order + [node], pool_type=True) + return any(valid_fusion.startswith(fusion_type) for valid_fusion in self.valid_fusions) + + def add_node(self, params, in_fusion=False): + if in_fusion: + self.nodes_in_fusion += 1 + if isinstance(params, ConvFusionParameters): + if self.fusion: + raise MergeStopError() # @IgnoreException + self.fusion = params + try: + for cnode in params.contained_nodes(): + self.add_node(cnode, in_fusion=True) + except MergeStopError: # @IgnoreException + raise MergeAbortError() + elif isinstance(params, Conv2DParameters): + if self.conv or not self.can_add(params): + raise MergeStopError() # @IgnoreException + self.order.append(params) + self.conv = params + elif isinstance(params, self.valid_activations): + if self.active or not self.can_add(params): + raise MergeStopError() # @IgnoreException + self.order.append(params) + self.active = params + elif isinstance(params, PoolingParameters): + if self.pool or not self.can_add(params): + raise MergeStopError() # @IgnoreException + self.order.append(params) + self.pool = params + else: + raise MergeStopError() # @IgnoreException + + @property + def can_fuse(self): + return (self.calc_fusion_type(self.order, pool_type=True) in self.valid_fusions + and len(self.order) > self.nodes_in_fusion) + + def fuse(self, G: GraphView): + fusion_outputs = G.out_edges(self.order[-1]) + if self.fusion is None: + fuse_node_name = G.unique_name(self.conv.name + '_fusion') + subg = GraphView() + inputs = [NNNodeRef(subg, FusionInputParameters(f'{fuse_node_name}_in_{idx}', + idx=idx, dims=self.conv.in_dims[0].shape), 0) for idx in range(3)] + in_edges = G.indexed_in_edges(self.conv) + else: + fuse_node_name = self.fusion.name + subg = self.fusion.subgraph + subg_output = subg.outputs() + assert len(subg_output) == 1 + inputs = [NNNodeRef(subg, subg.in_edges(subg_output[0])[0].from_node, 0)] + subg.remove_all(subg_output) + in_edges = None + nodes_to_fuse = self.order[self.nodes_in_fusion:] + LOG.info(f'fusing nodes {",".join(node.name for node in nodes_to_fuse)}' + f' into {fuse_node_name}') + node = None + while nodes_to_fuse: + node = nodes_to_fuse.pop(0) + G.remove(node) + inputs = [node(*inputs)] + FusionOutputParameters( + f'{fuse_node_name}_out_0', + dims=node.out_dims[0].shape)(*inputs) + if not self.fusion: + fusion = ConvFusionParameters( + fuse_node_name, + fusion_type=self.fusion_type, + subgraph=subg, + in_dims_hint=self.conv.in_dims_hint, + out_dims_hint=self.conv.out_dims_hint, + in_dims=deepcopy(self.conv.in_dims), + out_dims=deepcopy(self.order[-1].out_dims), + inout_set=True) + for edge in in_edges: + G.add_edge(edge.clone(to_node=fusion)) + else: # in the fusion case the outputs will already be removed since the node after was removed + fusion = self.fusion + for edge in fusion_outputs: + G.add_edge(edge.clone(from_node=fusion)) + if G.quantization: + for node in self.order[self.nodes_in_fusion:]: + G.quantization.move_to_fusion(node, fusion) + + + @property + def fusion_type(self): + return self.calc_fusion_type(self.order) + + +@groups('*') +@match_name("fuse_gap_convs") +@run_qtune_on_match +@description('Fuse convolutions, pools and activations to match GAP AutoTiler operations') +class MatchAllGapConv(Matcher): + def _match(self, G: GraphView, set_identity: bool = True, **kwargs): + has_modified_graph = False + group_identity = kwargs.get('group_identity') + if group_identity == 'pow2_match_group': + valid_activations = VALID_ACTIVATIONS_POW2 + valid_fusions = VALID_FUSIONS_POW2 + else: + valid_activations = VALID_ACTIVATIONS_SQ8 + valid_fusions = VALID_FUSIONS_SQ8 + + for conv_node in G.nodes(node_classes=(Conv2DParameters, ConvFusionParameters)): + matcher = NewFusionMatch.from_node(G, conv_node, valid_activations, valid_fusions) + if not matcher or not matcher.can_fuse: + continue + has_modified_graph = True + matcher.fuse(G) + + if set_identity: + self.set_identity(G) + + return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/match_gap_linear.py b/tools/nntool/graph/matches/matchers/fuse_gap_linear.py similarity index 89% rename from tools/nntool/graph/matches/matchers/match_gap_linear.py rename to tools/nntool/graph/matches/matchers/fuse_gap_linear.py index 634bcb62b..4ba1624e6 100644 --- a/tools/nntool/graph/matches/matchers/match_gap_linear.py +++ b/tools/nntool/graph/matches/matchers/fuse_gap_linear.py @@ -19,11 +19,11 @@ HSwishActivationParameters, LeakyActivationParameters, LinearFusionParameters, NNEdge, ReluActivationParameters, SigmoidActivationParameters) -from quantization.new_qrec import QRec +from graph.types.activations import TanHActivationParameters from utils.graph import GraphView -from utils.node_id import NodeId -from ..matcher import Matcher, description, groups, match_name +from ..matcher import (Matcher, description, groups, match_name, + run_adjust_on_match, run_qtune_on_match) LOG = logging.getLogger("nntool." + __name__) @@ -32,7 +32,8 @@ LeakyActivationParameters, HSigmoidActivationParameters, HSwishActivationParameters, - SigmoidActivationParameters + SigmoidActivationParameters, + TanHActivationParameters, ) VALID_ACTIVATIONS_POW2 = ( @@ -80,6 +81,8 @@ def move_stats_to_fusion(fusion, stats): @groups('*') +@run_qtune_on_match +@run_adjust_on_match @match_name("fuse_gap_linear") @description('Fuse linear layers and activations to match GAP AutoTiler operations') class MatchGapLinear(Matcher): @@ -124,14 +127,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): input_mapping=input_mapping, output_mapping=output_mapping) if G.quantization: - # TODO - stats - qrecs = G.quantization.get_all(pnode.contained_nodes()) - if qrecs: - prec = QRec.copy_ktype( - qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) - for node in pnode.contained_nodes(): - G.quantization.move_to_fusion(node, pnode) - G.quantization[NodeId(pnode)] = prec + for node in pnode.contained_nodes(): + G.quantization.move_to_fusion(node, pnode) in_edges = G.in_edges(node_list.linear.name) out_edges = G.out_edges(last_node.name) for node in node_list.order: diff --git a/tools/nntool/graph/matches/matchers/match_gap_pool.py b/tools/nntool/graph/matches/matchers/fuse_gap_pool.py similarity index 84% rename from tools/nntool/graph/matches/matchers/match_gap_pool.py rename to tools/nntool/graph/matches/matchers/fuse_gap_pool.py index 8b0314d5a..db7aa2997 100644 --- a/tools/nntool/graph/matches/matchers/match_gap_pool.py +++ b/tools/nntool/graph/matches/matchers/fuse_gap_pool.py @@ -14,20 +14,17 @@ # along with this program. If not, see . import logging -from copy import deepcopy -import numpy as np from graph.types import (HSigmoidActivationParameters, HSwishActivationParameters, LeakyActivationParameters, NNEdge, PoolingParameters, ReluActivationParameters, SigmoidActivationParameters) from graph.types.fusions import ActivationFusion from graph.types.global_pooling import GlobalPoolingParameters -from quantization.new_qrec import QRec from utils.graph import GraphView -from utils.node_id import NodeId -from ..matcher import Matcher, description, groups, match_name, run_after +from ..matcher import (Matcher, description, groups, match_name, + run_adjust_on_match, run_after, run_qtune_on_match) LOG = logging.getLogger("nntool." + __name__) @@ -86,7 +83,10 @@ def fusion_type(self): @groups('*') + @match_name("fuse_gap_pool") +@run_qtune_on_match +@run_adjust_on_match @description('Fuse pooling layers and activations to match GAP AutoTiler operations') @run_after('fuse_gap_convs') class MatchGapPool(Matcher): @@ -135,19 +135,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): output_mapping=output_mapping) if G.quantization: # TODO - stats - qrecs = G.quantization.get_all(pnode.contained_nodes()) - if qrecs: - prec = QRec.copy_ktype( - qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) - for node in pnode.contained_nodes(): - G.quantization.move_to_fusion(node, pnode) - if isinstance(node, GlobalPoolingParameters): - # Global pooling fused with activations need to have only the activation scale - G.quantization[NodeId(pnode, node)].out_qs[0] = deepcopy( - G.quantization[NodeId(pnode, node)].in_qs[0]) - G.quantization[NodeId( - pnode, node)].out_qs[0].dtype = np.int32 - G.quantization[NodeId(pnode)] = prec + for node in pnode.contained_nodes(): + G.quantization.move_to_fusion(node, pnode) in_edges = G.in_edges(node_list.pool.name) out_edges = G.out_edges(last_node.name) for node in node_list.order: diff --git a/tools/nntool/graph/matches/matchers/matscale.py b/tools/nntool/graph/matches/matchers/fuse_matscale.py similarity index 100% rename from tools/nntool/graph/matches/matchers/matscale.py rename to tools/nntool/graph/matches/matchers/fuse_matscale.py diff --git a/tools/nntool/graph/matches/matchers/match_op_activation.py b/tools/nntool/graph/matches/matchers/fuse_op_activation.py similarity index 92% rename from tools/nntool/graph/matches/matchers/match_op_activation.py rename to tools/nntool/graph/matches/matchers/fuse_op_activation.py index 406dbde5c..11dd896f3 100644 --- a/tools/nntool/graph/matches/matchers/match_op_activation.py +++ b/tools/nntool/graph/matches/matchers/fuse_op_activation.py @@ -13,7 +13,6 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from graph.types.tensor_arithmetic import MatMulTransposedParameters import logging from abc import abstractproperty @@ -22,14 +21,13 @@ GlobalPoolingParameters, HSigmoidActivationParameters, HSwishActivationParameters, LeakyActivationParameters, MatMulOpFusionParameters, MatMulOpParameters, - MatrixAddParameters, NNEdge, - PoolingParameters, ReluActivationParameters, - SigmoidActivationParameters) -from quantization.new_qrec import QRec + MatrixAddParameters, NNEdge, PoolingParameters, + ReluActivationParameters, SigmoidActivationParameters) +from graph.types.tensor_arithmetic import MatMulTransposedParameters from utils.graph import GraphView from utils.node_id import NodeId -from ..matcher import Matcher, description, groups, match_name, run_after +from ..matcher import Matcher, description, groups, match_name, run_after, run_qtune_on_match, run_adjust_on_match LOG = logging.getLogger("nntool." + __name__) @@ -134,6 +132,8 @@ def fusion_type(self): @run_after('fuse_gap_pool', 'fuse_external_bias_matmul') +@run_qtune_on_match +@run_adjust_on_match class MatchOpActivation(Matcher): @abstractproperty @@ -179,13 +179,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): output_mapping=output_mapping) if G.quantization: # TODO - stats - qrecs = G.quantization.get_all(pnode.contained_nodes()) - if qrecs: - prec = QRec.copy_ktype( - qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) - for fnode in pnode.contained_nodes(): - G.quantization.move_to_fusion(fnode, pnode) - G.quantization[NodeId(pnode)] = prec + for fnode in pnode.contained_nodes(): + G.quantization.move_to_fusion(fnode, pnode) in_edges = G.in_edges(node_list.node.name) out_edges = G.out_edges(last_node.name) for snode in node_list.order: diff --git a/tools/nntool/graph/matches/matchers/fuse_pad.py b/tools/nntool/graph/matches/matchers/fuse_pad.py index d5826ce5b..ed00d72eb 100644 --- a/tools/nntool/graph/matches/matchers/fuse_pad.py +++ b/tools/nntool/graph/matches/matchers/fuse_pad.py @@ -53,7 +53,7 @@ def expand_padding(from_shape, to_shape, padding): @match_name('fuse_pad') @description('Fuse pad operation to subsequent Convolution or Pool') @groups('*') -@run_before('match_gap_conv', 'match_gap_pool') +@run_before('fuse_gap_convs', 'fuse_gap_pool') class MatchFusePad(Matcher): @staticmethod def remove_padding(shape, padding): diff --git a/tools/nntool/graph/matches/matchers/match_channel_padded_add.py b/tools/nntool/graph/matches/matchers/fuse_padded_add.py similarity index 91% rename from tools/nntool/graph/matches/matchers/match_channel_padded_add.py rename to tools/nntool/graph/matches/matchers/fuse_padded_add.py index a650765a4..e4f8ba545 100644 --- a/tools/nntool/graph/matches/matchers/match_channel_padded_add.py +++ b/tools/nntool/graph/matches/matchers/fuse_padded_add.py @@ -22,7 +22,7 @@ from utils.graph import GraphView from utils.node_id import NodeId -from ..matcher import Matcher, match_name, description, groups +from ..matcher import Matcher, match_name, description, groups, run_before, run_qtune_on_match LOG = logging.getLogger("nntool." + __name__) @@ -64,6 +64,8 @@ def fusion_type(self): @match_name('fuse_padded_add') @description('Fuse convolutions, pools and activations to match GAP AutoTiler operations') +@run_before('fuse_op_activation_scale8') +@run_qtune_on_match @groups('scaled') class MatchPadAddAct(Matcher): @@ -109,14 +111,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): input_mapping=input_mapping, output_mapping=output_mapping) if G.quantization: - qrecs = G.quantization.get_all(pnode.contained_nodes()) - # TODO - stats - if qrecs: - prec = QRec.copy_ktype( - qrecs[1], in_qs=qrecs[1].in_qs, out_qs=qrecs[-1].out_qs) - for node in pnode.contained_nodes(): - G.quantization.move_to_fusion(node, pnode) - G.quantization[NodeId(pnode)] = prec + for node in pnode.contained_nodes(): + G.quantization.move_to_fusion(node, pnode) if padded_input_idx == 0: in_edges = G.in_edges(node_list.pad.name) + \ G.indexed_in_edges(node_list.add.name)[1::] diff --git a/tools/nntool/graph/matches/matchers/insert_copies.py b/tools/nntool/graph/matches/matchers/insert_copies.py index 29b53c275..dce28ef08 100644 --- a/tools/nntool/graph/matches/matchers/insert_copies.py +++ b/tools/nntool/graph/matches/matchers/insert_copies.py @@ -16,8 +16,8 @@ import logging from copy import deepcopy -from graph.types import (ConcatParameters, CopyParameters, InputParameters, - NNEdge, NoOPParameters, OutputParameters, +from graph.types import (ConcatParameters, CopyParameters, InputParameters, RNNBaseParameters, + NNEdge, NoOPParameters, OutputParameters, ConstantInputParameters, ReshapeParameters, SplitParameters, TransposeParameters) from quantization.new_qrec import QRec @@ -30,34 +30,30 @@ LOG = logging.getLogger("nntool." + __name__) -def find_real_in_edge(G, edge): - from_node = edge.from_node - if isinstance(from_node, ReshapeParameters): - res = find_real_in_edge(G, G.in_edges(from_node.name)[0]) - return res - if isinstance(from_node, NoOPParameters): - res = find_real_in_edge(G, G.in_edges(from_node.name)[0]) - return res - if isinstance(from_node, TransposeParameters): - _, real_transpose = from_node.real_shape() - if len(real_transpose) <= 1: - res = find_real_in_edge(G, G.in_edges(from_node.name)[0]) - return res - return (edge.from_node, edge.from_idx) +class VisitEdge(): + def __init__(self, edge, direction) -> None: + self.edge = edge + self.direction = direction + + def __eq__(self, other: object) -> bool: + return isinstance(other, VisitEdge) and self.edge == other.edge + + def __hash__(self) -> int: + return self.edge.__hash__() @match_name('insert_copies') -@description('insert copy nodes on edges that link splits to concats') -@run_after('insert_transposes') +@description('insert copy nodes on edges that link stacked tensors between themselves or to inputs or outputs') @groups('*') @needs_valid_dimension(True) class MatchInsertCopies(Matcher): @staticmethod def can_pass_node(node): - return (isinstance(node, (ReshapeParameters, NoOPParameters)) or - isinstance(node, TransposeParameters) and node.does_nothing) + # nodes that do not generate any kernel + return node.no_model_code def find_split_concat_down(self, G, edge): + # search for a split or concat on any downward edge if isinstance(edge.to_node, (SplitParameters, ConcatParameters)): return True elif self.can_pass_node(edge.to_node): @@ -76,7 +72,7 @@ def search_up_for_duplicate(self, G, edge): for out_edge in out_edges: if self.find_split_concat_down(G, out_edge): return out_edge - elif self.can_pass_node(edge.to_node): + elif self.can_pass_node(edge.from_node): return self.search_up_for_duplicate(G, G.in_edges(edge.from_node)[0]) return None @@ -91,15 +87,22 @@ def insert_copy_at_edge(G, edge): if G.quantization and nid in G.quantization: qrec = G.quantization[nid] qtype = deepcopy(qrec.out_qs[edge.from_idx]) - QRec.copy_ktype(qrec, in_qs=[qtype], out_qs=[qtype]) + G.quantization[NodeId(copy_node)] = QRec.copy_ktype(qrec, in_qs=[qtype], out_qs=[qtype]) def find_common_in_edges(self, G: GraphView): - # Look for splits and concats that share a common in edge where a copy is necessary - nodes = G.nodes(node_classes=(SplitParameters, ConcatParameters)) + # Look for splits and concats that share a common in edge + # the split is a stacked tensor and the concat is an alias in a different stack + # a copy is always necessary + # RNNBaseParameters are also here since they create a UserKernelGroup where their + # input can already be in a stack so causes a tiler error + nodes = G.nodes(node_classes=(SplitParameters, + ConcatParameters, RNNBaseParameters)) has_modified_graph = False while nodes: node = nodes.pop(0) - for in_edge in G.in_edges(node): + for in_edge in G.indexed_in_edges(node): + if isinstance(node, RNNBaseParameters) and in_edge.to_idx > 0: + break # find another edge that would be generated as the same edge # with a concat/split on it. If found then insert a copy # and search again on that node to find others @@ -119,38 +122,87 @@ def search_up_for(self, G, edge, node_class): return self.search_up_for(G, G.in_edges(edge.from_node)[0], node_class) return None - def insert_copy_split_to_output_or_concat(self, G): - # insert copys between splits and outputs or concats - nodes = G.nodes(node_classes=(ConcatParameters, OutputParameters)) - has_modified_graph = False - while nodes: - node = nodes.pop(0) - for edge in G.in_edges(node): - split_edge = self.search_up_for(G, edge, SplitParameters) - if split_edge is None: - continue - has_modified_graph = True - self.insert_copy_at_edge(G, split_edge) - return has_modified_graph + def on_same_edge_as(self, G, node, node_class, visited=None, last_direction=None, start_edge=None): + if visited is None: + visited = set() + if start_edge: + to_visit = {VisitEdge(start_edge, 'up')} + else: + to_visit = set(VisitEdge(edge, 'up') for edge in G.in_edges(node)) + + while to_visit: + visited_edge = to_visit.pop() + visit_node = visited_edge.edge.from_node if visited_edge.direction == "up" else visited_edge.edge.to_node + visited.add(visited_edge) + # if node class is a tuple of class and direction then see if we are visiting that side of the class + # this copes with splits and concats that are converted to stacks in the model. + # A stack alias cannot be an output. The aliases will be the output side of a split and the input side + # of a concat + if isinstance(node_class[0], tuple): + if visited_edge.direction == "up": + if any(pair[1] == "down" and isinstance(visit_node, pair[0]) for pair in node_class): + return visited_edge.edge + else: + if any(pair[1] == "up" and isinstance(visit_node, pair[0]) for pair in node_class): + return visited_edge.edge + elif isinstance(visit_node, node_class): + return visited_edge.edge + if visited_edge.direction == "up": + if self.can_pass_node(visit_node): + # if arriving upwards on the node and can pass visit all its edges that we have not visited + to_visit |= (set(VisitEdge(edge, 'up') + for edge in G.in_edges(visit_node)) - visited) + to_visit |= (set(VisitEdge(edge, 'down') + for edge in G.out_edges(visit_node)) - visited) + else: + # can't pass but must still visit all the edges on the same out idx + to_visit |= (set(VisitEdge(edge, 'down') + for edge in G.indexed_out_edges(visit_node)[visited_edge.edge.from_idx]) - visited) + else: + # the other up edges case is not here since if it is a concat it would already have + # triggered the return + if self.can_pass_node(visit_node): + to_visit |= (set(VisitEdge(edge, 'down') + for edge in G.out_edges(visit_node)) - visited) + return None - def insert_copy_input_or_split_to_concat(self, G): - # insert copies between inputs or splits and concats - nodes = G.nodes(node_classes=(ConcatParameters)) + COPIES_BETWEEN_CLSES = [ + # any stack -> any stack (different memory - must always copy) + {'from': (SplitParameters, ConcatParameters), 'to': ( + ConcatParameters, SplitParameters)}, + # stacked memory -> stack in user kernel group + {'from': (SplitParameters, ConcatParameters), 'to': (RNNBaseParameters,)}, + # stacked alias -> output (name is lost since it is an alias - could be fixed in AT) + {'from': ((SplitParameters, 'down'), (ConcatParameters, 'up')), 'to': (OutputParameters,)}, + # # stacked tensor -> alias (different memory - must always copy) + # input -> stacked tensor (can be fixed in AT) + # input -> stacked alias (name is lost since it is an alias - could be fixed in AT) + {'from': (InputParameters,), 'to': ( + ConcatParameters, SplitParameters)}, + # constant -> stacked alias (I guess the constant could be loaded here in some cases by AT) + # constant -> stack (not sure if this works or not. Including for safety. If it doesn't work it could.) + {'from': (ConstantInputParameters,), 'to': ( + ConcatParameters, SplitParameters)}, + ] + + def insert_copies_between(self, G, from_clses, to_clses): + # insert copys between splits and outputs or concats + nodes = G.nodes(node_classes=to_clses) has_modified_graph = False while nodes: node = nodes.pop(0) - for edge in G.in_edges(node): - input_edge = self.search_up_for( - G, edge, (SplitParameters, InputParameters)) - if input_edge is None: - continue - has_modified_graph = True - self.insert_copy_at_edge(G, input_edge) + for edge in G.indexed_in_edges(node): + found_edge = self.on_same_edge_as(G, node, from_clses, start_edge=edge) + if found_edge: + has_modified_graph = True + self.insert_copy_at_edge(G, found_edge) return has_modified_graph def _match(self, G: GraphView, set_identity: bool = True, **kwargs): - has_modified_graph = self.insert_copy_input_or_split_to_concat(G) - has_modified_graph |= self.insert_copy_split_to_output_or_concat(G) + has_modified_graph = False + for clses in self.COPIES_BETWEEN_CLSES: + has_modified_graph |= self.insert_copies_between( + G, clses['from'], clses['to']) has_modified_graph |= self.find_common_in_edges(G) return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/match_gap_conv.py b/tools/nntool/graph/matches/matchers/match_gap_conv.py deleted file mode 100644 index cbb5ca865..000000000 --- a/tools/nntool/graph/matches/matchers/match_gap_conv.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -from graph.types.activations import TanHActivationParameters -import logging -from copy import deepcopy - -from graph.types import (ActivationParameters, Conv2DParameters, - ConvFusionParameters, HSigmoidActivationParameters, - HSwishActivationParameters, LeakyActivationParameters, - NNEdge, PoolingParameters, ReluActivationParameters, - SigmoidActivationParameters) -from quantization.new_qrec import QRec -from utils.graph import GraphView -from utils.node_id import NodeId - -from ..matcher import Matcher, description, groups, match_name - -LOG = logging.getLogger("nntool." + __name__) - -VALID_ACTIVATIONS_SQ8 = ( - ReluActivationParameters, - LeakyActivationParameters, - HSigmoidActivationParameters, - HSwishActivationParameters, - SigmoidActivationParameters, - TanHActivationParameters -) - -VALID_ACTIVATIONS_POW2 = ( - ReluActivationParameters, - LeakyActivationParameters, - HSigmoidActivationParameters, - HSwishActivationParameters, - SigmoidActivationParameters -) - - -class FusionMatch(): - def __init__(self, valid_activations) -> None: - self.conv = None - self.pool = None - self.active = None - self.tensor_order = None - self.valid_activations = valid_activations - self.order = [] - - def add_node(self, params): - if isinstance(params, Conv2DParameters): - if self.conv: - return None - self.tensor_order = params.ker_out_order[0] - self.order.append(params) - self.conv = params - return self - elif isinstance(params, self.valid_activations): - if self.active: - return None - self.order.append(params) - self.active = params - return self - elif isinstance(params, PoolingParameters): - if self.pool: - return None - if self.tensor_order != params.ker_in_order[0]: - return None - self.order.append(params) - self.pool = params - return self - else: - return None - - @property - def fusion_type(self): - return '_'.join(['conv' if isinstance(params, Conv2DParameters) - else 'active' if isinstance(params, ActivationParameters) - else 'pool' for params in self.order]) - - -@groups('*') -@match_name("fuse_gap_convs") -@description('Fuse convolutions, pools and activations to match GAP AutoTiler operations') -class MatchAllGapConv(Matcher): - - def get_node_list(self, G, params, valid_activations, result=None): - if result is None: - result = FusionMatch(valid_activations) - if not result.add_node(params): - return result - out_edges = G.out_edges(params.name) - if len(out_edges) > 1: - return result - return self.get_node_list(G, out_edges[0].to_node, valid_activations, result=result) - - def _match(self, G: GraphView, set_identity: bool = True, **kwargs): - has_modified_graph = False - group_identity = kwargs.get('group_identity') - if group_identity == 'pow2_match_group': - valid_activations = VALID_ACTIVATIONS_POW2 - else: - valid_activations = VALID_ACTIVATIONS_SQ8 - for conv_node in [params for params in G.nodes() if isinstance(params, Conv2DParameters)]: - node_list = self.get_node_list(G, conv_node, valid_activations) - if node_list is None or len(node_list.order) < 2: - continue - if node_list.fusion_type == 'conv_active_pool': - if node_list.pool.pool_type == "average": - node_list.order = node_list.order[:2:] - node_list.pool = None - elif node_list.fusion_type == 'conv_pool_active': - # NOTE: This is only for old POW2 kernels - SQ8 can handle this - if node_list.pool.pool_type == "average" and node_list.active.activation != "relu": - continue - LOG.info("fusing nodes %s", ",".join( - (node.name for node in node_list.order))) - has_modified_graph = True - subgraph = GraphView() - last_node = None - for node in node_list.order: - if last_node is not None: - subgraph.add_edge( - NNEdge(from_node=last_node, to_node=node)) - last_node = node - input_mapping = [[(node_list.conv, idx)] for idx in range(3)] - output_mapping = [(last_node, 0)] - pnode = ConvFusionParameters( - node_list.conv.name + '_fusion', - fusion_type=node_list.fusion_type, - subgraph=subgraph, - in_dims_hint=node_list.conv.in_dims_hint, - out_dims_hint=node_list.conv.out_dims_hint, - in_dims=deepcopy(node_list.conv.in_dims), - out_dims=deepcopy(node_list.order[-1].out_dims), - input_mapping=input_mapping, - output_mapping=output_mapping) - if G.quantization: - qrecs = G.quantization.get_all(pnode.contained_nodes()) - if qrecs: - # TODO - stats - prec = QRec.copy_ktype( - qrecs[0], in_qs=deepcopy(qrecs[0].in_qs), out_qs=deepcopy(qrecs[-1].out_qs)) - for node in pnode.contained_nodes(): - G.quantization.move_to_fusion(node, pnode) - G.quantization[NodeId(pnode)] = prec - in_edges = G.in_edges(node_list.conv.name) - out_edges = G.out_edges(last_node.name) - for node in node_list.order: - G.remove(node) - for edge in in_edges: - G.add_edge(NNEdge(edge.from_node, pnode, - from_idx=edge.from_idx, to_idx=edge.to_idx)) - for edge in out_edges: - G.add_edge(NNEdge(pnode, edge.to_node, - from_idx=edge.from_idx, to_idx=edge.to_idx)) - - if set_identity: - self.set_identity(G) - - return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/find_hsigmoid.py b/tools/nntool/graph/matches/matchers/match_hsigmoid.py similarity index 100% rename from tools/nntool/graph/matches/matchers/find_hsigmoid.py rename to tools/nntool/graph/matches/matchers/match_hsigmoid.py diff --git a/tools/nntool/graph/matches/matchers/match_matmul_add_bias.py b/tools/nntool/graph/matches/matchers/match_matmul_add_bias.py deleted file mode 100644 index 16e4b6b2c..000000000 --- a/tools/nntool/graph/matches/matchers/match_matmul_add_bias.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -# import logging - -# from graph.types import (ActivationParameters, ConstantInputParameters, -# MatMulOpFusionParameters, MatMulOpParameters, -# MatrixAddParameters, NNEdge) -# from quantization.new_qrec import QRec -# from utils.graph import GraphView -# from utils.node_id import NodeId - -# from ..matcher import Matcher, groups, match_name, description - -# LOG = logging.getLogger("nntool." + __name__) - - -# class FusionMatch(): -# def __init__(self) -> None: -# self.matmul = None -# self.add = None -# self.active = None -# self.order = [] - -# def add_node(self, params, G): -# if isinstance(params, MatMulOpParameters): -# if self.matmul: -# return None -# self.order.append(params) -# self.matmul = params -# return self -# elif isinstance(params, ActivationParameters): -# if self.active: -# return None -# self.order.append(params) -# self.active = params -# return self -# elif isinstance(params, MatrixAddParameters): -# if self.add or self.active: -# return None -# can_fuse = False -# for in_edge in G.in_edges(params.name): -# can_fuse = can_fuse or isinstance( -# in_edge.from_node, ConstantInputParameters) -# if not can_fuse: -# return None -# self.order.append(params) -# self.add = params -# return self -# else: -# return None - -# @property -# def fusion_type(self): -# return '_'.join(['matmul' if isinstance(params, MatMulOpParameters) -# else 'with_bias' if isinstance(params, MatrixAddParameters) -# else 'active' for params in self.order]) - -# @groups('*') -# @match_name("fuse_gap_matmul") -# @description('Fuse matmul layers with optional add and/or activations to match GAP AutoTiler operations') -# class MatchMatMulAddBias(Matcher): - -# def get_node_list(self, G, params, result=None): -# if result is None: -# result = FusionMatch() -# if not result.add_node(params, G): -# return result -# out_edges = G.out_edges(params.name) -# if len(out_edges) > 1: -# return result -# return self.get_node_list(G, out_edges[0].to_node, result=result) - -# def _match(self, G: GraphView, set_identity: bool = True, **kwargs): -# has_modified_graph = False -# for matmul_node in [params for params in G.nodes() if isinstance(params, MatMulOpParameters)]: -# node_list = self.get_node_list(G, matmul_node) -# if node_list is None or len(node_list.order) < 2: -# continue -# LOG.info("fusing nodes %s", ",".join( -# (node.name for node in node_list.order))) -# has_modified_graph = True -# subgraph = GraphView() -# if node_list.active is not None: -# subgraph.add_edge( -# NNEdge(from_node=node_list.matmul, to_node=node_list.active)) -# input_mapping = [[(node_list.matmul, idx)] for idx in range(2)] -# if node_list.add: -# input_mapping += [[(node_list.matmul, 2)]] -# output_mapping = [(node_list.active, 0)] if node_list.active else [ -# (node_list.matmul, 0)] -# pnode = MatMulOpFusionParameters( -# node_list.matmul.name + '_fusion', -# fusion_type=node_list.fusion_type, -# subgraph=subgraph, -# input_mapping=input_mapping, -# output_mapping=output_mapping) -# if G.quantization: -# # if there are quantization stats then clear them. They need to be created again -# G.quantization.stats = None -# qrecs = G.quantization.get_all(pnode.contained_nodes()) -# if qrecs: -# prec = QRec.copy_ktype( -# qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs) -# for node in pnode.contained_nodes(): -# G.quantization.move_to_fusion(node, pnode) -# G.quantization[NodeId(pnode)] = prec -# in_edges = G.in_edges(node_list.matmul.name) -# if node_list.add: -# bias_edge = [add_edge for add_edge in G.in_edges(node_list.add.name) if isinstance( -# add_edge.from_node, ConstantInputParameters)][0] -# out_edges = G.out_edges(node_list.order[-1].name) -# for node in node_list.order: -# G.remove(node) -# for edge in in_edges: -# G.add_edge(NNEdge(edge.from_node, pnode, -# from_idx=edge.from_idx, to_idx=edge.to_idx)) -# if node_list.add: -# G.add_edge(NNEdge(bias_edge.from_node, pnode, -# from_idx=bias_edge.from_idx, to_idx=2)) -# for edge in out_edges: -# G.add_edge(NNEdge(pnode, edge.to_node, -# from_idx=edge.from_idx, to_idx=edge.to_idx)) - -# if set_identity: -# self.set_identity(G) - -# return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/match_transpose_matmul.py b/tools/nntool/graph/matches/matchers/match_transpose_matmul.py index b9ddb5c92..1b7cc5d53 100644 --- a/tools/nntool/graph/matches/matchers/match_transpose_matmul.py +++ b/tools/nntool/graph/matches/matchers/match_transpose_matmul.py @@ -13,30 +13,23 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from graph.types.tensor_arithmetic import MatMulTransposedParameters import logging -from abc import abstractproperty -from graph.types import (TransposeParameters, ActivationParameters, - BroadcastableActivationFusion, - GlobalPoolingParameters, HSigmoidActivationParameters, - HSwishActivationParameters, LeakyActivationParameters, - MatMulOpFusionParameters, MatMulOpParameters, - MatrixAddParameters, NNEdge, - PoolingParameters, ReluActivationParameters, - SigmoidActivationParameters) -from quantization.new_qrec import QRec +from graph.manipulations.eliminate_transposes.transpose_helpers import \ + identity_transpose +from graph.types import MatMulOpParameters, NNEdge, TransposeParameters +from graph.types.tensor_arithmetic import MatMulTransposedParameters from utils.graph import GraphView -from utils.node_id import NodeId -from ..matcher import Matcher, description, groups, match_name, run_after, run_before +from ..matcher import (Matcher, description, groups, match_name, run_after, + run_before) LOG = logging.getLogger("nntool." + __name__) @run_after('fuse_external_bias_matmul') @run_before('fuse_op_activation_scale8', 'fuse_op_activation_pow2') @groups('*') -@match_name("match_trans_matmul") +@match_name("match_transpose_matmul") @description("spots Transpose followed by matmul and generates the proper matmul generator") class MatchTransMatMul(Matcher): @@ -56,6 +49,11 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): trans_node = in_edges[1].from_node if not isinstance(trans_node, TransposeParameters): continue + transpose = tuple(trans_node.transpose) + if not identity_transpose(transpose[:-2]): + continue + if transpose[-2:] != (len(transpose) - 1, len(transpose) - 2): + continue if isinstance(node, MatMulTransposedParameters): new_node = MatMulOpParameters(node.name) else: diff --git a/tools/nntool/graph/matches/matchers/move_node_up.py b/tools/nntool/graph/matches/matchers/move_node_up.py index f3cd828fd..9522f7dcf 100644 --- a/tools/nntool/graph/matches/matchers/move_node_up.py +++ b/tools/nntool/graph/matches/matchers/move_node_up.py @@ -19,12 +19,12 @@ MatrixAddParameters, MatrixMulParameters, NNEdge, PoolingParameters, ReluActivationParameters, ReshapeParameters, TransposeParameters, MatMulTransposedParameters) -from graph.types.others import ReverseParameters, StridedSliceParameters +from graph.types.others import QuantizeParameters, ReverseParameters, StridedSliceParameters from graph.types.tensor_arithmetic import MatMulOpParameters from utils.graph import GraphView from utils.node_id import NodeId -from ..matcher import Matcher, match_name, groups, run_before, description, needs_valid_dimension +from ..matcher import Matcher, match_name, groups, run_before, description, needs_valid_dimension, run_qtune_on_match LOG = logging.getLogger("nntool." + __name__) @@ -56,7 +56,8 @@ def find_home_for_node(self, G, node, first=True): raise LocationNotFoundError() # @IgnoreException # Concat can have multiple inputs that must all acccept moved node if isinstance(node, ConcatParameters): - for in_edge in G.in_edges(node): + # important to use indexed here so the order is always the same + for in_edge in G.indexed_in_edges(node): yield from self.find_home_for_node(G, in_edge.from_node, first=False) @@ -140,10 +141,11 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): "Should be run before match_gap_ * fusions.") @needs_valid_dimension(True) @run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2') +@run_qtune_on_match class MoveActivationsMatcherScale8(MoveNodeUpMatcher): ValidNodesToPass = (ReshapeParameters, StridedSliceParameters, ReverseParameters, - TransposeParameters, ConcatParameters) + TransposeParameters, ConcatParameters, QuantizeParameters) ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters, GlobalPoolingParameters, MatrixAddParameters, MatrixMulParameters, MatMulOpParameters, MatMulTransposedParameters) @@ -159,8 +161,7 @@ class MoveActivationsMatcherScale8(MoveNodeUpMatcher): @run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8') class MoveMaxPoolMatcherScale8(MoveNodeUpMatcher): - ValidNodesToPass = (ReshapeParameters, TransposeParameters, - ReluActivationParameters, ConcatParameters) + ValidNodesToPass = (ReluActivationParameters,) ValidFusions = (Conv2DParameters, FcParameters) ValidNodes = (lambda node: isinstance( node, PoolingParameters) and node.pool_type == "max",) diff --git a/tools/nntool/graph/matches/matchers/propagate_rnn_sym_mult_qrec.py b/tools/nntool/graph/matches/matchers/propagate_rnn_sym_mult_qrec.py deleted file mode 100644 index 3303b64af..000000000 --- a/tools/nntool/graph/matches/matchers/propagate_rnn_sym_mult_qrec.py +++ /dev/null @@ -1,44 +0,0 @@ -# # This program is free software: you can redistribute it and/or modify -# # it under the terms of the GNU Affero General Public License as -# # published by the Free Software Foundation, either version 3 of the -# # License, or (at your option) any later version. - -# # This program is distributed in the hope that it will be useful, -# # but WITHOUT ANY WARRANTY; without even the implied warranty of -# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# # GNU Affero General Public License for more details. - -# # You should have received a copy of the GNU Affero General Public License -# # along with this program. If not, see . - -# from graph.types import RNNBaseParameters -# from utils.graph import GraphView -# from utils.node_id import NodeId - -# from ..matcher import Matcher, groups, match_name, description -# from .equalize_sym_mult_concats import propagate_qtype_up - - -# @groups('scaled') -# @match_name("propagate_up_rnn_in_qs") -# @description("After quantization of rnn their in_q and out_q are the same " -# "so in_q may be changed and we need to propagate it up") -# class PropagateUpRNNInputQ(Matcher): - -# def _match(self, G: GraphView, set_identity: bool = True, **kwargs): -# if not G.quantization: -# return -# rnns = [node for node in G.nodes() if isinstance( -# node, RNNBaseParameters)] -# qrecs = [G.quantization[NodeId(node)] for node in rnns] -# for rnn, qrec in zip(rnns, qrecs): -# in_idx = rnn.INPUT_NAMES.index('input') -# in_edge = [edge for edge in G.in_edges( -# rnn.name) if edge.to_idx == in_idx][0] -# in_q = qrec.in_qs[in_idx] -# propagate_qtype_up(G, in_q, in_edge) - -# if set_identity: -# self.set_identity(G) - -# return False diff --git a/tools/nntool/graph/matches/matchers/reduce_max_to_pool.py b/tools/nntool/graph/matches/matchers/reduce_max_to_pool.py deleted file mode 100644 index 55b51139c..000000000 --- a/tools/nntool/graph/matches/matchers/reduce_max_to_pool.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -# from graph.types import ReduceMaxParameters, GlobalPoolingParameters -# from utils.graph import GraphView -# from .matcher import DefaultMatcher, MatchNode, DontReplaceError - -# class MatchReduceMax(DefaultMatcher): -# NAME = 'match_reduce_max_nodes' -# DESCRIPTION = 'Match reduce max nodes and replace them with GlobalMaxPooling' - -# def match_function(self, G: GraphView): -# sub = GraphView() -# sub.add_node(MatchNode('0', matcher=lambda node: -# isinstance(node, ReduceMaxParameters))) -# return G.match_fragment(sub) - -# def replace_function(self, G: GraphView, subgraph: GraphView): -# reduce_max_node = list(subgraph.nodes())[0] - -# for idx, in_dim in enumerate(reduce_max_node.in_dims[0].shape): -# if idx > 0 and idx not in reduce_max_node.axis and in_dim == 1: -# raise DontReplaceError() -# return GlobalPoolingParameters(reduce_max_node.name + "_GLOBAL_MAXPOOL", pool_type='max', -# in_dims_hint=reduce_max_node.in_dims_hint, -# out_dims_hint=reduce_max_node.out_dims_hint), None, None diff --git a/tools/nntool/graph/matches/matchers/remove_copies.py b/tools/nntool/graph/matches/matchers/remove_copies.py index 302b9321a..69dba7e5c 100644 --- a/tools/nntool/graph/matches/matchers/remove_copies.py +++ b/tools/nntool/graph/matches/matchers/remove_copies.py @@ -33,7 +33,7 @@ @description("Remove unnecessary copies") @modifies_dimensions(True) @groups('*') -@run_after('expand_transposes', 'remove_noops') +@run_after('remove_noops') class RemoveCopies(Matcher): def _match(self, G: GraphView, set_identity: bool = True, **kwargs): @@ -47,15 +47,13 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): G, out_edges[0], (OutputParameters, InputParameters, ConstantInputParameters, SplitParameters, ConcatParameters), - can_pass=(ReshapeParameters, NoOPParameters), - can_pass_fn=lambda G, node: isinstance(node, TransposeParameters) and node.does_nothing, + can_pass_fn=lambda G, node: node.no_model_code, follow_multi=True) and search_up( G, G.in_edges(node)[0], (InputParameters, OutputParameters, ConstantInputParameters, SplitParameters, ConcatParameters), - can_pass=(ReshapeParameters, NoOPParameters), - can_pass_fn=lambda G, node: isinstance(node, TransposeParameters) and node.does_nothing, + can_pass_fn=lambda G, node: node.no_model_code, follow_multi=True)): continue nodes_to_remove.append(node) diff --git a/tools/nntool/graph/matches/matchers/remove_noops.py b/tools/nntool/graph/matches/matchers/remove_noops.py index d469cccf2..3f902fd85 100644 --- a/tools/nntool/graph/matches/matchers/remove_noops.py +++ b/tools/nntool/graph/matches/matchers/remove_noops.py @@ -15,9 +15,7 @@ import logging -from graph.types import NNEdge, NoOPParameters -from graph.types.others import (ConcatParameters, ReshapeParameters, - SplitParameters, TransposeParameters) +from graph.types import NNEdge from utils.graph import GraphView from ..matcher import Matcher, description, groups, match_name, run_before @@ -34,30 +32,9 @@ @groups('symmetric', 'scaled') class RemoveNoOPs(Matcher): - @staticmethod - def one_inedge(G, node, idx=None): - in_edges = G.in_edges(node) - return len(in_edges) == 1 and (idx is None or in_edges[0].to_idx == idx) - - @staticmethod - def one_outedge(G, node, idx=None): - out_edges = G.out_edges(node) - return len(out_edges) == 1 and (idx is None or out_edges[0].from_idx == idx) - - @staticmethod - def one_in_and_outedge(G, node, idx=None): - return RemoveNoOPs.one_inedge(G, node, idx=idx) and RemoveNoOPs.one_outedge(G, node, idx=idx) - - @staticmethod - def node_does_nothing(G, node): - return (isinstance(node, NoOPParameters) or - isinstance(node, TransposeParameters) and node.transpose is None or - isinstance(node, ReshapeParameters) and node.old_shape == node.shape or - (isinstance(node, (ConcatParameters, SplitParameters)) and RemoveNoOPs.one_in_and_outedge(G, node, idx=0))) - def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: has_modified_graph = False - for node in [node for node in G.nodes() if self.node_does_nothing(G, node)]: + for node in [node for node in G.nodes() if node.does_nothing]: has_modified_graph = True in_edge = G.in_edges(node.name)[0] G.remove_edge(in_edge) diff --git a/tools/nntool/graph/matches/matchers/remove_reshapes.py b/tools/nntool/graph/matches/matchers/remove_reshapes.py index 573d056da..1d9defb7b 100644 --- a/tools/nntool/graph/matches/matchers/remove_reshapes.py +++ b/tools/nntool/graph/matches/matchers/remove_reshapes.py @@ -43,7 +43,7 @@ def validate_reshape(G, reshape): return False candidate = edge.to_node if isinstance(candidate, TransposeParameters): - if not candidate.does_nothing(): + if not candidate.no_model_code: return False out_shape = tuple(candidate.out_dims[0].shape) else: @@ -56,12 +56,13 @@ def validate_reshape(G, reshape): return (reshape, candidates, out_shape) def _match(self, G: GraphView, set_identity: bool = True, **kwargs): - modified_graph = True - while modified_graph: - modified_graph = False + modified_graph = False + found_reshapes = True + while found_reshapes: + found_reshapes = False for reshape in G.nodes(node_classes=(ReshapeParameters,)): if reshape.shape.shape == reshape.old_shape.shape: - modified_graph = True + found_reshapes = modified_graph = True LOG.info('removing reshape that does nothing %s', reshape.name) G.remove_and_reconnect(reshape, edge_class=NNEdge) nid = NodeId(reshape) @@ -72,7 +73,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs): res = self.validate_reshape(G, reshape) if res: LOG.info('unnecessary reshape found after %s', reshape.name) - modified_graph = True + found_reshapes = modified_graph = True (reshape, candidates, out_shape) = res for candidate in candidates: LOG.info( diff --git a/tools/nntool/graph/matches/matchers/remove_ssd_output.py b/tools/nntool/graph/matches/matchers/remove_ssd_output.py new file mode 100644 index 000000000..7e77b587e --- /dev/null +++ b/tools/nntool/graph/matches/matchers/remove_ssd_output.py @@ -0,0 +1,49 @@ +# Copyright (C) 2020, 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging +from copy import deepcopy + +from graph.types import SSDDetectorParameters +from utils.graph import GraphView + +from ..matcher import (Matcher, description, groups, match_name, run_qtune_on_match, + needs_valid_dimension) + +LOG = logging.getLogger("nntool." + __name__) + + +@match_name('remove_ssd_output') +@description('remove the 4th output on the ssd detector - num detections. This is necessary for ' + 'operation with GAP kernels') +@groups('*') +@needs_valid_dimension(True) +@run_qtune_on_match +class RemoveSSDOutput(Matcher): + + def _match(self, G: GraphView, set_identity: bool = True, **kwargs): + has_modified_graph = False + for node in G.nodes(node_classes=SSDDetectorParameters): + if not node.output_detection_count: + continue + has_modified_graph = True + LOG.info(f'removing detection count output on {node.name}') + edges_below = G.indexed_out_edges(node)[3] + for edge in edges_below: + G.remove_below(edge.to_node) + G.remove(edge.to_node) + node.output_detection_count = False + + return has_modified_graph diff --git a/tools/nntool/graph/matches/matchers/match_reversed_rnn.py b/tools/nntool/graph/matches/matchers/rnn_reverse.py similarity index 100% rename from tools/nntool/graph/matches/matchers/match_reversed_rnn.py rename to tools/nntool/graph/matches/matchers/rnn_reverse.py diff --git a/tools/nntool/graph/matches/matchers/match_rnn_unpack.py b/tools/nntool/graph/matches/matchers/rnn_unpack.py similarity index 100% rename from tools/nntool/graph/matches/matchers/match_rnn_unpack.py rename to tools/nntool/graph/matches/matchers/rnn_unpack.py diff --git a/tools/nntool/graph/matches/matchers/slice_to_split.py b/tools/nntool/graph/matches/matchers/slice_to_split.py index 20f601e88..53ab42b5a 100644 --- a/tools/nntool/graph/matches/matchers/slice_to_split.py +++ b/tools/nntool/graph/matches/matchers/slice_to_split.py @@ -118,7 +118,7 @@ def slices_to_sizes(slices_and_shapes, shape_rest): @ match_name("slice_to_split") @ description("collects slices from a single node and converts to a single split") -@ run_before('unused_concats') +@ run_before('remove_noops', 'insert_copies') @ groups('*') class SliceToSplitMatch(Matcher): @ staticmethod diff --git a/tools/nntool/graph/matches/matchers/split_concat.py b/tools/nntool/graph/matches/matchers/split_concat.py new file mode 100644 index 000000000..ef2615cda --- /dev/null +++ b/tools/nntool/graph/matches/matchers/split_concat.py @@ -0,0 +1,149 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +from graph.dim import Dim +from graph.types import ConcatParameters, NNEdge, SplitParameters +from graph.types.others import (CopyParameters, NoOPParameters, + ReshapeParameters, TransposeParameters) +from utils.graph import GraphView +from utils.node_id import NodeId + +from ..match_utils import search_down +from ..matcher import Matcher, description, groups, match_name, run_before + +LOG = logging.getLogger("nntool." + __name__) + + +def reduce_slices(slices, shapes): + res_slice = [] + res_shape = [] + for slice_axis, shape_axis in zip(zip(*slices), zip(*shapes)): + if slice_axis[0] == slice_axis[1]: + res_slice.append(slice_axis[0]) + res_shape.append(shape_axis[0]) + else: + res_slice.append( + (slice_axis[0][0], + slice_axis[-1][1], + slice_axis[0][2])) + res_shape.append(sum(shape_axis)) + return res_slice, res_shape + + +def remove_edges(G, edges): + if not edges: + return + edges = edges.copy() + while len(edges) > 1: + edge = edges.pop(0) + G.remove(edge.to_node) + if G.quantization: + nid = NodeId(edge.to_node) + if nid in G.quantization: + del G.quantization[nid] + try: + G.remove_edge(edges[0]) # @IgnoreException + except KeyError: + pass + + +@groups('*') +@match_name("split_concat") +@run_before('remove_noops', 'remove_copies') +@description("removes splits that go to concats where all the out edges of the split are in sequence in the concat") +class SplitConcatMatch(Matcher): + def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool: + edge_groups = [] + for node in G.nodes(node_classes=SplitParameters): + cur_group = None + for out_edge_bundle in G.indexed_out_edges(node): + if len(out_edge_bundle) == 1: + out_edge = out_edge_bundle[0] + concat_node_edges = search_down( + G, out_edge, ConcatParameters, + can_pass=(CopyParameters,), + can_pass_fn=lambda _, node: node.no_model_code) + if concat_node_edges: + if cur_group: + this_concat_edge = concat_node_edges[-1] + last_concat_edge = cur_group[-1][-1] + if (this_concat_edge.to_node == last_concat_edge.to_node and + this_concat_edge.to_idx == last_concat_edge.to_idx + 1): + cur_group.append(concat_node_edges) + continue + if len(cur_group) > 1: + edge_groups.append(cur_group) + cur_group = [concat_node_edges] + continue + if cur_group: + if len(cur_group) > 1: + edge_groups.append(cur_group) + cur_group = None + if cur_group: + if len(cur_group) > 1: + edge_groups.append(cur_group) + cur_group = None + # we leave the splits and concats after this since they will be cleared up by remove_noops + for edge_group in edge_groups: + split_node = edge_group[0][0].from_node + concat_node = edge_group[0][-1].to_node + from_idx = edge_group[0][0].from_idx + to_idx = edge_group[-1][0].from_idx + from_concat_idx = edge_group[0][-1].to_idx + to_concat_idx = edge_group[1][-1].to_idx + LOG.info( + f"combining outputs {from_idx}:{to_idx} on split node {split_node.name} " + f"followed by concat {concat_node.name}") + # combine slices and shapes on edges in group + new_slice, new_shape = reduce_slices( + split_node.act_slices[from_idx:to_idx+1], + split_node.out_shapes[from_idx:to_idx+1] + ) + new_concat_shape = Dim.combine( + [concat_node.in_dims[idx] + for idx in range(from_concat_idx, to_concat_idx+1)], + concat_node.axis) + split_node.act_slices = split_node.act_slices[:from_idx] + [ + new_slice] + split_node.act_slices[to_idx+1:] + # the slice may need to reshape since we will remove everything in between + split_node.out_shapes = split_node.out_shapes[:from_idx] + [ + new_concat_shape.shape] + split_node.out_shapes[to_idx+1:] + + # remove all edges and intermediate nodes on all edge groups + for edge_list in edge_group: + remove_edges(G, edge_list) + # add back a direct edge to the first idx + G.add_edge(NNEdge(from_node=split_node, + from_idx=edge_group[0][0].from_idx, + to_node=concat_node, + to_idx=edge_group[0][-1].to_idx)) + out_edge_bundles = G.indexed_out_edges(split_node) + # move edges beyond the edge group after the first index + for offset, edge_list in enumerate(out_edge_bundles[to_idx+1:]): + assert len(edge_list) == 1 + edge = edge_list[0] + G.remove_edge(edge) + G.add_edge(NNEdge.clone(edge, from_idx=from_idx+1+offset)) + # reindex the in edges in the concat + from_idx = edge_group[0][-1].to_idx + to_idx = edge_group[-1][-1].to_idx + in_edges = G.indexed_in_edges(concat_node) + for offset, in_edge in enumerate(in_edges[to_idx+1:]): + G.remove_edge(in_edge) + G.add_edge(NNEdge.clone(in_edge, to_idx=from_idx+1+offset)) + + return bool(edge_groups) diff --git a/tools/nntool/graph/matches/matches.py b/tools/nntool/graph/matches/matches.py index 451dc92fe..fe1130680 100644 --- a/tools/nntool/graph/matches/matches.py +++ b/tools/nntool/graph/matches/matches.py @@ -17,32 +17,19 @@ import logging -from graph.matches.matcher import Matcher, MatchGroup +from graph.matches.matcher import Matcher, MatchGroup, match_name, description from utils.subclasses import get_all_subclasses from .matchers import * LOG = logging.getLogger("nntool." + __name__) - -def general_validation(match: Matcher): - if match.DESCRIPTION is None: - LOG.warning('matcher %s has no description', match.NAME) - if match.NAME is None: - raise ValueError(f'match {match.NAME} has no name') - if '*' in match.RUN_BEFORE and '*' in match.RUN_AFTER: - raise ValueError( - f'match {match.NAME} has wildcard in run_before and run_after') - return match - - -ALL_MATCHERS = [general_validation(match_class) for match_class in get_all_subclasses(Matcher) - if match_class.NAME is not None] +ALL_MATCHERS = {} def select_matchers(group=None): - return [match_class for match_class in ALL_MATCHERS - if (group is None or '*' in match_class.GROUPS or group in match_class.GROUPS)] + return [match_class for match_class in ALL_MATCHERS.values() + if ('*' in match_class.GROUPS or group in match_class.GROUPS)] def order_matchers(matchers): @@ -75,22 +62,39 @@ def select_sorted_matcher_instances(group=None): def get_fusions(): return sorted( [(match_class.NAME, match_class.DESCRIPTION) - for match_class in ALL_MATCHERS], + for match_class in ALL_MATCHERS.values()], key=lambda x: x[0]) +@match_name("pow2_match_group") +@description("a selection of matches that are relevant for POW2 quantized graphs") +class POW2MatchGroup(MatchGroup): + def __init__(self): + super().__init__(*select_sorted_matcher_instances(group='symmetric'), + identity='pow2_match_group') + + +@match_name("scaled_match_group") +@description("a selection of matches that are relevant for scaled quantized graphs") +class ScaledMatchGroup(MatchGroup): + def __init__(self): + super().__init__(*select_sorted_matcher_instances(group='scaled'), + identity='scaled_match_group') + + def get_pow2_match_group(): - return MatchGroup( - *select_sorted_matcher_instances(group='symmetric'), - identity="pow2_match_group" - ) + return POW2MatchGroup() def get_scale8_match_group(): - return MatchGroup( - *select_sorted_matcher_instances(group='scaled'), - identity="std_match_group" - ) + return ScaledMatchGroup() + + +def get_matches(*match_names, identity="custom"): + not_found = set(match_names) - set(ALL_MATCHERS) + if not_found: + raise ValueError(f'matches {" ".join(not_found)} not found') + return MatchGroup(*[ALL_MATCHERS[name]() for name in match_names], identity=identity) def get_fusion(name): @@ -98,8 +102,10 @@ def get_fusion(name): return get_pow2_match_group() if name in ["std_match_group", "scale8_match_group"]: return get_scale8_match_group() - match_class = next((match_class for match_class in select_matchers() - if match_class.NAME == name), None) - if match_class is not None: - return match_class() + if name in ALL_MATCHERS: + return ALL_MATCHERS[name]() return None + + +ALL_MATCHERS.update({match_class.NAME: match_class for match_class in get_all_subclasses(Matcher) + if match_class.NAME is not None}) diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py index 830f49739..4448d913b 100644 --- a/tools/nntool/graph/nngraph.py +++ b/tools/nntool/graph/nngraph.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS +# Copyright (C) 2020, 2022 GreenWaves Technologies, SAS # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -16,30 +16,33 @@ import logging import os import re -from typing import Generator, Sequence, Union +from typing import Callable, Generator, Sequence, Tuple, Union import numpy as np from quantization.quantization_set import QuantizationSet from reports.graph_reporter import GraphReporter -from reports.quantization_reporter import QuantizationReporter from utils.graph import Graph, Node from utils.node_id import NodeId from utils.tabular import TextTableRenderer from graph.dim import Dim -from graph.dump_tensor import PrintDumper, dump_tensor from graph.graph_identity import GraphIdentity -from graph.manipulations import (add_dimensions, adjust_order, - balance_all_filters, calculate_liveness) -from graph.manipulations.balance_filter import balance_filter_with_constants +from graph.manipulations.adjust_order import adjust_order +from graph.manipulations.balance_filter import (balance_all_filters, + balance_filter_with_constants) +from graph.manipulations.dimensions import add_dimensions +from graph.manipulations.liveness import calculate_liveness +from graph.matches.fusions import fusions from graph.types import (ConstantInputParameters, InputBaseParameters, InputParameters, MultiplicativeBiasParameters, OutputParameters, ResizerParameters, RNNBaseParameters, SSDDetectorParameters) -from graph.types.base import NNNodeRef +from graph.types.base import NNEdge, NNNodeRef, Parameters from graph.types.dsp_preprocessing import DSPParameters from graph.types.expression_fusion import ExpressionFusionParameters -from graph.types.fusions import ActivationFusionBase, FilterFusionBase, FusionBase, MatMulOpFusionParameters, PaddedAddFusionParameters +from graph.types.fusions import (ActivationFusionBase, FilterFusionBase, + FusionBase, MatMulOpFusionParameters, + PaddedAddFusionParameters) LOG = logging.getLogger("nntool." + __name__) @@ -80,78 +83,140 @@ def liveness(self): def liveness(self, val): self._state['liveness'] = val - @property - def has_quantization_info(self): - return self._state['quantization'] - - @has_quantization_info.setter - def has_quantization_info(self, val): - self._state['quantization'] = val - class NNGraph(Graph): def __init__(self, model=None, name=None, filename=None): - super().__init__() - - self.model = model - self.num_inputs = 0 - self.num_outputs = 0 - self.num_constants = 0 - self.node_options = {} - self.num_rinputs = 0 - self.num_routputs = 0 + attrs = { + 'model': model, + 'node_options': {}, - self.graph_state = NNGraphState() - - self.load_function = None - self.graphname = name - self.graph_identity = GraphIdentity(filename) - self._info = { 'quantization': None, + 'has_quantized_parameters': False, + 'graphname': name, + 'graph_state': NNGraphState(), + 'graph_identity': GraphIdentity(filename) } + super().__init__(**attrs) + + INVALID_CHARS = re.compile(r'[^A-Za-z0-9_]') + + @staticmethod + def valid_c_identifier(val: str) -> str: + return NNGraph.INVALID_CHARS.sub('_', val) + + @property + def _edge_class(self): + return NNEdge + + @property + def name(self) -> str: + """Returns the name of the graph potentially modified to be a valid C identifier + + Returns: + str: The graph name + """ + if self._attr['graphname'] is None: + base, _ = os.path.splitext( + os.path.basename(self._attr['graph_identity'].filename)) + return self.valid_c_identifier(base) + return self.valid_c_identifier(self._attr['graphname']) + + @name.setter + def name(self, val: str): + """Sets the name of the graph + + Args: + val (str): The name of the graph + """ + self._attr['graphname'] = val + + @property + def model(self): + """The original model that generated the NNTool graph + + Returns: + Any: The model file (TFLite or ONNX graph descriptor) + """ + return self._attr['model'] + + @property + def num_inputs(self) -> int: + """Current number of inputs + + Returns: + int: Number of inputs + """ + return len(self.nodes(node_classes=InputParameters)) + + @property + def num_outputs(self) -> int: + """Current number of outputs + + Returns: + int: Number of outputs + """ + return len(self.nodes(node_classes=OutputParameters)) @property - def info(self): - return self._info + def num_constants(self) -> int: + """Current number of constant inputs - @info.setter - def info(self, val): - self._info = val + Returns: + int: Number of constant inputs + """ + return len(self.nodes(node_classes=ConstantInputParameters)) + + @property + def node_options(self) -> dict: + return self._attr['node_options'] @property - def quantization(self) -> QuantizationSet: - return self._info.get('quantization') + def quantization(self) -> Union[QuantizationSet, None]: + """Current graph Quantization + + Returns: + Union[QuantizationSet, None]: quantization set + """ + return self._attr['quantization'] @quantization.setter - def quantization(self, val: QuantizationSet): - self._info['quantization'] = val + def quantization(self, val: Union[QuantizationSet, None]): + """Sets or clears the quantization + + Args: + val (Union[QuantizationSet, None]): quantization set + """ + self._attr['quantization'] = val @property def has_quantized_parameters(self) -> bool: - return self._info.get('has_quantized_parameters') + """Graph was imported with quantized parameters + + Returns: + bool: quantized parameters or not + """ + return self._attr['has_quantized_parameters'] @has_quantized_parameters.setter def has_quantized_parameters(self, val: bool): - self._info['has_quantized_parameters'] = val + """Graph was imported with quantized parameters - INVALID_CHARS = re.compile(r'[^A-Za-z0-9_]') - - @staticmethod - def valid_c_identifier(val: str) -> str: - return NNGraph.INVALID_CHARS.sub('_', val) + Args: + val (bool): quantized parameters or not + """ + self._attr['has_quantized_parameters'] = val + @property + def graph_state(self) -> NNGraphState: + return self._attr['graph_state'] @property - def name(self) -> str: - if self.graphname is None: - base, _ = os.path.splitext( - os.path.basename(self.graph_identity.filename)) - return self.valid_c_identifier(base) - return self.valid_c_identifier(self.graphname) + def graph_identity(self) -> GraphIdentity: + return self._attr['graph_identity'] @property def inputs_dim(self) -> list: @@ -161,165 +226,256 @@ def inputs_dim(self) -> list: def outputs_dim(self) -> list: return [out_node.out_dims[0].shape for out_node in self.output_nodes()] - @name.setter - def name(self, val): - self.graphname = val - @property - def has_ssd_postprocess(self): + def has_ssd_postprocess(self) -> bool: + """Graph has SSD detector nodes + + Returns: + bool: True if present + """ return self.has_node_type(SSDDetectorParameters) @property - def has_resizer(self): + def has_resizer(self) -> bool: + """Graph has resizer nodes + + Returns: + bool: True if present + """ return self.has_node_type(ResizerParameters) @property - def has_expressions(self): - return self.has_node_type(ExpressionFusionParameters) - - def has_rnn(self, ktype=None, ne16=False): - nodes = self.nodes(node_classes=RNNBaseParameters) - if not nodes: - return False - if ktype is not None and not any(self.quantization[NodeId(node)].ktype == ktype for node in nodes): - return False - if ne16 and not any(self.quantization[NodeId(node)].cache.get('ne16') for node in nodes): - return False - return True + def has_expressions(self) -> bool: + """Graph has compiled expressions + Returns: + bool: True if present + """ + return self.has_node_type(ExpressionFusionParameters) @property - def has_dsp(self): + def has_dsp(self) -> bool: + """Graph has DSP nodes + + Returns: + bool: True if present + """ return self.has_node_type(DSPParameters) @property - def all_expressions(self): - return self.all_node_types(ExpressionFusionParameters) + def all_expressions(self) -> Sequence[ExpressionFusionParameters]: + """All the expression nodes in the graph + + Returns: + Sequence[ExpressionFusionParameters]: List of nodes + """ + return self.nodes(node_classes=ExpressionFusionParameters) @property - def nodes_by_step_idx(self): + def nodes_by_step_idx(self) -> Sequence[Parameters]: + """All the nodes in the graph ordered by execution order + + Returns: + Sequence[Parameters]: List of nodes + """ return [step['node'] for step in self.graph_state.steps] @property - def nodes_by_step_idx_with_fusions(self): + def nodes_by_step_idx_with_fusions(self) -> Sequence[Parameters]: + """Nodes ordered by execution order but also including internal nodes + for fusions + + Returns: + Sequence[Parameters]: List of nodes + """ nodes = [] for step in self.graph_state.steps: node = step['node'] - if isinstance(node, (FilterFusionBase, ActivationFusionBase, PaddedAddFusionParameters, MatMulOpFusionParameters)): + if isinstance(node, (FilterFusionBase, ActivationFusionBase, + PaddedAddFusionParameters, MatMulOpFusionParameters)): nodes.extend(node.contained_nodes()) nodes.append(node) return nodes - def has_node_type(self, node_type): - return any(isinstance(node, node_type) for node in self.nodes()) + @property + def total_ops(self) -> int: + """Estimated total operations in the graph - def all_node_types(self, node_type): - return [node for node in self.nodes() if isinstance(node, node_type)] - - def set_load_function(self, func): - self.load_function = func - - def load_tensors(self, file=None): - assert self.load_function - self.load_function(self, file) - - def get_in_params(self, name: str) -> set: - in_edges = self.in_edges(name) - if not in_edges: - return in_edges - in_edges.sort(key=lambda edge: edge.to_idx) - res = [] - in_idx = 0 - for real_idx in range(max(edge.to_idx for edge in in_edges) + 1): - if real_idx == in_edges[in_idx].to_idx: - res.append(in_edges[in_idx].params) - in_idx += 1 - else: - res.append(None) + Returns: + int: Number of operations + """ + tot_ops = 0 + for node in self.nodes(): + ops = node.compute_load() + tot_ops += ops if ops else 0 + return tot_ops - return res + def has_rnn(self, ktype: str = None, ne16: bool = False) -> bool: + """Graph has RNN nodes - def get_out_params(self, name: str) -> set: - out_edges = self.indexed_out_edges(name) - return [edge_list[0].params for edge_list in out_edges] + Args: + ktype (str, optional): kernel type to match or all. Defaults to None. + ne16 (bool, optional): match nodes that will map to ne16 kernels. Defaults to False. - def all_inputs(self) -> Generator[Node, None, None]: - return (node for node in self.nodes() if isinstance(node, (InputBaseParameters))) + Returns: + bool: True if present + """ + nodes = self.nodes(node_classes=RNNBaseParameters) + if not nodes: + return False + if ktype is not None and not any(self.quantization[NodeId(node)].ktype == ktype for node in nodes): + return False + if ne16 and not any(self.quantization[NodeId(node)].cache.get('ne16') for node in nodes): + return False + return True + + def has_node_type(self, node_type: Parameters) -> bool: + """Returns True if graph contains node type + + Args: + node_type (Parameters): Node class + + Returns: + bool: True if present + """ + return any(isinstance(node, node_type) for node in self.nodes()) def inputs_and_constants(self) -> Generator[Node, None, None]: + """Iterate over all inputs and constants + + Returns: + Generator[Node]: a generator for all nodes + """ return (node for node in self.nodes() if isinstance(node, InputBaseParameters)) def input_nodes(self) -> Generator[Node, None, None]: + """Iterate over all inputs + + Returns: + Generator[Node]: a generator for all nodes + """ return (node for node in self.nodes() if isinstance(node, InputParameters)) def output_nodes(self) -> Generator[Node, None, None]: - return (node for node in self.nodes() if isinstance(node, OutputParameters)) + """Iterate over all outputs - def is_input(self, node_name: Union[str, Node]) -> bool: - if isinstance(node_name, str): - return isinstance(self[node_name], InputParameters) - return isinstance(node_name, InputParameters) + Returns: + Generator[Node]: a generator for all nodes + """ + return (node for node in self.nodes() if isinstance(node, OutputParameters)) - def is_output(self, node_name: Union[str, Node]) -> bool: - if isinstance(node_name, str): - return isinstance(self[node_name], OutputParameters) - return isinstance(node_name, OutputParameters) + def add_input(self, dim: Union[Dim, Tuple[int]], name: str = None, **kwargs) -> NNNodeRef: + """Create an input node. If a name is not supplied then one will be automatically chosen. - def reset_inout_counts(self): - self.num_inputs = 0 - self.num_outputs = 0 - self.num_constants = 0 + Args: + dim (Union[Dim, Tuple[int]]): Input dimension + name (str, optional): Node name. Defaults to None. - def add_input(self, dim: Dim, name=None, **kwargs) -> InputParameters: - self.num_inputs += 1 - node_name = "input_"+str(self.num_inputs) if not name else name + Returns: + NNNodeRef: Reference to created node in graph + """ + node_name = self.unique_name( + f"input_{self.num_inputs + 1}") if not name else name node = InputParameters(node_name, dims=dim, **kwargs) self.add_node(node) - return NNNodeRef(node, 0, self) - - def add_constant(self, dim: Dim, name: str = None, - adjust_transpose=None, is_mutated=False, - is_intermediate=False, short_name=None) -> ConstantInputParameters: - self.num_constants += 1 - node_name = name if name else "constant_"+str(self.num_constants) + return NNNodeRef(self, node, 0) + + def add_constant(self, dim: Union[Dim, Tuple[int]] = None, + name: str = None, + value: np.ndarray = None, + adjust_transpose: Sequence[int]=None, + is_mutated=False, + is_intermediate=False, + short_name: str=None) -> NNNodeRef: + """Creates a constant node + + Args: + dim (Union[Dim, Tuple[int]], optional): Dimension of constant if not supplied then a value must be. Defaults to None. + name (str, optional): Optional name. A unique one will be created if None. Defaults to None. + value (np.ndarray, optional): Numpy array with value. Defaults to None. + adjust_transpose (Sequence[int], optional): Adjust will transpose the value using this transpose. Defaults to None. + is_mutated (bool, optional): Constant is both an input and an output. Defaults to False. + is_intermediate (bool, optional): Constant is marked as intermediate at import. Defaults to False. + short_name (str, optional): Preferred short name for model generation. Defaults to None. + + Returns: + NNNodeRef: A reference to the Node in the Graph + """ + node_name = name if name else self.unique_name(f"constant_{self.num_constants}") node = ConstantInputParameters(node_name, dims=dim, + value=value, adjust_transpose=adjust_transpose, is_intermediate=is_intermediate, is_mutated=is_mutated, short_name=short_name) self.add_node(node) - return NNNodeRef(node, 0, self) - - def variable_in_edges(self, node_name): - return list([edge for edge in self.in_edges(node_name) - if not isinstance(edge.from_node, ConstantInputParameters)]) + return NNNodeRef(self, node, 0) def add_output(self, name=None) -> OutputParameters: - self.num_outputs += 1 - node_name = "output_"+str(self.num_outputs) if name is None else name + """Create an output node. If a name is not supplied then one will be automatically chosen. + + Args: + name (str, optional): Node name. Defaults to None. + + Returns: + OutputParameters: Created node + """ + node_name = self.unique_name( + f"output_{self.num_outputs + 1}") if name is None else name node = OutputParameters(node_name) self.add_node(node) return node def nodes_iterator(self, yield_fusions=True): + """Yields a tuple of length 4 with the step idx and parameters of each node. Optionally + when in a fusion yields tuples containing the fusion internal step id and node for each internal + node. + + Args: + yield_fusions (bool, optional): Whether to yield fusion nodes. Defaults to True. + + Yields: + [Tuple[int, Parameters, Optional[int], Optional[Parameters]]]: Tuple containing node_idx, node, fusion_idx, fusion_node + """ for step_idx, step in enumerate(self.graph_state.steps): node = step['node'] - if isinstance(node, (FusionBase)) and not isinstance(node, ExpressionFusionParameters): - if yield_fusions: + if yield_fusions: + if isinstance(node, (FusionBase)) and not isinstance(node, ExpressionFusionParameters): for fusion_idx, fnode in enumerate(node.contained_nodes()): yield (step_idx, node, fusion_idx, fnode) - yield (step_idx, node, None, None) - else: - yield (step_idx, node, None, None) - - def adjust_order(self, reshape_weights=True, postprocess=True, debug_function=None, steps=None, single_step=False): + yield (step_idx, node, None, None) + + def adjust_order(self, reshape_weights=True, no_postprocess=False, debug_function: Callable=None, steps: int=None, single_step=False): + """Adjusts tensor order to match selected kernels + + Args: + reshape_weights (bool, optional): Whether weights should be modified to remove transposes. Defaults to True. + no_postprocess (bool, optional): Whether post processing such as transpose elimination is run. Defaults to False. + debug_function (Callable, optional): Function to be called after each transpose elimination step. Defaults to None. + steps (int, optional): Number of elimination steps to run. Defaults to None. + single_step (bool, optional): Execute only one transpose elimination step in each cycle. Defaults to False. + """ adjust_order(self, reshape_weights=reshape_weights, - postprocess=postprocess, debug_function=debug_function, + postprocess=not no_postprocess, debug_function=debug_function, steps=steps, single_step=single_step) LOG.info("adjusted order") self.graph_identity.is_adjusted = True + def fusions(self, *match_names, no_postprocess: bool = False): + """Run matchers on the graph + + Args: + match_names (str): Names of matches to apply + no_postprocess (bool, optional): Do not execute postprocessing such as transpose elimination. Defaults to False. + """ + fusions(self, *match_names, no_postprocess=no_postprocess) + def add_dimensions(self, quiet=False): + """Add dimensions to the graph and calculate execution order and liveness + + Args: + quiet (bool, optional): Do not log progress. Defaults to False. + """ if not quiet: LOG.info("update graph dimensions") self.graph_state.steps = add_dimensions(self) @@ -329,7 +485,17 @@ def add_dimensions(self, quiet=False): self, self.graph_state.steps) - def balance_filters(self, step_idx=None, precision_threshold=0.20): + def balance_filters(self, step_idx: int=None, precision_threshold=0.20): + """Experimental filter balancing routines + + Args: + step_idx (int, optional): Step to balance. Defaults to None. + precision_threshold (float, optional): Precision threshold. Defaults to 0.20. + + Raises: + ValueError: Bad parameters + NotImplementedError: Bad graph structure + """ if step_idx is not None: if step_idx > len(self.graph_state.steps) or step_idx < 0: raise ValueError("step idx out of range") @@ -352,62 +518,13 @@ def balance_filters(self, step_idx=None, precision_threshold=0.20): else: balance_all_filters(self, precision_threshold=precision_threshold) - def print_intermediates(self, outputs, limit=None, width=8, - precision=4, channel=None, order=None, - checksum=False, print_constants=False): - def print_step(step, outs, index): - node = step['node'] - if checksum: - for out_idx, out in enumerate(outs): - if isinstance(node, ConstantInputParameters): - continue - print(f"S{index} - {node.name}\n\tChecksum = {np.sum(out) if out.dtype != np.uint8 else np.sum(out.astype(np.int8))}") - else: - print(node.name) - for out_idx, out in enumerate(outs): - dims = node.out_dims[out_idx] - if order is not None and dims.is_named and order != dims.order and all(k in dims.order - for k in order): - transpose = dims.transpose_to_order(order) - out = out.transpose(transpose) - if channel is not None: - out = out[channel:channel+1:1, ...] - dump_tensor(out, PrintDumper( - out, width=width, precision=precision)) - - if limit is not None: - print_step(self.graph_state.steps[limit], outputs[limit], limit) - else: - for idx, out in enumerate(outputs): - print_step(self.graph_state.steps[idx], out, idx) - print() - - def qshow(self): - tab = QuantizationReporter().report(self, self.quantization) - renderer = TextTableRenderer(150) - tab.render(renderer) - return renderer.get_output() - - def merge(self, other: 'NNGraph'): - if self != other: - for edge in other.edges: - self.add_edge(edge) - return self - - def __getitem__(self, key): + def __getitem__(self, key) -> Parameters: if isinstance(key, int): return self.nodes_by_step_idx[key] return super().__getitem__(key) - def __repr__(self): + def __repr__(self) -> str: tab = GraphReporter().report(self) renderer = TextTableRenderer(150) tab.render(renderer) return renderer.get_output() - - def total_ops(self): - tot_ops = 0 - for node in self.nodes(): - ops = node.compute_load() - tot_ops += ops if ops else 0 - return tot_ops diff --git a/tools/nntool/graph/types/activations.py b/tools/nntool/graph/types/activations.py index 503a7a25c..59f75d7b9 100644 --- a/tools/nntool/graph/types/activations.py +++ b/tools/nntool/graph/types/activations.py @@ -226,10 +226,6 @@ class TanHActivationParameters(ActivationParameters, CanFuseToExpression): def can_equalize(self): return False - def should_fuse(self, node_set, qrec=None): - # TODO - TanH is only supported in an expression currently - return True - @expression_op(Sigmoid) @cls_op_name('sigmoid') diff --git a/tools/nntool/graph/types/base.py b/tools/nntool/graph/types/base.py index 0dc86024d..e80eb5223 100644 --- a/tools/nntool/graph/types/base.py +++ b/tools/nntool/graph/types/base.py @@ -20,6 +20,7 @@ from expressions.symbolic.symbol import Symbol from generation.at_types.gen_ctrl import CTRL_FEATURES, GenCtrl from graph.dim import Dim, PadDim, StrideDim +from stats.ranges_utils import collect_stat from utils.graph import Edge, Node, NodeRef from utils.option_list import OptionList @@ -64,25 +65,13 @@ def clone_dims(dims: Sequence[Dim], hints: Sequence[Dim]): class NNNodeRef(NodeRef): - def __init__(self, node, idx, G) -> None: - super(NNNodeRef, self).__init__(node) - self._G = G - self._idx = idx - - @property - def G(self): - return self._G - - @property - def ref(self): - return ((self._node, self._idx), self._G) def __getattr__(self, name): return getattr(self._node, name) def __setattr__(self, name, val): if name in ['_node', '_G', '_idx']: - super().__setattr__(name, val) + return super().__setattr__(name, val) return setattr(self._node, name, val) def __hasattr__(self, name): @@ -94,26 +83,14 @@ def __str__(self) -> str: def __repr__(self) -> str: return self._node.__repr__() - def __eq__(self, o: object) -> bool: - if isinstance(o, NNNodeRef): - return super().__eq__(o) - return self._node.__eq__(o) - - def __hash__(self) -> int: - return self._node.__hash__() - - def __call__(self, *args, **kwargs): - raise ValueError("this is already a reference") - class Parameters(Node): CLS_OP_NAME = None NARGS = {1} NOT_GENERATED = False - def __init__(self, name, *args, in_dims_hint=None, out_dims_hint=None, **kwargs): - super().__init__(name, *args, **kwargs) - del args, kwargs + def __init__(self, name, in_dims_hint=None, out_dims_hint=None, **kwargs): + super().__init__(name, **kwargs) self._in_dims = None self._out_dims = None self._in_dims_hint = in_dims_hint @@ -134,17 +111,19 @@ def __str__(self): def __repr__(self): return f'{self.__class__.__name__}({self.name})' - def __call__(self, *args, **kwargs): + @property + def _edge_class(self): + return NNEdge + + @property + def _noderef_class(self): + return NNNodeRef + + def __call__(self, *args, num_outputs=1, **kwargs): # set of number of args if isinstance(self.NARGS, set): if '*' not in self.NARGS and len(args) not in self.NARGS: raise ValueError("incorrect number of arguments") - inputs, fragments = [], [] - for arg in args: - if arg is not None and not isinstance(arg, NNNodeRef): - raise ValueError("expecting NNNodeRef") - inputs.append(arg.ref[0] if arg else None) - fragments.append(arg.ref[1] if arg else None) # list of possible inputs passed in kwargs. Things passed in args get # copied to kwargs with their index from the names in nargs @@ -153,32 +132,37 @@ def __call__(self, *args, **kwargs): if idx >= len(self.nargs): raise ValueError('Too many inputs for this node type') kwargs[self.nargs[idx]] = arg - inputs = [] - fragments = [] + args = [] for name in self.nargs: if name in kwargs: ref = kwargs[name] if not isinstance(ref, NNNodeRef): raise ValueError("expecting NNNodeRef") - inputs.append(ref[0]) - fragments.append(ref[1]) + args.append(ref) else: - inputs.append(None) - fragments.append(None) - if inputs[0] is None: + args.append(None) + if args[0] is None: raise ValueError('Expecting at least an input') - fragment = [frag for frag in fragments if frag is not None][0] - if len(fragments) > 1: - for other in fragments[1::]: - if other is not None: - fragment.merge(other) - for to_idx, from_tuple in enumerate(inputs): - if from_tuple is not None: - from_node, from_idx = from_tuple - fragment.add_edge(NNEdge(from_node=from_node, - from_idx=from_idx, to_node=self, to_idx=to_idx)) - return NNNodeRef(self, 0, fragment) + return super().__call__(*args, num_outputs=num_outputs) + + @property + def no_model_code(self) -> bool: + """Returns True if node results in no kernel, global or local generation in model + + Returns: + bool: True if nothing generated + """ + return False + + @property + def does_nothing(self) -> bool: + """Returns True if the node does not modify its input in any way + + Returns: + bool: True if node could be eliminated with no effect + """ + return False @property def graph_label(self): @@ -314,6 +298,9 @@ def can_equalize(self): def op_name(self): return self.CLS_OP_NAME + def details_collector(self, stats, stat, details): + pass + def compute_load(self): return None @@ -531,6 +518,8 @@ def __init__(self, *args, filt=None, has_bias=True, use_compressed=False, **kwar self.details = None self.at_options.update_valid_options(CTRL_FEATURES) + def details_collector(self, stats, stat, details): + collect_stat(stat, 'range_acc', details, details_name='acc') class MultiplicativeBiasParameters(FilterParameters): def __init__(self, *args, **kwargs): @@ -538,6 +527,11 @@ def __init__(self, *args, **kwargs): self.has_mul_bias = False self._mul_biases = None + def details_collector(self, stats, stat, details): + super().details_collector(stats, stat, details) + if self.mul_biases: + collect_stat(stat, 'range_pre_mul_bias', details, details_name='pre_mul_bias') + @property def mul_biases(self): return self._mul_biases @@ -645,4 +639,3 @@ def __init__(self, from_node: Union[str, Node], to_node: Union[str, Node], from_idx: int = 0, to_idx: int = 0): super().__init__(from_node, to_node, from_idx, to_idx) self.params = params - diff --git a/tools/nntool/graph/types/constant_input.py b/tools/nntool/graph/types/constant_input.py index 940deecd1..a7a55fb02 100644 --- a/tools/nntool/graph/types/constant_input.py +++ b/tools/nntool/graph/types/constant_input.py @@ -53,7 +53,7 @@ def __init__(self, *args, adjust_transpose=None, is_mutated=False, def __call__(self, graph): if graph.__class__.__name__ != 'NNGraph': raise ValueError('expecting NNGraph as parameter') - return NNNodeRef(self, 0, graph) + return NNNodeRef(graph, self, 0) @classmethod def fake(cls, G, val): diff --git a/tools/nntool/graph/types/expression_fusion.py b/tools/nntool/graph/types/expression_fusion.py index 51d7ac65f..43afedf3b 100644 --- a/tools/nntool/graph/types/expression_fusion.py +++ b/tools/nntool/graph/types/expression_fusion.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from copy import deepcopy import logging from collections import Counter @@ -121,6 +122,19 @@ def set_min_max(qrecs, symbol, node): qtype = qrec.out_qs[0] symbol.control.add_min_max(symbol, qtype.min_val, qtype.max_val) + def details_collector(self, stats, stat, details): + if 'expression' in stat: + stat = stat['expression'] + for sym_name, rec in details.items(): + if sym_name == "results": + continue + stat_rec = stat.setdefault( + sym_name, {'min': float('inf'), 'max': float('-inf')}) + stat_rec['min'] = min(stat_rec['min'], rec['min']) + stat_rec['max'] = max(stat_rec['max'], rec['max']) + else: + stat['expression'] = deepcopy(details) + def is_same_operation_as(self, G, other): if not isinstance(other, ExpressionFusionParameters): return False @@ -182,6 +196,8 @@ def get_output_size(self, in_dims): if tuple(in_vars[idx].shape) != shape: in_vars[idx].shape = shape dim_change = True + if dim_change: + self.func_col.set_var_shapes() out_dims = super().get_output_size(in_dims) if dim_change: # if the input shapes haven't changed then the output shapes have not changed out_vars = [self.func_col.variables[name] for name in self.output_symbols] diff --git a/tools/nntool/graph/types/fusions.py b/tools/nntool/graph/types/fusions.py index 01d3d33f0..8e894dbe6 100644 --- a/tools/nntool/graph/types/fusions.py +++ b/tools/nntool/graph/types/fusions.py @@ -19,6 +19,8 @@ from graph.types.others import PadParameters from graph.types.pooling import PoolingParameters from graph.types.tensor_arithmetic import Broadcastable +from utils.graph import GraphView +from utils.node_id import NodeId from ..dim import Dim from .base import (FilterParameters, NNEdge, NodeOptions, Parameters, @@ -69,7 +71,7 @@ class FusionBase(Parameters): fusion_op_name = "!!NOT SET!!" quantize_internals = True - def __init__(self, name, *args, fusion_type=None, subgraph=None, + def __init__(self, name, *args, fusion_type=None, subgraph: GraphView=None, input_mapping=None, output_mapping=None, in_dims=None, out_dims=None, @@ -157,11 +159,11 @@ def op_name(self): return self.fusion_op_name @property - def subgraph(self): + def subgraph(self) -> GraphView: return self._subgraph def contained_nodes(self): - return [node for node in self.subgraph.dfs() + return [node for node in self.subgraph.topological_sort() if not isinstance(node, FusionInputOutputParameters)] def get_contained_node(self, name): @@ -197,7 +199,7 @@ def get_parameter_size(self): def get_output_size(self, in_dims): node_out_dims = [] - for node in self.subgraph.dfs(): + for node in self.subgraph.topological_sort(): if isinstance(node, FusionInputParameters): node_in_dims = [self.clone_dim_with_hint( in_dims[node.idx], node.idx)] diff --git a/tools/nntool/graph/types/input_output.py b/tools/nntool/graph/types/input_output.py index 7a67de6a4..9a7ef936f 100644 --- a/tools/nntool/graph/types/input_output.py +++ b/tools/nntool/graph/types/input_output.py @@ -131,7 +131,7 @@ def __init__(self, *args, **kwargs): def __call__(self, graph): if graph.__class__.__name__ != 'NNGraph': raise ValueError('expecting NNGraph as parameter') - return NNNodeRef(self, 0, graph) + return NNNodeRef(graph, self, 0) def verify(self, G): problems = [] diff --git a/tools/nntool/graph/types/others.py b/tools/nntool/graph/types/others.py index 2d033305c..70bacc828 100644 --- a/tools/nntool/graph/types/others.py +++ b/tools/nntool/graph/types/others.py @@ -15,6 +15,7 @@ import logging import math +from functools import reduce import numpy as np from expressions.symbolic.basic import (Abs, Ceil, Cos, Exp, Log, Max, Min, @@ -24,7 +25,7 @@ from utils.real_transpose import real_transpose from .base import (CanFuseToExpression, ComparableParameters, - InsensitiveToQuantization, NNNodeRef, + InsensitiveToQuantization, NoSizeChangeParameters, Parameters, SensitiveToOrder, SingleInputAndOutput, cls_op_name, expression_op, nargs, not_generated) @@ -63,7 +64,8 @@ def get_parameter_size(self): def permute(self, val): return [val[i] for i in self.transpose] - def does_nothing(self): + @property + def no_model_code(self): if not self.transpose: return True if not self.in_dims or not self.in_dims[0]: @@ -75,9 +77,13 @@ def does_nothing(self): for idx in trans if shape_idx[idx] is not None] return shape_trans == sorted(shape_trans) + @property + def does_nothing(self) -> bool: + return self._transpose is None + @property def is_not_generated(self): - return self.does_nothing() + return self.does_nothing def is_same_operation_as(self, G, other): if not isinstance(other, TransposeParameters): @@ -174,7 +180,7 @@ def __str__(self): @cls_op_name('quantize') -class QuantizeParameters(Parameters): +class QuantizeParameters(Parameters, ComparableParameters): def __init__(self, *args, from_qtype=None, to_qtype=None, inserted_by_quantizer=False, **kwargs): @@ -187,6 +193,11 @@ def __init__(self, *args, from_qtype=None, to_qtype=None, def get_parameter_size(self): return 0 + def is_same_operation_as(self, G, other): + return (isinstance(other, QuantizeParameters) and + self.from_qtype == other.from_qtype and + self.to_qtype == other.to_qtype) + @property def can_equalize(self): return False @@ -224,10 +235,11 @@ def __str__(self): @not_generated class ConcatParameters(Parameters, SensitiveToOrder): - def __init__(self, *args, axis=None, axis_hint=None, **kwargs): + def __init__(self, *args, axis=None, **kwargs): super(ConcatParameters, self).__init__(*args, **kwargs) + if axis is None: + raise ValueError("axis must be set") self._axis = axis - self._axis_hint = axis_hint @property def graph_label(self): @@ -245,6 +257,10 @@ def axis(self): def axis(self, val): self._axis = val + @property + def does_nothing(self) -> bool: + return self.in_dims and len(self.in_dims) == 1 + def get_parameter_size(self): return 0 @@ -252,9 +268,16 @@ def get_parameter_size(self): def can_equalize(self): return False + @property + def offsets(self): + return reduce( + lambda state, in_dim: ( + state[0] + [state[1]], state[1] + in_dim.shape[self.axis]), + self.in_dims, + ([], 0) + )[0] + def get_output_size(self, in_dims): - if in_dims[0].is_named and self._axis_hint: - self._axis = in_dims[0].get_order_idx(self._axis_hint) out_dim = Dim.combine([in_dim for in_dim in in_dims], self.axis) return [out_dim] @@ -281,8 +304,11 @@ def __init__(self, *args, self.axis = axis def __call__(self, *args, **kwargs): - noderef = super(SplitParameters, self).__call__(*args, **kwargs) - return tuple(NNNodeRef(self, i, noderef.ref[1]) for i in range(len(self.act_slices))) + return super().__call__(*args, num_outputs=len(self.act_slices), **kwargs) + + @property + def does_nothing(self) -> bool: + return self.out_dims and len(self.out_dims) == 1 @property def graph_label(self): @@ -308,7 +334,8 @@ def get_splits(in_shape, axis, splits=None, num_splits=None): if splits: if in_shape[axis] is not None and any(split == -1 for split in splits): rest_sz = sum(split for split in splits if split > 0) - splits = (split if split > 0 else in_shape[axis] - rest_sz for split in splits) + splits = (split if split > + 0 else in_shape[axis] - rest_sz for split in splits) for sz in splits: act_slices.append([(in_idx, in_idx + sz, 1) if idx == axis else (0, shape, 1) for idx, shape in enumerate(in_shape) @@ -392,7 +419,6 @@ def can_equalize(self): def __str__(self): return "A %s I %s" % (self.axis, self.indices) - @cls_op_name('strided_slice') class StridedSliceParameters(Parameters, SingleInputAndOutput, ComparableParameters, InsensitiveToQuantization): @@ -403,7 +429,8 @@ def __init__(self, *args, super(StridedSliceParameters, self).__init__(*args, **kwargs) self.act_slice = act_slice - self.slice_shape = tuple(int(abs(math.ceil((sl[1] - sl[0])/sl[2]))) for sl in self.act_slice) + self.slice_shape = tuple( + int(abs(math.ceil((sl[1] - sl[0])/sl[2]))) for sl in self.act_slice) self.out_shape = tuple(out_shape) @property @@ -414,6 +441,34 @@ def graph_label(self): def graph_anon_label(self): return ['Slice'] + ["(%s,%s,%s)" % elem for elem in self.act_slice] + @property + def slice_shape(self): + return self._slice_shape + + @slice_shape.setter + def slice_shape(self, val): + self._slice_shape = tuple(val) + + @property + def slices_axes(self): + in_shape = self.in_dims[0].shape + return tuple(idx for idx, shapes in enumerate(zip(self.post_slice_shape, in_shape)) if shapes[0] != shapes[1]) + + @property + def post_slice_shape(self): + old_settings = np.seterr(all='raise') + res = tuple(abs(((sl[1] if sl[1] >= -1 else -1) - sl[0])//sl[2]) for sl in self.act_slice) + np.seterr(**old_settings) + return res + + @property + def changes_shape(self): + return self.post_slice_shape != self.out_shape + + @property + def can_equalize(self): + return False + def numpy_slice(self, arr: np.ndarray): slice_spec = [slice(elem[0], elem[1], elem[2]) for elem in self.act_slice if len(elem) == 3] @@ -447,12 +502,14 @@ def only_slices(self, axis): for idx, dim in enumerate(self.in_dims[0].shape) if axis != idx) @property - def post_slice_shape(self): - return [(sl[1] - sl[0])//sl[2] for sl in self.act_slice] + def does_nothing(self) -> bool: + return self.no_model_code and not self.changes_shape @property - def changes_shape(self): - return len(self.post_slice_shape) > len(self.out_shape) + def no_model_code(self) -> bool: + if not self.in_dims: + return False + return self.post_slice_shape == tuple(self.in_dims[0].shape) def get_parameter_size(self): return 0 @@ -460,10 +517,6 @@ def get_parameter_size(self): def get_output_size(self, in_dims): return [Dim.unnamed(self.out_shape)] - @property - def can_equalize(self): - return False - def __str__(self): return ",".join("(%s,%s,%s)" % elem for elem in self.act_slice) @@ -675,14 +728,19 @@ def __init__(self, *args, old_shape=None, shape=None, **kwargs): @property def graph_label(self): - return [self.name, f'{self.old_shape} to {self.shape}'] + return [f'Reshape({self.name})', f'{self.old_shape} to {self.shape}'] @property def graph_anon_label(self): return ['Reshape', f'{self.old_shape} to {self.shape}'] + @property def does_nothing(self): - return self.shape.layout_shape == self.old_shape.layout_shape + return tuple(self.shape.shape) == tuple(self.old_shape.shape) + + @property + def no_model_code(self) -> bool: + return True def get_parameter_size(self): return 0 @@ -691,7 +749,7 @@ def exp_red_pattern(self): """ If the reshape is an expand or reduce dim i.e. adds or removes 1 size axes then return a pattern with True indicating an added axis, False a removed axis and None an unchanged axis""" - if not self.does_nothing(): + if not self.does_nothing: return None res = [] s1 = self._old_shape.shape.copy() @@ -781,6 +839,14 @@ def get_parameter_size(self): def can_equalize(self): return False + @property + def no_model_code(self) -> bool: + return True + + @property + def does_nothing(self) -> bool: + return True + def compute_load(self): return 0 diff --git a/tools/nntool/graph/types/rnn.py b/tools/nntool/graph/types/rnn.py index 12f0912e0..aa3a01fdb 100644 --- a/tools/nntool/graph/types/rnn.py +++ b/tools/nntool/graph/types/rnn.py @@ -18,6 +18,7 @@ from graph.dim import Dim from graph.types import (ConstantInputParameters, NNEdge, Parameters, SensitiveToOrder, SingleInputAndOutput) +from stats.ranges_utils import collect_stat from .base import cls_op_name, nargs @@ -60,6 +61,10 @@ def graph_label(self): def graph_anon_label(self): return ["Filt"] + def details_collector(self, stats, stat, details): + for k in filter(lambda x: x.startswith('range_'), details): + collect_stat(stat, k, details[k]) + def get_parameter_size(self): return 0 diff --git a/tools/nntool/graph/types/ssd.py b/tools/nntool/graph/types/ssd.py index 826b1b73c..a18fe59d4 100644 --- a/tools/nntool/graph/types/ssd.py +++ b/tools/nntool/graph/types/ssd.py @@ -17,7 +17,7 @@ from graph.dim import Dim -from .base import NNNodeRef, Parameters, SensitiveToOrder, cls_op_name, nargs +from .base import Parameters, SensitiveToOrder, cls_op_name, nargs LOG = logging.getLogger("nntool." + __name__) @@ -41,14 +41,26 @@ def __init__(self, *args, parameters=None, **kwargs): self.nms_config = {'using_json_config': {'INCLUDE': False, 'json_config_path': ''}, 'using_pipeline_config': {'INCLUDE': False, 'pipeline_config_path': ''}, 'using_params': {'INCLUDE': True, 'params': self._parameters}} + self.at_options.valid_options['NMS_SCORE_THRESHOLD'] = float + self.at_options.valid_options['NMS_IOU_THRESHOLD'] = float + self.nms_score_threshold = self._parameters['nms_score_threshold'] + self.nms_iou_threshold = self._parameters['nms_iou_threshold'] + self._output_detection_count = True def __call__(self, *args, **kwargs): - noderef = super(SSDDetectorParameters, self).__call__(*args, **kwargs) - return tuple(NNNodeRef(self, i, noderef.ref[1]) for i in range(3)) + return super().__call__(*args, num_outputs=4 if self._output_detection_count else 3, **kwargs) def get_parameter_size(self): return 0 + @property + def output_detection_count(self): + return self._output_detection_count + + @output_detection_count.setter + def output_detection_count(self, val): + self._output_detection_count = val + @property def can_equalize(self): return False @@ -69,14 +81,6 @@ def w_scale(self): def h_scale(self): return self._parameters['h_scale'] - @property - def nms_score_threshold(self): - return self._parameters['nms_score_threshold'] - - @nms_score_threshold.setter - def nms_score_threshold(self, val): - self._parameters['nms_score_threshold'] = val - @property def max_bb_before_nms(self): return self._parameters['max_bb_before_nms'] @@ -95,7 +99,19 @@ def use_exp_for_wh_decode(self, val): @property def nms_iou_threshold(self): - return self._parameters['nms_iou_threshold'] + return self.at_options.nms_iou_threshold + + @nms_iou_threshold.setter + def nms_iou_threshold(self, val): + self.at_options.nms_iou_threshold = val + + @property + def nms_score_threshold(self): + return self.at_options.nms_score_threshold + + @nms_score_threshold.setter + def nms_score_threshold(self, val): + self.at_options.nms_score_threshold = val @property def max_detections(self): @@ -108,18 +124,18 @@ def max_classes_per_detection(self): def get_output_size(self, in_dims): num_detected_boxes = self._parameters['max_detections'] * \ self._parameters['max_classes_per_detection'] - return [ - Dim(shape=[num_detected_boxes, 4], is_ordered=True), - Dim(shape=[num_detected_boxes], is_ordered=True), - Dim(shape=[num_detected_boxes], is_ordered=True), - Dim(shape=[num_detected_boxes], is_ordered=True), + outputs = [ + Dim.unnamed([num_detected_boxes, 4]), + Dim.unnamed([num_detected_boxes]), + Dim.unnamed([num_detected_boxes]), ] + if self.output_detection_count: + outputs.append(Dim.unnamed([1])) + return outputs def __str__(self): - return "{} SCORE_THR {:.2f} IOU_THR {:.2f}".format( - self.at_options, - self.nms_score_threshold, - self.nms_iou_threshold + return "{}".format( + self.at_options ) @@ -148,10 +164,13 @@ def __init__(self, *args, parameters=None, in_dims_hint=None, out_dims_hint=None self._ker_in_order = [['batch', 'spatial_dim', 'box'], [ 'batch', 'class', 'spatial_dim']] self._ker_out_order = [['spatial_dim', 'index']] + self.at_options.valid_options['NMS_SCORE_THRESHOLD'] = float + self.at_options.valid_options['NMS_IOU_THRESHOLD'] = float + self.nms_score_threshold = self._parameters['nms_score_threshold'] + self.nms_iou_threshold = self._parameters['nms_iou_threshold'] def __call__(self, *args, **kwargs): - noderef = super(NMSParameters, self).__call__(*args, **kwargs) - return tuple(NNNodeRef(self, i, noderef.ref[1]) for i in range(2)) + return super().__call__(*args, num_outputs=2, **kwargs) def get_parameter_size(self): return 0 @@ -160,17 +179,21 @@ def get_parameter_size(self): def can_equalize(self): return False + @property + def nms_iou_threshold(self): + return self.at_options.nms_iou_threshold + + @nms_iou_threshold.setter + def nms_iou_threshold(self, val): + self.at_options.nms_iou_threshold = val + @property def nms_score_threshold(self): - return self._parameters['nms_score_threshold'] + return self.at_options.nms_score_threshold @nms_score_threshold.setter def nms_score_threshold(self, val): - self._parameters['nms_score_threshold'] = val - - @property - def nms_iou_threshold(self): - return self._parameters['nms_iou_threshold'] + self.at_options.nms_score_threshold = val @property def max_output_boxes_per_class(self): @@ -192,8 +215,6 @@ def get_output_size(self, in_dims): ] def __str__(self): - return "{} SCORE_THR {:.2f} IOU_THR {:.2f}".format( - self.at_options, - self.nms_score_threshold, - self.nms_iou_threshold + return "{}".format( + self.at_options ) diff --git a/tools/nntool/importer/common/broadcast_mixin.py b/tools/nntool/importer/common/broadcast_mixin.py index e23c1bf8e..8a0627dd8 100644 --- a/tools/nntool/importer/common/broadcast_mixin.py +++ b/tools/nntool/importer/common/broadcast_mixin.py @@ -19,11 +19,16 @@ from .provisional_dim import ProvisionalDim +# reduces broadcasted constants on unknown dimensions. +# Setting this to false can provoke conception errors in matchers +FIX_CONSTANTS = True class BroadcastMixin(object): @classmethod - def get_broadcasted_shape(cls, x, y): + def get_broadcasted_shape(cls, x, y, is_constant=None): + if is_constant is None: + is_constant = (False, False) if len(x) < len(y): x = ([1] * (len(y) - len(x))) + x elif len(y) < len(x): @@ -34,9 +39,20 @@ def get_broadcasted_shape(cls, x, y): "{} and {} cannot be broadcasted".format(x, y) def broad(elem_x, elem_y): - if elem_x is None or elem_y is None: - return None - return elem_x if elem_y == 1 else elem_y + # if one element is not None then take it since that dimension will be broadcasted + if elem_x is None: + if elem_y is None or (FIX_CONSTANTS and is_constant[1] and elem_y == 1): + return None + else: + return elem_y + else: + if elem_y is None: + if FIX_CONSTANTS and is_constant[0] and elem_x == 1: + return None + else: + return elem_x + else: + return elem_x if elem_y == 1 else elem_y return [broad(elem_x, elem_y) for elem_x, elem_y in zip(x, y)] @classmethod @@ -62,8 +78,9 @@ def _fix_constant_inputs(cls, inputs, shape): @classmethod def implied_broadcast(cls, inputs): + is_constant = [isinstance(inp[0], ConstantInputParameters) for inp in inputs] x = inputs[0][2].shape y = inputs[1][2].shape - shape = cls.get_broadcasted_shape(x, y) + shape = cls.get_broadcasted_shape(x, y, is_constant=is_constant) cls._fix_constant_inputs(inputs, shape) return [ProvisionalDim(shape)] diff --git a/tools/nntool/importer/common/check_batchdim.py b/tools/nntool/importer/common/check_batchdim.py new file mode 100644 index 000000000..07d2dd098 --- /dev/null +++ b/tools/nntool/importer/common/check_batchdim.py @@ -0,0 +1,34 @@ +# Copyright (C) 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from graph.types.base import NNEdge +from graph.types.others import ReshapeParameters +from importer.common.provisional_dim import ProvisionalDim + + +def check_batchdim(G, x, valid_name): + x_shape = x[2].shape + if x_shape[0] is not None: + if x_shape[0] != 1: + raise NotImplementedError( + f'{valid_name} pool is on more than one batch. This is not supported') + reshape = ReshapeParameters(G.unique_name(f'{valid_name}_reshape'), + old_shape=tuple(x_shape), shape=tuple(x_shape[1:])) + G.add_edge(NNEdge(from_node=x[0], from_idx=x[1], to_node=reshape)) + x_shape[0] = None + if len(x) == 3: + return (reshape, 0, ProvisionalDim(x_shape)) + return (reshape, 0, ProvisionalDim(x_shape), x[3]) + return x diff --git a/tools/nntool/importer/importer.py b/tools/nntool/importer/importer.py index 035c4c2f6..db57003a4 100644 --- a/tools/nntool/importer/importer.py +++ b/tools/nntool/importer/importer.py @@ -20,8 +20,8 @@ from .tflite2.tflite import NNGraph, TFLiteImporter GRAPH_IMPORTERS = { - 'onnx': {'matches':[r".*\.onnx$"], 'importer':OnnxImporter, 'loader': None}, - 'tflite': {'matches':[r".*\.tflite$"], 'importer':TFLiteImporter, 'loader': None}, + 'onnx': {'matches':[r".*\.onnx$"], 'importer':OnnxImporter}, + 'tflite': {'matches':[r".*\.tflite$"], 'importer':TFLiteImporter}, } class ImportException(Exception): @@ -51,7 +51,6 @@ def create_graph(filename: str, graph_format: str = None, opts: Mapping = None) if re.search(match, filename): importer = v['importer']() graph = importer.create_graph(filename, opts) - graph.set_load_function(v['loader']) return graph raise ValueError("Graph importer not found") diff --git a/tools/nntool/importer/onnx/common/handler_helper.py b/tools/nntool/importer/onnx/common/handler_helper.py index 1a0102ea0..e6d32a8bf 100644 --- a/tools/nntool/importer/onnx/common/handler_helper.py +++ b/tools/nntool/importer/onnx/common/handler_helper.py @@ -14,12 +14,14 @@ # along with this program. If not, see . from onnx import defs +from onnx.defs import SchemaError from .. import common # pylint: disable=wildcard-import,unused-wildcard-import from ..handlers.backend import * # noqa from ..handlers.backend_handler import BackendHandler + def get_opset_status(): ops = [] onnx_ops = {} @@ -33,7 +35,8 @@ def get_opset_status(): counts_by_domain = {} for handler in BackendHandler.__subclasses__(): handler.check_cls() - counts_by_domain.setdefault(handler.DOMAIN, [0, onnx_ops.get(handler.DOMAIN, 0)]) + counts_by_domain.setdefault( + handler.DOMAIN, [0, onnx_ops.get(handler.DOMAIN, 0)]) counts_by_domain[handler.DOMAIN][0] += 1 ops.append([ handler.DOMAIN, @@ -47,6 +50,7 @@ def get_opset_status(): ]) return ops, counts_by_domain + def get_all_backend_handlers(opset_dict): """ Get a dict of all backend handler classes. e.g. {'domain': {'Abs': Abs handler class}, ...}, }. @@ -65,14 +69,16 @@ def get_all_backend_handlers(opset_dict): since_version = 1 if defs.has(handler.ONNX_OP, domain=handler.DOMAIN): try: - since_version = defs.get_schema( # @IgnoreException + since_version = defs.get_schema( #@IgnoreException handler.ONNX_OP, domain=handler.DOMAIN, max_inclusive_version=version).since_version - except RuntimeError: - common.logger.debug("Fail to get since_version of %s in domain `%s` " - "with max_inclusive_version=%s. Set to 1.", - handler.ONNX_OP, handler.DOMAIN, version) + except (SchemaError, RuntimeError): + versions = sorted([int(ver_func[len('varsion_'):]) for ver_func in dir(handler) if ver_func.startswith('version_')]) + since_version = versions[0] if versions else 1 + common.logger.debug( + f"Fail to load schema of {handler.ONNX_OP} in domain `{handler.DOMAIN}` " + f"with max_inclusive_version=version. Since version set to {since_version}.") else: common.logger.debug("Unknown op %s in domain `%s`.", handler.ONNX_OP, handler.DOMAIN or "ai.onnx") diff --git a/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py b/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py index 0b4654fdc..bfab732c8 100644 --- a/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py +++ b/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py @@ -71,11 +71,3 @@ def _common(cls, node, **kwargs): @classmethod def version_11(cls, node, **kwargs): return cls._common(node, **kwargs) - - @classmethod - def version_9(cls, node, **kwargs): - return cls._common(node, **kwargs) - - @classmethod - def version_1(cls, node, **kwargs): - return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/equal.py b/tools/nntool/importer/onnx/handlers/backend/equal.py new file mode 100644 index 000000000..fcbaabbeb --- /dev/null +++ b/tools/nntool/importer/onnx/handlers/backend/equal.py @@ -0,0 +1,60 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from graph.dim import Dim +from graph.types import ConstantInputParameters +from importer.common.constant_mixin import ConstantMixin +from importer.common.provisional_dim import ProvisionalDim + +from ..backend_handler import BackendHandler +from ..handler import onnx_op + + +@onnx_op("Equal") +class Equal(ConstantMixin, BackendHandler): + + @classmethod + def _common(cls, node, **kwargs): + all_nodes = kwargs['all_nodes'] + G = kwargs['G'] + valid_name = kwargs['valid_name'] + inputs = [all_nodes[inp] for inp in node.input] + x = inputs[0] + x_shape = x[2].shape + if all(cls.is_constant(inp) for inp in inputs): + a = cls.get_constant(inputs[0]) + b = cls.get_constant(inputs[1]) + params = ConstantInputParameters(valid_name, dims=Dim.unnamed(a.shape), value=(a==b)) + else: + raise ValueError("ONNX Equal operator is not implemented") + all_nodes[node.output[0]] = (params, 0, ProvisionalDim(x_shape), None) + return params + + @classmethod + def version_1(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_7(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_11(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_13(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/handlers/backend/gather.py b/tools/nntool/importer/onnx/handlers/backend/gather.py index 1a2761e3d..911396330 100644 --- a/tools/nntool/importer/onnx/handlers/backend/gather.py +++ b/tools/nntool/importer/onnx/handlers/backend/gather.py @@ -13,13 +13,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import copy -from graph.types.others import StridedSliceParameters + import numpy as np from graph.types import ConstantInputParameters, GatherParameters, NNEdge +from graph.types.others import ReshapeParameters, StridedSliceParameters from importer.common.constant_mixin import ConstantMixin from importer.common.provisional_dim import ProvisionalDim from importer.onnx.common import logger +from utils.numpy_helpers import np_asscalar from ..backend_handler import BackendHandler from ..handler import onnx_op @@ -40,21 +41,31 @@ def _common(cls, node, **kwargs): indices = cls.get_constant(y) axis = node.attrs.get('axis', 0) - pshape = ProvisionalDim(x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:]) + pshape = ProvisionalDim( + x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:]) if cls.is_constant(x): x_val = cls.get_constant(x) - logger.info(f"reducing {valid_name} to a constant {cls.print_small(x_val)}") - params = ConstantInputParameters(valid_name, value=np.take(x_val, indices, axis=axis)) + logger.info( + f"reducing {valid_name} to a constant {cls.print_small(x_val)}") + params = ConstantInputParameters(valid_name, value=np.take( + x_val, indices.astype(np.int64), axis=axis)) else: if np.ndim(indices) <= 1: - idx = np.asscalar(indices) - act_slice = tuple([(0, dim, 1) if i != axis else (idx, idx+1, 1) for i, dim in enumerate(x_shape) if dim is not None]) + idx = np_asscalar(indices) + act_slice = tuple([(0, dim, 1) if i != axis else ( + idx, idx+1, 1) for i, dim in enumerate(x_shape) if dim is not None]) out_shape = pshape.known_shape.copy() - params = StridedSliceParameters(valid_name, act_slice=act_slice, out_shape=out_shape) + params = StridedSliceParameters( + valid_name, act_slice=act_slice, out_shape=out_shape) + if params.post_slice_shape == tuple(x[2].known_shape): + params = ReshapeParameters(valid_name, old_shape=tuple( + x[2].known_shape), shape=out_shape) else: axis = cls._trim_axis(axis, x_shape) - params = GatherParameters(valid_name, axis=axis, indices=indices) - G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0)) + params = GatherParameters( + valid_name, axis=axis, indices=indices) + G.add_edge( + NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0)) all_nodes[node.output[0]] = (params, 0, pshape, x[3]) return params diff --git a/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py b/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py index 72c18169e..3324a1da2 100644 --- a/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py +++ b/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py @@ -101,8 +101,8 @@ def _handle(cls, node, quantized=False, **kwargs): NNEdge(from_node=y[0], to_node=trans2, from_idx=y[1], to_idx=0)) G.add_edge( NNEdge(from_node=trans2, to_node=params, from_idx=0, to_idx=1)) - biases_params = ConstantInputParameters(f'{valid_name}_biases', dims=Dim.unnamed([out_dims[0].shape[1]]), - value=np.zeros((out_dims[0].shape[1]), dtype=np.float32)) + biases_params = ConstantInputParameters(f'{valid_name}_biases', dims=Dim.unnamed([out_dims[0].shape[-1]]), + value=np.zeros((out_dims[0].shape[-1]), dtype=np.float32)) G.add_edge(NNEdge(from_node=biases_params, to_node=params, to_idx=2)) diff --git a/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py b/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py index 93b977116..69544b786 100644 --- a/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py +++ b/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py @@ -17,8 +17,8 @@ from graph.types import GlobalPoolingParameters, PoolingParameters from graph.types.base import NNEdge from importer.common.provisional_dim import ProvisionalDim +from importer.common.check_batchdim import check_batchdim -from ..handler import partial_support, ps_description from .pad_mixin import PadMixin @@ -31,6 +31,7 @@ def pool(cls, node, pool_type=None, copy_qtype=False, **kwargs): valid_name = kwargs['valid_name'] inputs = [all_nodes[inp] for inp in node.input] x = inputs[0] + x = check_batchdim(G, x, valid_name) x_shape = x[2].shape x_feature_shape = x_shape[2::] input_rank = len(x_feature_shape) diff --git a/tools/nntool/importer/onnx/handlers/backend/where.py b/tools/nntool/importer/onnx/handlers/backend/where.py new file mode 100644 index 000000000..e7abcc53b --- /dev/null +++ b/tools/nntool/importer/onnx/handlers/backend/where.py @@ -0,0 +1,53 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np +from graph.dim import Dim +from graph.types import ConstantInputParameters +from importer.common.constant_mixin import ConstantMixin +from importer.common.provisional_dim import ProvisionalDim + +from ..backend_handler import BackendHandler +from ..handler import onnx_op + + +@onnx_op("Where") +class Where(ConstantMixin, BackendHandler): + + @classmethod + def _common(cls, node, **kwargs): + all_nodes = kwargs['all_nodes'] + G = kwargs['G'] + valid_name = kwargs['valid_name'] + inputs = [all_nodes[inp] for inp in node.input] + x = inputs[0] + x_shape = x[2].shape + if all(cls.is_constant(inp) for inp in inputs): + condition = cls.get_constant(inputs[0]) + x = cls.get_constant(inputs[1]) + y = cls.get_constant(inputs[2]) + params = ConstantInputParameters(valid_name, dims=Dim.unnamed(x.shape), value=np.where(condition, x, y)) + else: + raise ValueError("ONNX Where operator is not implemented") + all_nodes[node.output[0]] = (params, 0, ProvisionalDim(x_shape), None) + return params + + @classmethod + def version_9(cls, node, **kwargs): + return cls._common(node, **kwargs) + + @classmethod + def version_16(cls, node, **kwargs): + return cls._common(node, **kwargs) diff --git a/tools/nntool/importer/onnx/onnx.py b/tools/nntool/importer/onnx/onnx.py index ad083709a..a89f4d71f 100644 --- a/tools/nntool/importer/onnx/onnx.py +++ b/tools/nntool/importer/onnx/onnx.py @@ -354,8 +354,14 @@ def _import_nodes(self, G, graph, handlers, all_nodes, outputs, **kwargs): from_idx=producer[1])) banned_inputs.update(node.output) continue - - params = handler.handle(OnnxNode(node), all_nodes=all_nodes, vars_dict=vars_dict, + onode = OnnxNode(node) + inputs = [all_nodes[inp] if inp else None for inp in onode.input] + if inputs: + x = inputs[0] + x_shape = x[2].shape + name = hasattr(node, 'name') and getattr(node, 'name') + x=0 + params = handler.handle(onode, all_nodes=all_nodes, vars_dict=vars_dict, G=G, valid_name=self._node_name(node), used_tensors=used_tensors, importer=self, **kwargs) if params is None: diff --git a/tools/nntool/importer/tflite2/handlers/backend/concatenation.py b/tools/nntool/importer/tflite2/handlers/backend/concatenation.py index 21cf0dd24..4a9924777 100644 --- a/tools/nntool/importer/tflite2/handlers/backend/concatenation.py +++ b/tools/nntool/importer/tflite2/handlers/backend/concatenation.py @@ -96,7 +96,7 @@ def red_func(x, y): params = ConstantInputParameters(node.name, value=value) else: axis -= sum(1 if dim is None else 0 for dim in pout_shape[:axis:]) - params = ConcatParameters(node.name, axis=axis, axis_hint=None) + params = ConcatParameters(node.name, axis=axis) for idx, inp in enumerate(inputs): inp_node, inp_idx = cls._maybe_insert_reshape(G, inp, inp_shapes[idx], pout_shape) diff --git a/tools/nntool/importer/tflite2/handlers/backend/pack.py b/tools/nntool/importer/tflite2/handlers/backend/pack.py index 57f16a58b..73371e5fb 100644 --- a/tools/nntool/importer/tflite2/handlers/backend/pack.py +++ b/tools/nntool/importer/tflite2/handlers/backend/pack.py @@ -84,7 +84,7 @@ def _common(cls, node: TFLiteNode, **kwargs): G.add_edge(NNEdge(from_node=inputs[0][0], to_node=params, from_idx=inputs[0][1])) else: axis -= sum(1 if dim is None else 0 for dim in pconcat_out_shape[:axis:]) - params = ConcatParameters(node.name, axis=axis, axis_hint=None) + params = ConcatParameters(node.name, axis=axis) # insert reshapes on each input to add concat axis for idx, inp in enumerate(inputs): diff --git a/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py b/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py index 0c56a4034..791ce1bc9 100644 --- a/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py +++ b/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py @@ -19,6 +19,7 @@ from importer.common.provisional_dim import ProvisionalDim from importer.tflite2.tflite_schema_head.Pool2DOptions import Pool2DOptions from utils.node_id import NodeId +from importer.common.check_batchdim import check_batchdim from .filter_pad_mixin import FilterPadMixin @@ -34,6 +35,8 @@ def pool2d(cls, node, pool_type=None, **kwargs): inputs = [all_nodes[inp] for inp in node.input] x = inputs[0] + x = check_batchdim(G, x, node.name) + x = cls.remove_known_batch_dimension(G, x, node) x_shape = x[2].shape in_c = x_shape[1] diff --git a/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py b/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py index e61eb9707..b262697be 100644 --- a/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py +++ b/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py @@ -16,6 +16,7 @@ from graph.dim import Dim from graph.types.base import NNEdge +from importer.common.check_batchdim import check_batchdim from utils.node_id import NodeId @@ -32,6 +33,7 @@ def _common(cls, node, **kwargs): inputs = [all_nodes[inp] for inp in node.input] x = inputs[0] + x = check_batchdim(G, x, node.name) new_shape = tuple(cls._verify_constant(inputs[1])) params = params_class(node.name, new_shape=new_shape, diff --git a/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py b/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py index f8c23360e..8bbef202c 100644 --- a/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py +++ b/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py @@ -37,15 +37,10 @@ def _common(cls, node: TFLiteNode, **kwargs): opts = kwargs['opts'] all_nodes = kwargs['all_nodes'] importer = kwargs['importer'] - graph_outputs = kwargs['outputs'] - if len(node.output) > 3 and node.output[3] in graph_outputs: - G.remove(graph_outputs[node.output[3]][0]) - del graph_outputs[node.output[3]] inputs = [all_nodes[t] for t in node.input] outputs = [all_nodes.get(node.output[idx]) if idx < len(node.output) else None - for idx in range(3)] - # inp_shapes = [input[2].shape for input in inputs] + for idx in range(4)] if 'max_bb_before_nms' not in custom_opts: custom_opts['max_bb_before_nms'] = 300 @@ -79,9 +74,10 @@ def _common(cls, node: TFLiteNode, **kwargs): dtype=np.int16, scale=2**(-14)) o_scores_qtype = node.input[1].qtype o_class_qtype = QType(scale=1, dtype=np.int8) + o_num_detect = QType(scale=1, dtype=np.int8) qrec = QRec.scaled(in_qs=in_qtypes, out_qs=[o_boxes_qtype, o_class_qtype, - o_scores_qtype]) + o_scores_qtype, o_num_detect]) G.quantization[NodeId(params)] = qrec return params diff --git a/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py b/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py index c60de0360..6de3241b5 100644 --- a/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py +++ b/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py @@ -22,6 +22,7 @@ from importer.tflite2.tflite_schema_head.Padding import Padding from importer.tflite2.tflite_schema_head.TransposeConvOptions import \ TransposeConvOptions +from importer.common.check_batchdim import check_batchdim from ..backend_handler import BackendHandler from ..handler import tflite_op, partial_support, ps_description @@ -43,12 +44,14 @@ def version_1(cls, node: TFLiteNode, **kwargs): inputs = [all_nodes[t] for t in node.input] x = inputs[2] + x = check_batchdim(G, x, node.name) x_shape = x[2].shape in_b, in_h, in_w, in_c = tuple(x_shape) pout_shape = [dim if x_shape[idx] is not None else None for idx, dim in enumerate(cls.get_constant(inputs[0]))] out_b, out_h, out_w, out_c = tuple(pout_shape) + filt = inputs[1] weights_node = filt[0] filt_shape = filt[2].shape diff --git a/tools/nntool/importer/tflite2/remove_concats.py b/tools/nntool/importer/tflite2/remove_concats.py deleted file mode 100644 index c3a3391ca..000000000 --- a/tools/nntool/importer/tflite2/remove_concats.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -import logging - -from graph.matches.matcher import find_forward -from graph.types import (ConcatParameters, NNEdge, NoOPParameters, - StridedSliceParameters) -from utils.node_id import NodeId - -LOG = logging.getLogger('nntool.' + __name__) - -def remove_concats(G): - concat_nodes = list([node for node in G.nodes() if isinstance(node, ConcatParameters)]) - strided_slices_removed = [] - concats_removed = [] - for node in concat_nodes: - concat_out_edges = G.indexed_out_edges(node.name)[0] - concat_in_edges = G.indexed_in_edges(node.name) - axis_slices = [] - start_idx = 0 - # find the slice patterns that can match inputs - for in_idx, dim in enumerate(node.in_dims): - slice_patterns = [(start_idx, start_idx + dim.shape[node.axis], 1)] - if dim.shape[node.axis] == 1: - # can also match reversed - slice_patterns.append((start_idx, start_idx - 1, -1)) - axis_slices.append(slice_patterns) - start_idx += dim.shape[node.axis] - for out_edge in concat_out_edges: - edge_lists = find_forward(G, out_edge, - StridedSliceParameters, - skip_node_classes=NoOPParameters) - # each list of edges goes to a strided slice - for edge_list in edge_lists: - edge = edge_list[-1] - assert isinstance(edge.to_node, StridedSliceParameters) - ssp = edge.to_node - LOG.info("found strided slice %s", ssp.name) - # must only slice axis of concat - if not ssp.only_slices_axis(node.axis): - LOG.info("rejected: slices more than one axis") - continue - # must match a slice pattern on the input - ssp_slice = ssp.act_slice[node.axis] - - in_idx = None - for idx, slice_patterns in enumerate(axis_slices): - if ssp_slice in slice_patterns: - in_idx = idx - break - if in_idx is None: - LOG.info("rejected: slices pattern matching concat not found") - continue - LOG.info("removing slice %s", ssp.name) - strided_slices_removed.append(ssp.name) - # save the out edges - ssp_out_edges = G.out_edges(ssp.name) - in_edge = concat_in_edges[in_idx] - # remove all the nodes including the ssp - for inter_edge in edge_list: - if G.quantization: - del G.quantization[NodeId(inter_edge.to_node)] - G.remove(inter_edge.to_node) - # connect all the ssp out edges to the node on the concat input - - for ssp_out_edge in ssp_out_edges: - G.add_edge(NNEdge(in_edge.from_node, ssp_out_edge.to_node, - from_idx=in_edge.from_idx, - to_idx=ssp_out_edge.to_idx)) - # if the concat now has no out edges remove it - if G.num_out_edges(node.name) == 0: - LOG.info("removing concat %s", node.name) - concats_removed.append(node.name) - G.remove(node) - - return (strided_slices_removed, concats_removed) diff --git a/tools/nntool/importer/tflite2/tflite.py b/tools/nntool/importer/tflite2/tflite.py index d0e35779d..2566e5906 100644 --- a/tools/nntool/importer/tflite2/tflite.py +++ b/tools/nntool/importer/tflite2/tflite.py @@ -40,7 +40,6 @@ from .common import LOG, check from .common.handler_helper import get_all_backend_handlers from .fix_split_in_edges import fix_split_in_edges -from .remove_concats import remove_concats # pylint: disable=E1101 @@ -104,7 +103,6 @@ def create_graph(self, filename, opts): RemoveReshapesBeforeLinear().match(G) # DrawGraphReporter().report(G) G.add_dimensions() - remove_concats(G) if opts['remove_quantize_ops']: RemoveQuantizeOperators().match(G) G.add_dimensions() diff --git a/tools/nntool/interpreter/commands/adjust.py b/tools/nntool/interpreter/commands/adjust.py index f03975ac8..65c51d756 100644 --- a/tools/nntool/interpreter/commands/adjust.py +++ b/tools/nntool/interpreter/commands/adjust.py @@ -47,5 +47,5 @@ def do_adjust(self, args): else: steps = None self.G.adjust_order( - postprocess=not args.no_postprocess, steps=steps, single_step=args.individual_step) + no_postprocess=args.no_postprocess, steps=steps, single_step=args.individual_step) self.G.add_dimensions() diff --git a/tools/nntool/interpreter/commands/aquant.py b/tools/nntool/interpreter/commands/aquant.py index 56b2cf8c8..e66fc99e4 100644 --- a/tools/nntool/interpreter/commands/aquant.py +++ b/tools/nntool/interpreter/commands/aquant.py @@ -14,11 +14,14 @@ # along with this program. If not, see . import argparse +import glob import logging import pickle -import glob +from pathlib import Path + from cmd2 import Cmd2ArgumentParser, with_argparser from cmd2.cmd2 import Cmd +from interpreter.commands.qtune import load_options from interpreter.nntool_shell_base import (NNToolShellBase, store_once_in_history) from interpreter.shell_utils import glob_input_files, input_options @@ -50,6 +53,9 @@ class AquantCommand(NNToolShellBase): parser_aquant.add_argument('--stats', completer_method=Cmd.path_complete, help='pickle file containing statistics') + parser_aquant.add_argument('--json', + completer_method=Cmd.path_complete, + help='json file file containing saved quantization options using qtunesave command') add_options_to_parser(parser_aquant) input_options(parser_aquant) @@ -62,6 +68,16 @@ def do_aquant(self, args: argparse.Namespace): stats_collector = ActivationRangesCollector() # if replaying state file then load the activation stats if they are present opts = get_options_from_args(args) + + if args.json: + json_path = Path(args.json) + if not json_path.exists() or not json_path.is_file(): + self.perror(f'{json_path} does not exist or is not a file') + return + json_opts = load_options(json_path) + json_opts.update(opts) + opts = json_opts + state = ConstantInputParameters.save_compression_state(self.G) try: if args.stats: diff --git a/tools/nntool/interpreter/commands/compile_at_model.py b/tools/nntool/interpreter/commands/compile_at_model.py index 75db2c9f8..db5e6cc9a 100644 --- a/tools/nntool/interpreter/commands/compile_at_model.py +++ b/tools/nntool/interpreter/commands/compile_at_model.py @@ -111,7 +111,8 @@ def do_compile(self, args): at_gen_srcs.append(os.path.join(TILER_DSP_GENERATOR_PATH, "DSP_Generators.c")) objs = cc.compile( - srcs + at_gen_srcs, + sources=at_gen_srcs + srcs, + output_dir=args.model_dir, debug=1, extra_preargs=["-g"] ) diff --git a/tools/nntool/interpreter/commands/dsp_preprocessing.py b/tools/nntool/interpreter/commands/dsp_preprocessing.py index e8cf8e192..4d9c1e29c 100644 --- a/tools/nntool/interpreter/commands/dsp_preprocessing.py +++ b/tools/nntool/interpreter/commands/dsp_preprocessing.py @@ -13,6 +13,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from graph.types.base import NNEdge +from graph.types.others import ReshapeParameters import json import logging @@ -26,9 +28,11 @@ LOG = logging.getLogger("nntool") + class DSPPreprocessingCommand(NNToolShellBase): # GEN COMMAND parser_compile = Cmd2ArgumentParser() + def inputs_choices(self): if self.G is None: return [] @@ -37,8 +41,8 @@ def inputs_choices(self): def dsp_types(self): return [clas.__name__ for clas in DSPParameters.__subclasses__()] - - parser_dsp = Cmd2ArgumentParser("inserts dsp preprocessing node into graphs") + parser_dsp = Cmd2ArgumentParser( + "inserts dsp preprocessing node into graphs") parser_dsp.add_argument('input_node', choices_method=inputs_choices, help='input node name to format') @@ -49,6 +53,10 @@ def dsp_types(self): help='path to the config file for mfcc') parser_dsp.add_argument('--n_fft', type=int, help="n_fft bins") + parser_dsp.add_argument('--n_frames', type=int, + help="number of frames") + parser_dsp.add_argument('--n_fbanks', type=int, + help="number of filter banks") parser_dsp.add_argument('--frame_size', default=None, type=int, help='frame size in samples') parser_dsp.add_argument('--frame_step', default=None, type=int, @@ -68,6 +76,8 @@ def do_dsp_preprocessing(self, args): magsquared = args.magsquared win_fn = args.window_fn preemp_factor = args.preemp_factor + n_frames = args.n_frames + n_fbanks = args.n_fbanks config_dict = None if args.config_json: with open(args.config_json) as json_file: @@ -78,38 +88,42 @@ def do_dsp_preprocessing(self, args): magsquared = config_dict.get("magsquared", magsquared) win_fn = config_dict.get("window_fn", win_fn) preemp_factor = config_dict.get("preemp_factor", preemp_factor) + n_frames = config_dict.get("n_frames", n_frames) + n_fbanks = config_dict.get("n_fbanks", n_fbanks) assert frame_step, "frame_step is required" - spect_shape = self.G[args.input_node].out_dims[0].shape - if len(spect_shape) > 2: - if 1 in spect_shape: - temp = spect_shape[::-1] - temp.remove(1) - spect_shape = temp[::-1] - LOG.info(f"spectrogram shape expected as {spect_shape}") - n_frames = spect_shape[-2] - n_fbanks = spect_shape[-1] + assert n_fbanks and n_frames, "n_frames and n_fbanks are required" + org_input_dim = self.G[args.input_node].out_dims[0] + if org_input_dim.size() != (n_frames * n_fbanks): + raise ValueError( + f"Next layer has dimension {org_input_dim} (size: {org_input_dim.size()}) while you are trying to insert a DSP params with output size of {n_fbanks*n_frames} ({n_frames}x{n_fbanks})") LOG.info(f"N FRAMES: {n_frames}") new_input_size = frame_step * (n_frames - 1) + frame_size if args.dsp_node_type == "MFCCPreprocessingParameters": - dsp_params = MFCCPreprocessingParameters("MfccPreprocessing", conf_dict=config_dict) + dsp_params = MFCCPreprocessingParameters( + "MfccPreprocessing", conf_dict=config_dict) win_lut, fft_twiddles, swaptable, rfft_twiddles = dsp_params.gen_fft_twiddles() melfilt_coeff_sparse_node, melfilt_sparsity_node = dsp_params.gen_melfilter() - dct_matrix_node = dsp_params.gen_dct_matrix()(self.G) if dsp_params.n_dct else None - dsp_params_ref = dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(self.G), swaptable(self.G), rfft_twiddles(self.G), melfilt_sparsity_node(self.G), melfilt_coeff_sparse_node(self.G), dct_matrix_node) + dct_matrix_node = dsp_params.gen_dct_matrix()( + self.G) if dsp_params.n_dct else None + dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(self.G), swaptable(self.G), rfft_twiddles( + self.G), melfilt_sparsity_node(self.G), melfilt_coeff_sparse_node(self.G), dct_matrix_node) elif args.dsp_node_type == "RFFT2DPreprocessingParameters": - dsp_params = RFFT2DPreprocessingParameters("RfftPreprocessing", conf_dict=config_dict) + dsp_params = RFFT2DPreprocessingParameters( + "RfftPreprocessing", conf_dict=config_dict) win_lut, fft_twiddles, swaptable, rfft_twiddles = dsp_params.gen_fft_twiddles() - dsp_params_ref = dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(self.G), swaptable(self.G), rfft_twiddles(self.G)) + dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles( + self.G), swaptable(self.G), rfft_twiddles(self.G)) - new_input_node = InputParameters(args.input_node, dims=Dim.unnamed([new_input_size])) + new_input_node = InputParameters( + args.input_node, dims=Dim.unnamed([new_input_size])) input_node_edge = self.G.out_edges(args.input_node)[0] input_node_edge.from_node.in_dims[0] = Dim.unnamed([new_input_size]) self.G.insert_node_at_edge(dsp_params, input_node_edge) self.G.replace_node(self.G[args.input_node], new_input_node) + dsp_out_dim = dsp_params.get_output_size([Dim.unnamed([new_input_size])])[0] + if dsp_out_dim != org_input_dim: + reshape = ReshapeParameters("reshape_dsp", old_shape=dsp_out_dim.shape, shape=org_input_dim.shape) + self.G.insert_node_after( + dsp_params, reshape, from_idx=0, edge_class=NNEdge) self.G.add_dimensions() - - - - - diff --git a/tools/nntool/interpreter/commands/dump.py b/tools/nntool/interpreter/commands/dump.py index b665bbfda..ec05f2acc 100644 --- a/tools/nntool/interpreter/commands/dump.py +++ b/tools/nntool/interpreter/commands/dump.py @@ -18,22 +18,56 @@ import pickle import numpy as np -from PIL import Image, ImageDraw from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser - from execution.graph_executer import GraphExecuter from execution.quantization_mode import QuantizationMode -from graph.types import SSDDetectorParameters from interpreter.nntool_shell_base import NNToolShellBase, no_history -from interpreter.shell_utils import (glob_input_files, - input_options) +from interpreter.shell_utils import glob_input_files, input_options +from PIL import Image, ImageDraw +from utils.at_norm import get_do_rounding, set_do_rounding from utils.data_importer import import_data from utils.node_id import NodeId -from utils.at_norm import set_do_rounding, get_do_rounding + +from graph.dump_tensor import PrintDumper, dump_tensor +from graph.types import ConstantInputParameters, SSDDetectorParameters LOG = logging.getLogger('nntool.'+__name__) +def print_intermediates(G, outputs, limit=None, width=8, + precision=4, channel=None, order=None, + checksum=False, print_constants=False): + def print_step(step, outs, index): + node = step['node'] + if checksum: + for out_idx, out in enumerate(outs): + if isinstance(node, ConstantInputParameters): + continue + checksum_val = np.sum(out) if out.dtype != np.uint8 else np.sum( + out.astype(np.int8)) + print( + f"S{index} - {node.name}\n\tChecksum = {checksum_val}") + else: + print(node.name) + for out_idx, out in enumerate(outs): + dims = node.out_dims[out_idx] + if order is not None and dims.is_named and order != dims.order and all(k in dims.order + for k in order): + transpose = dims.transpose_to_order(order) + out = out.transpose(transpose) + if channel is not None: + out = out[channel:channel+1:1, ...] + dump_tensor(out, PrintDumper( + out, width=width, precision=precision)) + + if limit is not None: + print_step(G.graph_state.steps[limit], outputs[limit], limit) + else: + for idx, out in enumerate(outputs): + print_step(G.graph_state.steps[idx], out, idx) + print() + + class DumpCommand(NNToolShellBase): # DUMP COMMAND parser_dump = Cmd2ArgumentParser() @@ -126,9 +160,9 @@ def do_dump(self, args: argparse.Namespace): if args.pickle or self._in_py or args.save: pickles.append(outputs) else: - self.G.print_intermediates(outputs, limit=step, width=args.number_width, - precision=args.precision, channel=args.channel, - order=['c', 'h', 'w'], checksum=args.checksum) + print_intermediates(self.G, outputs, limit=step, width=args.number_width, + precision=args.precision, channel=args.channel, + order=['c', 'h', 'w'], checksum=args.checksum) if args.visualize_detection: img_in = Image.open(file_per_input[0]).convert('RGBA') diff --git a/tools/nntool/interpreter/commands/fquant.py b/tools/nntool/interpreter/commands/fquant.py index 251d6aeeb..baa78b0d2 100644 --- a/tools/nntool/interpreter/commands/fquant.py +++ b/tools/nntool/interpreter/commands/fquant.py @@ -15,9 +15,11 @@ import argparse import logging +from pathlib import Path import numpy as np -from cmd2 import Cmd2ArgumentParser, with_argparser +from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser +from interpreter.commands.qtune import load_options from interpreter.nntool_shell_base import NNToolShellBase from quantization.handlers_helpers import (add_options_to_parser, get_options_from_args) @@ -55,6 +57,9 @@ class FquantCommand(NNToolShellBase): parser_fquant.add_argument('--seed', type=int, default=0, help='numpy random seed, default not set and inputs change every time') + parser_fquant.add_argument('--json', + completer_method=Cmd.path_complete, + help='json file file containing saved quantization options using qtunesave command') add_options_to_parser(parser_fquant) @with_argparser(parser_fquant) @@ -65,6 +70,16 @@ def do_fquant(self, args: argparse.Namespace): weights and input data are avalaible.""" self._check_graph() opts = get_options_from_args(args) + opts = get_options_from_args(args) + if args.json: + json_path = Path(args.json) + if not json_path.exists() or not json_path.is_file(): + self.perror(f'{json_path} does not exist or is not a file') + return + json_opts = load_options(json_path) + json_opts.update(opts) + opts = json_opts + state = ConstantInputParameters.save_compression_state(self.G) try: if self.replaying_history and self.history_stats: diff --git a/tools/nntool/interpreter/commands/fusions.py b/tools/nntool/interpreter/commands/fusions.py index 4fae2751d..bf2bd21ec 100644 --- a/tools/nntool/interpreter/commands/fusions.py +++ b/tools/nntool/interpreter/commands/fusions.py @@ -16,13 +16,8 @@ import texttable from cmd2 import Cmd2ArgumentParser, with_argparser from interpreter.nntool_shell_base import NNToolShellBase -from quantization.quantizer.new_quantizer import NewQuantizer -from quantization.verify_quantization import verify_quantization -from graph.matches.matches import (get_fusion, get_fusions, - get_pow2_match_group, - get_scale8_match_group) -from graph.types import ConstantInputParameters +from graph.matches.matches import get_fusions class FusionsCommand(NNToolShellBase): @@ -31,6 +26,9 @@ def fusions_list(self): return [elem[0] for elem in get_fusions()] parser_fusions = Cmd2ArgumentParser("apply fusions to graph") + parser_fusions.add_argument('--no_postprocess', + action='store_true', + help="don't run adjust or qtune or rerun fusions (debugging option)") parser_fustions_exclusive = parser_fusions.add_mutually_exclusive_group() parser_fustions_exclusive.add_argument('-l', '--list', action='store_true', @@ -59,31 +57,17 @@ def do_fusions(self, args): self.ppaged(table.draw()) return self._check_graph() - state = ConstantInputParameters.save_compression_state(self.G) try: if args.apply: - fusions = [get_fusion(name) for name in args.apply] - invalid_names = [args.apply[idx] for idx, fusion in enumerate(fusions) if fusion is None] - if invalid_names: - self.perror(f'fusion{"s" if len(invalid_names) > 1 else ""} {", ".join(invalid_names)} not found') - return + fusions_names = args.apply elif args.pow2: - fusions = [get_pow2_match_group()] + fusions_names = ['pow2_match_group'] elif args.scale8: - fusions = [get_scale8_match_group()] + fusions_names = ['scaled_match_group'] else: - self.perror("No fusion set selected. Nothing to do. Select --pow2 or --scale8.") + self.perror( + "No fusion set selected. Nothing to do. Select --pow2 or --scale8.") return - for fusion in fusions: - fusion.match(self.G) - self.G.add_dimensions() - if self.G.quantization and verify_quantization(self.G): - quantizer = NewQuantizer(self.G) - quantizer.quantize() - problems = verify_quantization(self.G) - if problems: - self.perror('quantization issue after fusions') - for problem in problems: - self.perror(problem) - finally: - ConstantInputParameters.restore_compression_state(self.G, state) + self.G.fusions(*fusions_names, no_postprocess=args.no_postprocess) + except ValueError as ex: + self.perror(f'{ex}') diff --git a/tools/nntool/interpreter/commands/gen.py b/tools/nntool/interpreter/commands/gen.py index 6e083a8c8..ccbd77b9e 100644 --- a/tools/nntool/interpreter/commands/gen.py +++ b/tools/nntool/interpreter/commands/gen.py @@ -16,14 +16,19 @@ import argparse import logging import os + from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser -from interpreter.nntool_shell_base import NNToolShellBase, no_history -from utils.data_importer import import_data from execution.graph_executer import GraphExecuter from execution.quantization_mode import QuantizationMode -from generation.default_template import basic_kernel_header_template, basic_kernel_source_template, default_template, dynamic_template, header_template -from generation.naming_convension import DefaultNamingConvension from generation.code_generator import CodeGenerator +from generation.default_template import (basic_kernel_header_template, + basic_kernel_source_template, + default_template, dynamic_template, + header_template) +from generation.gen_utils import write_empty +from generation.naming_convension import DefaultNamingConvension +from interpreter.nntool_shell_base import NNToolShellBase, no_history +from utils.data_importer import import_data LOG = logging.getLogger("nntool") @@ -92,9 +97,11 @@ def do_gen(self, args): self.settings['basic_kernel_source_file'] = args.basic_kernel_source_file self.settings['basic_kernel_header_file'] = args.basic_kernel_header_file self.settings['anonymise'] = args.anonymise - os.makedirs(os.path.abspath(self.settings['model_directory']), mode=0o750, exist_ok=True) - os.makedirs(os.path.abspath(self.settings['tensor_directory']), mode=0o750, exist_ok=True) - code_gen = CodeGenerator(self.G, DefaultNamingConvension(self.G, anonymise=args.anonymise), self.settings) + os.makedirs(os.path.abspath( + self.settings['model_directory']), mode=0o750, exist_ok=True) + os.makedirs(os.path.abspath( + self.settings['tensor_directory']), mode=0o750, exist_ok=True) + code_gen = CodeGenerator(self.G, DefaultNamingConvension(anonymise=args.anonymise), self.settings) if self.settings['template_file']: code_template = dynamic_template(self.settings['template_file']) @@ -108,18 +115,28 @@ def do_gen(self, args): if self.G.has_expressions: with open(os.path.join(self.settings['model_directory'], args.basic_kernel_source_file), "w") as output_fp: - output_fp.write(basic_kernel_source_template(self.G, code_generator=code_gen)) + output_fp.write(basic_kernel_source_template( + self.G, code_generator=code_gen)) with open(os.path.join(self.settings['model_directory'], args.basic_kernel_header_file), "w") as output_fp: - output_fp.write(basic_kernel_header_template(self.G, code_generator=code_gen)) + output_fp.write(basic_kernel_header_template( + self.G, code_generator=code_gen)) + else: + write_empty(self.settings['model_directory'], + args.basic_kernel_source_file, "no expressions used") + write_empty(self.settings['model_directory'], + args.basic_kernel_header_file, "no expressions used") else: self.ppaged(code_template(self.G, code_generator=code_gen)) if self.G.has_expressions: - self.ppaged(basic_kernel_source_template(self.G, code_generator=code_gen)) - self.ppaged(basic_kernel_header_template(self.G, code_generator=code_gen)) + self.ppaged(basic_kernel_source_template( + self.G, code_generator=code_gen)) + self.ppaged(basic_kernel_header_template( + self.G, code_generator=code_gen)) if args.output_tensors: code_gen.write_constants() if args.header_file: with open(os.path.join(self.settings['model_directory'], args.header_file), "w") as output_fp: - output_fp.write(header_template(self.G, code_generator=code_gen)) + output_fp.write(header_template( + self.G, code_generator=code_gen)) diff --git a/tools/nntool/interpreter/commands/gen_project.py b/tools/nntool/interpreter/commands/gen_project.py index b8202d8b3..83457d548 100644 --- a/tools/nntool/interpreter/commands/gen_project.py +++ b/tools/nntool/interpreter/commands/gen_project.py @@ -36,6 +36,7 @@ from generation.default_template import (basic_kernel_header_template, basic_kernel_source_template, default_template) +from generation.gen_utils import write_empty from generation.naming_convension import DefaultNamingConvension from interpreter.commands.aquant import AquantCommand from interpreter.commands.open import OpenCommand @@ -134,6 +135,11 @@ def do_gen_project(self, args): self._check_quantized() self._check_adjusted() + if "GAP_SDK_HOME" not in os.environ or "NNTOOL_PATH" not in os.environ: + self.perror( + 'you must run "source sourceme.sh" in the GAP SDK before using this command') + return + if args.input_tensors: if args.input_tensors not in self.tensor_store: self.perror( @@ -217,7 +223,7 @@ def do_performance(self, args): self._check_graph() self._check_quantized() self._check_adjusted() - if "GAP_SDK_HOME" not in os.environ: + if "GAP_SDK_HOME" not in os.environ or "NNTOOL_PATH" not in os.environ: self.perror( 'you must run "source sourceme.sh" in the GAP SDK before using this command') return @@ -281,7 +287,7 @@ def do_performance(self, args): self.tensor_store[args.output_tensors] = at_map_tensors( self.G, at_tensor_loader_int(fp)) - match_perf = r" +((?:S\d+|Tota)[^:]+): *Cycles: +(\d+)[^:]+: +(\d+)[^:]+: +([\d<.]+)" + match_perf = r" +((?:S\d+|Tota)[^:]+): *Cycles: +(\d+)[^:]+: +(\d+)[^:]+: +(.+)" matcher = re.compile(match_perf) perf = matcher.findall(res.stdout) if not perf: @@ -356,7 +362,9 @@ def process_script(script): if line.startswith('aquant'): # add abs path for input files and try to remake command args = aquant_parser.parse_args(line.rstrip().split(' ')[1:]) - input_files = [os.path.abspath(f) for f in args.input_files if f != ''] + input_files = [os.path.abspath(f) + for f in args.input_files if f != ''] +#pylint: disable=singleton-comparison opts = [f"--{k} {v}" if v != True else f"--{k}" for k, v in vars(args).items() if v and k != 'input_files'] line = " ".join(['aquant'] + opts + input_files) @@ -373,7 +381,7 @@ def gen_project(G, settings, project_folder, script_commands, overwrite=False, p settings['graph_produce_operinfos'] = True code_gen = CodeGenerator( - G, DefaultNamingConvension(G), settings) + G, DefaultNamingConvension(), settings) if not os.path.exists(project_folder): os.mkdir(project_folder) @@ -447,7 +455,7 @@ def gen_project(G, settings, project_folder, script_commands, overwrite=False, p if script_commands[-1] != "save_state": fp.write('save_state\n') if gen_atproject: - code_gen = CodeGenerator(G, DefaultNamingConvension(G), settings) + code_gen = CodeGenerator(G, DefaultNamingConvension(), settings) with open(os.path.join(project_folder, 'Model.c'), "w") as output_fp: output_fp.write(default_template(G, code_generator=code_gen)) if G.has_expressions: @@ -457,6 +465,12 @@ def gen_project(G, settings, project_folder, script_commands, overwrite=False, p with open(os.path.join(project_folder, "Expression_Kernels.h"), "w") as output_fp: output_fp.write(basic_kernel_header_template( G, code_generator=code_gen)) + else: + write_empty(project_folder, "Expression_Kernels.c", + "no expressions used") + write_empty(project_folder, "Expression_Kernels.h", + "no expressions used") + code_gen.write_constants(tensor_directory=project_folder) ignore_function = None if overwrite else skip_existing_files( project_folder) diff --git a/tools/nntool/interpreter/commands/open.py b/tools/nntool/interpreter/commands/open.py index 2db6d849f..1e317bb5f 100644 --- a/tools/nntool/interpreter/commands/open.py +++ b/tools/nntool/interpreter/commands/open.py @@ -179,7 +179,11 @@ def do_open(self, args: argparse.Namespace): else: # reset the current graph self._graphs[self._graph_idx] = NO_GRAPH.copy() - self.__open_graph(args) + try: + self.__open_graph(args) + except FileNotFoundError: + self.perror(f'{args.nnfile} not found') + return self._update_prompt() self.py_locals['G'] = self.G diff --git a/tools/nntool/interpreter/commands/qtune.py b/tools/nntool/interpreter/commands/qtune.py index 1eaa78c9f..e7b60a90f 100644 --- a/tools/nntool/interpreter/commands/qtune.py +++ b/tools/nntool/interpreter/commands/qtune.py @@ -75,7 +75,7 @@ def qtune_first_arg_mapper(self, nodestr): parser_tune.add_argument( '--json', completer_method=Cmd.path_complete, - help='json file to save quantization options') + help='json file to load quantization options from') @with_argparser(parser_tune, ns_provider=capture_shell) def do_qtune(self, args): @@ -102,8 +102,7 @@ def reduction(state, x): if not json_path.exists() or not json_path.is_file(): self.perror(f'{json_path} does not exist or is not a file') return - with json_path.open('r') as fp: - options = json.load(fp, cls=JsonSerializableStateDecoder) + options = load_options(json_path) else: options = {} @@ -115,6 +114,17 @@ def reduction(state, x): quantizer.quantize() self.pfeedback('quantization options set') +def load_options(file_path): + with file_path.open('r') as fp: + save_options = json.load(fp, cls=JsonSerializableStateDecoder) + options = save_options['global'] + for node_opt in save_options['nodes']: + if 'node_name' not in node_opt: + raise ValueError('node option missing node id') + options[NodeId(node_opt['node_name'])] = {opt: val for opt, val in node_opt.items() if opt != "node_name"} + return options + + class QTuneSaveCommand(NNToolShellBase): # QTUNESAVE COMMAND @@ -126,13 +136,35 @@ class QTuneSaveCommand(NNToolShellBase): @with_argparser(parser_qtune_save) def do_qtunesave(self, args): """ -Save set quantization options.""" +Save set quantization options. + +You can manually edit quantization options in the file. +The global section contains options that will be applied to the whole graph +The nodes array contains options for each node that override the global options +Each nodes entry should be a JSON mapping with a key node_name +node_name should contain a node name or and arry with the fusion name and fusion internal +node name. +""" self._check_graph() self._check_quantized() save_path = Path(args.jsonfile).with_suffix('.json') - options = self.G.quantization.options.copy() - if 'scheme' not in options: - options['scheme'] = self.G.quantization.scheme_priority[0] + save_options = { + "global": {}, + "nodes": [] + } + for optid, opt in self.G.quantization.options.items(): + if isinstance(optid, NodeId): + opt = opt.copy() + if 'qtype_ind' in opt: + del opt['qtype_ind'] + if opt: + opt['node_name'] = optid.id[0] if not optid.id[1] else optid + save_options['nodes'].append(opt) + else: + save_options['global'][optid] = opt + + if 'scheme' not in save_options: + save_options['scheme'] = self.G.quantization.scheme_priority[0] with save_path.open('w') as fp: - json.dump(options, fp, cls=JsonSerializableStateEncoder, indent=2) + json.dump(save_options, fp, cls=JsonSerializableStateEncoder, indent=2) self.pfeedback(f'quantization options saved to {save_path}') diff --git a/tools/nntool/interpreter/commands/remove.py b/tools/nntool/interpreter/commands/remove.py index f5b716a7d..3283f59c8 100644 --- a/tools/nntool/interpreter/commands/remove.py +++ b/tools/nntool/interpreter/commands/remove.py @@ -14,6 +14,7 @@ # along with this program. If not, see . import argparse +from functools import reduce from cmd2 import Cmd2ArgumentParser, with_argparser from interpreter.nntool_shell_base import NNToolShellBase @@ -40,6 +41,9 @@ def nodes_choices(self): parser_remove.add_argument('-u', '--up', action='store_true', help='when one node is specified remove it and everything above it') + parser_remove.add_argument('--leave', + action='store_true', + help='when one node is specified only remove what is above or below and not the node itself') @with_argparser(parser_remove) def do_remove(self, args: argparse.Namespace): @@ -51,36 +55,53 @@ def do_remove(self, args: argparse.Namespace): node_from = self.G[args.nodes[0]] if len(args.nodes) == 1: if args.up: - nodes_above = self.G.nodes_above(node_from) - out_edges = self.G.indexed_out_edges(node_from) - nodes_above.add(node_from) + nodes_above = set(self.G.nodes_above(node_from)) + if args.leave: + remove_nodes = nodes_above + inputs_on = [] + dims = node_from.in_dims + for in_edge in self.G.indexed_in_edges(node_from): + if isinstance(in_edge.from_node, ConstantInputParameters): + nodes_above.remove(in_edge.from_node) + else: + inputs_on.append([in_edge]) + else: + dims = node_from.out_dims + remove_nodes = nodes_above | {node_from} + inputs_on = self.G.indexed_out_edges(node_from) + input_names = sorted( - [node.name for node in nodes_above if isinstance(node, InputParameters)]) - self.G.remove_all(nodes_above | {node_from}) - for idx, edge_group in enumerate(out_edges): + [node.name for node in remove_nodes if isinstance(node, InputParameters)]) + self.G.remove_all(remove_nodes) + + for idx, edge_group in enumerate(inputs_on): name = input_names.pop(0) if input_names else None - in_node = self.G.add_input( - node_from.out_dims[idx], name=name) + in_node = self.G.add_input(dims[idx], name=name) self.pfeedback(f'adding input {in_node.name}') for edge in edge_group: self.G.add_edge(NNEdge(from_node=in_node, to_idx=edge.to_idx, to_node=edge.to_node)) else: - nodes_below = self.G.nodes_below(node_from) - for node in list(nodes_below): - nodes_below.update(edge.from_node for edge in self.G.in_edges(node) - if isinstance(edge.from_node, ConstantInputParameters)) + nodes_below = set(self.G.nodes_below(node_from)) if self.G.is_vertex_cut(nodes_below): self.perror( f'removing everything below {node_from.name} would split the graph which is not permitted') return - nodes_below.add(node_from) - in_edges = self.G.in_edges(node_from.name) + if args.leave: + remove_nodes = nodes_below + outputs_on = [edge_bundle[0] + for edge_bundle in self.G.indexed_out_edges(node_from)] + else: + input_nodes = {edge.from_node for edge in self.G.in_edges(node_from) + if isinstance(edge.from_node, (InputParameters, ConstantInputParameters))} + remove_nodes = nodes_below | {node_from} | input_nodes + outputs_on = self.G.indexed_in_edges(node_from) output_names = sorted( - [node.name for node in nodes_below if isinstance(node, OutputParameters)]) - self.G.remove_all(nodes_below) - for edge in in_edges: + [node.name for node in remove_nodes if isinstance(node, OutputParameters)]) + + self.G.remove_all(remove_nodes) + for edge in outputs_on: name = output_names.pop(0) if output_names else None out_node = self.G.add_output(name=name) self.pfeedback(f'adding output {out_node.name}') @@ -98,8 +119,11 @@ def do_remove(self, args: argparse.Namespace): f'all paths from {node_from.name} must lead to {node_to.name}') return - edges_from = self.G.indexed_out_edges(node_from) - edges_to = self.G.indexed_in_edges(node_to.name) + edges_from = set(self.G.out_edges(node_from)) + edges_to = set(self.G.in_edges(node_to.name)) + between_edges = reduce(lambda s, x: s|set(self.G.edges(x)), nodes_between, set()) + edges_from = edges_from.intersection(between_edges) + edges_to = edges_to.intersection(between_edges) if len(edges_from) != len(edges_to): self.perror( f"{node_from.name} has a different number of outputs than {node_to.name}'s inputs") diff --git a/tools/nntool/interpreter/generator.py b/tools/nntool/interpreter/generator.py index 5a89cc584..c271c2c65 100644 --- a/tools/nntool/interpreter/generator.py +++ b/tools/nntool/interpreter/generator.py @@ -24,6 +24,7 @@ basic_kernel_source_template, default_template, dynamic_template, header_template) +from generation.gen_utils import write_empty from generation.naming_convension import DefaultNamingConvension from interpreter.nntool_shell import NNToolShell @@ -51,7 +52,6 @@ def write_template(G, code_gen, model_directory, model_file, template, template_ with open(model_path, "w") as output_fp: output_fp.write(model) - def generate_code(args): LOG.propagate = False @@ -85,7 +85,7 @@ def generate_code(args): os.makedirs(os.path.abspath(opts['model_directory']), mode=0o750, exist_ok=True) os.makedirs(os.path.abspath(opts['tensor_directory']), mode=0o750, exist_ok=True) - code_gen = CodeGenerator(G, DefaultNamingConvension(G, anonymise=opts.get('anonymise')), opts) + code_gen = CodeGenerator(G, DefaultNamingConvension(anonymise=opts.get('anonymise')), opts) if args.template_file: code_template = dynamic_template(args.template_file) else: @@ -96,6 +96,9 @@ def generate_code(args): opts['basic_kernel_header_file'], basic_kernel_header_template, "kernel headers") write_template(G, code_gen, opts['model_directory'], opts['basic_kernel_source_file'], basic_kernel_source_template, "kernel source") + else: + write_empty(opts['model_directory'], opts['basic_kernel_header_file'], "no expressions used") + write_empty(opts['model_directory'], opts['basic_kernel_source_file'], "no expressions used") if args.header_file: with open(os.path.join(opts['model_directory'], args.header_file), "w") as output_fp: diff --git a/tools/nntool/quantization/clipping.py b/tools/nntool/quantization/clipping.py new file mode 100644 index 000000000..3cd8b8695 --- /dev/null +++ b/tools/nntool/quantization/clipping.py @@ -0,0 +1,99 @@ +# Copyright (C) 2020 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import math + +import numpy as np + +# import scipy.optimize as opt +# sigma=1.0 +# ALPHA_GAUS = {m: opt.minimize_scalar(lambda x: mse_gaussian(x, sigma=sigma, num_bits=m)).x for m in range(2,17,1)} +# b=1. +# ALPHA_LAPLACE = {m: opt.minimize_scalar(lambda x: mse_laplace(x, b=b, num_bits=m)).x for m in range(2,17,1)} + +ALPHA_GAUS = {2: 1.7106351863419305, 3: 2.1515927416420935, 4: 2.559136455058456, 5: 2.9362006203824396, + 6: 3.2869143669161147, 7: 3.615114210893466, 8: 3.924034014599462, 9: 4.216330919936089, + 10: 4.494170448727792, 11: 4.759309171709873, 12: 5.013218066309031, 13: 5.2570849373594974, + 14: 5.491968790304721, 15: 5.7186999893215, 16: 5.937970657819115} + +ALPHA_LAPLACE = {2: 2.830682989304011, 3: 3.89722946313961, 4: 5.028640140480669, 5: 6.204766334217521, + 6: 7.413126215019491, 7: 8.645619949475485, 8: 9.896759823828738, 9: 11.16268502214751, + 10: 12.440591336219248, 11: 13.728384769877623, 12: 15.024464757336403, 13: 16.32758309514459, + 14: 17.6367486184042, 15: 18.95116231019748, 16: 20.270171640301292} +GAUSSIAN_CONST = (0.5 * 0.35) * (1 + (math.pi * math.log(4)) ** 0.5) + + +def get_alpha_laplace(num_bits, stat): + alpha = ALPHA_LAPLACE[num_bits] * stat['b'] + return alpha, stat['b'] + + +def get_alpha_gaus(shape, num_bits, stat): + size = np.prod(shape) + std = ((stat['max'] - stat['min']) * GAUSSIAN_CONST) / \ + ((2 * math.log(size)) ** 0.5) + alpha = ALPHA_GAUS[num_bits] * std + return alpha, std + + +def mse_laplace(b, alpha, num_bits): + return 2 * (b ** 2) * np.exp(-alpha / b) + ((alpha ** 2) / (3 * 2 ** (2 * num_bits))) + + +def mse_gaussian(sigma, alpha, num_bits): + clipping_err = (sigma ** 2 + (alpha ** 2)) * (1 - math.erf(alpha / (sigma * np.sqrt(2.0)))) - \ + np.sqrt(2.0 / np.pi) * alpha * sigma * \ + (np.e ** ((-1) * (0.5 * (alpha ** 2)) / sigma ** 2)) + quant_err = (alpha ** 2) / (3 * (2 ** (2 * num_bits))) + return clipping_err + quant_err + + +def alpha2DeltaOffset(self, alpha, max_value, min_value, mean): + max_range = max_value - min_value + if alpha <= 0 or alpha >= max_range / 2: + delta = max_range + else: + delta = 2 * alpha + min_value = max(min_value, mean - delta / 2) + + return delta, min_value + + +def get_clip(shape, num_bits, stat, clip_type): + if clip_type == "laplace": + alpha, _ = get_alpha_laplace(num_bits, stat) + elif clip_type == "gaus": + alpha, _ = get_alpha_gaus(shape, num_bits, stat) + elif clip_type == "mix": + alpha_laplace, b = get_alpha_laplace(num_bits, stat) + alpha_gaus, std = get_alpha_gaus(shape, num_bits, stat) + mse_est_laplace = mse_laplace(b, alpha_laplace, num_bits) + mse_est_gaus = mse_gaussian(std, alpha_gaus, num_bits) + if mse_est_laplace < mse_est_gaus: + alpha = alpha_laplace + else: + alpha = alpha_gaus + elif clip_type == "none": + return stat['min'], stat['max'] + else: + raise ValueError('unknown clip type') + max_range = stat['max'] - stat['min'] + if alpha <= 0 or alpha >= max_range / 2: + return stat['min'], stat['max'] + + min_value = max(stat['min'], stat['mean'] - alpha) + max_value = min_value + 2 * alpha + + return min_value, max_value diff --git a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py index ddd6f2607..56c24ed63 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py @@ -30,35 +30,93 @@ from quantization.new_qrec import QRec from quantization.qtype import QType from quantization.unified_quantization_handler import (in_qs_constraint, - out_qs_constraint, - params_type, priority) + out_qs_constraint,option_constraint, + params_type, options) from ..mult_quantization_handler import MultQuantizionHandler +from quantization.quantizer_options import * LOG = logging.getLogger('nntool.' + __name__) +@options( + FORCE_OUTPUT_SIZE_OPTION, +) class ActivationMultSWBase(MultQuantizionHandler): @classmethod - def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, **kwargs): + def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, out_asym, **kwargs): force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] fusion = kwargs.get('fusion', None) in_q = in_qs[0] - if not fusion and in_dtype == np.int32: - return None + if fusion: + in_dtype = np.int32 + bits = 8 if out_dtype == np.int8 or out_dtype == np.uint8 else 16 if isinstance(params, (HSwishActivationParameters, HSigmoidActivationParameters)): - if in_q.max < params.upper_bound: - # TODO - could do something clever for asymmetric here - in_q = QType.from_min_max_sq(-params.upper_bound, params.upper_bound, - dtype=in_dtype, forced=True) + cls.check_valid_ranges(params, stats, idx=0, dirs='in') + # we need to be able to represent offset and upper_bound in output dtype + # input range should match stats since swish requires the full input range + if fusion: + # in a fusion the output container is smaller than the input container + # The input scale may be too small to represent offset and upper_bound + # in the output dtype + params_qtype = QType.from_min_max_sq( + 0, + np.maximum( + params.upper_bound, + params.offset), + bits=bits, + dtype=out_dtype) + in_q = QType.from_min_max_sq( + stats['range_in'][0]['min'], + stats['range_in'][0]['max'], + dtype=in_dtype) + # if params scale is larger then we must reduce precision + if np.all(params_qtype.scale > in_q.scale): + in_q.scale = params_qtype.scale + else: + # outside a fusion our in and out dtype is the same + # so we just need to check that offset and upper_bound can be represented + if in_dtype == np.uint8: + in_dtype = np.int8 + elif in_dtype == np.uint16: + in_dtype = np.int16 + if isinstance(params, HSwishActivationParameters): + lower = stats['range_in'][0]['min'] + upper = np.maximum( + np.maximum( + params.upper_bound, + params.offset), + stats['range_in'][0]['max']) + else: + lower = -params.offset + upper = params.upper_bound + + in_q = QType.from_min_max_sq( + lower, + upper, + dtype=in_dtype) elif isinstance(params, (TanHActivationParameters, SigmoidActivationParameters)): - in_q = QType.from_min_max_sq(-8, 8, dtype=in_dtype, forced=True) + if in_dtype == np.int8: + in_q = QType.from_min_max_sq( + -8, + 8, + dtype=in_dtype, + forced=True) + else: + in_q = QType( + dtype=in_dtype, + scale=pow(2, -12)) + elif isinstance(params, (HTanHActivationParameters, )): + scale = 2 / pow(2, bits) + in_q = QType(scale=scale, dtype=in_dtype, forced=True) + elif isinstance(params, (LeakyActivationParameters, )): + max_out = max(abs(stats['range_out'][0]['max']), abs(stats['range_out'][0]['min'])) + scale = (2 * max_out) / pow(2, bits) + in_q = QType(scale=scale, dtype=in_dtype, forced=True) if force_out_q: - if force_out_q.signed != in_q.signed: - return None if fusion and fusion.fusion_type in ['conv_active_pool', 'conv_active']: if not isinstance(params, (SigmoidActivationParameters, HTanHActivationParameters, HSwishActivationParameters, HSigmoidActivationParameters)): @@ -70,35 +128,43 @@ def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, **kwargs): else: cls.check_valid_ranges(params, stats, idx=0, dirs='out') - if (isinstance(params, ReluActivationParameters) and params.lower_bound == 0 and - in_q.dtype == np.int8): + if isinstance(params, ReluActivationParameters): max_val = params.upper_bound if params.upper_bound else stats['range_out'][0]['max'] o_q = QType.from_min_max_sq(0, max_val, dtype=out_dtype, asymmetric=(in_q.zero_point != 0)) in_q = deepcopy(o_q) - elif isinstance(params, (TanHActivationParameters, SigmoidActivationParameters)): - if out_dtype == np.int8: - o_q = QType(q=7, dtype=np.int8) - elif out_dtype == np.int16: - o_q = QType(q=15, dtype=np.int16) - else: - raise NotImplementedError( - 'int8 and int16 are implemented as output only') + elif isinstance(params, TanHActivationParameters): + o_q = QType.from_min_max_sq( + min_val=-1, max_val=1, dtype=out_dtype, asymmetric=out_asym) + elif isinstance(params, SigmoidActivationParameters): + o_q = QType.from_min_max_sq( + min_val=0, max_val=1, dtype=out_dtype, asymmetric=out_asym) elif isinstance(params, LeakyActivationParameters): o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], stats['range_out'][0]['max'], - dtype=out_dtype) - # force the preceeding filter to clip the negative range - in_q = deepcopy(o_q) + dtype=out_dtype, + asymmetric=out_asym) + in_q.scale = o_q.scale + elif isinstance(params, HSigmoidActivationParameters): + # hsigmoid prefer to output zeropoint 0 to represent 0 - 1 range + o_q = QType.from_min_max_sq( + min_val=0, max_val=1, dtype=out_dtype, asymmetric=out_asym) + elif isinstance(params, HSwishActivationParameters): + # hswish multiplies 0-upper bound range by input so take the upper bound from stats + o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], + stats['range_out'][0]['max'], + dtype=out_dtype, + asymmetric=out_asym) else: o_q = QType.from_min_max_sq(stats['range_out'][0]['min'], stats['range_out'][0]['max'], - dtype=out_dtype) + dtype=out_dtype, + asymmetric=out_asym) qrec = QRec.scaled(in_qs=[in_q], out_qs=[o_q]) - qrec = cls.compute_cache(params, qrec) + qrec = cls.compute_cache(params, qrec, stats) return qrec @classmethod @@ -106,32 +172,39 @@ def get_prefered_input_dtypes(cls, params, **kwargs): return [np.int8] @classmethod - def compute_cache(cls, params, qrec): + def compute_cache(cls, params, qrec, stats): scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q - - if isinstance(params, (SigmoidActivationParameters, TanHActivationParameters)): - scale_mul_biases_q.scale = (math.pow( - 2, -7 if qrec.out_qs[0].dtype == np.int8 else -15)/qrec.out_qs[0].scale) + if isinstance(params, (ReluActivationParameters)): + if params.upper_bound: + qrec.cache['upper_bound'] = qrec.in_qs[0].quantize( + params.upper_bound).astype(qrec.out_qs[0].dtype) + if params.lower_bound: + qrec.cache['lower_bound'] = qrec.in_qs[0].quantize( + params.lower_bound).astype(qrec.out_qs[0].dtype) + scale_mul_biases_q.scale = ( + qrec.in_qs[0].scale/qrec.out_qs[0].scale) + elif isinstance(params, (SigmoidActivationParameters, TanHActivationParameters)): + scale_mul_biases_q.scale = math.pow(2, -15) / qrec.out_qs[0].scale + qrec.cache["zero_point"] = qrec.out_qs[0].zero_point.astype( + qrec.out_qs[0].dtype) elif isinstance(params, (LeakyActivationParameters)): scale_mul_biases_q.scale = ( qrec.in_qs[0].scale/qrec.out_qs[0].scale) qrec.cache['leak_factor'] = np.int8( params.leak_factor*math.pow(2, 7) + 0.5) - elif isinstance(params, HSwishActivationParameters): - scale_mul_biases_q.scale = ( - (qrec.in_qs[0].scale*qrec.in_qs[0].scale * params.mult)/qrec.out_qs[0].scale) - qrec.cache['offset'] = qrec.in_qs[0].quantize(params.offset) - qrec.cache['mult'] = np.int8(1) - qrec.cache['upper_bound'] = qrec.in_qs[0].quantize( - params.upper_bound) - elif isinstance(params, HSigmoidActivationParameters): - scale_mul_biases_q.scale = ( - (qrec.in_qs[0].scale*params.mult)/qrec.out_qs[0].scale) - qrec.cache['offset'] = qrec.in_qs[0].quantize(params.offset) - qrec.cache['mult'] = np.int8(1) + qrec.cache['zero_point'] = qrec.out_qs[0].zero_point.astype(qrec.out_qs[0].dtype) + elif isinstance(params, (HSwishActivationParameters, HSigmoidActivationParameters)): + scale = (qrec.in_qs[0].scale * params.mult)/qrec.out_qs[0].scale + if isinstance(params, HSwishActivationParameters): + # HSwish multiplies HSigmoid by input + scale *= qrec.in_qs[0].scale + scale_mul_biases_q.scale = scale + qrec.cache['offset'] = qrec.in_qs[0].quantize( + params.offset).astype(qrec.out_qs[0].dtype) + qrec.cache['zero_point'] = qrec.out_qs[0].zero_point qrec.cache['upper_bound'] = qrec.in_qs[0].quantize( - params.upper_bound) + params.upper_bound).astype(qrec.out_qs[0].dtype) else: scale_mul_biases_q.scale = ( qrec.in_qs[0].scale/qrec.out_qs[0].scale) @@ -139,184 +212,86 @@ def compute_cache(cls, params, qrec): @params_type(ActivationParameters) -@in_qs_constraint({'dtype': np.int8}) +@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}}) @out_qs_constraint({'dtype': np.int8}) -class ActivationMultSW8x8(ActivationMultSWBase): +@option_constraint(force_output_size={8, None}) +class ActivationMultSW_I_I8(ActivationMultSWBase): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - return cls._quantize_sw(params, in_qs, stats, np.int8, np.int8, **kwargs) + return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.int8, out_asym=False, **kwargs) -@params_type(ActivationParameters) -@in_qs_constraint({'dtype': np.int32}) -@out_qs_constraint({'dtype': np.int8}) -@priority(2) -class ActivationMultSW32x8(ActivationMultSWBase): +@params_type(HSwishActivationParameters, HSigmoidActivationParameters) +@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}}) +@out_qs_constraint({'dtype': np.uint8}) +@option_constraint(force_output_size={8, None}) +class ActivationMultSW_HSwish_I_U8(ActivationMultSWBase): @classmethod - def _quantize(cls, params, in_qs, stats, **kwargs): - return cls._quantize_sw(params, in_qs, stats, np.int32, np.int8, **kwargs) - + def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs): + dtype = in_qs and in_qs[0] and in_qs[0].dtype + if dtype == np.uint16: + dtype = np.int16 + else: + dtype = np.int8 + return [QType.from_min_max_sq( + stats['range_in'][0]['min'], + stats['range_in'][0]['max'], + dtype=dtype)] -@params_type(ActivationParameters) -@in_qs_constraint({'dtype': np.int16}) -@out_qs_constraint({'dtype': np.int16}) -@priority(2) -class ActivationMultSW16x16(ActivationMultSWBase): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - return cls._quantize_sw(params, in_qs, stats, np.int16, np.int16, **kwargs) + return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint8, out_asym=False, **kwargs) -@params_type(ActivationParameters) -@in_qs_constraint({'dtype': np.int32}) -@out_qs_constraint({'dtype': np.int16}) -@priority(2) -class ActivationMultSW32x16(ActivationMultSWBase): +@params_type(HSwishActivationParameters, HSigmoidActivationParameters) +@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}}) +@out_qs_constraint({'dtype': np.uint16}) +@option_constraint(force_output_size=16) +class ActivationMultSW_HSwish_I_U16(ActivationMultSWBase): @classmethod - def _quantize(cls, params, in_qs, stats, **kwargs): - return cls._quantize_sw(params, in_qs, stats, np.int32, np.int16, **kwargs) - + def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs): + dtype = in_qs and in_qs[0] and in_qs[0].dtype + if dtype == np.uint16: + dtype = np.int16 + else: + dtype = np.int8 + return [QType.from_min_max_sq( + stats['range_in'][0]['min'], + stats['range_in'][0]['max'], + dtype=dtype)] -@params_type(ReluActivationParameters) -@in_qs_constraint({'dtype': {np.uint8, np.uint16, np.int8, np.int16}, 'attr': {'ne16': True}}) -@out_qs_constraint({'dtype': {np.uint8, np.uint16, np.int8, np.int16}}) -@priority(3) -class ActivationMultNe16(MultQuantizionHandler): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) - force_out_q = force_out_qs and force_out_qs[0] - in_q = in_qs[0] - - if force_out_q: - # TODO - should uint8 be accepted here if it does not scale to the relu? - o_q = deepcopy(force_out_q) - in_q = deepcopy(o_q) - else: - cls.check_valid_ranges(params, stats, idx=0, dirs='out') - upper = (stats['range_out'][0]['max'] if params.upper_bound is None - else params.upper_bound) - in_q = QType.from_min_max_sq( - params.lower_bound, upper, dtype=in_q.dtype, asymmetric=True, - ne16=True, dont_copy_attr=['ne16']) - o_q = deepcopy(in_q) - - scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) - scale_mul_biases_q.scale = in_q.scale/o_q.scale - qrec = QRec.scaled(in_qs=[in_q], out_qs=[o_q], - ne16=True, scale_mul_biases_q=scale_mul_biases_q) - return qrec - + return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint16, out_asym=False, **kwargs) -@params_type(HSigmoidActivationParameters, HSwishActivationParameters) -@in_qs_constraint({'dtype': {np.uint16, np.int32}}) #, 'attr': {'ne16': True}}) -@out_qs_constraint({'dtype': {np.uint16}}) -@priority(3) -class HSigmoidSwishActivationMultNe16USQ16(MultQuantizionHandler): - DEFAULT_DTYPE = np.uint16 +@params_type(ActivationParameters) +@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}}) +@out_qs_constraint({'dtype': np.int16}) +@option_constraint(force_output_size=16) +class ActivationMultSW_I_I16(ActivationMultSWBase): @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) - force_out_q = force_out_qs and force_out_qs[0] - - # input ranged to values that count and upper bound must be representable - # there is an assumption here that params.offset is always less than params.upper_bound - assert params.offset <= params.upper_bound - in_q = in_qs[0] - max_repr = np.maximum(in_q.max, params.upper_bound) - in_q = QType.from_min_max_sq(-max_repr, max_repr, dtype=np.int32, forced=True) - - if force_out_q: - # sigmoid and hswish has to output asymmetric with zero point at zero - if not force_out_q.zero_point_asymmetric_zero: - return None - # if the output has been forced then propagate it - out_q = deepcopy(force_out_q) - elif isinstance(params, HSigmoidActivationParameters): - # hsigmoid prefer to output Q16 zeropoint 0 to represent 0 - 1 range - out_q = QType(dtype=np.uint16, scale=pow(2, -16), zero_point=0, - ne16=True, dont_copy_attr=['ne16']) - else: - # hswish multiplies 0-upper bound range by input so take the upper - # bound from stats - upper = stats['range_out'][0]['max'] - max_repr = np.maximum(in_q.max, upper) - in_q = QType.from_min_max_sq(0, upper, dtype=np.int32, - ne16=True, dont_copy_attr=['ne16']) - - qrec = QRec.scaled(in_qs=[in_q], out_qs=[out_q], ne16=True) - scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8) - - if isinstance(params, HSigmoidActivationParameters): - scale_mul_biases_q.scale = ( - (qrec.in_qs[0].scale*params.mult)/qrec.out_qs[0].scale) - elif isinstance(params, HSwishActivationParameters): - scale_mul_biases_q.scale = ( - (qrec.in_qs[0].scale*qrec.in_qs[0].scale*params.mult)/qrec.out_qs[0].scale) - else: - raise ValueError(f"Unexpacted params type {params}") - - qrec.cache['offset'] = qrec.in_qs[0].quantize(params.offset) - qrec.cache['mult'] = np.int16(1) - qrec.cache['upper_bound'] = qrec.in_qs[0].quantize( - params.upper_bound) + return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.int16, out_asym=False, **kwargs) - qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q - return qrec - -@params_type(SigmoidActivationParameters) -@in_qs_constraint({'dtype': {np.uint16, np.int32}, 'attr': {'ne16': True}}) -@out_qs_constraint({'dtype': {np.uint16}}) -@priority(3) -class SigmoidActivationMultNe16USQ16(MultQuantizionHandler): +@params_type(LeakyActivationParameters, TanHActivationParameters, SigmoidActivationParameters, ReluActivationParameters) +@in_qs_constraint({'dtype': {np.uint8, np.int32}}) +@out_qs_constraint({'dtype': np.uint8}) +@option_constraint(force_output_size={8, None}) +class ActivationMultSW_U_U8(ActivationMultSWBase): + # This handler should be called only for NE16 for the moment --> out is asym @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) - force_out_q = force_out_qs and force_out_qs[0] - in_q = QType(dtype=np.int32, scale=pow(2, -12)) - if force_out_q: - # sigmoid has to output asymmetric with zero point at zero - if not force_out_q.zero_point_asymmetric_zero: - return None - # if the output has been forced then propagate it - out_q = deepcopy(force_out_q) - else: - # 0 to 1 range so prefer Q16 zeropoint 0 - out_q = QType(dtype=np.uint16, scale=pow(2, -16), zero_point=0, - ne16=True, dont_copy_attr=['ne16'], min_val=0, max_val=1) - - scale_mul_biases_q = MultMulBiasScaleQType( - dtype=np.uint8, scale=pow(2, -16)/out_q.scale) - qrec = QRec.scaled(in_qs=[in_q], out_qs=[out_q], - ne16=True, scale_mul_biases_q=scale_mul_biases_q) - return qrec + return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint8, out_asym=True, **kwargs) -@params_type(TanHActivationParameters) -@in_qs_constraint({'dtype': {np.uint16, np.int32}, 'attr': {'ne16': True}}) -@out_qs_constraint({'dtype': {np.uint16}}) -@priority(3) -class TanHActivationMultNe16USQ16(MultQuantizionHandler): +@params_type(LeakyActivationParameters, TanHActivationParameters, SigmoidActivationParameters, ReluActivationParameters) +@in_qs_constraint({'dtype': {np.uint16, np.int32}}) +@out_qs_constraint({'dtype': np.uint16}) +@option_constraint(force_output_size=16) +class ActivationMultSW_U_U16(ActivationMultSWBase): + # This handler should be called only for NE16 for the moment --> out is asym @classmethod def _quantize(cls, params, in_qs, stats, **kwargs): - force_out_qs, out_dtype = cls.get_mult_opts(**kwargs) - force_out_q = force_out_qs and force_out_qs[0] - in_q = QType(dtype=np.int32, scale=pow(2, -12)) - if force_out_q: - # tanh has to output symmetric with zero point at 32768 - if not np.all(np.atleast_1d(force_out_q.zero_point) == 32768): - return None - # if the output has been forced then propagate it - out_q = deepcopy(force_out_q) - else: - # -1 to 1 range so prefer Q15 - out_q = QType(dtype=np.uint16, scale=pow(2, -15), zero_point=np.array([32768], dtype=np.uint16), - ne16=True, dont_copy_attr=['ne16'], min_val=-1, max_val=1) - - scale_mul_biases_q = MultMulBiasScaleQType( - dtype=np.uint8, scale=pow(2, -15)/out_q.scale) - qrec = QRec.scaled(in_qs=[in_q], out_qs=[out_q], - ne16=True, scale_mul_biases_q=scale_mul_biases_q) - return qrec + return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint16, out_asym=True, **kwargs) diff --git a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py index 6382e89fd..57f1fb857 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py @@ -13,15 +13,16 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from graph.types.constant_input import ConstantInputParameters -from graph.types.tensor_arithmetic import MatMulOpParameters, MatMulTransposedParameters import logging from copy import deepcopy import numpy as np from graph.types import (Conv2DParameters, FcParameters, FusionInputParameters, - HSigmoidActivationParameters, PoolingParameters, + HSigmoidActivationParameters, ReluActivationParameters, SigmoidActivationParameters) +from graph.types.constant_input import ConstantInputParameters +from graph.types.tensor_arithmetic import (MatMulTransposedParameters) +from quantization.clipping import get_clip from quantization.multiplicative.quantizers.rnn_mult_ne16 import \ limit_input_precision from quantization.multiplicative.scaling_qtypes import MultMulBiasScaleQType @@ -97,7 +98,8 @@ def check_options(params, opts=None, **kwargs): FORCE_INPUT_SIZE_OPTION, FORCE_OUTPUT_SIZE_OPTION, HWC_OPTION, - MAX_PRECISION_LIMIT_OPTION + MAX_PRECISION_LIMIT_OPTION, + CLIP_TYPE_OPTION ) # pylint: disable=abstract-method class FilterMultBase(MultQuantizionHandler): @@ -210,14 +212,18 @@ def _quantize_sw(cls, params, in_qs, stats, in_out_dtype, **kwargs): if force_out_q: o_q = force_out_q - # can't be forced to something not in_out_dtype - if o_q.dtype != in_out_dtype: + # can't be forced to something not in_out_dtype or int32 + if o_q.dtype != in_out_dtype and o_q.dtype != np.int32: return None LOG.warning(f'node {params.name} output forced to range {o_q.min}/{o_q.max} ' f'{"asymmetric" if o_q.asymmetric else "symmetric"}') else: cls.check_valid_ranges(params, stats, idx=0, dirs='out') - min_val, max_val = stats['range_out'][0]['min'], stats['range_out'][0]['max'] + min_val, max_val = get_clip( + params.out_dims[0].shape, + 8 if in_out_dtype == np.int8 else 16, + stats['range_out'][0], + opts['clip_type']) o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=in_out_dtype, @@ -273,7 +279,7 @@ def can_handle_asymmetric_input(cls, params, **kwargs): @params_type(FcParameters, Conv2DParameters) @in_qs_constraint({'dtype': np.int8}) -@out_qs_constraint({'dtype': np.int8}) +@out_qs_constraint({'dtype': set([np.int8, np.int32])}) @option_constraint(check_filter_options(False, input_size={8, None}, output_size={8, None})) class FilterSWMult8x8(FilterSWMultBase): @classmethod @@ -283,7 +289,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs): @params_type(FcParameters, Conv2DParameters) @in_qs_constraint({'dtype': np.int16}) -@out_qs_constraint({'dtype': np.int16}) +@out_qs_constraint({'dtype': set([np.int8, np.int32])}) @option_constraint(check_filter_options(False, input_size={16, None}, output_size={16, None})) class FilterSWMult16x8(FilterSWMultBase): @classmethod @@ -331,9 +337,13 @@ def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs): f'node {params.name} output forced to range {o_q.min}/{o_q.max}') else: cls.check_valid_ranges(params, stats, idx=0, dirs='out') - min_val, max_val = stats['range_out'][0]['min'], stats['range_out'][0]['max'] force_output_size = opts.get('force_output_size', 8) output_dtype = np.uint8 if force_output_size == 8 else np.uint16 + min_val, max_val = get_clip( + params.out_dims[0].shape, + force_output_size, + stats['range_out'][0], + opts['clip_type']) o_q = QType.from_min_max_sq(min_val=min_val, max_val=max_val, dtype=output_dtype, diff --git a/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py b/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py index b6bcf537e..c9767e147 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py @@ -44,21 +44,21 @@ def _quantize(cls, params, in_qs, stats, **kwargs): @params_type(ActivationFusionBase, MatMulOpFusionParameters, MatScaleFusionParameters, PaddedAddFusionParameters) @in_qs_constraint(MatchAll({'dtype': np.int8})) -@out_qs_constraint(MatchAll({'dtype': np.int8})) +#@out_qs_constraint(MatchAll({'dtype': np.int8})) @fusion_handler class GenericFusionMult(GenericFusionMultBase): pass @params_type(ActivationFusionBase, MatMulOpFusionParameters, MatScaleFusionParameters, PaddedAddFusionParameters) @in_qs_constraint(MatchAll({'dtype': np.uint8})) -@out_qs_constraint(MatchAll({'dtype': np.uint8})) +#@out_qs_constraint(MatchAll({'dtype': np.uint8})) @fusion_handler class GenericFusionMultU8(GenericFusionMultBase): pass @params_type(ActivationFusionBase, MatMulOpFusionParameters, MatScaleFusionParameters, PaddedAddFusionParameters) @in_qs_constraint(MatchAll({'dtype': np.uint16})) -@out_qs_constraint(MatchAll({'dtype': np.uint16})) +#@out_qs_constraint(MatchAll({'dtype': np.uint16})) @fusion_handler class GenericFusionMultU16(GenericFusionMultBase): pass diff --git a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py index 3a2484562..d32c2009b 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py @@ -58,8 +58,8 @@ def _quantize(cls, params, in_qs, stats, **kwargs): if fusion: # Global pooling fused with activations need to have only the activation scale + #o_q = QType(scale=in_q.scale, dtype=np.int32) o_q = deepcopy(in_q) - o_q.dtype = np.int32 elif force_out_q: if force_out_q.zero_point != in_q.zero_point: return None diff --git a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py index 5b76154df..a09de7233 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py +++ b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py @@ -128,7 +128,7 @@ def get_min_max(cls, fusion, stats, all_stats, params): @params_type(MatMulOpParameters) @in_qs_constraint({'dtype': set([np.int8])}) -@out_qs_constraint({'dtype': set([np.int8])}) +@out_qs_constraint({'dtype': set([np.int8, np.int32])}) @option_constraint(check_filter_options(False, input_size={8, None}, output_size={8, None})) class MatMultMultSW8(MatMultMultBase): @classmethod @@ -150,7 +150,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs): kwargs['graph_update']['requires_adjust'] = True in_q2 = QType.from_array_sq( arr=in2_node.dqvalue, - quantized_dimension=0, + quantized_dimension=len(in2_node.dqvalue.shape) - 2, dtype=np.int8, narrow_range=True, bits=8) @@ -165,7 +165,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs): if force_out_q: o_q = force_out_q # can't be forced to something not np.int8 - if o_q.dtype != np.int8 or o_q.asymmetric: + if (o_q.dtype != np.int8 and o_q.dtype != np.int32) or o_q.asymmetric: return None LOG.warning(f'node {params.name} output forced to range {o_q.min}/{o_q.max} ' f'{"asymmetric" if o_q.asymmetric else "symmetric"}') diff --git a/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py b/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py index 6f2e413ad..667da7a53 100644 --- a/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py +++ b/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py @@ -26,7 +26,7 @@ @in_qs_constraint({'dtype': np.int8}) class SSDDetectorParametersMult(MultQuantizionHandler): @classmethod - def _quantize(cls, params, in_qs, stats, **kwargs): + def _quantize(cls, params: SSDDetectorParameters, in_qs, stats, **kwargs): force_out_qs, _ = cls.get_mult_opts(**kwargs) force_out_q = force_out_qs and force_out_qs[0] if force_out_q: @@ -39,4 +39,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs): dtype=np.int16, scale=2**(-14)) o_scores_qtype = in_qs[1] o_class_qtype = QType(scale=1, dtype=np.int8) - return QRec.scaled(in_qs=in_qs, out_qs=[o_boxes_qtype, o_class_qtype, o_scores_qtype, o_class_qtype]) + outputs = [o_boxes_qtype, o_class_qtype, o_scores_qtype] + if params.output_detection_count: + outputs.append(QType(scale=1, dtype=np.int32)) + return QRec.scaled(in_qs=in_qs, out_qs=outputs) diff --git a/tools/nntool/quantization/qtype.py b/tools/nntool/quantization/qtype.py index de8cac419..7d409ba36 100644 --- a/tools/nntool/quantization/qtype.py +++ b/tools/nntool/quantization/qtype.py @@ -15,7 +15,7 @@ import math from copy import deepcopy -from functools import reduce +from functools import cmp_to_key, reduce import numpy as np from bfloat16 import bfloat16 @@ -132,7 +132,7 @@ def divide_ignore(a, b): IGNORE_KEYS = {'ne16', 'to_dict'} -class AttrNamespace: +class AttrNamespace(): def __init__(self, **kwargs): self.__dict__.update(kwargs) @@ -298,6 +298,26 @@ def __setstate__(self, state): setattr(self, '_dtype', STR_DTYPE[state['dtype']]) setattr(self, '_EventEmitter__raw_listeners', {}) + def _encapsulate(self): + res = {} + for k in self.EXPORT: + v = getattr(self, f'_{k}') + if v is None: + continue + if k == "attr": + res[k] = v.__getstate__() + else: + res[k] = v + return res + + @classmethod + def _dencapsulate(cls, val): + if 'attr' in val: + attr = val['attr'] + val['attr'] = AttrNamespace() + val['attr'].__setstate__(attr) + return QType(**val) + @property def zero_point_asymmetric_zero(self): if self.dtype in [np.int8, np.int16, np.int32]: @@ -335,13 +355,6 @@ def attr(self): def Pow2(cls, bits, q, signed, forced=False): return cls(bits=bits, q=q, signed=signed, forced=forced) - def _encapsulate(self): - return {k: getattr(self, f'_{k}') for k in self.EXPORT - if getattr(self, f'_{k}') is not None} - - @classmethod - def _dencapsulate(cls, val): - return QType(**val) def _update_dtype(self): if self._signed is None or self._bits is None: @@ -380,6 +393,27 @@ def forced_dtype(self): def forced_scale(self): return self._forced.get('scale') + @staticmethod + def precision_key(): + """ Returns a key function that compares precision + """ + def cmp(a, b): + a = float(a) + b = float(b) + return (a > b) - (a < b) + def cmp_func(q1: QType, q2: QType): + if q1.is_floating: + if q2.is_floating: + return cmp(q1.bits, q2.bits) + else: + return 1 # q1 > q2 + elif q2.is_floating: + return -1 + # lower scale is more precise + return cmp(np.max(q2.scale), np.max(q1.scale)) + return cmp_to_key(cmp_func) + + def set_forced(self, val=True, flags=None): if flags is None: flags = FORCED_FLAGS @@ -528,7 +562,8 @@ def scale(self, val): @property def has_valid_range(self): - return (self._min_val is not None and self._max_val is not None) or self._scale is not None + return ((self._min_val is not None and self._max_val is not None) or + self._scale is not None or self._q is not None) @property def min_val(self): diff --git a/tools/nntool/quantization/quantization_set.py b/tools/nntool/quantization/quantization_set.py index 1e7fe2138..deae94a7e 100644 --- a/tools/nntool/quantization/quantization_set.py +++ b/tools/nntool/quantization/quantization_set.py @@ -185,6 +185,12 @@ def move_to_fusion(self, node: Parameters, new_pnode: Parameters): del self.qset[nid] if self.stats and nid in self.stats: self.stats[fnid] = self.stats[nid] + if self.options and nid in self.options: + pnid = NodeId(new_pnode) + options = self.options[nid] + del self.options[nid] + self.options[fnid] = options + self.options.setdefault(pnid, {}).update(options) def move_to_node(self, node: Parameters, new_pnode: Parameters): nid = NodeId(node) diff --git a/tools/nntool/quantization/quantizer/new_quantizer.py b/tools/nntool/quantization/quantizer/new_quantizer.py index 4bf9aaeeb..f34a497c0 100644 --- a/tools/nntool/quantization/quantizer/new_quantizer.py +++ b/tools/nntool/quantization/quantizer/new_quantizer.py @@ -15,10 +15,13 @@ import logging from functools import reduce -from operator import attrgetter +from graph.matches.matchers.duplicate_operations import \ + MatchDuplicateOperations +from graph.matches.matchers.insert_copies import MatchInsertCopies from graph.matches.matchers.remove_copies import RemoveCopies -from graph.matches.matchers.remove_unnecessary_quantize_operators import RemoveUnnecessaryQuantizeOperators +from graph.matches.matchers.remove_unnecessary_quantize_operators import \ + RemoveUnnecessaryQuantizeOperators from graph.types import (FusionBase, FusionInputParameters, FusionOutputParameters, QuantizeParameters) from graph.types.base import NNEdge @@ -287,14 +290,36 @@ def select_qtype_fusion(qtypes): return qtypes[0] raise CantContinueError() - def get_outqtypes_up(self, G, node): + @staticmethod + def most_precise(qtypes, stat): + # reduce to unique qtypes and sort most precise first + sorted_qtypes = sorted( + reduce( + lambda s, x: s if x in s else s + [x], + qtypes, + []), + key=QType.precision_key(), reverse=True) + if sorted_qtypes[0].is_floating: + return sorted_qtypes[0] + assert stat + # here none are float + # choose closest to range with max bits + max_bits = max(x.bits for x in sorted_qtypes) + sorted_qtypes = filter(lambda x: x.bits == max_bits, sorted_qtypes) + sorted_qtypes = sorted( + sorted_qtypes, + key=lambda x: abs(x.min - stat['min']) + abs(x.max - stat['max'])) + return sorted_qtypes[0] + + + def get_outqtypes_up(self, G, node, stat): # this function copes with the conflict on output edges which is the most complicated scenario since # there can be multiple competing forces. This only handles the cases that we have seen in real models # or been able to emulate in synthetic models. qtypes = [] - for cur_qtypes, forced_qtypes in [zip(*[(self.get_qtype_forced_up(edge), self.get_conflict_up(edge)) - for edge in edge_bundle]) - for edge_bundle in G.indexed_out_edges(node)]: + for (cur_qtypes, forced_qtypes), edge_idx in [(zip(*[(self.get_qtype_forced_up(edge), self.get_conflict_up(edge)) + for edge in edge_bundle]), idx) + for idx, edge_bundle in enumerate(G.indexed_out_edges(node))]: forced_qtypes = no_nones(forced_qtypes) if not forced_qtypes: qtypes.append(None) @@ -304,30 +329,10 @@ def get_outqtypes_up(self, G, node): qtypes.append(forced_qtypes[0]) continue else: - # more than one output edge - if any(qtype.is_floating for qtype in forced_qtypes): - return sorted(forced_qtypes, key=attrgetter('bits'))[-1] - uniq_cur_qtypes = reduce( - lambda s, x: s if x in s else s + [x], cur_qtypes, []) - if len(uniq_cur_qtypes) == 1: - uniq_cur_qtype = uniq_cur_qtypes[0] - if len(cur_qtypes) > len(forced_qtypes): - qtypes.append(uniq_cur_qtype) - continue - else: - # all outputs are forced. we want to keep the one that best represents - # the output so we calculate the maximum overlapping range - # TODO - what about 16 bit versus 8 bit - if the range overlap is similar - # then lower scale should be taken into account - range_diffs = sorted([(qtype, min(qtype.max, uniq_cur_qtype.max_val) - max( - qtype.min, uniq_cur_qtype.min_val)) for qtype in forced_qtypes], key=lambda x: x[1]) - qtypes.append(range_diffs[-1][0]) - continue - cur_qtypes = ",".join(str(qtype) for qtype in cur_qtypes) - forced_qtypes = ",".join(str(qtype) for qtype in forced_qtypes) - raise NotImplementedError( - f'unexpected quantization conflict seen cur {cur_qtypes} forced {forced_qtypes}' - ' - please contact GreenWaves') + edge_stat = stat and stat['range_out'][edge_idx] + qtypes.append(self.most_precise(cur_qtypes + forced_qtypes, edge_stat)) + continue + return qtypes def get_outqtypes_up_fusion(self, G, node): @@ -626,7 +631,7 @@ def elimination_pass_down(self, cur_G, edge, qtype, visited, fusion=None): self.set_qtype_up(edge, qrec.in_qs[edge.to_idx]) if self.is_conflict(edge): if fusion: - raise CantContinueError() + raise CantContinueError() # @IgnoreException if not was_conflict: self.report_conflict(edge) else: @@ -744,13 +749,13 @@ def elimination_fusion_pass_up(self, parent_node, qrecs, in_qs, out_qs): def evaluate(self, cur_G, node, direction, qrecs, fusion=None): in_qs = self.get_inqtypes_down(cur_G, node) - if fusion: - out_qs = self.get_outqtypes_up_fusion(cur_G, node) - else: - out_qs = self.get_outqtypes_up(cur_G, node) nid = NodeId(node) if fusion is None else NodeId(fusion, fnode=node) pnid = NodeId(node) if fusion is None else NodeId(fusion) stat = self._stats.get(nid, None) + if fusion: + out_qs = self.get_outqtypes_up_fusion(cur_G, node) + else: + out_qs = self.get_outqtypes_up(cur_G, node, stat) opts = self.get_options(pnid) scheme_priority = self.get_scheme_priority(pnid) if isinstance(node, FusionBase) and node.quantize_internals: @@ -794,7 +799,8 @@ def continue_down(self, cur_G, qrecs, visited, node, qrec, exclude_edge=None, fu if not self.is_conflict(out_edge): continue qrecs.update(self.elimination_pass_down(cur_G, - out_edge, self.get_qtype_down(out_edge), + out_edge, self.get_qtype_down( + out_edge), visited + [node], fusion=fusion)) def continue_up(self, cur_G, qrecs, visited, node, qrec, exclude_edge=None, fusion=None): @@ -882,6 +888,9 @@ def insert_quantizers(self): for out_edge in self._graph.out_edges(qnode): self._qtypes[out_edge] = to_qtype RemoveCopies().match(self._graph) + MatchDuplicateOperations( + limit_to_dest_classes=QuantizeParameters).match(self._graph) + MatchInsertCopies().match(self._graph) def remove_quantizers(self, only_inserted=False): for node in self._graph.nodes(node_classes=QuantizeParameters): diff --git a/tools/nntool/quantization/quantizer/qrec_to_stats.py b/tools/nntool/quantization/quantizer/qrec_to_stats.py index 5c46424ca..5e86998a3 100644 --- a/tools/nntool/quantization/quantizer/qrec_to_stats.py +++ b/tools/nntool/quantization/quantizer/qrec_to_stats.py @@ -30,17 +30,13 @@ def ranges_are_valid(ranges): return not any(rng['min'] is None or rng['max'] is None for rng in ranges if rng is not None) -def build_stat_from_qrec(qrec, node=None): - if qrec is None: - return None - if qrec.in_qs is None or qrec.out_qs is None: - return None +def build_stat_from_qtypes(in_qs, out_qs, node=None): range_in = [None if qtype is None else ({'min': qtype.min_val, 'max': qtype.max_val} if qtype.has_valid_range else {'min': None, 'max': None}) - for qtype in qrec.in_qs] + for qtype in in_qs] range_out = [None if qtype is None else ({'min': qtype.min_val, 'max': qtype.max_val} if qtype and qtype.has_valid_range else {'min': None, 'max': None}) - for qtype in qrec.out_qs] + for qtype in out_qs] range_in_valid = ranges_are_valid(range_in) range_out_valid = ranges_are_valid(range_out) if not range_in_valid or not range_out_valid: @@ -62,6 +58,47 @@ def build_stat_from_qrec(qrec, node=None): } +def build_stat_from_qrec(qrec, node=None): + if qrec is None: + return None + if qrec.in_qs is None or qrec.out_qs is None: + return None + return build_stat_from_qtypes(qrec.in_qs, qrec.out_qs, node=node) + + +def build_fusion_stats(stats: dict, fusion: FusionBase): + inputs = fusion.subgraph.inputs() + in_stats = [None] * len(inputs) + for sub_node in inputs: + edge = fusion.subgraph.out_edges(sub_node)[0] + stat = stats.get(NodeId(edge.to_node)) + if stat is None: + in_stats = None + break + range_in = stat['range_in'] + if len(range_in) <= edge.to_idx: + in_stats = None + break + in_stats[sub_node.idx] = range_in[edge.to_idx] + outputs = fusion.subgraph.outputs() + out_stats = [None] * len(outputs) + for sub_node in outputs: + edge = fusion.subgraph.in_edges(sub_node)[0] + stat = stats.get(NodeId(edge.from_node)) + if stat is None: + out_stats = None + break + range_out = stat['range_out'] + if len(range_out) <= edge.from_idx: + out_stats = None + break + out_stats[sub_node.idx] = range_out[edge.from_idx] + return { + 'range_in': in_stats, + 'range_out': out_stats + } + + def build_stat(G, nid, node=None): if not G.quantization: return None @@ -87,6 +124,9 @@ def set_stats(G, current_stats=None, current_options=None): qrec = G.quantization.get( nid) if G.quantization else None stats[nid] = build_stat_from_qrec(qrec) + nid = NodeId(node) + if G.quantization and nid not in G.quantization: + stats[nid] = build_fusion_stats(stats, node) elif isinstance(node, ExpressionFusionParameters): if stats[nid] is None or 'expression' not in stats[nid]: if (G.quantization is None or nid not in G.quantization or G.quantization[nid].cache is None or @@ -96,6 +136,7 @@ def set_stats(G, current_stats=None, current_options=None): stats[nid]['expression'] = G.quantization[nid].cache['expression'] elif isinstance(node, ConstantInputParameters): if G.quantization and nid in G.quantization: - current_options.setdefault(nid, {})['qtype_ind'] = G.quantization[nid].out_qs[0] + current_options.setdefault( + nid, {})['qtype_ind'] = G.quantization[nid].out_qs[0] return stats, current_options diff --git a/tools/nntool/quantization/quantizer_options.py b/tools/nntool/quantization/quantizer_options.py index d76d85623..48f114c18 100644 --- a/tools/nntool/quantization/quantizer_options.py +++ b/tools/nntool/quantization/quantizer_options.py @@ -136,6 +136,14 @@ 'default': 'fastfloat' } +CLIP_TYPE_OPTION = { + 'name': 'clip_type', + 'type': str, + 'choices': ['laplace', 'gaus', 'mix', 'none'], + 'help': 'Clipping method for filter output activations min max. laplace or gaussian distribution or choose based on MSE or no clipping', + 'default': 'none' +} + BIAS_SIZE_OPTION = { 'name': 'pow2_biases', 'type': int, diff --git a/tools/nntool/reports/draw_graph_reporter.py b/tools/nntool/reports/draw_graph_reporter.py index ec23e8d3a..da7b40b5c 100644 --- a/tools/nntool/reports/draw_graph_reporter.py +++ b/tools/nntool/reports/draw_graph_reporter.py @@ -17,7 +17,7 @@ from expressions.symbolic.symbol import Constant, Variable from graph.nngraph import NNGraph -from graph.types import ExpressionFusionParameters, FusionBase +from graph.types import ExpressionFusionParameters, FusionBase, Parameters from graph.types.fusions import FusionInputParameters, FusionOutputParameters from graphviz import Digraph, nohtml from quantization.qtype import QType @@ -92,6 +92,12 @@ def insert_tag(idx, tag, names): names[idx] = [f'{tag} {name}'] + names[idx][1::] return + @staticmethod + def get_label(node, anon): + if hasattr(node, 'graph_label'): + return node.graph_anon_label if anon else node.graph_label + return [node.name] + @staticmethod def build_nodebox(node, ports, num_in, num_out, anon=False): trans_in = DrawGraphReporter.get_trans(node, 'in') @@ -106,10 +112,10 @@ def build_nodebox(node, ports, num_in, num_out, anon=False): edges = [ f' {idx if num_in > 1 else ""}{trans[idx] if idx < len(trans) else ""}' for idx in range(num_in)] names.append(edges) - names.extend(node.graph_anon_label if anon else node.graph_label) + names.extend(DrawGraphReporter.get_label(node, anon)) else: ports[0] = [f'{node.name}:name'] - names.extend(node.graph_anon_label if anon else node.graph_label) + names.extend(DrawGraphReporter.get_label(node, anon)) DrawGraphReporter.insert_tag(0, f'', names) if num_out > 1 or trans_out: if trans_out: @@ -222,16 +228,18 @@ def in_label(self, G, edge, qrecs, parent=None, to_node=True, from_node=True): if not from_qtype.quantization_equal(qtype): return f'{from_qtype}/{qtype}', True return str(qtype), False - else: + elif isinstance(node, Parameters): if node.in_dims: return self.dim_or_error(node.in_dims, idx) return 'not set', True + return '', False + def report_graph(self, G: NNGraph, dot, all_ports, fake_idx, nodes=None, all_dims=False, anonymise=False, expressions=False, qrecs=None, fusions=False, parent=None): if nodes is None: nodes = set(G.nodes()) - for node in G.dfs(): + for node in G.topological_sort(): if node not in nodes: continue if isinstance(node, (FusionInputParameters)): @@ -239,7 +247,7 @@ def report_graph(self, G: NNGraph, dot, all_ports, fake_idx, nodes=None, all_dim if expressions and isinstance(node, ExpressionFusionParameters): all_ports[node] = self.report_expression( dot, G, node, anonymise=anonymise, report_quantized=expressions == "quantized") - elif fusions and isinstance(node, FusionBase): + elif fusions and isinstance(node, FusionBase) and node.quantize_internals: all_ports[node] = self.report_fusion( dot, G, node, all_ports, fake_idx, all_dims=all_dims, anonymise=anonymise, expressions=expressions, qrecs=qrecs) @@ -251,8 +259,13 @@ def report_graph(self, G: NNGraph, dot, all_ports, fake_idx, nodes=None, all_dim if not isinstance(node, FusionOutputParameters): names = self.build_nodebox( node, ports, num_in_edges, num_out_edges, anon=anonymise) - dot.node(node.name, nohtml(names), shape='record', - xlabel=f"{node.step_idx}" if parent is None else "", color="blue" if node.is_not_generated else "black") + if not isinstance(node, Parameters): + dot.node(node.name, nohtml(names), + shape='record', color='black') + else: + dot.node(node.name, nohtml(names), shape='record', + xlabel=f"{node.step_idx}" if parent is None else "", + color="blue" if node.is_not_generated else "black") for edge in G.in_edges(node.name): if edge.from_node not in nodes: if not all_dims: @@ -318,7 +331,7 @@ def report(self, G: NNGraph, nodes=None, graph_format='PDF', all_dims=False, qrecs = None self.init_name_cache() all_ports = {} - graph_name = G.graphname if hasattr(G, 'graphname') else 'graph' + graph_name = G.name if hasattr(G, 'name') else 'graph' dot = Digraph(comment=graph_name, format=graph_format, node_attr={ 'height': '.1'}, edge_attr={'fontsize': '10.0'}) fake_idx = 0 @@ -376,7 +389,8 @@ def report_expression(self, dot: Digraph, G: NNGraph, else: dot.node(var.name, nohtml(var_name), shape='plaintext', fontsize='10.0') - sub.edge(node_id, var.name, xlabel=f'{str_shape(shape)}') + sub.edge( + node_id, var.name, xlabel=f'{str_shape(shape)}', color="red" if shape is None else "black") return [node.input_symbols, node.output_symbols] @@ -424,12 +438,17 @@ def report_symbol(self, dot, symbol, intermediates, anonymise=False): const_name = self.get_next('Const') dot.node(const_name, 'Const' if anonymise else str( symbol.value[0]), shape='oval', fontsize='10.0') - return const_name, None if len(symbol.shape) == 1 else symbol.shape + return const_name, symbol.shape ids_and_shapes = [self.report_symbol(dot, sym, intermediates, anonymise=anonymise) for sym in symbol.contents] func_label = self.get_next( 'Op') if anonymise else symbol.__class__.__name__ dot.node(symbol.name, nohtml(func_label), shape='record') for child_id, shape in ids_and_shapes: - dot.edge(child_id, symbol.name, xlabel=f'{str_shape(shape)}') - return symbol.name, symbol.shape + dot.edge(child_id, symbol.name, + xlabel=f'{str_shape(shape)}', color="red" if shape is None else "black") + try: + symbol_shape = symbol.shape + except ValueError: + symbol_shape = None + return symbol.name, symbol_shape diff --git a/tools/nntool/requirements.txt b/tools/nntool/requirements.txt index 3a9417a05..4694226d8 100644 --- a/tools/nntool/requirements.txt +++ b/tools/nntool/requirements.txt @@ -12,7 +12,7 @@ argcomplete==1.10.0 Cython==0.29.21 scikit-image==0.17.2 scikit-learn==0.21.3 -onnx==1.8.0 +onnx==1.10.2 prettytable==0.7.2 iteration-utilities==0.11.0 bfloat16==1.0 diff --git a/tools/nntool/stats/activation_ranges_collector.py b/tools/nntool/stats/activation_ranges_collector.py index 42c8fd593..913187d3f 100644 --- a/tools/nntool/stats/activation_ranges_collector.py +++ b/tools/nntool/stats/activation_ranges_collector.py @@ -18,10 +18,9 @@ import numpy as np from execution.graph_executer import GraphExecuter -from graph.types import (FilterParameters, LSTMParameters, - MultiplicativeBiasParameters, RNNBaseParameters) -from graph.types.expression_fusion import ExpressionFusionParameters from graph.types.fusions import FusionBase, FusionInputParameters +from stats.ranges_utils import collect_stat, update_ranges +from utils.json_serializable import JsonSerializable from utils.node_id import NodeId from .stats_collector import GraphStatsCollector @@ -41,10 +40,75 @@ def update_peraxis(var, arr: np.ndarray): per_axis_elem['max'] = np.maximum( per_axis_elem['max'], arr.max(axis=other_axis)) +class Rolling(JsonSerializable): + def __init__(self) -> None: + self._values = [] -def update_ema(ema, value, decay): - ema = value * decay + (1 - decay) * ema - return ema + def __float__(self): + if not self._values: + return 0 + return float(np.sum(self._values)/len(self._values)) + + def add_val(self, val: float): + self._values.append(val) + + def _encapsulate(self): + return float(self) + + def __mul__(self, other): + return float(self).__mul__(other) + + def __add__(self, other): + return float(self).__add__(other) + + def __truediv__(self, other): + return float(self).__truediv__(other) + + def __floordiv__(self, other): + return float(self).__floordiv__(other) + + def __mod__(self, other): + return float(self).__mod__(other) + + def __divmod__(self, other): + return float(self).__divmod__(other) + + def __pow__(self, other): + return float(self).__pow__(other) + + def __sub__(self, other): + return float(self).__sub__(other) + + def __radd__(self, other): + return float(self).__radd__(other) + + def __rsub__(self, other): + return float(self).__rsub__(other) + + def __rmul__(self, other): + return float(self).__rmul__(other) + + def __rtruediv__(self, other): + return float(self).__rtruediv__(other) + + def __rfloordiv__(self, other): + return float(self).__rfloordiv__(other) + + def __rmod__(self, other): + return float(self).__rmod__(other) + + def __rpow__(self, other): + return float(self).__rpow__(other) + + @classmethod + def _dencapsulate(cls, val): + return val + + def __repr__(self) -> str: + return f'{float(self)}' + + def __str__(self) -> str: + return f'{float(self)}' class ActivationRangesCollector(GraphStatsCollector): @@ -55,40 +119,10 @@ def __init__(self, graph_execution=None, use_ema=False, ema_decay=0.999): self.use_ema = use_ema self.ema_decay = ema_decay - def update_expression_ranges(self, stat, details): - if 'expression' in stat: - stat = stat['expression'] - for sym_name, rec in details.items(): - if sym_name == "results": - continue - stat_rec = stat.setdefault( - sym_name, {'min': float('inf'), 'max': float('-inf')}) - stat_rec['min'] = min(stat_rec['min'], rec['min']) - stat_rec['max'] = max(stat_rec['max'], rec['max']) - else: - stat['expression'] = deepcopy(details) - - def collect_stat(self, stat, name, details, details_name=None): - range_stat = stat.get(name) - if not range_stat: - range_stat = {'min': float('inf'), 'max': float('-inf')} - stat[name] = range_stat - if details_name is None: - self.update_ranges( - range_stat, details[name]['min'], details[name]['max']) - else: - self.update_ranges( - range_stat, details['min_' + details_name], details['max_' + details_name]) - - def update_ranges(self, range_out, tensor_min, tensor_max): - if self.use_ema and all([range_out['min'] != float('inf'), range_out['max'] != float('-inf')]): - range_out['min'] = update_ema( - range_out['min'], tensor_min, self.ema_decay) - range_out['max'] = update_ema( - range_out['max'], tensor_max, self.ema_decay) - else: - range_out['min'] = min(range_out['min'], tensor_min) - range_out['max'] = max(range_out['max'], tensor_max) + + def collect_stat(self, stat: dict, name, details, details_name=None): + ema_decay = self.ema_decay if self.use_ema else None + collect_stat(stat, name, details, details_name=details_name, ema_decay=ema_decay) def collect_stats(self, G, input_tensors, step_idx=None): if self._graph_execution is None: @@ -113,7 +147,9 @@ def collect_stats(self, G, input_tensors, step_idx=None): { 'min': float('inf'), 'max': float('-inf'), - 'std': 0.0 + 'std': Rolling(), + 'mean': Rolling(), + 'b': Rolling() } for _ in output_tensors] stat = { 'range_in': range_in, @@ -145,26 +181,18 @@ def collect_stats(self, G, input_tensors, step_idx=None): for idx, tensor in enumerate(output_tensors): range_out = stat['range_out'][idx] - self.update_ranges(range_out, tensor.min(), tensor.max()) - range_out['std'] = np.std(tensor) + ema_decay = self.ema_decay if self.use_ema else None + update_ranges(range_out, tensor.min(), tensor.max(), ema_decay=ema_decay) + range_out['std'].add_val(np.std(tensor)) + mean = np.mean(tensor) + range_out['mean'].add_val(mean) + range_out['b'].add_val(np.mean(np.abs(tensor - mean))) update_peraxis(range_out, tensor) - if isinstance(node, FilterParameters): - if details: - self.collect_stat(stat, 'range_acc', - details, details_name='acc') - if isinstance(node, MultiplicativeBiasParameters) and node.has_mul_bias: - self.collect_stat( - stat, 'range_pre_mul_bias', details, details_name='pre_mul_bias') - elif isinstance(node, RNNBaseParameters): - if details: - for k in details: - if k.startswith('range_'): - self.collect_stat(stat, k, details) - elif isinstance(node, ExpressionFusionParameters): - if details: - self.update_expression_ranges(stat, details) - elif isinstance(node, FusionBase) and pnode.quantize_internals: + if details: + node.details_collector(self.stats, stat, details) + + if isinstance(node, FusionBase) and pnode.quantize_internals: for inode in node.subgraph.nodes(node_classes=FusionInputParameters): finput_in_stat = stat['range_in'][inode.idx] for edge in node.subgraph.out_edges(inode.name): diff --git a/tools/nntool/stats/ranges_utils.py b/tools/nntool/stats/ranges_utils.py new file mode 100644 index 000000000..0c0bdad70 --- /dev/null +++ b/tools/nntool/stats/ranges_utils.py @@ -0,0 +1,37 @@ +# Copyright (C) 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + + +def update_ema(ema, value, decay): + ema = value * decay + (1 - decay) * ema + return ema + +def update_ranges(range_out, tensor_min, tensor_max, ema_decay=None): + if ema_decay is not None and all([range_out['min'] != float('inf'), range_out['max'] != float('-inf')]): + range_out['min'] = update_ema( + range_out['min'], tensor_min, ema_decay) + range_out['max'] = update_ema( + range_out['max'], tensor_max, ema_decay) + else: + range_out['min'] = min(range_out['min'], tensor_min) + range_out['max'] = max(range_out['max'], tensor_max) + +def collect_stat(stat: dict, name, details, details_name=None, ema_decay=None): + range_stat = stat.setdefault(name, {'min': float('inf'), 'max': float('-inf')}) + postfix = "" if details_name is None else f'_{details_name}' + tensors = tuple(details[f'{key}{postfix}'] for key in ('min', 'max')) + update_ranges(range_stat, *tensors, ema_decay=ema_decay) diff --git a/tools/nntool/utils/compatible_transposes.py b/tools/nntool/utils/compatible_transposes.py index d5c869381..eb3251931 100644 --- a/tools/nntool/utils/compatible_transposes.py +++ b/tools/nntool/utils/compatible_transposes.py @@ -13,8 +13,11 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import Sequence from functools import reduce +from typing import Sequence + +from graph.manipulations.eliminate_transposes.transpose_helpers import ( + apply_transpose, indexes_of, reverse_transpose) def reduce_one(state, num): @@ -85,7 +88,7 @@ def find_first(combination, idx): return fc_idx elif idx in fc_set: # item is not first so no match - raise IndexError() # @IgnoreException + raise IndexError() # @IgnoreException return None @@ -98,7 +101,7 @@ def len_at_start(l, elem): def compatible_transpose(combination, trans): - """Determines if the transpose can be expressed in the combination in fc + """Determines if the transpose can be expressed in the combination in descriptor found by find_combination """ res = [] trans = list(trans) @@ -114,9 +117,9 @@ def compatible_transpose(combination, trans): continue # we have a segment who's first element matches the transpose # the rest of the elements in the segment must be in order - fc_idx = first_idx - while trans and fc_idx < len(combination): - segs = combination[fc_idx] + descriptor_idx = first_idx + while trans and descriptor_idx < len(combination): + segs = combination[descriptor_idx] if segs is not None: # if segs is none then it matches anything # the first seg must match since it was found by find_first # this tests if another seg doesn't match in which case we are @@ -131,16 +134,16 @@ def compatible_transpose(combination, trans): # so no solution if trans[0] != oidx: return False - fc_idx += 1 + descriptor_idx += 1 # this trans element no longer matches the next segment or we # reached the end if trans: trans.pop(0) # add range from first to last on first time around # this will include idxes idxes of Nones after this segment - res += list(range(first_idx, fc_idx)) + res += list(range(first_idx, descriptor_idx)) at_start = len_at_start(combination, None) - # The only bit we won't have matched is a fc that starts with None's so + # The only bit we won't have matched is a descriptor that starts with None's so # add those indexes to the start return tuple(list(range(0, at_start)) + res) @@ -154,3 +157,170 @@ def find_all_compatible_transposes(combinations, trans): def find_compatible_transpose(fcs, trans): return next(find_all_compatible_transposes(fcs, trans), None) + + +def expand_to_len(trans, length): + extra = length-len(trans) + return tuple(list(range(extra)) + [dim + extra for dim in trans]) + + +def reduce_to_len(trans, length): + extra = len(trans) - length + return tuple([dim - extra for dim in trans if dim >= extra]) + + +def no_ones(l): + return tuple(elem for elem in l if elem != 1) + + +def ones_shuffled(from_shape, to_shape): + if len(from_shape) != len(to_shape) or no_ones(from_shape) != no_ones(to_shape): + return False + return True + + +def reshape_shuffle_trans(from_shape, to_shape): + from_shape = list(enumerate(from_shape)) + to_shape = list(enumerate(to_shape)) + ones_pos_from = tuple(shape[0] for shape in from_shape if shape[1] == 1) + ones_pos_to = list(shape[0] for shape in to_shape if shape[1] == 1) + idx_to = 0 + idx_from = 0 + idx_from_ones = 0 + trans = [] + for idx_to in range(len(to_shape)): + if idx_to in ones_pos_to: + trans.append(ones_pos_from[idx_from_ones]) + idx_from_ones += 1 + else: + while idx_from in ones_pos_from: + idx_from += 1 + trans.append(idx_from) + idx_from += 1 + return trans + + +def is_broadcasted(from_shape, to_shape): + from_len = len(from_shape) + to_len = len(to_shape) + if from_len >= to_len: + return False + return tuple(([1] * (to_len - from_len)) + list(from_shape)) == tuple(to_shape) + + +def broadcast_transpose(from_shape, to_shape, going_up): + from_len = len(from_shape) + to_len = len(to_shape) + if going_up: + return tuple((idx,) for idx in range(to_len - from_len, to_len)) + return tuple(([None] * (to_len - from_len))+list(range(from_len))) + + +def apply_combination(shape, comb): + res = [] + comb = list(comb) + while comb: + elem = comb.pop(0) + if elem is None: + res.append(1) + else: + res.append(reduce(lambda x, y: x*y, [shape[i] for i in elem])) + return tuple(res) + + +def transpose_combination(comb, trans): + res = [] + comb = list(comb) + while comb: + elem = comb.pop(0) + if elem is None: + res.append(None) + else: + res.append(tuple(trans.index(i) for i in elem)) + return tuple(res) + + +def calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up): + if going_up: + # transpose is in the up direction so is reversed + # we want to apply it in the down direction so reverse it + new_to_shape = apply_transpose(to_shape, reverse_transpose(trans)) + # the from_shape gets the new transpose applied to it - this may result in the reshape being eliminated + # since the shape change that it caused is already in the transpose + # NOTE - Looking at the reshape as a transpose itself is not correct. It is a shuffle not a transpose + # the tensor physical order is not changed unlike a transpose + new_from_shape = apply_transpose( + from_shape, reverse_transpose(new_trans)) + else: + # transpose is in the down direction but we want to pass it through this reshape so + # we want to reverse its effect + new_from_shape = apply_transpose(from_shape, reverse_transpose(trans)) + # the to_shape gets the new transpose applied to it - this may result in the reshape being eliminated + # since the shape change that it caused is already in the transpose + new_to_shape = apply_transpose(to_shape, reverse_transpose(new_trans)) + return ( + tuple(new_trans), + tuple(new_from_shape), + tuple(new_to_shape)) + + +def calc_failure_reshapes(trans, from_shape, to_shape, going_up): + if going_up: + new_to_shape = tuple(apply_transpose( + to_shape, reverse_transpose(trans))) + new_from_shape = None + else: + new_from_shape = tuple(apply_transpose( + from_shape, reverse_transpose(trans))) + new_to_shape = None + return ( + None, + new_from_shape, + new_to_shape) + + +def reverse_reshape(trans, from_shape, to_shape, going_up=False): + """reverses the effect of this reshape on the transpose. If going up is set then then + the transpose is in the direction to_shape -> from_shape""" + + if len(from_shape) == 0 or len(to_shape) == 0: + return calc_failure_reshapes(trans, from_shape, to_shape, going_up) + + # if the from_shape -> to_shape is actually a broadcast reshape + # i.e. 4, 10, 1 -> 1, 4, 10, 1 we absolutely need to keep the order 4, 10, 1 in + # the transpose however the 2 1s in the result are ambiguous so handle this as a + # (simple) special case. Just expand the transpose with no transpose at the start + # and expand_len + original transpose dim at the end + if is_broadcasted(from_shape, to_shape): + broad_trans = broadcast_transpose(from_shape, to_shape, going_up) + if going_up: + new_trans = reverse_transpose(reduce_to_len( + reverse_transpose(trans), len(from_shape))) + else: + new_trans = reverse_transpose(expand_to_len( + reverse_transpose(trans), len(to_shape))) + return calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up) + + # consider the shapes in the correct order + shape_order = (to_shape, from_shape) if going_up else ( + from_shape, to_shape) + + if ones_shuffled(shape_order[0], shape_order[1]): + shuffle_trans = reshape_shuffle_trans(shape_order[0], shape_order[1]) + new_trans = apply_transpose(trans, shuffle_trans) + return calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up) + + for combination in find_combination(*shape_order): + if not combination: + continue + # going down we are looking at where we could transpose the reshape combination back up the + # graph in a valid way and then reverse that transpose + # going up re are propagating a reversed transpose so we still need to reverse + reversed_new_trans = compatible_transpose( + combination, reverse_transpose(trans)) + if not reversed_new_trans or len(reversed_new_trans) != len(shape_order[1]): + continue + new_trans = reverse_transpose(reversed_new_trans) + return calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up) + + return calc_failure_reshapes(trans, from_shape, to_shape, going_up) diff --git a/tools/nntool/utils/exception.py b/tools/nntool/utils/exception.py new file mode 100644 index 000000000..a4771cd5e --- /dev/null +++ b/tools/nntool/utils/exception.py @@ -0,0 +1,20 @@ +# Copyright (C) 2022 GreenWaves Technologies, SAS + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +class NNToolInternelError(Exception): + pass + +class NNToolNotImplementedError(NotImplementedError): + pass diff --git a/tools/nntool/utils/graph.py b/tools/nntool/utils/graph.py index 31423e1eb..e3372f99e 100644 --- a/tools/nntool/utils/graph.py +++ b/tools/nntool/utils/graph.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 GreenWaves Technologies, SAS +# Copyright (C) 2020, 2022 GreenWaves Technologies, SAS # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -15,9 +15,16 @@ from itertools import zip_longest -from collections import OrderedDict, deque from collections.abc import Iterable, Mapping -from typing import KeysView, Union, Sequence +from typing import Optional, Set, Tuple, Union, Sequence + + +def is_iterable(x): + try: + iter(x) # @IgnoreException + except TypeError: + return False + return True class GraphError(Exception): @@ -46,9 +53,13 @@ class Node(): '''Node class to inherit for nodes''' def __init__(self, name: str, *args, **kwargs): - super(Node, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._name = name + @property + def _noderef_class(self): + return NodeRef + @property def name(self): '''Node name - must not be changed once node is in graph''' @@ -59,18 +70,72 @@ def name(self, name): '''Node name - must not be changed once node is in graph''' self._name = name + def __call__(self, *args, num_outputs=1): + inputs = [] + fragments = set() + for arg in args: + if arg is not None and not isinstance(arg, self._noderef_class): + raise ValueError( + f"expecting {self._noderef_class.__name__} or None") + inputs.append(arg.ref[0] if arg else None) + fragments.add(arg.ref[1] if arg else None) + fragment = next( + iter([frag for frag in fragments if frag is not None]), None) + if fragment is None: + raise ValueError("No inputs") + other_fragments = fragments - {fragment} + + for other in other_fragments: + if hasattr(fragment, 'merge'): + if other is not None: + fragment.merge(other) + else: + raise ValueError('graph has no merge method') + + for to_idx, from_tuple in enumerate(inputs): + if from_tuple is not None: + from_node, from_idx = from_tuple + fragment.add_edge(fragment._edge_class(from_node=from_node, + from_idx=from_idx, + to_node=self, + to_idx=to_idx)) + if num_outputs == 1: + return self._noderef_class(fragment, self, 0) + return tuple(self._noderef_class(fragment, self, idx) for idx in range(num_outputs)) + def __str__(self): return self._name class NodeRef(): - def __init__(self, node) -> None: + def __init__(self, G: "GraphView", node: Node, idx: int) -> None: + self._G = G self._node = node + self._idx = idx + + @property + def G(self) -> "GraphView": + return self._G + + @property + def ref(self) -> Tuple[Tuple[Node, int], "GraphView"]: + return ((self._node, self._idx), self._G) @property - def node(self): + def node(self) -> Node: return self._node + def __eq__(self, o: object) -> bool: + if isinstance(o, NodeRef): + return super().__eq__(o) + return self._node.__eq__(o) + + def __hash__(self) -> int: + return self._node.__hash__() + + def __call__(self, *args, **kwargs): + raise ValueError("this is already a reference") + class MatchNode(Node): '''Node class to inherit for node matchers''' @@ -136,6 +201,12 @@ def __init__(self, from_node: Union[str, Node, NodeRef], to_node: Union[str, Nod raise ValueError('expecting int for to_idx') self._link = (from_node, from_idx, to_node, to_idx) + @classmethod + def from_src_to_dest(cls, from_edge, to_edge): + return cls( + from_node=from_edge.from_node, from_idx=from_edge.from_idx, + to_node=to_edge.to_node, to_idx=to_edge.to_idx) + @property def from_node(self): '''Edge start node''' @@ -194,18 +265,53 @@ def __hash__(self): class GraphView(Mapping): - def __init__(self): - self._out_edges = OrderedDict() - self._in_edges = OrderedDict() - self._nodes = OrderedDict() - self._hidden = False - self._hidden_nodes = [] + def __init__(self, **attr): + self._out_edges = {} + self._in_edges = {} + self._nodes = {} + self._attr = attr @classmethod # pylint: disable=unused-argument def clone_factory(cls, G): return cls() + def with_hidden_nodes(self, hidden_fn, edge_class=None): + if edge_class is None: + edge_class = Edge + + def real_up_node(G, edge): + if hidden_fn(edge.from_node): + edges = G.in_edges(edge.from_node) + assert len(edges) == 1 + return real_up_node(G, edges[0]) + return edge.from_node, edge.from_idx + + def copy_node(G, new_graph, node): + for edge in self.in_edges(node): + from_node, from_idx = real_up_node(G, edge) + new_edge = edge_class(from_node=from_node, from_idx=from_idx, + to_node=node, to_idx=edge.to_idx) + if new_graph.has_edge(new_edge): + continue + new_graph.add_edge(new_edge) + copy_node(G, new_graph, from_node) + + new_graph = self.__class__() + setattr(new_graph, '_attr', self._attr) + for node in self.outputs(): + copy_node(self, new_graph, node) + return new_graph + + def has_edge(self, edge): + edges = self._in_edges.get(edge.to_node.name) + if not edges: + return False + edges = edges.get(edge.from_node.name) + if not edges: + return False + return edge in edges + def clear(self): '''Clears the graph view''' if self._nodes: @@ -221,6 +327,12 @@ def clone(self) -> 'GraphView': clone._nodes = self._nodes.copy() return clone + def merge(self, other: 'Graph'): + if self != other: + for edge in other.edges: + self.add_edge(edge) + return self + def num_nodes(self): '''Number of nodes len(GraphView) also works''' return len(self) @@ -232,14 +344,8 @@ def num_edges(self): for edge in edge_list) def __add_in_edge(self, edge: Edge, update=False): - edges = self._in_edges.get(edge.to_node.name) - if not edges: - edges = {} - self._in_edges[edge.to_node.name] = edges - edge_list = edges.get(edge.from_node.name) - if edge_list is None: - edge_list = [] - edges[edge.from_node.name] = edge_list + edges = self._in_edges.setdefault(edge.to_node.name, {}) + edge_list = edges.setdefault(edge.from_node.name, []) edge_idx = next((i for i, x in enumerate(edge_list) if x == edge), -1) if edge_idx >= 0: if update: @@ -255,14 +361,8 @@ def __add_in_edge(self, edge: Edge, update=False): edge_list.append(edge) def __add_out_edge(self, edge: Edge, update=False): - edges = self._out_edges.get(edge.from_node.name) - if not edges: - edges = {} - self._out_edges[edge.from_node.name] = edges - edge_list = edges.get(edge.to_node.name) - if edge_list is None: - edge_list = [] - edges[edge.to_node.name] = edge_list + edges = self._out_edges.setdefault(edge.from_node.name, {}) + edge_list = edges.setdefault(edge.to_node.name, []) edge_idx = next((i for i, x in enumerate(edge_list) if x == edge), -1) if edge_idx >= 0: if update: @@ -287,11 +387,6 @@ def verify_edges(self, check_connected=True): def add_edge(self, edge: Edge): '''Adds an edge to the graph''' - hidden_state = self._hidden - if hidden_state: - self._hidden = False - if self._hidden_nodes: - raise ValueError('cannot modify graph when nodes are hidden') if isinstance(edge.from_node, str): edge = edge.clone(from_node=self._nodes[edge.from_node]) elif edge.from_node.name not in self._nodes: @@ -304,13 +399,9 @@ def add_edge(self, edge: Edge): self._nodes[edge.to_node.name] = edge.to_node self.__add_in_edge(edge) self.__add_out_edge(edge) - if hidden_state: - self._hidden = True def node(self, node_name): '''Find a node by name. GraphView[node_name] also works''' - if self._hidden and any(node_name == node.name for node in self._hidden_nodes): - raise IndexError(f'{node_name} is hidden') return self[node_name] def insert_node(self, node_to_insert, from_node_name, @@ -318,9 +409,6 @@ def insert_node(self, node_to_insert, from_node_name, node_input_idx=0, node_output_idx=0, edge_class=None): '''Inserts a node between two existing nodes''' - hidden_state = self._hidden - if hidden_state: - self._hidden = False if edge_class is None: edge_class = Edge node_to_insert = resolve_node(node_to_insert) @@ -333,8 +421,6 @@ def insert_node(self, node_to_insert, from_node_name, from_idx=from_idx, to_idx=node_input_idx)) self.add_edge(edge_class(node_to_insert, to_node_name, from_idx=node_output_idx, to_idx=to_idx)) - if hidden_state: - self._hidden = True def edge(self, from_node_name: str, to_node_name: str, from_idx: int = 0, to_idx: int = 0): '''Finds first edge between two nodes - WARNING - probably not good in weird situation @@ -398,7 +484,8 @@ def predecessor_names(self, node_name: str) -> Iterable: def nodes(self, node_classes=None, sort=False): '''All the nodes in the graph. GraphView.values() also works.''' if node_classes is not None: - nodes = [node for node in self._nodes.values() if isinstance(node, node_classes)] + nodes = [node for node in self._nodes.values( + ) if isinstance(node, node_classes)] else: nodes = list(self._nodes.values()) if sort: @@ -451,34 +538,48 @@ def connected_nodes(self, node_or_node_name): edge.to_node for edge in self.out_edges(node_or_node_name)) return list(connected_nodes) - def is_vertex_cut(self, node_set, node=None, visited=None): - if visited is None: - visited = set() - if node is None: - inputs = set(self.inputs()) - # choose one input node (or successor) that is not in the node_set - start_node = None - while inputs: - node = inputs.pop() - # if the input node is actually in the set then move past it - # this ensures that if the node_set is at the start of the graph - # and does not divide the graph it is not reported as a cut - if node not in node_set: - start_node = node - break - inputs.update(edge.to_node for edge in self.out_edges(node)) - self.is_vertex_cut(node_set, node=start_node, visited=visited) - return len(visited) < (len(self) - len(node_set)) - # undirected dfs + def _old_undirected_dfs(self, node, stop_at, pass_at, visited): + if node in stop_at or node in visited: + return + if pass_at and node not in pass_at: + return visited.add(node) - for edge in self.out_edges(node): - if edge.to_node in visited | node_set: - continue - self.is_vertex_cut(node_set, node=edge.to_node, visited=visited) + yield node for edge in self.in_edges(node): - if edge.from_node in visited | node_set: - continue - self.is_vertex_cut(node_set, node=edge.from_node, visited=visited) + yield from self._old_undirected_dfs(edge.from_node, stop_at, pass_at, visited) + for edge in self.out_edges(node): + yield from self._old_undirected_dfs(edge.to_node, stop_at, pass_at, visited) + + def old_undirected_dfs(self, stop_at=None, start_at=None, pass_at=None): + if start_at is None: + start_at = list(self.inputs()) + elif is_iterable(start_at): + start_at = list(start_at) + else: + start_at = [start_at] + if stop_at is None: + stop_at = set() + elif not is_iterable(stop_at): + stop_at = {stop_at} + else: + stop_at = set(stop_at) + if pass_at is None: + pass_at = set() + elif not is_iterable(pass_at): + pass_at = {stop_at} + else: + pass_at = set(pass_at) + + visited = set() + while start_at: + yield from self._old_undirected_dfs(start_at.pop(0), stop_at, pass_at, visited) + + def is_vertex_cut(self, node_set): + start_at = next( + iter([node for node in self.nodes() if node not in node_set])) + visited = list(self.old_undirected_dfs( + start_at=start_at, stop_at=node_set)) + return len(visited) < (len(self) - len(node_set)) def nodes_between_in(self, node_from, node_to, node_set, start=True): """Check that the only nodes between from and to are in node set""" @@ -512,25 +613,38 @@ def nodes_between(self, node_from, node_to, visited=None, path=None): visited=visited, path=path + [edge.to_node]) return visited - def nodes_below(self, node, visited=None): + def paths_between(self, node_from, node_to, path=None, topo=None): + if topo is None: + topo = {node: idx for idx, node in enumerate(self.topological_sort())} + path = [] + if node_from == node_to: + return path + if topo[node_from] > topo[node_to]: + return None + found_paths = [] + for edge in self.in_edges(node_to): + up_path = self.paths_between(node_from, edge.from_node, path=[edge] + path, topo=topo) + if up_path is None: + continue + found_paths.append(up_path) + if not found_paths: + return None + if len(found_paths) == 1: + return found_paths[0] + return found_paths + + def nodes_below(self, node): """Return nodes below node not including node""" - if visited is None: - node = resolve_node_or_str(node, G=self) - visited = set() - for edge in self.out_edges(node): - visited.add(edge.to_node) - self.nodes_below(edge.to_node, visited=visited) - return visited + nodes_below = set(self.directed_dfs(node)) + nodes_above = set(self.undirected_dfs( + node, start_up=True, stop_down_at=nodes_below)) + return tuple(self.undirected_dfs(node, stop_up_at=nodes_above)) def nodes_above(self, node, visited=None): """Return nodes above node not including node""" - if visited is None: - node = resolve_node_or_str(node, G=self) - visited = set() - for edge in self.in_edges(node): - visited.add(edge.from_node) - self.nodes_above(edge.from_node, visited=visited) - return visited + nodes_above = set(self.directed_dfs(node, go_up=True)) + nodes_below = set(self.undirected_dfs(node, stop_up_at=nodes_above)) + return tuple(self.undirected_dfs(node, stop_down_at=nodes_below, start_up=True)) def nodes_below_are_class(self, node, classes, visited=None): """Check all nodes below are in classes""" @@ -546,6 +660,169 @@ def nodes_below_are_class(self, node, classes, visited=None): return False return True + def directed_dfs(self, + node_or_name: Union[str, Node], + stop_at: Optional[Set[Node]] = None, + go_up: bool = False, + yield_start_node=False, + visited=None): + """Yields all nodes above or below this node searched directed. This is almost a dfs + since it yields in order going down the graph rather than bottom up + + Args: + node_or_name (Union[str, Node]): Node or node name to start at + stop_at (Optional[Set[Node]], optional): Stop at this set of nodes. Defaults to None. + go_up (bool, optional): Go in an upward direction or downwards if False. Defaults to False. + + Yields: + Node: Nodes visited + """ + node = resolve_node_or_str(node_or_name, G=self) + if visited is None: + visited = {node} + started = False + if stop_at is None: + stop_at = {} + else: + started = True + if node in stop_at: + return + if started or yield_start_node: + yield node + if not go_up: + for edge in self.out_edges(node.name): + if edge.to_node in visited: + continue + visited.add(edge.to_node) + yield from self.directed_dfs(edge.to_node, stop_at=stop_at, go_up=go_up, visited=visited) + if go_up: + for edge in self.in_edges(node.name): + if edge.from_node in visited: + continue + visited.add(edge.from_node) + yield from self.directed_dfs(edge.from_node, stop_at=stop_at, go_up=go_up, visited=visited) + + def connected_groups(self): + nodes = set(self.nodes()) + groups = [] + while nodes: + start = nodes.pop() + group = set(self.undirected_dfs(start))|set(self.undirected_dfs(start, start_up=True))|{start} + groups.append(group) + nodes -= group + return groups + + def undirected_dfs(self, + node_or_name: Union[str, Node], + stop_at: Optional[Set[Node]] = None, + stop_down_at: Optional[Set[Node]] = None, + stop_up_at: Optional[Set[Node]] = None, + start_up: bool = False, + yield_start_node: bool = False, + yield_stop_node: bool = False, + visited=None): + """Yields all nodes above or below this node searched undirected. This is almost a dfs + since it yields in order going down the graph rather than bottom up. It also has a few modes + where it is edge direction sensitive for stopping + + Args: + node_or_name (Union[str, Node]): Node or node name to start at + stop_at (Optional[Set[Node]], optional): Stop at this set of nodes. Defaults to None. + stop_down_at (Optional[Set[Node]], optional): Stop at this set of nodes going down. Defaults to None. + stop_up_at (Optional[Set[Node]], optional): Stop at this set of nodes going up. Defaults to None. + start_up (bool, optional): Start in an upward direction or downwards if False. Defaults to False. + + Yields: + Node: Nodes visited + """ + node = resolve_node_or_str(node_or_name, G=self) + if visited is None: + visited = {node} + started = False + if stop_at is None: + stop_at = {} + if stop_down_at is None: + stop_down_at = {} + if stop_up_at is None: + stop_up_at = {} + else: + started = True + if node in stop_at: + if yield_stop_node: + yield node + return + if started or yield_start_node: + yield node + if not start_up or started: + for edge in self.out_edges(node.name): + if edge.to_node in visited or edge.to_node in stop_down_at: + continue + visited.add(edge.to_node) + yield from self.undirected_dfs(edge.to_node, stop_at=stop_at, stop_down_at=stop_down_at, + stop_up_at=stop_up_at, visited=visited) + if start_up or started: + for edge in self.in_edges(node.name): + if edge.from_node in visited or edge.from_node in stop_up_at: + continue + visited.add(edge.from_node) + yield from self.undirected_dfs(edge.from_node, stop_at=stop_at, stop_down_at=stop_down_at, + stop_up_at=stop_up_at, visited=visited) + + def _topological_sort(self, node: Node, visited_edges): + yield node + for edge_bundle in self.indexed_out_edges(node): + for edge in edge_bundle: + visited_edges.add(edge) + if set(self.in_edges(edge.to_node)).issubset(visited_edges): + yield from self._topological_sort(edge.to_node, visited_edges) + + def _topological_sort_reversed(self, node: Node, visited_edges): + yield node + for edge in reversed(self.indexed_in_edges(node)): + visited_edges.add(edge) + if set(self.out_edges(edge.from_node)).issubset(visited_edges): + yield from self._topological_sort_reversed(edge.from_node, visited_edges) + + def topological_sort(self, + start_node_or_nodes: Optional[Union[str, + Node, + Sequence[Union[str, Node]]]] = None, + reverse: bool = False): + """[summary] + + Args: + start_node_or_nodes (Optional[Union[str, Node, Sequence[Union[str, Node]]]], optional): + Optional start node or nodes. Can also be node names. Defaults to None. + reverse (bool, optional): Sort from bottom of the graph up. Tries to be a perfect reverse of order. Defaults to False. + + Raises: + ValueError: Bad parameters given + + Yields: + (Node): Yields nodes in desired sort order + """ + if start_node_or_nodes is None: + if reverse: + nodes = list(reversed(self.outputs())) + else: + nodes = self.inputs() + elif isinstance(start_node_or_nodes, str): + nodes = [self._nodes[start_node_or_nodes]] + elif isinstance(start_node_or_nodes, Iterable): + nodes = [node if isinstance(node, Node) else + self[node] for node in start_node_or_nodes] + else: + raise ValueError('invalid argument') + visited_edges = set() + if reverse: + while nodes: + node = nodes.pop(0) + yield from self._topological_sort_reversed(node, visited_edges) + else: + while nodes: + node = nodes.pop(0) + yield from self._topological_sort(node, visited_edges) + def nodes_above_are_class(self, node, classes, visited=None): """Check all nodes above are in classes""" if visited is None: @@ -617,64 +894,6 @@ def num_out_edges(self, node_or_name: Union[str, Node]) -> int: node_name = resolve_name(node_or_name) return len(self.out_edges(node_name)) - def flood_above(self, node_or_name: Union[str, Node], res=None, in_edge=None): - """Return all nodes above this node including it and those connected to it - - Args: - node (Node): Node to flood - - Returns: - [Sequence[Node]]: Nodes found including node - """ - node = resolve_node_or_str(node_or_name, G=self) - if res is None: - first = True - res = {node} - else: - first = False - for edge in self.in_edges(node.name): - if edge.from_node not in res: - res.add(edge.from_node) - self.flood_above(edge.from_node, res=res, in_edge=edge) - if not first: - for edge in self.out_edges(node.name): - if edge == in_edge: - continue - if edge.to_node != node and edge.to_node not in res: - res.add(edge.to_node) - self.flood_below(edge.to_node, res=res) - return res - - def flood_below(self, node_or_name: Union[str, Node], stop_at=None, res=None, out_edge=None): - """Return all nodes below this node including it and those connected to it - - Args: - node (Node): Node to flood - stop_at (Node): Optional node to stop flooding at - Returns: - [Sequence[Node]]: Nodes found including node - """ - node = resolve_node_or_str(node_or_name, G=self) - if stop_at: - stop_at = resolve_node_or_str(stop_at, G=self) - if res is None: - first = True - res = {node, stop_at} if stop_at else {node} - else: - first = False - for edge in self.out_edges(node.name): - if edge.to_node not in res: - res.add(edge.to_node) - self.flood_below(edge.to_node, res=res, out_edge=edge) - if not first: - for edge in self.in_edges(node.name): - if edge == out_edge: - continue - if edge.from_node != node and edge.from_node not in res: - res.add(edge.from_node) - self.flood_above(edge.from_node, res=res) - return res - def remove_all(self, nodes: Sequence[Node]): """Remove all nodes @@ -691,26 +910,20 @@ def remove_all(self, nodes: Sequence[Node]): self.remove(del_node) def remove_below(self, node: Node): - """Remove the nodes below this node. Note: If there are links below this node - that go back above it this will do nothing since all nodes are flooded. Use - keep_between in this case. + """Remove the nodes below this node. Args: node (Node): Remove below this node """ - keep_nodes = self.flood_above(node) - self.remove_all(set(self._nodes.values()) - keep_nodes) + self.remove_all(self.nodes_below(node)) def remove_above(self, node: Node): - """Remove the nodes above this node. Note: If there are links above this node - that go down below it this will do nothing since all nodes are flooded. Use - keep_between in this case. + """Remove the nodes above this node.. Args: node (Node): Remove below this node """ - keep_nodes = self.flood_below(node) - self.remove_all(set(self._nodes.values()) - keep_nodes) + self.remove_all(self.nodes_above(node)) def keep_between(self, from_node: Node, to_node: Node): """Remove all nodes that are not between from_node and to_node @@ -719,8 +932,18 @@ def keep_between(self, from_node: Node, to_node: Node): from_node (Node): Remove above this node to_node (Node): Remove below this node """ - keep_nodes = self.flood_below(from_node, stop_at=to_node) - self.remove_all(set(self._nodes.values()) - keep_nodes) + keep_nodes = set( + self.directed_dfs( + from_node, + stop_at={to_node}, + yield_start_node=True) + ) | set( + self.directed_dfs( + to_node, + stop_at={from_node}, + go_up=True, + yield_start_node=True)) + self.remove_all(set(self._nodes.values()) - set(keep_nodes)) def remove(self, node_or_name: Union[str, Node]): '''Removes a node and all its connected edges''' @@ -757,6 +980,10 @@ def edge_match(x): if not self._out_edges[edge.from_node.name][edge.to_node.name]: del self._out_edges[edge.from_node.name][edge.to_node.name] + def remove_edges(self, edges): + for edge in edges: + self.remove_edge(edge) + def edge_in_graph(self, edge): if edge.to_node.name in self._in_edges: edges = self._in_edges[edge.to_node.name] @@ -914,7 +1141,8 @@ def remove_fragment(self, frag: 'Graph'): in_nodes.add(edge.from_node) del frag_in_edges, in_nodes - frag_out_nodes = set((edge.to_node, edge.to_idx) for frag_out_node in frag.outputs(ignore_names=nodes_not_in_graph) + frag_out_nodes = set((edge.to_node, edge.to_idx) + for frag_out_node in frag.outputs(ignore_names=nodes_not_in_graph) for edge in self.out_edges(frag_out_node.name)) assert len(frag_out_nodes) == 1, "doesn't work if more than one output" frag_out_node = list(frag_out_nodes)[0] @@ -946,96 +1174,6 @@ def outputs(self, ignore_names=None): if node_name not in self._out_edges or all(output_name in ignore_names for output_name in self._out_edges[node_name])] - def fast_dfs(self): - visited_edges = set() - nodes = deque(self.inputs()) - while nodes: - node = nodes.pop() - node_name = node.name - if node_name in self._in_edges and not set(edge for edge_list in self._in_edges[node_name].values() for edge in edge_list).issubset(visited_edges): - continue - yield node - if node_name not in self._out_edges: - return - for edge_list in self._out_edges[node_name].values(): - for out_edge in edge_list: - visited_edges.add(out_edge) - nodes.append(out_edge.to_node) - - def __revdfs(self, node, condition, visited_nodes, visited_edges, from_node, from_edge): - if not node: - return - if isinstance(node, str): - node = self._nodes[node] - if node not in visited_nodes and\ - (from_node is None or - all((out_edge in visited_edges) for out_edge in self.out_edges(node.name))) and\ - (not condition or condition(self, from_node, node, from_edge)): - - yield node - visited_nodes.add(node) - in_edges = self.in_edges(node.name) - # Edges are visited in a repeatable order - in_edges.sort(key=lambda x: str(x.from_idx) + x.from_node.name + str(x.to_idx), - reverse=True) - for edge in in_edges: - visited_edges.add(edge) - - yield from self.__revdfs(edge.from_node, - condition, - visited_nodes, - visited_edges, - node, - edge) - - def __dfs(self, node, condition, visited_nodes, visited_edges, from_node, from_edge): - if not node: - return - if isinstance(node, str): - node = self._nodes[node] - if node not in visited_nodes and \ - (from_node is None or all((in_edge in visited_edges) for in_edge in self.in_edges(node.name))) and \ - (not condition or condition(self, from_node, node, from_edge)): - yield node - visited_nodes.add(node) - out_edges = self.out_edges(node.name) - # Edges are visited in a repeatable order - out_edges.sort(key=lambda x: str(x.from_idx) + - x.to_node.name + str(x.to_idx)) - for edge in out_edges: - visited_edges.add(edge) - - yield from self.__dfs(edge.to_node, - condition, - visited_nodes, - visited_edges, - node, - edge) - - def dfs(self, node_or_name=None, condition=None, reverse=False): - if node_or_name is None: - if reverse: - nodes = list(self.outputs()) - # This isn't really necessary but helps with tests - nodes.reverse() - else: - nodes = self.inputs() - elif isinstance(node_or_name, str): - nodes = [self._nodes[node_or_name]] - elif isinstance(node_or_name, Iterable): - nodes = [node if isinstance(node, Node) else - self[node] for node in node_or_name] - else: - raise TypeError() - - visited_nodes = set() - visited_edges = set() - for node in nodes: - if reverse: - yield from self.__revdfs(node, condition, visited_nodes, visited_edges, None, None) - else: - yield from self.__dfs(node, condition, visited_nodes, visited_edges, None, None) - @staticmethod def match_semantics(edges, match_edge): for edge in edges: @@ -1174,49 +1312,20 @@ def match_down_edge(self, matched_graphview, fragment, graph_edge, return False return True - def match_fragment(self, fragment: 'GraphView', node_or_name: Node = None, allow_extra_edges=False): - """Matches a graph fragment against this graph""" - inputs = fragment.inputs() - - start_points = {} - - def match_start_points(G, from_node, to_node, unused1): - del unused1 - nonlocal inputs, start_points - edge = None if from_node is None else G.edge( - from_node.name, to_node.name) - for fragment_input_node in inputs: - if isinstance(fragment_input_node, MatchNode) and fragment_input_node._match(G, to_node, edge): - start_points[to_node] = fragment_input_node - return True + @property + def _edge_class(self): + return Edge - _ = list(self.dfs(condition=match_start_points, - node_or_name=node_or_name)) - # start points will now be a list of pairs with the start_node name in the graph and the - # corresponding node in the fragment. The start points have all matched an input none in the - # fragment - - matched_fragments = [] - while True: - graph_node = next(start_points.__iter__(), None) - if graph_node is None: - return matched_fragments - match_node = start_points[graph_node] - del start_points[graph_node] - - matched_graphview = GraphView() - matched_graphview.add_node(graph_node) - visited_nodes = set() - if self.match_down_node(matched_graphview, fragment, - graph_node, match_node, - visited_nodes, start_points, - allow_extra_edges=allow_extra_edges): - matched_fragments.append(matched_graphview) - return matched_fragments + @property + def _noderef_class(self): + return NodeRef def __eq__(self, other): return set(self.nodes()) == set(other.nodes()) and set(self.edges()) == set(other.edges()) + def __hash__(self) -> int: + return (tuple(self.nodes()), tuple(self.edges())).__hash__() + def __len__(self): return len(self._nodes) diff --git a/tools/nntool/utils/json_serializable.py b/tools/nntool/utils/json_serializable.py index b5bae0632..3a0f3e7f9 100644 --- a/tools/nntool/utils/json_serializable.py +++ b/tools/nntool/utils/json_serializable.py @@ -64,6 +64,10 @@ def default(self, o): '__contents': o.tolist(), '__dtype': o.dtype.name } + if hasattr(o, 'dtype'): + if np.issubdtype(o.dtype, np.bool): + return bool(o) + # Let the base class default method raise the try: return json.JSONEncoder.default(self, o) diff --git a/tools/nntool/utils/maximizer.py b/tools/nntool/utils/maximizer.py index 061e9b6ea..49a51e9b8 100644 --- a/tools/nntool/utils/maximizer.py +++ b/tools/nntool/utils/maximizer.py @@ -25,7 +25,7 @@ def __init__(self, func, var_min, var_max, func_change=None, int_step=False): self._args = tuple() self._int_step = int_step - @lru_cache + @lru_cache(maxsize=128, typed=False) def func(self, var): return self._func(var, *self._args) diff --git a/tools/nntool/utils/numpy_helpers.py b/tools/nntool/utils/numpy_helpers.py index a8a841af3..9641702d9 100644 --- a/tools/nntool/utils/numpy_helpers.py +++ b/tools/nntool/utils/numpy_helpers.py @@ -36,3 +36,8 @@ def packbits(value, bits): )[:, 0:bits:].flatten(), bitorder='little' ) + +def np_asscalar(elem): + if isinstance(elem, np.ndarray): + return elem.item() + return elem diff --git a/tools/nntool/utils/process_header.py b/tools/nntool/utils/process_header.py index e6021f3f7..adcb5f261 100644 --- a/tools/nntool/utils/process_header.py +++ b/tools/nntool/utils/process_header.py @@ -168,6 +168,7 @@ def gen_infos_array(self, len_key, **vals): keys = sorted([(key, self.inf(key), self.inf_len(key)) for key in vals], key=lambda x: x[1]) bvals = np.full((self.inf(len_key),), 0, dtype=np.uint8) + comment = "" for key, info, info_len in keys: val = np.atleast_1d(vals[key]) val = val.newbyteorder('>') @@ -177,4 +178,5 @@ def gen_infos_array(self, len_key, **vals): raise ValueError( f'value for {key} is too long {val_len}>{info_len}') bvals[info:info+len(val):1] = val - return bvals + comment += f" {key}: {vals[key]}" + return bvals, comment diff --git a/tools/profiler/.gitignore b/tools/profiler/.gitignore deleted file mode 100644 index a6f8ec3da..000000000 --- a/tools/profiler/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -*/build/* -docs/ -gui/Makefile -*.qmake.stash -*.debug_info -gui/uic_wrapper.sh -function_statistics.txt diff --git a/tools/profiler/Doxyfile b/tools/profiler/Doxyfile deleted file mode 100644 index 90e2f5b37..000000000 --- a/tools/profiler/Doxyfile +++ /dev/null @@ -1,2427 +0,0 @@ -# Doxyfile 1.8.11 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a double hash (##) is considered a comment and is placed in -# front of the TAG it is preceding. -# -# All text after a single hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists, items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (\" \"). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. -# The default value is: UTF-8. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -# double-quotes, unless you are using Doxywizard) that should identify the -# project for which the documentation is generated. This name is used in the -# title of most generated pages and in a few other places. -# The default value is: My Project. - -PROJECT_NAME = "GAP Profiler" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -# could be handy for archiving the generated documentation or if some version -# control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a -# quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = "A tool to help to optimize GAP programs" - -# With the PROJECT_LOGO tag one can specify a logo or an icon that is included -# in the documentation. The maximum height of the logo should not exceed 55 -# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy -# the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -# into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If -# left blank the current directory will be used. - -OUTPUT_DIRECTORY = docs/ - -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. -# The default value is: NO. - -CREATE_SUBDIRS = NO - -# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII -# characters to appear in the names of generated files. If set to NO, non-ASCII -# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode -# U+3044. -# The default value is: NO. - -ALLOW_UNICODE_NAMES = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. -# The default value is: English. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member -# descriptions after the members that are listed in the file and class -# documentation (similar to Javadoc). Set to NO to disable this. -# The default value is: YES. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief -# description of a member or function before the detailed description -# -# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. -# The default value is: YES. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator that is -# used to form the text in various listings. Each string in this list, if found -# as the leading text of the brief description, will be stripped from the text -# and the result, after processing the whole list, is used as the annotated -# text. Otherwise, the brief description is used as-is. If left blank, the -# following values are used ($name is automatically replaced with the name of -# the entity):The $name class, The $name widget, The $name file, is, provides, -# specifies, contains, represents, a, an and the. - -ABBREVIATE_BRIEF = - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief -# description. -# The default value is: NO. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. -# The default value is: NO. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path -# before files name in the file list and in the header files. If set to NO the -# shortest path that makes the file name unique will be used -# The default value is: YES. - -FULL_PATH_NAMES = YES - -# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -# Stripping is only done if one of the specified strings matches the left-hand -# part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to -# strip. -# -# Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. -# This tag requires that the tag FULL_PATH_NAMES is set to YES. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -# path mentioned in the documentation of a class, which tells the reader which -# header file to include in order to use a class. If left blank only the name of -# the header file containing the class definition is used. Otherwise one should -# specify the list of include paths that are normally passed to the compiler -# using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't -# support long names like on DOS, Mac, or CD-ROM. -# The default value is: NO. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) -# The default value is: NO. - -JAVADOC_AUTOBRIEF = NO - -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) -# The default value is: NO. - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -# a brief description. This used to be the default behavior. The new default is -# to treat a multi-line C++ comment block as a detailed description. Set this -# tag to YES if you prefer the old behavior instead. -# -# Note that setting this tag to YES also means that rational rose comments are -# not recognized any more. -# The default value is: NO. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -# documentation from any documented member that it re-implements. -# The default value is: YES. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new -# page for each member. If set to NO, the documentation of a member will be part -# of the file/class/namespace that contains it. -# The default value is: NO. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -# uses this value to replace tabs by spaces in code fragments. -# Minimum value: 1, maximum value: 16, default value: 4. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that act as commands in -# the documentation. An alias has the form: -# name=value -# For example adding -# "sideeffect=@par Side Effects:\n" -# will allow you to put the command \sideeffect (or @sideeffect) in the -# documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. - -ALIASES = - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -# only. Doxygen will then generate output that is more tailored for C. For -# instance, some of the names that are used will be different. The list of all -# members will be omitted, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -# Python sources only. Doxygen will then generate output that is more tailored -# for that language. For instance, namespaces will be presented as packages, -# qualified scopes will look different, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources. Doxygen will then generate output that is tailored for Fortran. -# The default value is: NO. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for VHDL. -# The default value is: NO. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. -# -# Note: For files without extension you can use no_extension as a placeholder. -# -# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -# according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -# case of backward compatibilities issues. -# The default value is: YES. - -MARKDOWN_SUPPORT = YES - -# When enabled doxygen tries to link words that correspond to documented -# classes, or namespaces to their corresponding documentation. Such a link can -# be prevented in individual cases by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. -# The default value is: YES. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. -# The default value is: NO. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. -# The default value is: NO. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. -# The default value is: NO. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate -# getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. -# This will only work if the methods are indeed getting or setting a simple -# type. If this is not the case, or you want to show the methods anyway, you -# should set this option to NO. -# The default value is: YES. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. -# The default value is: NO. - -DISTRIBUTE_GROUP_DOC = NO - -# If one adds a struct or class to a group and this option is enabled, then also -# any nested class or struct is added to the same group. By default this option -# is disabled and one has to add nested compounds explicitly via \ingroup. -# The default value is: NO. - -GROUP_NESTED_COMPOUNDS = NO - -# Set the SUBGROUPING tag to YES to allow class member groups of the same type -# (for instance a group of public functions) to be put as a subgroup of that -# type (e.g. under the Public Functions section). Set it to NO to prevent -# subgrouping. Alternatively, this can be done per class using the -# \nosubgrouping command. -# The default value is: YES. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -# are shown inside the group in which they are included (e.g. using \ingroup) -# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -# and RTF). -# -# Note that this feature does not work in combination with -# SEPARATE_MEMBER_PAGES. -# The default value is: NO. - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -# with only public data fields or simple typedef fields will be shown inline in -# the documentation of the scope in which they are defined (i.e. file, -# namespace, or group documentation), provided this scope is documented. If set -# to NO, structs, classes, and unions are shown on a separate page (for HTML and -# Man pages) or section (for LaTeX and RTF). -# The default value is: NO. - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -# enum is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically be -# useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. -# The default value is: NO. - -TYPEDEF_HIDES_STRUCT = NO - -# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -# cache is used to resolve symbols given their name and scope. Since this can be -# an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The -# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest -# the optimal cache size from a speed point of view. -# Minimum value: 0, maximum value: 9, default value: 0. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in -# documentation are documented, even if no documentation was available. Private -# class members and static file members will be hidden unless the -# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -# Note: This will also disable the warnings about undocumented members that are -# normally produced when WARNINGS is set to YES. -# The default value is: NO. - -EXTRACT_ALL = NO - -# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will -# be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIVATE = NO - -# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal -# scope will be included in the documentation. -# The default value is: NO. - -EXTRACT_PACKAGE = NO - -# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be -# included in the documentation. -# The default value is: NO. - -EXTRACT_STATIC = NO - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined -# locally in source files will be included in the documentation. If set to NO, -# only classes defined in header files are included. Does not have any effect -# for Java sources. -# The default value is: YES. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. If set to YES, local methods, -# which are defined in the implementation section but not in the interface are -# included in the documentation. If set to NO, only methods in the interface are -# included. -# The default value is: NO. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base name of -# the file that contains the anonymous namespace. By default anonymous namespace -# are hidden. -# The default value is: NO. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -# undocumented members inside documented classes or files. If set to NO these -# members will be included in the various overviews, but no documentation -# section is generated. This option has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. If set -# to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. -# The default value is: NO. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -# documentation blocks found inside the body of a function. If set to NO, these -# blocks will be appended to the function's detailed documentation block. -# The default value is: NO. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation that is typed after a -# \internal command is included. If the tag is set to NO then the documentation -# will be excluded. Set it to YES to include the internal documentation. -# The default value is: NO. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. -# The default value is: system dependent. - -CASE_SENSE_NAMES = YES - -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -# their full class and namespace scopes in the documentation. If set to YES, the -# scope will be hidden. -# The default value is: NO. - -HIDE_SCOPE_NAMES = NO - -# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will -# append additional text to a page's title, such as Class Reference. If set to -# YES the compound reference will be hidden. -# The default value is: NO. - -HIDE_COMPOUND_REFERENCE= NO - -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -# the files that are included by a file in the documentation of that file. -# The default value is: YES. - -SHOW_INCLUDE_FILES = YES - -# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each -# grouped member an include statement to the documentation, telling the reader -# which file to include in order to use the member. -# The default value is: NO. - -SHOW_GROUPED_MEMB_INC = NO - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -# files with double quotes in the documentation rather than with sharp brackets. -# The default value is: NO. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -# documentation for inline members. -# The default value is: YES. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -# (detailed) documentation of file and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. -# The default value is: YES. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -# descriptions of file, namespace and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. Note that -# this will also influence the order of the classes in the class list. -# The default value is: NO. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -# (brief and detailed) documentation of class members so that constructors and -# destructors are listed first. If set to NO the constructors will appear in the -# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -# member documentation. -# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -# detailed member documentation. -# The default value is: NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -# of group names into alphabetical order. If set to NO the group names will -# appear in their defined order. -# The default value is: NO. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -# fully-qualified names, including namespaces. If set to NO, the class list will -# be sorted only by class name, not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the alphabetical -# list. -# The default value is: NO. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -# type resolution of all parameters of a function it will reject a match between -# the prototype and the implementation of a member function even if there is -# only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -# accept a match between prototype and implementation in such cases. -# The default value is: NO. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo -# list. This list is created by putting \todo commands in the documentation. -# The default value is: YES. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test -# list. This list is created by putting \test commands in the documentation. -# The default value is: YES. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug -# list. This list is created by putting \bug commands in the documentation. -# The default value is: YES. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) -# the deprecated list. This list is created by putting \deprecated commands in -# the documentation. -# The default value is: YES. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional documentation -# sections, marked by \if ... \endif and \cond -# ... \endcond blocks. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -# initial value of a variable or macro / define can have for it to appear in the -# documentation. If the initializer consists of more lines than specified here -# it will be hidden. Use a value of 0 to hide initializers completely. The -# appearance of the value of individual variables and macros / defines can be -# controlled using \showinitializer or \hideinitializer command in the -# documentation regardless of this setting. -# Minimum value: 0, maximum value: 10000, default value: 30. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -# the bottom of the documentation of classes and structs. If set to YES, the -# list will mention the files that were used to generate the documentation. -# The default value is: YES. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -# will remove the Files entry from the Quick Index and from the Folder Tree View -# (if specified). -# The default value is: YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -# page. This will remove the Namespaces entry from the Quick Index and from the -# Folder Tree View (if specified). -# The default value is: YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command command input-file, where command is the value of the -# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file -# version. For an example see the documentation. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can -# optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. -# -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -# tag is left empty. - -LAYOUT_FILE = - -# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -# the reference definitions. This must be a list of .bib files. The .bib -# extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. -# For LaTeX the style of the bibliography can be controlled using -# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -# search path. See also \cite for info how to create references. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# Configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the -# messages are off. -# The default value is: NO. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES -# this implies that the warnings are on. -# -# Tip: Turn warnings on while writing the documentation. -# The default value is: YES. - -WARNINGS = YES - -# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate -# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -# will automatically be disabled. -# The default value is: YES. - -WARN_IF_UNDOCUMENTED = NO - -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. -# The default value is: YES. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -# are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. -# The default value is: NO. - -WARN_NO_PARAMDOC = NO - -# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. -# The default value is: NO. - -WARN_AS_ERROR = NO - -# The WARN_FORMAT tag determines the format of the warning messages that doxygen -# can produce. The string should contain the $file, $line, and $text tags, which -# will be replaced by the file and line number from which the warning originated -# and the warning text. Optionally the format may contain $version, which will -# be replaced by the version of the file (if it could be obtained via -# FILE_VERSION_FILTER) -# The default value is: $file:$line: $text. - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning and error -# messages should be written. If left blank the output is written to standard -# error (stderr). - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# Configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag is used to specify the files and/or directories that contain -# documented source files. You may enter file names like myfile.cpp or -# directories like /usr/src/myproject. Separate the files or directories with -# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING -# Note: If this tag is empty the current directory is searched. - -INPUT = - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of -# possible encodings. -# The default value is: UTF-8. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -# *.h) to filter out the source-files in the directories. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# read by doxygen. -# -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, -# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. - -FILE_PATTERNS = - -# The RECURSIVE tag can be used to specify whether or not subdirectories should -# be searched for input files as well. -# The default value is: NO. - -RECURSIVE = YES - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = gui/build backend/build docs - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. -# The default value is: NO. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or directories -# that contain example code fragments that are included (see the \include -# command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank all -# files are included. - -EXAMPLE_PATTERNS = - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude commands -# irrespective of the value of the RECURSIVE tag. -# The default value is: NO. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or directories -# that contain images that are to be included in the documentation (see the -# \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command: -# -# -# -# where is the value of the INPUT_FILTER tag, and is the -# name of an input file. Doxygen will then use the output that the filter -# program writes to standard output. If FILTER_PATTERNS is specified, this tag -# will be ignored. -# -# Note that the filter must not add or remove lines; it is applied before the -# code is scanned, but not when the output code is generated. If lines are added -# or removed, the anchors will not be placed correctly. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: pattern=filter -# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -# patterns match the file name, INPUT_FILTER is applied. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will also be used to filter the input files that are used for -# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -# The default value is: NO. - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -# it is also possible to disable source filtering for a specific pattern using -# *.ext= (so without naming a filter). -# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - -FILTER_SOURCE_PATTERNS = - -# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -# is part of the input, its contents will be placed on the main page -# (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. - -USE_MDFILE_AS_MAINPAGE = README.md - -#--------------------------------------------------------------------------- -# Configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -# generated. Documented entities will be cross-referenced with these sources. -# -# Note: To get rid of all source code in the generated output, make sure that -# also VERBATIM_HEADERS is set to NO. -# The default value is: NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. -# The default value is: NO. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -# special comment blocks from generated source code fragments. Normal C, C++ and -# Fortran comments will always remain visible. -# The default value is: YES. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. -# The default value is: NO. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES then for each documented function -# all documented entities called/used by that function will be listed. -# The default value is: NO. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -# to YES then the hyperlinks from functions in REFERENCES_RELATION and -# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -# link to the documentation. -# The default value is: YES. - -REFERENCES_LINK_SOURCE = YES - -# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -# source code will show a tooltip with additional information such as prototype, -# brief description and links to the definition and documentation. Since this -# will make the HTML file larger and loading of large files a bit slower, you -# can opt to disable this feature. -# The default value is: YES. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -SOURCE_TOOLTIPS = YES - -# If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in -# source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version -# 4.8.6 or higher. -# -# To use it do the following: -# - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file -# - Make sure the INPUT points to the root of the source tree -# - Run doxygen as normal -# -# Doxygen will invoke htags (and that will in turn invoke gtags), so these -# tools must be available from the command line (i.e. in the search path). -# -# The result: instead of the source browser generated by doxygen, the links to -# source code will now point to the output of htags. -# The default value is: NO. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -# verbatim copy of the header file for each class for which an include is -# specified. Set to NO to disable this. -# See also: Section \class. -# The default value is: YES. - -VERBATIM_HEADERS = YES - -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# generated with the -Duse-libclang=ON option for CMake. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - -#--------------------------------------------------------------------------- -# Configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -# compounds will be generated. Enable this if the project contains a lot of -# classes, structs, unions or interfaces. -# The default value is: YES. - -ALPHABETICAL_INDEX = YES - -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output -# The default value is: YES. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -# it. -# The default directory is: html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -# generated HTML page (for example: .htm, .php, .asp). -# The default value is: .html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a -# standard header. -# -# To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. -# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -# default header using -# doxygen -w html new_header.html new_footer.html new_stylesheet.css -# YourConfigFile -# and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally -# uses. -# Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description -# of the possible markers and block names see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard -# footer. See HTML_HEADER for more information on how to generate a default -# footer and what special commands can be used inside the footer. See also -# section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -# sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. -# See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. -# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -# it is more robust and this tag (HTML_STYLESHEET) will in the future become -# obsolete. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined -# cascading style sheets that are included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. -# This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefore more robust against future updates. -# Doxygen will copy the style sheet files to the output directory. -# Note: The order of the extra style sheet files is of importance (e.g. the last -# style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -# files will be copied as-is; there are no commands or markers available. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -# will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value -# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -# purple, and 360 is red again. -# Minimum value: 0, maximum value: 359, default value: 220. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A -# value of 255 will produce the most vivid colors. -# Minimum value: 0, maximum value: 255, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -# luminance component of the colors in the HTML output. Values below 100 -# gradually make the output lighter, whereas values above 100 make the output -# darker. The value divided by 100 is the actual gamma applied, so 80 represents -# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -# change the gamma. -# Minimum value: 40, maximum value: 240, default value: 80. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -# shown in the various tree structured indices initially; the user can expand -# and collapse entries dynamically later on. Doxygen will expand the tree to -# such a level that at most the specified number of entries are visible (unless -# a fully collapsed tree already exceeds this amount). So setting the number of -# entries 1 will produce a full collapsed tree by default. 0 is a special value -# representing an infinite number of entries and will result in a full expanded -# tree by default. -# Minimum value: 0, maximum value: 9999, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files will be -# generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_DOCSET = NO - -# This tag determines the name of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# The default value is: Doxygen generated docs. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# This tag specifies a string that should uniquely identify the documentation -# set bundle. This should be a reverse domain-name style string, e.g. -# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. -# The default value is: org.doxygen.Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -# The default value is: Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. -# -# The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -# files are now used as the Windows 98 help format, and will replace the old -# Windows help format (.hlp) on all Windows platforms in the future. Compressed -# HTML files also contain an index, a table of contents, and you can search for -# words in the documentation. The HTML workshop also contains a viewer for -# compressed HTML files. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_HTMLHELP = NO - -# The CHM_FILE tag can be used to specify the file name of the resulting .chm -# file. You can add a path in front of the file if the result should not be -# written to the html output directory. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_FILE = - -# The HHC_LOCATION tag can be used to specify the location (absolute path -# including file name) of the HTML help compiler (hhc.exe). If non-empty, -# doxygen will try to run the HTML help compiler on the generated index.hhp. -# The file has to be specified with full path. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -HHC_LOCATION = - -# The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -GENERATE_CHI = NO - -# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) -# and project file content. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_INDEX_ENCODING = - -# The BINARY_TOC flag controls whether a binary table of contents is generated -# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it -# enables the Previous and Next buttons. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members to -# the table of contents of the HTML help documentation and to the tree view. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -# (.qch) of the generated HTML documentation. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -# the file name of the resulting .qch file. The path specified is relative to -# the HTML output folder. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -# Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -# Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -# folders). -# The default value is: doc. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_VIRTUAL_FOLDER = doc - -# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -# filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_SECT_FILTER_ATTRS = - -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -# generated, together with the HTML files, they form an Eclipse help plugin. To -# install this plugin and make it available under the help contents menu in -# Eclipse, the contents of the directory containing the HTML and XML files needs -# to be copied into the plugins directory of eclipse. The name of the directory -# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -# After copying Eclipse needs to be restarted before the help appears. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the Eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have this -# name. Each documentation set should have its own identifier. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# If you want full control over the layout of the generated HTML pages it might -# be necessary to disable the index and replace it with your own. The -# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -# of each HTML page. A value of NO enables the index and the value YES disables -# it. Since the tabs in the index contain the same information as the navigation -# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. If the tag -# value is set to YES, a side panel will be generated containing a tree-like -# index structure (just like the one that is generated for HTML Help). For this -# to work a browser that supports JavaScript, DHTML, CSS and frames is required -# (i.e. any modern browser). Windows users are probably better off using the -# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. -# -# Note that a value of 0 will completely suppress the enum values from appearing -# in the overview section. -# Minimum value: 0, maximum value: 20, default value: 4. -# This tag requires that the tag GENERATE_HTML is set to YES. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -# to set the initial width (in pixels) of the frame in which the tree is shown. -# Minimum value: 0, maximum value: 1500, default value: 250. -# This tag requires that the tag GENERATE_HTML is set to YES. - -TREEVIEW_WIDTH = 250 - -# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to -# external symbols imported via tag files in a separate window. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of LaTeX formulas included as images in -# the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML -# output directory to force them to be regenerated. -# Minimum value: 8, maximum value: 50, default value: 10. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering -# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX -# installed or if you want to formulas look prettier in the HTML output. When -# enabled you may also need to install MathJax separately and configure the path -# to it using the MATHJAX_RELPATH option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -USE_MATHJAX = NO - -# When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. -# Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. -# The default value is: HTML-CSS. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_FORMAT = HTML-CSS - -# When MathJax is enabled you need to specify the location relative to the HTML -# output directory using the MATHJAX_RELPATH option. The destination directory -# should contain the MathJax.js script. For instance, if the mathjax directory -# is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -# Content Delivery Network so you can quickly see the result without installing -# MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest - -# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -# extension names that should be enabled during MathJax rendering. For example -# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_EXTENSIONS = - -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -# of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an -# example see the documentation. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_CODEFILE = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and -# should work on any modern browser. Note that when using HTML help -# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -# there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then -# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -# search using the keyboard; to jump to the search box use + S -# (what the is depends on the OS and browser, but it is typically -# , /