diff --git a/Makefile b/Makefile
index 5cf96bf10..da17c1fea 100644
--- a/Makefile
+++ b/Makefile
@@ -178,11 +178,19 @@ openocd.checkout:
 	fi
 
 openocd.build: openocd.checkout
-	cd utils/openocd && ./bootstrap && ./configure --enable-jtag_dpi --prefix=$(INSTALL_DIR)/openocd && make && make install
+	cd utils/openocd && ./bootstrap && ./configure --enable-jtag_dpi --prefix=$(INSTALL_DIR)/openocd && $(MAKE) && $(MAKE) install
 
 openocd.clean:
 	rm -rf $(INSTALL_DIR)/openocd tools/openocd
 
+PROFILER_V2_DIR = $(GAP_SDK_HOME)/tools/profiler_v2
+PROFILER_V2_BUILD_DIR = $(GAP_SDK_HOME)/build/profiler_v2
+
+profiler_v2:
+	cmake -S $(PROFILER_V2_DIR) -B $(PROFILER_V2_BUILD_DIR)
+	cmake --build $(PROFILER_V2_BUILD_DIR)
+	cmake --install $(PROFILER_V2_BUILD_DIR) --prefix $(INSTALL_DIR)
+
 profiler:
 	$(MAKE) -C tools/profiler all
 	mkdir -p $(INSTALL_DIR)/bin
diff --git a/configs/common.sh b/configs/common.sh
index c030d35a7..ce8f600f5 100644
--- a/configs/common.sh
+++ b/configs/common.sh
@@ -76,7 +76,8 @@ export PYTHONPATH=$GAP_SDK_HOME/gvsoc/gvsoc/engine/python:$PYTHONPATH
 export PATH="$GAP_SDK_HOME/utils/gaptest":$PATH
 
 # Audio framework
-export PYTHONPATH=$GAP_SDK_HOME/tools/audio-framework/frontends/python_graph_generator:$GAP_SDK_HOME/tools/audio-framework/components:$PYTHONPATH
+export GAP_AUDIO_FRAMEWORK_HOME=$GAP_SDK_HOME/tools/audio-framework
+export PYTHONPATH=$GAP_AUDIO_FRAMEWORK_HOME/frontends/python_graph_generator:$GAP_AUDIO_FRAMEWORK_HOME/components:$PYTHONPATH
 
 
 # Autotiler
diff --git a/configs/gapuino_v3.sh b/configs/gapuino_v3.sh
index 2db43eac9..ff20523ca 100644
--- a/configs/gapuino_v3.sh
+++ b/configs/gapuino_v3.sh
@@ -23,4 +23,6 @@ export OPENOCD_CABLE=interface/ftdi/gapuino_ftdi.cfg
 
 export GAPY_TARGET=gapuino_v3
 
+export PLPTEST_DEFAULT_PROPERTIES="chip=gap8_v3 chip_family=gap8 board=gapuino_v3 duration=50 test_duration=50"
+
 source $GAP_SDK_HOME/configs/common.sh
diff --git a/doc/conf.py b/doc/conf.py
index f9bc1c7da..d672e4da7 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -35,6 +35,7 @@ def configure_doxyfile(file_in, file_out, replace_dict):
         "../rtos/pmsis/pmsis_api/include/pmsis/rtos/",
         "../rtos/pmsis/pmsis_api/include/pmsis/cluster/",
         "../rtos/pmsis/pmsis_api/include/pmsis/platforms/",
+        "../rtos/pmsis/pmsis_api/include/pmsis/",
         "../rtos/pmsis/pmsis_bsp/include/",
         "source/reference/builtins/headers/",
     ]
@@ -86,6 +87,10 @@ def configure_doxyfile(file_in, file_out, replace_dict):
 html_theme = "sphinx_rtd_theme"
 html_logo = "_static/logo.png"
 
+html_theme_options = {
+        'navigation_depth' : -1,
+}
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
diff --git a/examples/autotiler/FFTL1/FFTRunTest.c b/examples/autotiler/FFTL1/FFTRunTest.c
index b8e892448..340333255 100644
--- a/examples/autotiler/FFTL1/FFTRunTest.c
+++ b/examples/autotiler/FFTL1/FFTRunTest.c
@@ -4,6 +4,12 @@
 	#define pmsis_exit(a)	exit(a)		
 #endif
 
+#ifndef SILENT
+    #define PRINTF printf
+#else
+    #define PRINTF(...) ((void) 0)
+#endif
+
 #define __XSTR(__s) __STR(__s)
 #define __STR(__s) #__s
 #include <stdlib.h>
@@ -21,7 +27,8 @@
 #endif
 #define  STACK_SIZE 			   2048
 typedef void (*FFTFun_T )(void *Data, void *Twiddles, signed char *shift, unsigned int Nfft, unsigned int Inverse);
-
+PI_L2 int PERF_ARR[6][3][2];
+PI_L2 float MSE_ARR[6][2];
 
 short int *InBuff_q16;
 float     *InBuff_f32, *InBuff_f32R4, *OutBuff_f32;
@@ -58,14 +65,14 @@ float MSE_f32(float* real, float* calc, int Size){
 void CallFFT(int Nfft, int Type){
 
 	// FFT: reset buffers, run and check mse
-	int start, elapsed, elapsedFFT, Q;
+	int start, elapsed, elapsedFFT, Q = 0;
 	FFT_InstallArg_T ArgIns;
 	FFT_Arg_T FFTArg;
 	AT_L2_EVENT DmaR_Evt1;
-	void (*FFTFun)(FFT_Arg_T*);
-	void (*SwapFun)(SwapSamples_Arg_T*);
+	void (*FFTFun)(FFT_Arg_T*) = 0;
+	void (*SwapFun)(SwapSamples_Arg_T*) = 0;
 	char *FFTDataType = 0;
-	void *InBuff;
+	void *InBuff = 0;
 
   	ArgIns.Nfft = Nfft; 
   	ArgIns.Radix = ((Nfft)==64 || (Nfft)==256 || (Nfft)==1024)?4:2;
@@ -96,11 +103,11 @@ void CallFFT(int Nfft, int Type){
 			FFTDataType = "Q16";
 			switch (Nfft) {
 				case 64:   ArgIns.Twiddles = R4_Twiddles_fix_64;   ArgIns.SwapLUT = R4_SwapTable_fix_64;   Q = 10; break;
-				case 128:  ArgIns.Twiddles = R2_Twiddles_fix_128;  ArgIns.SwapLUT = R2_SwapTable_fix_128;  Q = 8; break;
-				case 256:  ArgIns.Twiddles = R4_Twiddles_fix_256;  ArgIns.SwapLUT = R4_SwapTable_fix_256;  Q = 8; break;
-				case 512:  ArgIns.Twiddles = R2_Twiddles_fix_512;  ArgIns.SwapLUT = R2_SwapTable_fix_512;  Q = 6; break;
-				case 1024: ArgIns.Twiddles = R4_Twiddles_fix_1024; ArgIns.SwapLUT = R4_SwapTable_fix_1024; Q = 6; break;
-				case 2048: ArgIns.Twiddles = R2_Twiddles_fix_2048; ArgIns.SwapLUT = R2_SwapTable_fix_2048; Q = 4; break;
+				case 128:  ArgIns.Twiddles = R2_Twiddles_fix_128;  ArgIns.SwapLUT = R2_SwapTable_fix_128;  Q = 7; break;
+				case 256:  ArgIns.Twiddles = R4_Twiddles_fix_256;  ArgIns.SwapLUT = R4_SwapTable_fix_256;  Q = 6; break;
+				case 512:  ArgIns.Twiddles = R2_Twiddles_fix_512;  ArgIns.SwapLUT = R2_SwapTable_fix_512;  Q = 5; break;
+				case 1024: ArgIns.Twiddles = R4_Twiddles_fix_1024; ArgIns.SwapLUT = R4_SwapTable_fix_1024; Q = 4; break;
+				case 2048: ArgIns.Twiddles = R2_Twiddles_fix_2048; ArgIns.SwapLUT = R2_SwapTable_fix_2048; Q = 3; break;
 			}
 			if (ArgIns.Radix == 2) FFTFun = &Radix2FFT_DIF_Par_Fix16;
 			else 				   FFTFun = &Radix4FFT_DIF_Par_Fix16;
@@ -139,24 +146,32 @@ void CallFFT(int Nfft, int Type){
   	__CALL((*FFTFun), &FFTArg);
   	AT_FORK(gap_ncore(), (void *) (*SwapFun), (void *) &SwapArg);
   	__CALL((*SwapFun), &SwapArg);
-  	elapsed = gap_cl_readhwtimer() - start; printf("|     %4d | %3s %6s | %6d | %5d | %6d", Nfft, FFTDataType, ArgIns.Radix==2?"Radix2":"Radix4", elapsedFFT, elapsed, elapsed+elapsedFFT);
+  	elapsed = gap_cl_readhwtimer() - start;
+
+  	PERF_ARR[Nfft/128][Type][0] = elapsedFFT;
+  	PERF_ARR[Nfft/128][Type][1] = elapsed;
+  	
+
+  	PRINTF("|     %4d | %3s %6s | %6d | %5d | %6d", Nfft, FFTDataType, ArgIns.Radix==2?"Radix2":"Radix4", elapsedFFT, elapsed, elapsed+elapsedFFT);
 	#if !defined(__EMUL__) && defined(PERF_ALL)
-  	printf(" | %7d | %7d | %7d | %8d | %7d |", pi_perf_read(PI_PERF_INSTR), pi_perf_read(PI_PERF_ACTIVE_CYCLES), pi_perf_read(PI_PERF_TCDM_CONT), pi_perf_read(PI_PERF_LD_STALL), pi_perf_read(PI_PERF_IMISS));
+  	PRINTF(" | %7d | %7d | %7d | %8d | %7d |", pi_perf_read(PI_PERF_INSTR), pi_perf_read(PI_PERF_ACTIVE_CYCLES), pi_perf_read(PI_PERF_TCDM_CONT), pi_perf_read(PI_PERF_LD_STALL), pi_perf_read(PI_PERF_IMISS));
   	#else
-  	printf(" |         |         |         |          |         |");
+  	PRINTF(" |         |         |         |          |         |");
   	#endif
   	if (Type == 0) {
-  		printf("          |\n");
-  		// printf("\nOutFFT%d_f32 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) printf("%f%+fj, ", InBuff_f32[2*i], InBuff_f32[2*i+1]); printf("])\n");
+  		PRINTF("          |\n");
+  		// PRINTF("\nOutFFT%d_f32 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) PRINTF("%f%+fj, ", InBuff_f32[2*i], InBuff_f32[2*i+1]); PRINTF("])\n");
   	} else if (Type == 1) {
-  		// printf("\nOutFFT%d_q16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) printf("%d%+dj, ", ((short int*)InBuff_q16)[2*i], ((short int*)InBuff_q16)[2*i+1]); printf("])\n");
-  		printf(" %f |\n", MSE_16(InBuff_f32, (short int*) InBuff_q16, Nfft, Q));
+  		// PRINTF("\nOutFFT%d_q16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) PRINTF("%d%+dj, ", ((short int*)InBuff_q16)[2*i], ((short int*)InBuff_q16)[2*i+1]); PRINTF("])\n");
+  		MSE_ARR[Nfft/128][0] = MSE_16(InBuff_f32, (short int*) InBuff_q16, Nfft, Q);
+  		PRINTF(" %f |\n", MSE_ARR[Nfft/128][0]);
   	} else if (Type == 2) {
   		#ifdef __gap9__
-  		// printf("\nOutFFT%d_f16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) printf("%f%+fj, ", ((f16*)OutBuff)[2*i], ((f16*)OutBuff)[2*i+1]); printf("])\n");
-  		printf(" %f |\n", MSE_f16(InBuff_f32, (f16 *) InBuff_f16, Nfft));
+  		// PRINTF("\nOutFFT%d_f16 = np.array([\n", Nfft); for(int i=0;i<(Nfft); i++) PRINTF("%f%+fj, ", ((f16*)OutBuff)[2*i], ((f16*)OutBuff)[2*i+1]); PRINTF("])\n");
+  		MSE_ARR[Nfft/128][1] = MSE_f16(InBuff_f32, (f16 *) InBuff_f16, Nfft);
+  		PRINTF(" %f |\n", MSE_ARR[Nfft/128][1]);
   		#else
-  		printf("\n");
+  		PRINTF("\n");
   		#endif
   	}
 }
@@ -171,31 +186,31 @@ static void RunFFT()
     #endif
     gap_cl_resethwtimer();
     int	start, elapsed, timef32;
-    printf("Initializing inputs....\n");
+    PRINTF("Initializing inputs....\n");
 	//InitData4      (InDataQ16, MAXDIM, 37, 15, 23, 73, 0.1, 0.5, 0.6, 0.8);
 	//InitData4_float(InDataf32, MAXDIM, 37, 15, 23, 73, 0.1, 0.5, 0.6, 0.8);
 	#ifdef __gap9__
 	for (int i=0; i<MAXDIM; i++) InDataf16[i] = (f16) InDataf32[i];
 	#endif
-	printf("Done!\n");
+	PRINTF("Done!\n");
 
   	gap_cl_resethwtimer();
 
   	int FFTBins = 64;
-  	printf("|----------+------------+--------+-------+--------+---------+---------+---------+----------+---------+----------|\n");
-  	printf("| FFT BINS | Type       | FFT    | Swap  | Tot    | Instr   | Act Cyc | TCDM Co | LD Stall | Imiss   | MSE Err  |\n");
-  	printf("|----------+------------+--------+-------+--------+---------+---------+---------+----------+---------+----------|\n");
+  	PRINTF("|----------+------------+--------+-------+--------+---------+---------+---------+----------+---------+----------|\n");
+  	PRINTF("| FFT BINS | Type       | FFT    | Swap  | Tot    | Instr   | Act Cyc | TCDM Co | LD Stall | Imiss   | MSE Err  |\n");
+  	PRINTF("|----------+------------+--------+-------+--------+---------+---------+---------+----------+---------+----------|\n");
 	while (FFTBins < MAXDIM){
-		//printf("FFT: %4d\n", FFTBins);
+		//PRINTF("FFT: %4d\n", FFTBins);
 	  	CallFFT(FFTBins, 0);
 	  	CallFFT(FFTBins, 1);
 	  	#ifdef __gap9__
 	  	CallFFT(FFTBins, 2);
 	  	#endif
 	  	FFTBins *= 2;
-  		printf("|----------+------------+--------+-------+--------+---------+---------+---------+----------+---------+----------|\n");
+  		PRINTF("|----------+------------+--------+-------+--------+---------+---------+---------+----------+---------+----------|\n");
   	}
-  	printf("Finished\n");
+  	PRINTF("Finished\n");
 }
 
 void test_kickoff(void *arg)
@@ -237,7 +252,25 @@ void test_kickoff(void *arg)
 		task.slave_stack_size = (unsigned int) 1048;
 		pi_cluster_send_task(&cluster_dev, &task);
 	#endif
-	printf("Exiting\n");
+
+  	int FFTBins = 64;
+	while (FFTBins < MAXDIM){
+		if (MSE_ARR[FFTBins/128][0] > 0.016) {
+			printf("Error: MSE too large for %d FFT Q16\n", FFTBins);
+			printf("Test FAILED\n");
+			pmsis_exit(-1);
+		}
+		#ifdef __gap9__
+		if (MSE_ARR[FFTBins/128][1] > 0.000048) {
+			printf("Error: MSE too large for %d FFT F16\n", FFTBins);
+			printf("Test FAILED\n");
+			pmsis_exit(-1);
+		}
+		#endif
+	  	FFTBins *= 2;
+	}
+ 
+	printf("Test PASSED\n");
     pmsis_exit(0);
 }
 
diff --git a/examples/autotiler/FFTL1/Makefile b/examples/autotiler/FFTL1/Makefile
index 56a7a8a6a..f9886f037 100644
--- a/examples/autotiler/FFTL1/Makefile
+++ b/examples/autotiler/FFTL1/Makefile
@@ -1,7 +1,7 @@
 # User Test
 #------------------------------------
 
-PMSIS_OS?=pulpos
+#PMSIS_OS?=pulpos
 APP              = test
 APP_SRCS        += FFTRunTest.c $(AT_HOME)/DSP_Libraries/FFT_Library.c $(AT_HOME)/DSP_Libraries/LUT_Tables/TwiddlesDef.c $(AT_HOME)/DSP_Libraries/LUT_Tables/SwapTablesDef.c 
 APP_INC	        += 
diff --git a/examples/pmsis/periph/i2c/i2c_scan/Makefile b/examples/pmsis/periph/i2c/i2c_scan/Makefile
index c210b96ed..ad43981fc 100644
--- a/examples/pmsis/periph/i2c/i2c_scan/Makefile
+++ b/examples/pmsis/periph/i2c/i2c_scan/Makefile
@@ -7,14 +7,11 @@ APP_INC         +=
 APP_CFLAGS      +=
 
 
-runner_args =--trace=corruptor --trace-level=trace
-#runner_args =--trace=board.*i2c --trace-level=trace
-#runner_args =--trace=board.*i2c:gvsoc.log --trace-level=trace
-#runner_args =--trace=eeprom
-#runner_args +=--trace-level=trace
+ifeq '$(platform)' 'gvsoc'
 # Overwrite the default target so that GVSOC simulates our board
 # First name is the class name, second one is the python module
 export GAPY_PY_TARGET=My_board@my_board
+endif
 
 # Append current directory to python path so that it finds our board and module
 export PYTHONPATH:=$(CURDIR):$(PYTHONPATH)
diff --git a/gvsoc/gvsoc/bin/pulp-pc-info b/gvsoc/gvsoc/bin/pulp-pc-info
index 62bff025e..ab816b189 100755
--- a/gvsoc/gvsoc/bin/pulp-pc-info
+++ b/gvsoc/gvsoc/bin/pulp-pc-info
@@ -63,12 +63,15 @@ toolchain = os.environ.get('PULP_RISCV_GCC_TOOLCHAIN_CI')
 if toolchain is None:
 	toolchain = os.environ.get('PULP_RISCV_GCC_TOOLCHAIN')
 
-if toolchain is not None:
-	readelf = toolchain + '/bin/riscv32-unknown-elf-readelf'
-	addr2line = toolchain + '/bin/riscv32-unknown-elf-addr2line'
-else:
-	readelf = 'riscv32-unknown-elf-readelf'
-	addr2line = 'riscv32-unknown-elf-addr2line'
+# if toolchain is not None:
+# 	readelf = toolchain + '/bin/riscv32-unknown-elf-readelf'
+# 	addr2line = toolchain + '/bin/riscv32-unknown-elf-addr2line'
+# else:
+# 	readelf = 'riscv32-unknown-elf-readelf'
+# 	addr2line = 'riscv32-unknown-elf-addr2line'
+
+readelf = 'readelf'
+addr2line = 'addr2line'
 
 process = Popen((readelf + ' -s %s' % args.file).split(), stdin=PIPE, stdout=PIPE)
 
diff --git a/gvsoc/gvsoc/engine/include/gv/power.hpp b/gvsoc/gvsoc/engine/include/gv/power.hpp
index df4853d35..2f43e6049 100644
--- a/gvsoc/gvsoc/engine/include/gv/power.hpp
+++ b/gvsoc/gvsoc/engine/include/gv/power.hpp
@@ -148,9 +148,41 @@ namespace vp
              */
             void setup(double temp, double volt, double freq);
 
+            /**
+             * @brief Turn on a power source
+             *
+             * This power source should be turned on when its power domain is turned on, in order to start consuming power
+             */
+            void turn_on();
+
+            /**
+             * @brief Turn off a power source
+             *
+             * This power source should be turned off when its power domain is turned off, in order to stop consuming power
+             */
+            void turn_off();
+
+            /**
+             * @brief Turn on a power source
+             *
+             * This power source should be turned on when its power domain is turned on, in order to start consuming power
+             */
+            void turn_dynamic_power_on();
+
+            /**
+             * @brief Turn off a power source
+             *
+             * This power source should be turned off when its power domain is turned off, in order to stop consuming power
+             */
+            void turn_dynamic_power_off();
+
         private:
-            Linear_table *table = NULL;  // Table of power values for all supported temperatures and voltages
-                                        // imported from the json configuration given when trace was initialized.
+            void check();
+
+            Linear_table *dyn_table = NULL;  // Table of power values for all supported temperatures and voltages
+                                     // imported from the json configuration given when trace was initialized.
+            Linear_table *leakage_table = NULL;  // Table of power values for all supported temperatures and voltages
+                                     // imported from the json configuration given when trace was initialized.
             double quantum;          // Current quantumm of energy, for quantum-based power consumption.
                                      // The current value is estimated depending on voltage and temperature according
                                      // to the provided json configuration.
@@ -162,7 +194,12 @@ namespace vp
                                      // to the provided json configuration.
             component *top;          // Top component containing the power source
             power_trace *trace;      // Power trace where the power consumption should be reported.
-            bool is_on = false;      // True is the source is on and backgroun-power and leakage should be reported
+            bool is_dynamic_power_started = false;      // True is the source consuming dynamic backgroun power
+            bool is_leakage_power_started = false;      // True is the source should start consuming leakage power
+            bool is_on = false;      // True is the power domain containing the power source is on and backgroun-power and leakage should be reported
+            bool is_dynamic_power_on = false;      // True is the power domain containing the power source is on and backgroun-power and leakage should be reported
+            bool dynamic_power_is_on_sync = false;
+            bool leakage_power_is_on_sync = false;
         };
 
 
@@ -327,12 +364,16 @@ namespace vp
             // power consumed.
             void account_leakage_power();
 
-            // Check if the current amount of cycle energy is not for the current cycle
+            // Check if the current amount of power due to quantum of energies
+            // is not for the current cycle
             // (by checking the timestamp), and if not, reset it to zero.
-            inline void flush_dynamic_energy_for_cycle();
+            inline void flush_quantum_power_for_cycle();
 
-            // Get the amount of energy spent in the current cycle
-            inline double get_dynamic_energy_for_cycle();
+            // Get the average power of the current cycle due to quantums of energy
+            inline double get_quantum_power_for_cycle();
+
+            // Get the energy spent in the current cycle due to quantums of energy
+            inline double get_quantum_energy_for_cycle();
 
             // Return the total amount of dynamic energy spent since the beginning
             // of the report windows (since report_start was called)
@@ -363,7 +404,7 @@ namespace vp
             int64_t curent_cycle_timestamp;   // Timestamp of the current cycle, used to compute energy spent in the
                                               // current cycle. As soon as current time is different, the timestamp
                                               // is set to current time and the current energy is set to 0.
-            double dynamic_energy_for_cycle;  // Amount of energy spent in the current cycle.
+            double quantum_power_for_cycle;  // Power spent by quentum of energy in the current cycle.
                                               // It is increased everytime a quantum of energy is
                                               // spent and reset to zero when the current cycle is
                                               // over. It is mostly used to compute the instant power
@@ -468,6 +509,15 @@ namespace vp
              */
             vp::power::power_trace *get_power_trace() { return &this->power_trace; }
 
+            /**
+             * @brief Set power supply state
+             * 
+             * This sets the power supply for this component and all his childs.
+             * 
+             * @param state Supply state
+             */
+            void power_supply_set_all(int state);
+
         protected:
             /**
              * @brief Get the report energy from childs object
@@ -516,10 +566,15 @@ namespace vp
             // Get instant power for this component and the whole hierarchy below him.
             double get_power_from_self_and_childs();
 
+            // Set power supply state
+            static void power_supply_sync(void *_this, int state);
+
             component &top;                                // Component containing the power component object
             vp::power::power_trace power_trace;            // Default power trace of this component
             std::vector<vp::power::power_trace *> traces;  // Vector of power traces of this component
+            std::vector<vp::power::power_source *> sources;  // Vector of power sources of this component
             power::engine *engine = NULL;                  // Power engine
+            vp::wire_slave<int> power_port;                // Slave port for setting power supply state
         };
 
 
@@ -542,6 +597,8 @@ namespace vp
              */
             engine(vp::component *top);
 
+            ~engine();
+
             /**
              * @brief Start power report generation
              * 
@@ -570,6 +627,8 @@ namespace vp
             std::vector<vp::power::power_trace *> traces; // Vector of all traces.
 
             vp::component *top;  // Top component of the simulated architecture
+
+            FILE *file; // File where the power reports are dumped
         };
 
     };
diff --git a/gvsoc/gvsoc/engine/include/vp/component.hpp b/gvsoc/gvsoc/engine/include/vp/component.hpp
index dd336a587..44b05070c 100644
--- a/gvsoc/gvsoc/engine/include/vp/component.hpp
+++ b/gvsoc/gvsoc/engine/include/vp/component.hpp
@@ -397,6 +397,7 @@ namespace vp {
   {
 
     friend class component_clock;
+    friend class vp::power::component_power;
 
   public:
     component(js::config *config);
@@ -410,6 +411,7 @@ namespace vp {
     virtual void quit(int status) {}
     virtual void pre_reset() {}
     virtual void reset(bool active) {}
+    virtual void power_supply_set(int state) {}
     virtual void load() {}
     virtual void elab();
     virtual void run() {}
@@ -568,6 +570,14 @@ namespace vp {
 
   vp::component *__gv_create(std::string config_path, struct gv_conf *gv_conf);
 
+  class top
+  {
+  public:
+      component *top_instance;
+      power::engine *power_engine;
+  private:
+  };
+
 };  
 
 #endif
diff --git a/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp b/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp
index 10bb99434..bcea94c18 100644
--- a/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp
+++ b/gvsoc/gvsoc/engine/include/vp/power/power_source.hpp
@@ -25,51 +25,78 @@
 #include "vp/vp_data.hpp"
 
 
+inline void vp::power::power_source::turn_on()
+{
+    this->is_on = true;
+    this->is_dynamic_power_on = true;
+    this->check();
+}
+
+
+inline void vp::power::power_source::turn_off()
+{
+    this->is_on = false;
+    this->is_dynamic_power_on = false;
+    this->check();
+}
+
+inline void vp::power::power_source::turn_dynamic_power_on()
+{
+    this->is_dynamic_power_on = true;
+    this->check();
+}
+
+inline void vp::power::power_source::turn_dynamic_power_off()
+{
+    this->is_dynamic_power_on = false;
+    this->check();
+}
+
 
 inline void vp::power::power_source::leakage_power_start()
 {
-    // Only start accounting leakage if not already done and if leakage is defined
-    if (!this->is_on && this->leakage != -1)
+    // Only start if leakage is defined
+    if (this->leakage != -1)
     {
-        this->trace->inc_leakage_power(this->leakage);
+        this->is_leakage_power_started = true;
+        this->check();
     }
-    this->is_on = true;
 }
 
 
 
 inline void vp::power::power_source::leakage_power_stop()
 {
-    // Only stop accounting leakage if not already done and if leakage is defined
-    if (this->is_on && this->leakage != -1)
+    // Only stop if leakage is defined
+    if (this->leakage != -1)
     {
-        this->trace->inc_leakage_power(-this->leakage);
+        this->is_leakage_power_started = false;
+        this->check();
     }
-    this->is_on = false;
 }
 
 
 
 inline void vp::power::power_source::dynamic_power_start()
 {
-    // Only start accounting background power if not already done and if it is is defined
-    if (!this->is_on && this->background_power != -1)
+    // Only start accounting background power if it is is defined
+    if (this->background_power != -1)
     {
-        this->trace->inc_dynamic_power(this->background_power);
+        this->is_dynamic_power_started = true;
+        this->check();
     }
-    this->is_on = true;
 }
 
 
 
 inline void vp::power::power_source::dynamic_power_stop()
 {
-    // Only stop accounting background power if not already done and if it is is defined
-    if (this->is_on && this->background_power != -1)
+    // Only stop accounting background power if it is is defined
+    if (this->background_power != -1)
     {
-        this->trace->inc_dynamic_power(-this->background_power);
+        this->is_dynamic_power_started = false;
+        this->check();
     }
-    this->is_on = false;
 }
 
 
@@ -77,7 +104,7 @@ inline void vp::power::power_source::dynamic_power_stop()
 inline void vp::power::power_source::account_energy_quantum()
 {
     // Only account energy is a quantum is defined
-    if (this->quantum != -1)
+    if (this->is_on && this->is_dynamic_power_on && this->quantum != -1)
     {
         this->trace->inc_dynamic_energy(this->quantum);
     }
diff --git a/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp b/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp
index 6d833599b..fe33eaa79 100644
--- a/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp
+++ b/gvsoc/gvsoc/engine/include/vp/power/power_trace.hpp
@@ -32,25 +32,43 @@ inline double vp::power::power_trace::get_power()
 
 
 
-inline double vp::power::power_trace::get_dynamic_energy_for_cycle()
+inline double vp::power::power_trace::get_quantum_power_for_cycle()
 {
     // First check if the current energy is for an old cycle
-    this->flush_dynamic_energy_for_cycle();
+    this->flush_quantum_power_for_cycle();
 
     // And return the current total
-    return this->dynamic_energy_for_cycle;
+    return this->quantum_power_for_cycle;
 }
 
 
+inline double vp::power::power_trace::get_quantum_energy_for_cycle()
+{
+    double power = this->get_quantum_power_for_cycle();
 
-inline void vp::power::power_trace::flush_dynamic_energy_for_cycle()
+    if (power != 0)
+    {
+        return power * this->top->get_period();
+    }
+
+    return 0;
+}
+
+
+
+inline void vp::power::power_trace::flush_quantum_power_for_cycle()
 {
     // Clear the current total if it is not for the current cycle
-    if (this->curent_cycle_timestamp < this->top->get_time())
+    if (this->quantum_power_for_cycle && this->curent_cycle_timestamp < this->top->get_time())
     {
-        this->curent_cycle_timestamp = this->top->get_time();
-        this->dynamic_energy_for_cycle = 0;
+        if (this->parent)
+        {
+            this->parent->inc_dynamic_power(-this->quantum_power_for_cycle);
+        }
+        this->quantum_power_for_cycle = 0;
     }
+
+    this->curent_cycle_timestamp = this->top->get_time();
 }
 
 
diff --git a/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp b/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp
index 4f5da18ea..6340ba80d 100644
--- a/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp
+++ b/gvsoc/gvsoc/engine/include/vp/trace/event_dumper.hpp
@@ -71,7 +71,7 @@ namespace vp {
   class Event_dumper
   {
   public:
-    Event_dumper(vp::component *comp) : comp(comp) {}
+    Event_dumper(vp::component *comp) : comp(comp) { this->user_vcd = NULL; }
     Event_trace *get_trace(string trace_name, string file_name, int width, bool is_real=false, bool is_string=false);
     Event_trace *get_trace_real(string trace_name, string file_name);
     Event_trace *get_trace_string(string trace_name, string file_name);
@@ -83,6 +83,7 @@ namespace vp {
   private:
     std::map<std::string, Event_trace *> event_traces;
     std::map<std::string, Event_file *> event_files;
+    gv::Vcd_user *user_vcd;
   };
 
   class Vcd_file : public Event_file
diff --git a/gvsoc/gvsoc/engine/python/gv/gvsoc.py b/gvsoc/gvsoc/engine/python/gv/gvsoc.py
index 04f900465..d711756d8 100644
--- a/gvsoc/gvsoc/engine/python/gv/gvsoc.py
+++ b/gvsoc/gvsoc/engine/python/gv/gvsoc.py
@@ -146,7 +146,8 @@ def conf(self):
 
     def __gen_debug_info(self, full_config, gvsoc_config):
         for binary in full_config.get('**/debug_binaries').get_dict():
-            if os.system('pulp-pc-info --file %s --all-file %s' % (binary.replace('.debugInfo', ''), binary)) != 0:
+            if os.system('gen-debug-info %s %s' % (binary.replace('.debugInfo', ''), binary)) != 0:
+            # if os.system('pulp-pc-info --file %s --all-file %s' % (binary.replace('.debugInfo', ''), binary)) != 0:
                 raise errors.InputError('Error while generating debug symbols information, make sure the toolchain and the binaries are accessible ')
 
 
diff --git a/gvsoc/gvsoc/engine/src/launcher.cpp b/gvsoc/gvsoc/engine/src/launcher.cpp
index 63908857c..1dc3b727a 100644
--- a/gvsoc/gvsoc/engine/src/launcher.cpp
+++ b/gvsoc/gvsoc/engine/src/launcher.cpp
@@ -43,6 +43,7 @@ class Gvsoc_launcher : public gv::Gvsoc
 
 private:
 
+    void *handler;
     vp::component *instance;
 };
 
@@ -53,19 +54,20 @@ gv::Gvsoc *gv::gvsoc_new()
 
 void Gvsoc_launcher::open(std::string config_path)
 {
-    this->instance = vp::__gv_create(config_path, NULL);
+    this->handler = vp::__gv_create(config_path, NULL);
+    this->instance = ((vp::top *)this->handler)->top_instance;
 
-    gv_start((void *)this->instance);
+    gv_start(this->handler);
 }
 
 void Gvsoc_launcher::close()
 {
-    gv_destroy((void *)this->instance);
+    gv_destroy(this->handler);
 }
 
 void Gvsoc_launcher::run()
 {
-    gv_step((void *)this->instance, 0);
+    gv_step(this->handler, 0);
 }
 
 int64_t Gvsoc_launcher::stop()
@@ -76,7 +78,7 @@ int64_t Gvsoc_launcher::stop()
 
 int64_t Gvsoc_launcher::step(int64_t duration)
 {
-    gv_step((void *)this->instance, duration);
+    gv_step(this->handler, duration);
     return 0;
 }
 
@@ -92,10 +94,10 @@ void Gvsoc_launcher::vcd_bind(gv::Vcd_user *user)
 
 void Gvsoc_launcher::event_add(std::string path, bool is_regex)
 {
-
+    this->instance->traces.get_trace_manager()->conf_trace(1, path, 1);
 }
 
 void Gvsoc_launcher::event_exclude(std::string path, bool is_regex)
 {
-
+    this->instance->traces.get_trace_manager()->conf_trace(1, path, 0);
 }
diff --git a/gvsoc/gvsoc/engine/src/power/component_power.cpp b/gvsoc/gvsoc/engine/src/power/component_power.cpp
index 310a00b7a..d8930b94c 100644
--- a/gvsoc/gvsoc/engine/src/power/component_power.cpp
+++ b/gvsoc/gvsoc/engine/src/power/component_power.cpp
@@ -41,6 +41,9 @@ void vp::power::component_power::build()
     {
         this->get_engine()->reg_trace(trace);
     }
+
+    this->power_port.set_sync_meth(&vp::power::component_power::power_supply_sync);
+    this->top.new_slave_port(this, "power_supply", &this->power_port);
 }
 
 
@@ -69,6 +72,8 @@ int vp::power::component_power::new_power_source(std::string name, power_source
 
     source->setup(VP_POWER_DEFAULT_TEMP, VP_POWER_DEFAULT_VOLT, VP_POWER_DEFAULT_FREQ);
 
+    this->sources.push_back(source);
+
     return 0;
 }
 
@@ -150,3 +155,51 @@ void vp::power::component_power::dump_child_traces(FILE *file, double total)
         x->power.dump(file, total);
     }
 }
+
+void vp::power::component_power::power_supply_sync(void *__this, int state)
+{
+    vp::power::component_power *_this = (vp::power::component_power *)__this;
+    _this->power_supply_set_all(state);
+}
+
+
+void vp::power::component_power::power_supply_set_all(int state)
+{
+    this->top.power_supply_set(state);
+
+    for (auto &x : this->top.childs)
+    {
+        x->power.power_supply_set_all(state);
+    }
+
+
+
+    if (state >= 2)
+    {
+        for (auto &x : this->sources)
+        {
+            if (state == 3)
+            {
+                x->turn_dynamic_power_on();
+            }
+            else
+            {
+                x->turn_dynamic_power_off();
+            }
+        }
+    }
+    else
+    {
+        for (auto &x : this->sources)
+        {
+            if (state == 1)
+            {
+                x->turn_on();
+            }
+            else
+            {
+                x->turn_off();
+            }
+        }
+    }
+}
diff --git a/gvsoc/gvsoc/engine/src/power/power_engine.cpp b/gvsoc/gvsoc/engine/src/power/power_engine.cpp
index c7661f6a9..19652c069 100644
--- a/gvsoc/gvsoc/engine/src/power/power_engine.cpp
+++ b/gvsoc/gvsoc/engine/src/power/power_engine.cpp
@@ -46,14 +46,11 @@ void vp::power::engine::start_capture()
 void vp::power::engine::stop_capture()
 {
     // When stopping, dump recursively all traces to a file
-    FILE *file = fopen("power_report.csv", "w");
-    if (file == NULL)
+
+    if (this->file)
     {
-        // vp_warning_always(&this->warning, "Failed to open power report file (path: %s)\n", "power_report.csv");
-        return;
+        this->top->dump_traces_recursive(file);
     }
-
-    this->top->dump_traces_recursive(file);
 }
 
 
@@ -64,4 +61,19 @@ vp::power::engine::engine(vp::component *top)
 
     // Declare power service, each component will ask the connection to it
     top->new_service("power", this);
+
+    this->file = fopen("power_report.csv", "w");
+    if (this->file == NULL)
+    {
+        //vp_warning_always(&this->warning, "Failed to open power report file (path: %s)\n", "power_report.csv");
+    }
+}
+
+
+vp::power::engine::~engine()
+{
+    if (this->file)
+    {
+        fclose(this->file);
+    }
 }
diff --git a/gvsoc/gvsoc/engine/src/power/power_source.cpp b/gvsoc/gvsoc/engine/src/power/power_source.cpp
index 579e27f07..c28ded02a 100644
--- a/gvsoc/gvsoc/engine/src/power/power_source.cpp
+++ b/gvsoc/gvsoc/engine/src/power/power_source.cpp
@@ -30,15 +30,15 @@ void vp::power::power_source::setup(double temp, double volt, double freq)
     // dynamic background power or leakage if they are defined, which is the case if they are not -1
     if (this->quantum != -1)
     {
-        this->quantum = this->table->get(temp, volt, freq);
+        this->quantum = this->dyn_table->get(temp, volt, freq);
     }
     if (this->background_power != -1)
     {
-        this->background_power = this->table->get(temp, volt, freq);
+        this->background_power = this->dyn_table->get(temp, volt, freq);
     }
     if (this->leakage != -1)
     {
-        this->leakage = this->table->get(temp, volt, freq);
+        this->leakage = this->leakage_table->get(temp, volt, freq);
     }
 }
 
@@ -130,7 +130,14 @@ int vp::power::power_source::init(component *top, std::string name, js::config *
                     return -1;
                 }
 
-                this->table = new Linear_table(values);
+                if (is_leakage)
+                {
+                    this->leakage_table = new Linear_table(values);
+                }
+                else
+                {
+                    this->dyn_table = new Linear_table(values);
+                }
             }
             else
             {
@@ -147,3 +154,44 @@ int vp::power::power_source::init(component *top, std::string name, js::config *
 
     return 0;  
 }
+
+
+void vp::power::power_source::check()
+{
+    bool leakage_power_is_on = this->is_on && this->is_leakage_power_started;
+    bool dynamic_power_is_on = this->is_on && this->is_dynamic_power_on && this->is_dynamic_power_started;
+
+    if (this->dynamic_power_is_on_sync != dynamic_power_is_on)
+    {
+        if (this->background_power)
+        {
+            if (dynamic_power_is_on)
+            {
+                this->trace->inc_dynamic_power(this->background_power);
+            }
+            else
+            {
+                this->trace->inc_dynamic_power(-this->background_power);
+            }
+        }
+
+        this->dynamic_power_is_on_sync = dynamic_power_is_on;
+    }
+
+    if (this->leakage_power_is_on_sync != leakage_power_is_on)
+    {
+        if (this->leakage)
+        {
+            if (leakage_power_is_on)
+            {
+                this->trace->inc_leakage_power(this->leakage);
+            }
+            else
+            {
+                this->trace->inc_leakage_power(-this->leakage);
+            }
+        }
+
+        this->leakage_power_is_on_sync = leakage_power_is_on;
+    }
+}
diff --git a/gvsoc/gvsoc/engine/src/power/power_trace.cpp b/gvsoc/gvsoc/engine/src/power/power_trace.cpp
index 3bda6a8b4..c5e5001b8 100644
--- a/gvsoc/gvsoc/engine/src/power/power_trace.cpp
+++ b/gvsoc/gvsoc/engine/src/power/power_trace.cpp
@@ -28,7 +28,7 @@ int vp::power::power_trace::init(component *top, std::string name, vp::power::po
 {
     this->top = top;
     top->traces.new_trace_event_real(name, &this->trace);
-    this->dynamic_energy_for_cycle = 0;
+    this->quantum_power_for_cycle = 0;
     this->report_dynamic_energy = 0;
     this->report_leakage_energy = 0;
     this->curent_cycle_timestamp = 0;
@@ -37,7 +37,7 @@ int vp::power::power_trace::init(component *top, std::string name, vp::power::po
     if (parent == NULL)
     {
         vp::component *component = top->get_parent();
-        if (component)
+        if (component && component->get_path() != "")
         {
             parent = component->power.get_power_trace();
         }
@@ -73,10 +73,13 @@ void vp::power::power_trace::trace_handler(void *__this, vp::clock_event *event)
 
 void vp::power::power_trace::report_start()
 {
+    this->account_dynamic_power();
+    this->account_leakage_power();
+
     // Since the report start may be triggered in the middle of several events
     // for power consumptions, include what has already be accounted
     // in the same cycle.
-    this->report_dynamic_energy = this->get_dynamic_energy_for_cycle();
+    this->report_dynamic_energy = this->get_quantum_energy_for_cycle();
     this->report_leakage_energy = 0;
     this->report_start_timestamp = this->top->get_time();
 }
@@ -93,12 +96,9 @@ void vp::power::power_trace::get_report_energy(double *dynamic, double *leakage)
 
 void vp::power::power_trace::get_report_power(double *dynamic, double *leakage)
 {
-    double childs_dynamic = 0, childs_leakage = 0;
-
     // To get the power on the report window, we just get the total energy and divide by the window duration
-    this->top->power.get_report_energy_from_childs(&childs_dynamic, &childs_leakage);
-    *dynamic = (childs_dynamic + this->get_report_dynamic_energy()) / (this->top->get_time() - this->report_start_timestamp);
-    *leakage = (childs_leakage + this->get_report_leakage_energy()) / (this->top->get_time() - this->report_start_timestamp);
+    *dynamic = (this->get_report_dynamic_energy()) / (this->top->get_time() - this->report_start_timestamp);
+    *leakage = (this->get_report_leakage_energy()) / (this->top->get_time() - this->report_start_timestamp);
 }
 
 
@@ -127,43 +127,25 @@ void vp::power::power_trace::dump_vcd_trace()
     if (this->top->get_path() == "")
         return;
 
-    double power = 0.0;
-
     // To dump the VCD trace, we need to compute the instant power, since this is what is reported.
     // This is easy for background and leakage power. For enery quantum, we get the amount of energy for the current
     // cycle and compute the instant power using the clock engine period.
 
-    // Some component do not have clocks. They cannot use energy quantum but they can still use background
-    // power and leakage
-    if (this->top->get_clock())
-    {
-        int64_t period = this->top->get_period();
-        if (period != 0)
-        {
-            power += this->get_dynamic_energy_for_cycle() / period;
-        }
-    }
+    double quantum_power = this->get_quantum_power_for_cycle();
     double power_background = this->current_dynamic_power + this->current_leakage_power;
 
     // Also account the power from childs since VCD traces are hierarchical
-    double childs_power = this->top->power.get_power_from_childs();
-    this->current_power = power + power_background + childs_power;
+    this->current_power = quantum_power + power_background;
 
     // Dump the instant power to trace
     this->trace.event_real(current_power);
 
     // If there was a contribution from energy quantum, schedule an event in the next cycle so that we dump again 
     // the trace since teh quantum implicitely disappears and overal power is modified
-    if (!this->trace_event->is_enqueued() && power > 0)
+    if (!this->trace_event->is_enqueued() && quantum_power > 0)
     {
         this->top->event_enqueue(this->trace_event, 1);
     }
-
-    // Notify the parent that this trace was dumped so that the upper traces can be dumped as well
-    if (this->parent)
-    {
-        this->parent->dump_vcd_trace();
-    }
 }
 
 
@@ -213,15 +195,26 @@ void vp::power::power_trace::account_leakage_power()
 
 void vp::power::power_trace::inc_dynamic_energy(double quantum)
 {
+    if (this->top->get_period() == 0)
+    {
+        return;
+    }
+
     // Since we need to account the energy for the current amount of the cycle, check if it needs to be flushed
-    this->flush_dynamic_energy_for_cycle();
+    this->flush_quantum_power_for_cycle();
 
     // Then account it to both the total amount and to the cycle amount
-    this->dynamic_energy_for_cycle += quantum;
+    double power = quantum / this->top->get_period();
+    this->quantum_power_for_cycle += power;
     this->report_dynamic_energy += quantum;
 
-    // Redump VCD trace since teh instant power is impacted
+    // Redump VCD trace since the instant power is impacted
     this->dump_vcd_trace();
+
+    if (this->parent)
+    {
+        this->parent->inc_dynamic_power(power);
+    }
 }
 
 
@@ -235,14 +228,24 @@ void vp::power::power_trace::inc_dynamic_power(double power_incr)
     this->account_dynamic_power();
     this->current_dynamic_power += power_incr;
 
-    // Redump VCD trace since teh instant power is impacted
+    // Redump VCD trace since the instant power is impacted
     this->dump_vcd_trace();
+
+    if (this->parent)
+    {
+        this->parent->inc_dynamic_power(power_incr);
+    }
 }
 
 
 
 void vp::power::power_trace::inc_leakage_power(double power_incr)
 {
+    // TODO this is wasting time and should be removed once fake component such as time domain and trace domain
+    // are not in the component hierarchy anymore
+    if (this->top->get_path() == "")
+        return;
+
     // Leakage and dynamic are handled differently since they are reported separately,
     // In both cases, first compute the power on current period, start a new one,
     // and change the power so that it is constant over the period, to properly
@@ -250,6 +253,11 @@ void vp::power::power_trace::inc_leakage_power(double power_incr)
     this->account_leakage_power();
     this->current_leakage_power += power_incr;
 
-    // Redump VCD trace since teh instant power is impacted
+    // Redump VCD trace since the instant power is impacted
     this->dump_vcd_trace();
+
+    if (this->parent)
+    {
+        this->parent->inc_leakage_power(power_incr);
+    }
 }
diff --git a/gvsoc/gvsoc/engine/src/trace/event.cpp b/gvsoc/gvsoc/engine/src/trace/event.cpp
index ae0f0475a..579880226 100644
--- a/gvsoc/gvsoc/engine/src/trace/event.cpp
+++ b/gvsoc/gvsoc/engine/src/trace/event.cpp
@@ -102,6 +102,11 @@ vp::Event_trace *vp::Event_dumper::get_trace(string trace_name, string file_name
 
     trace = new Event_trace(trace_name, event_file, width, is_real, is_string);
     event_traces[trace_name] = trace;
+
+    if (this->user_vcd)
+    {
+      trace->set_vcd_user(this->user_vcd);
+    }
   }
 
   return trace;
@@ -135,6 +140,8 @@ void vp::Event_dumper::close()
 
 void vp::Event_dumper::set_vcd_user(gv::Vcd_user *user)
 {
+    this->user_vcd = user;
+
     for (auto const& x : event_traces)
     {
       x.second->set_vcd_user(user);
diff --git a/gvsoc/gvsoc/engine/src/vp.cpp b/gvsoc/gvsoc/engine/src/vp.cpp
index 271e0078a..f733d8e3d 100644
--- a/gvsoc/gvsoc/engine/src/vp.cpp
+++ b/gvsoc/gvsoc/engine/src/vp.cpp
@@ -67,6 +67,7 @@ char vp_error[VP_ERROR_SIZE];
 static Gv_proxy *proxy = NULL;
 
 
+
 uint64_t vp::reg::get_field(int offset, int width)
 {
     uint64_t value = 0;
@@ -329,6 +330,7 @@ void vp::component_clock::clk_reg(component *_this, component *clock)
     }
 }
 
+
 void vp::component::reset_all(bool active, bool from_itf)
 {
     // Small hack to not propagate the reset from top level if the reset has
@@ -1686,12 +1688,15 @@ vp::component *vp::__gv_create(std::string config_path, struct gv_conf *gv_conf)
 
     vp::component *instance = constructor(js_config);
 
-    new vp::power::engine(instance);
+    vp::top *top = new vp::top();
+
+    top->top_instance = instance;
+    top->power_engine = new vp::power::engine(instance);
 
     instance->set_vp_config(gv_config);
     instance->set_gv_conf(gv_conf);
 
-    return instance;
+    return (vp::component *)top;
 }
 
 
@@ -1708,7 +1713,8 @@ extern "C" void gv_destroy(void *arg)
 
 extern "C" void gv_start(void *arg)
 {
-    vp::component *instance = (vp::component *)arg;
+    vp::top *top = (vp::top *)arg;
+    vp::component *instance = (vp::component *)top->top_instance;
 
     instance->pre_pre_build();
     instance->pre_build();
@@ -1736,7 +1742,8 @@ extern "C" void gv_start(void *arg)
 
 extern "C" void gv_step(void *arg, int64_t timestamp)
 {
-    vp::component *instance = (vp::component *)arg;
+    vp::top *top = (vp::top *)arg;
+    vp::component *instance = (vp::component *)top->top_instance;
 
     instance->step(timestamp);
 }
@@ -1744,7 +1751,8 @@ extern "C" void gv_step(void *arg, int64_t timestamp)
 
 extern "C" int64_t gv_time(void *arg)
 {
-    vp::component *instance = (vp::component *)arg;
+    vp::top *top = (vp::top *)arg;
+    vp::component *instance = (vp::component *)top->top_instance;
 
     return instance->get_time_engine()->get_next_event_time();
 }
@@ -1989,9 +1997,10 @@ vp::time_event *vp::time_scheduler::enqueue(time_event *event, int64_t time)
 
 
 
-extern "C" int gv_run(void *_instance)
+extern "C" int gv_run(void *arg)
 {
-    vp::component *instance = (vp::component *)_instance;
+    vp::top *top = (vp::top *)arg;
+    vp::component *instance = (vp::component *)top->top_instance;
 
     if (!proxy)
     {
@@ -2014,9 +2023,10 @@ extern "C" void gv_init(struct gv_conf *gv_conf)
 }
 
 
-extern "C" void gv_stop(void *_instance, int retval)
+extern "C" void gv_stop(void *arg, int retval)
 {
-    vp::component *instance = (vp::component *)_instance;
+    vp::top *top = (vp::top *)arg;
+    vp::component *instance = (vp::component *)top->top_instance;
 
     if (proxy)
     {
@@ -2024,6 +2034,8 @@ extern "C" void gv_stop(void *_instance, int retval)
     }
 
     instance->stop();
+
+    delete top->power_engine;
 }
 
 
@@ -2059,6 +2071,7 @@ void vp::fatal(const char *fmt, ...)
 
 extern "C" void *gv_chip_pad_bind(void *handle, char *name, int ext_handle)
 {
-    vp::component *instance = (vp::component *)handle;
+    vp::top *top = (vp::top *)handle;
+    vp::component *instance = (vp::component *)top->top_instance;
     return instance->external_bind(name, "", (void *)(long)ext_handle);
 }
diff --git a/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp b/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp
index 76f4757b3..9cd9aa576 100644
--- a/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp
+++ b/gvsoc/gvsoc/engine/vp/trace_domain_impl.cpp
@@ -474,7 +474,7 @@ void trace_domain::conf_trace(int event, std::string path_str, bool enabled)
     if (trace != NULL)
     {
         if (event)
-        {              
+        {
             if (enabled)
             {
                 vp::Event_trace *event_trace;
diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp b/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp
index f7e234e1c..9ae3c3fe4 100644
--- a/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp
+++ b/gvsoc/gvsoc/models/cpu/iss/vp/include/iss_wrapper.hpp
@@ -113,6 +113,7 @@ class iss_wrapper : public vp::component, vp::Gdbserver_core
 
   vp::wire_slave<int>      irq_req_itf;
   vp::wire_master<int>     irq_ack_itf;
+  vp::wire_master<bool>     busy_itf;
 
   vp::wire_master<bool>    flush_cache_req_itf;
   vp::wire_slave<bool>     flush_cache_ack_itf;
@@ -142,8 +143,7 @@ class iss_wrapper : public vp::component, vp::Gdbserver_core
   vp::reg_1     do_step;
 
   std::vector<vp::power::power_source> insn_groups_power;
-  vp::power::power_source clock_gated_power;
-  vp::power::power_source leakage_power;
+  vp::power::power_source background_power;
 
   vp::trace     state_event;
   vp::trace     pc_trace_event;
diff --git a/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp b/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp
index 69df9a4e0..9825fa0df 100644
--- a/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp
+++ b/gvsoc/gvsoc/models/cpu/iss/vp/src/iss_wrapper.cpp
@@ -287,6 +287,11 @@ void iss_wrapper::clock_sync(void *__this, bool active)
 
   _this->clock_active = active;
 
+  if (_this->busy_itf.is_bound())
+  {
+    _this->busy_itf.sync(active);
+  }
+
   // TODO this could be better handler is the clock would be taken into
   // account in the core state machine
   uint8_t value = active && _this->is_active_reg.get();
@@ -1330,8 +1335,7 @@ int iss_wrapper::build()
     this->insn_groups_power.resize(1);
     power.new_power_source("power_insn", &this->insn_groups_power[0], this->get_js_config()->get("**/insn"));
   }
-  power.new_power_source("power_clock_gated", &clock_gated_power, this->get_js_config()->get("**/clock_gated"));
-  power.new_power_source("leakage", &leakage_power, this->get_js_config()->get("**/leakage"));
+  power.new_power_source("background", &background_power, this->get_js_config()->get("**/power_models/background"));
 
   data.set_resp_meth(&iss_wrapper::data_response);
   data.set_grant_meth(&iss_wrapper::data_grant);
@@ -1354,6 +1358,8 @@ int iss_wrapper::build()
   new_slave_port("irq_req", &irq_req_itf);
   new_master_port("irq_ack", &irq_ack_itf);
 
+  new_master_port("busy", &busy_itf);
+
   fetchen_itf.set_sync_meth(&iss_wrapper::fetchen_sync);
   new_slave_port("fetchen", &fetchen_itf);
 
@@ -1424,14 +1430,6 @@ void iss_wrapper::start()
     iss_register_debug_info(this, x->get_str().c_str());
   }
 
-  if (this->get_js_config()->get("**/binaries") != NULL)
-  {
-    for (auto x:this->get_js_config()->get("**/binaries")->get_elems())
-    {
-      this->binaries_trace_event.event_string("static enable " + x->get_str());
-    }
-  }
-
   trace.msg("ISS start (fetch: %d, is_active: %d, boot_addr: 0x%lx)\n", fetch_enable_reg.get(), is_active_reg.get(), get_config_int("boot_addr"));
 
 #ifdef USE_TRDB
@@ -1439,7 +1437,8 @@ void iss_wrapper::start()
   INIT_LIST_HEAD(&this->trdb_packet_list);
 #endif
 
-  this->leakage_power.leakage_power_start();
+  this->background_power.leakage_power_start();
+  this->background_power.dynamic_power_start();
 
   this->gdbserver = (vp::Gdbserver_engine *)this->get_service("gdbserver");
 
@@ -1495,6 +1494,14 @@ void iss_wrapper::reset(bool active)
       this->halted.set(true);
     }
 
+    if (this->get_js_config()->get("**/binaries") != NULL)
+    {
+      for (auto x:this->get_js_config()->get("**/binaries")->get_elems())
+      {
+        this->binaries_trace_event.event_string("static enable " + x->get_str());
+      }
+    }
+
     check_state();
   }
 }
diff --git a/gvsoc/gvsoc/models/devices/CMakeLists.txt b/gvsoc/gvsoc/models/devices/CMakeLists.txt
index d87ee0b38..cfb0dccc5 100644
--- a/gvsoc/gvsoc/models/devices/CMakeLists.txt
+++ b/gvsoc/gvsoc/models/devices/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(hyperbus)
 add_subdirectory(i2c)
 add_subdirectory(jtag)
 add_subdirectory(sound)
+add_subdirectory(gpio)
 add_subdirectory(spiflash)
 add_subdirectory(testbench)
 add_subdirectory(uart)
diff --git a/gvsoc/gvsoc/models/devices/gpio/CMakeLists.txt b/gvsoc/gvsoc/models/devices/gpio/CMakeLists.txt
new file mode 100644
index 000000000..f2aa7d662
--- /dev/null
+++ b/gvsoc/gvsoc/models/devices/gpio/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+set(GPIO_PREFIX "devices/gpio")
+
+vp_model(NAME fxl6408
+         PREFIX ${GPIO_PREFIX}
+         SOURCES "fxl6408.cpp")
diff --git a/gvsoc/gvsoc/models/devices/gpio/fxl6408.cpp b/gvsoc/gvsoc/models/devices/gpio/fxl6408.cpp
new file mode 100644
index 000000000..f148aa47d
--- /dev/null
+++ b/gvsoc/gvsoc/models/devices/gpio/fxl6408.cpp
@@ -0,0 +1,471 @@
+/*
+ * Copyright (C) 2020 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+
+#include <vp/vp.hpp>
+#include <vp/itf/i2c.hpp>
+
+
+
+typedef enum
+{
+  I2C_STATE_WAIT_START,
+  I2C_STATE_WAIT_ADDRESS,
+  I2C_STATE_GET_DATA,
+  I2C_STATE_SAMPLE_DATA,
+  I2C_STATE_ACK,
+  I2C_STATE_READ_ACK
+} I2c_state_e;
+
+
+class Fxl6408 : public vp::component
+{
+public:
+    Fxl6408(js::config *config);
+
+    int build();
+
+protected:
+    static void i2c_sync(void *__this, int scl, int sda);
+    void i2c_start(unsigned int address, bool is_read);
+    void i2c_handle_byte(uint8_t byte);
+    void i2c_stop();
+    void i2c_get_data();
+    void i2c_send_byte(uint8_t byte);
+
+    void handle_reg_write(uint8_t address, uint8_t value);
+    uint8_t handle_reg_read(uint8_t address);
+
+    void start();
+
+    vp::trace trace;
+    vp::i2c_master i2c_itf;
+
+    unsigned int device_address;
+
+    bool i2c_being_addressed;
+    unsigned int i2c_address;
+    uint8_t i2c_pending_data;
+    bool i2c_is_read;
+    I2c_state_e i2c_state;
+    int i2c_pending_bits;
+    int i2c_prev_sda;
+    int i2c_prev_scl;
+    unsigned int i2c_pending_send_byte;
+    uint8_t reg_address;
+    bool waiting_reg_address;
+
+    uint8_t device_id;
+    uint8_t io_dir;
+    uint8_t output_state;
+    uint8_t output_high_z;
+    uint8_t input_default_state;
+    uint8_t pull_enable;
+    uint8_t pull_down_up;
+    uint8_t input_status;
+    uint8_t interrupt_mask;
+    uint8_t interrupt_status;
+};
+
+
+Fxl6408::Fxl6408(js::config *config)
+    : vp::component(config)
+{
+}
+
+
+void Fxl6408::start()
+{
+    this->i2c_itf.sync(1, 1);
+}
+
+
+void Fxl6408::i2c_sync(void *__this, int scl, int sda)
+{
+    Fxl6408 *_this = (Fxl6408 *)__this;
+
+    _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C sync (scl: %d, sda: %d)\n", scl, sda);
+
+    int sdo = 1;
+
+    if (scl == 1 && _this->i2c_prev_sda != sda)
+    {
+        if (_this->i2c_prev_sda == 1)
+        {
+            _this->trace.msg(vp::trace::LEVEL_TRACE, "Detected start\n");
+
+            _this->i2c_state = I2C_STATE_WAIT_ADDRESS;
+            _this->i2c_address = 0;
+            _this->i2c_pending_bits = 8;
+        }
+        else
+        {
+            _this->i2c_state = I2C_STATE_WAIT_START;
+            _this->i2c_stop();
+        }
+        goto end;
+    }
+
+    if (!_this->i2c_prev_scl && scl)
+    {
+        switch (_this->i2c_state)
+        {
+            case I2C_STATE_WAIT_START:
+            {
+                sdo = 1;
+                break;
+            }
+
+            case I2C_STATE_WAIT_ADDRESS:
+            {
+                if (_this->i2c_pending_bits > 1)
+                {
+                    _this->i2c_address = (_this->i2c_address << 1) | sda;
+                    _this->trace.msg(vp::trace::LEVEL_TRACE, "Received address bit (bit: %d, address: 0x%x, pending_bits: %d)\n", sda, _this->i2c_address, _this->i2c_pending_bits);
+                }
+                else
+                {
+                    _this->i2c_is_read = sda;
+                }
+                _this->i2c_pending_bits--;
+                if (_this->i2c_pending_bits == 0)
+                {
+                    _this->i2c_start(_this->i2c_address, _this->i2c_is_read);
+                    _this->i2c_state = I2C_STATE_ACK;
+                    _this->i2c_pending_bits = 8;
+                }
+                break;
+            }
+
+            case I2C_STATE_SAMPLE_DATA:
+            {
+                _this->i2c_pending_data = (_this->i2c_pending_data << 1) | sda;
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Sampling data (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sda, _this->i2c_pending_data, _this->i2c_pending_bits);
+                _this->i2c_pending_bits--;
+                if (_this->i2c_pending_bits == 0)
+                {
+                    _this->i2c_pending_bits = 8;
+                    _this->i2c_handle_byte(_this->i2c_pending_data);
+                    _this->i2c_state = I2C_STATE_ACK;
+                }
+                break;
+            }
+
+            case I2C_STATE_ACK: {
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed);
+                if (_this->i2c_being_addressed)
+                {
+                    if (_this->i2c_is_read)
+                    {
+                        _this->i2c_state = I2C_STATE_GET_DATA;
+                        _this->i2c_pending_bits = 8;
+                        _this->i2c_get_data();
+                    }
+                    else
+                    {
+                        _this->i2c_state = I2C_STATE_SAMPLE_DATA;
+                    }
+                }
+                else
+                {
+                    _this->i2c_state = I2C_STATE_WAIT_START;
+                }
+
+                break;
+            }
+
+            case I2C_STATE_READ_ACK: {
+                _this->i2c_state = I2C_STATE_WAIT_START;
+                break;
+            }
+        }
+    }
+
+    if (_this->i2c_prev_scl && !scl)
+    {
+        switch (_this->i2c_state)
+        {
+            case I2C_STATE_ACK:
+            {
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed);
+                sdo = !_this->i2c_being_addressed;
+                break;
+            }
+
+            case I2C_STATE_READ_ACK:
+            {
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Read ack\n");
+                sdo = 0;
+                break;
+            }
+
+            case I2C_STATE_GET_DATA:
+            {
+                sdo = (_this->i2c_pending_send_byte >> 7) & 1;
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Sending bit (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sdo, _this->i2c_pending_send_byte, _this->i2c_pending_bits);
+                _this->i2c_pending_send_byte <<= 1;
+                _this->i2c_pending_bits--;
+                if (_this->i2c_pending_bits == 0)
+                {
+                    _this->i2c_state = I2C_STATE_READ_ACK;
+                }
+                break;
+            }
+        }
+    }
+
+end:
+    if (_this->i2c_prev_scl && !scl)
+    {
+        _this->trace.msg(vp::trace::LEVEL_TRACE, "Sync sda (value: %d)\n", sdo);
+        _this->i2c_itf.sync(1, sdo);
+    }
+    _this->i2c_prev_sda = sda;
+    _this->i2c_prev_scl = scl;
+}
+
+void Fxl6408::i2c_start(unsigned int address, bool is_read)
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Received header (address: 0x%x, is_read: %d)\n", address, is_read);
+
+    this->i2c_being_addressed = address == this->device_address;
+    if (this->i2c_being_addressed && is_read)
+    {
+        this->i2c_send_byte(this->handle_reg_read(this->reg_address));
+    }
+}
+
+void Fxl6408::i2c_handle_byte(uint8_t byte)
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Handle byte (value: 0x%x)\n", byte);
+
+    if (this->waiting_reg_address)
+    {
+        this->reg_address = byte;
+        this->waiting_reg_address = false;
+    }
+    else
+    {
+        this->handle_reg_write(this->reg_address, byte);
+        this->waiting_reg_address = true;
+    }
+}
+
+void Fxl6408::i2c_stop()
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Received stop bit\n");
+
+}
+
+void Fxl6408::i2c_get_data()
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Getting data\n");
+}
+
+void Fxl6408::i2c_send_byte(uint8_t byte)
+{
+  this->i2c_pending_send_byte = byte;
+}
+
+
+void Fxl6408::handle_reg_write(uint8_t address, uint8_t value)
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Register write (address: 0x%x, value: 0x%x)\n", address, value);
+
+    switch (address)
+    {
+        case 0x01:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Device ID & Ctrl", value);
+            this->device_id = value;
+            break;
+        }
+        case 0x03:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "IO Direction", value);
+            this->io_dir = value;
+            break;
+        }
+        case 0x05:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Output State", value);
+            this->output_state = value;
+            break;
+        }
+        case 0x07:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Output High-Z", value);
+            this->output_high_z = value;
+            break;
+        }
+        case 0x09:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Input Default State", value);
+            this->input_default_state = value;
+            break;
+        }
+        case 0x0B:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Pull Enable", value);
+            this->pull_enable = value;
+            break;
+        }
+        case 0x0D:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Pull-Down/Pull-Up", value);
+            this->pull_down_up = value;
+            break;
+        }
+        case 0x0F:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Input Status", value);
+            this->input_status = value;
+            break;
+        }
+        case 0x11:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Interrupt Mask", value);
+            this->interrupt_mask = value;
+            break;
+        }
+        case 0x13:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "interrupt Status", value);
+            this->interrupt_status = value;
+            break;
+        }
+        default:
+            this->trace.force_warning("Writing invalid register (address: 0x%x)\n", address);
+            break;
+    }
+
+}
+
+
+uint8_t Fxl6408::handle_reg_read(uint8_t address)
+{
+    this->trace.msg(vp::trace::LEVEL_DEBUG, "Register read (address: 0x%x)\n", address);
+
+    uint8_t value = 0xFF;
+
+    switch (address)
+    {
+        case 0x01:
+        {
+            value = this->device_id;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Device ID & Ctrl", value);
+            break;
+        }
+        case 0x03:
+        {
+            value = this->io_dir;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "IO Direction", value);
+            break;
+        }
+        case 0x05:
+        {
+            value = this->output_state;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Output State", value);
+            break;
+        }
+        case 0x07:
+        {
+            value = this->output_high_z;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Output High-Z", value);
+            break;
+        }
+        case 0x09:
+        {
+            value = this->input_default_state;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Input Default State", value);
+            break;
+        }
+        case 0x0B:
+        {
+            value = this->pull_enable;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Pull Enable", value);
+            break;
+        }
+        case 0x0D:
+        {
+            value = this->pull_down_up;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Pull-Down/Pull-Up", value);
+            break;
+        }
+        case 0x0F:
+        {
+            value = this->input_status;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Input Status", value);
+            break;
+        }
+        case 0x11:
+        {
+            value = this->interrupt_mask;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Interrupt Mask", value);
+            break;
+        }
+        case 0x13:
+        {
+            value = this->interrupt_status;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Interrupt Status", value);
+            break;
+        }
+        default:
+            this->trace.force_warning("Reading invalid register (address: 0x%x)\n", address);
+            break;
+    }
+
+    return value;
+}
+
+
+int Fxl6408::build()
+{
+    traces.new_trace("trace", &trace, vp::DEBUG);
+
+    this->i2c_itf.set_sync_meth(&Fxl6408::i2c_sync);
+    this->new_master_port("i2c", &this->i2c_itf);
+
+    this->i2c_state = I2C_STATE_WAIT_START;
+    this->i2c_prev_sda = 1;
+    this->i2c_prev_scl = 1;
+    this->i2c_being_addressed = false;
+    this->device_address = 0x43;
+    this->waiting_reg_address = true;
+
+    this->device_id           = 0xC2;
+    this->io_dir              = 0x00;
+    this->output_state        = 0x00;
+    this->output_high_z       = 0xFF;
+    this->input_default_state = 0x00;
+    this->pull_enable         = 0xFF;
+    this->pull_down_up        = 0x00;
+    this->input_status        = 0xFF;
+    this->interrupt_mask      = 0x00;
+    this->interrupt_status    = 0xFF;
+
+    return 0;
+}
+
+
+extern "C" vp::component *vp_constructor(js::config *config)
+{
+    return new Fxl6408(config);
+}
diff --git a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp
index 22ed95106..7f024bb02 100644
--- a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp
+++ b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.cpp
@@ -15,27 +15,29 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
+// The same library is compiled with same flags for all gvsoc mode (normal, debug and system verilog)
+// Force trace support to be able to have them.
+#define VP_TRACE_ACTIVE 1
+
 #include "i2c_helper.hpp"
 
 #include <stdio.h>
 #include <cassert>
 
-//#define I2C_HELPER_DEBUG(...)    (fprintf(stderr, "[I2C-HLP] " __VA_ARGS__))
-#define I2C_HELPER_DEBUG(...)
-
 namespace {
     void null_callback(i2c_operation_e id, i2c_status_e status, int value)
     {
         (void) id;
         (void) status;
         (void) value;
-        I2C_HELPER_DEBUG("null callback: id=%d, status=%d, value=%d\n",
-                id, status, value);
+        //this->trace.msg(vp::trace::LEVEL_TRACE, "null callback: id=%d, status=%d, value=%d\n",
+        //        id, status, value);
     }
 }
 
 I2C_helper::I2C_helper(vp::component* parent, vp::i2c_master* itf,
-        i2c_enqueue_event_fn_t enqueue_event, i2c_cancel_event_fn_t cancel_event) :
+        i2c_enqueue_event_fn_t enqueue_event, i2c_cancel_event_fn_t cancel_event,
+        std::string trace_path) :
     parent(parent),
     itf(itf),
     enqueue_event(enqueue_event),
@@ -51,28 +53,24 @@ I2C_helper::I2C_helper(vp::component* parent, vp::i2c_master* itf,
     is_starting(false),
     is_stopping(false),
     is_clock_enabled(false),
-    is_clock_low(false),
+    clock_value(1),
     is_driving_scl(false),
     is_driving_sda(false),
     cb_master_operation(null_callback),
     clock_event(parent, this, I2C_helper::st_clock_event_handler),
-    data_event(parent, this, I2C_helper::st_data_event_handler)
+    fsm_event(parent, this, I2C_helper::fsm_event_handler)
 {
     assert(NULL != this->parent);
     assert(NULL != this->itf);
 
-    I2C_HELPER_DEBUG("Initializing helper interface\n");
-}
+    parent->traces.new_trace(trace_path + "/i2c_helper", &this->trace, vp::DEBUG);
 
-void I2C_helper::st_data_event_handler(void* __this, vp::clock_event* event)
-{
-    assert(NULL != __this);
-    assert(NULL != event);
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Initializing helper interface\n");
 
-    I2C_HELPER_DEBUG("st_data_event_handler: none\n");
-    I2C_helper* _this = (I2C_helper*) __this;
-    _this->desired_sda = _this->expected_bit_value;
-    _this->sync_pins();
+    this->pending_data_bits = 0;
+    this->fsm_waiting = false;
+    this->input_scl = 1;
+    this->input_sda = 1;
 }
 
 void I2C_helper::st_clock_event_handler(void* __this, vp::clock_event* event)
@@ -80,29 +78,70 @@ void I2C_helper::st_clock_event_handler(void* __this, vp::clock_event* event)
     assert(NULL != __this);
     assert(NULL != event);
 
-    I2C_HELPER_DEBUG("st_clock_event_handler: none\n");
     I2C_helper* _this = (I2C_helper*) __this;
     _this->clock_event_handler(event);
 }
 
+
+void I2C_helper::clock_toggle(void)
+{
+    if (this->is_clock_enabled)
+    {
+        this->clock_value ^= 1;
+
+        this->enqueue_clock_toggle();
+    }
+}
+
+void I2C_helper::enqueue_clock_toggle(void)
+{
+    if (this->is_clock_enabled)
+    {
+        if (this->clock_event.is_enqueued())
+        {
+            this->cancel_event(&this->clock_event);
+        }
+
+        const uint64_t delay = this->clock_value ? this->delay_low_ps : this->delay_high_ps;
+        this->enqueue_event(&this->clock_event, delay);
+    }
+}
+
+
+void I2C_helper::fsm_enqueue_event(int64_t delay)
+{
+    if (!this->fsm_event.is_enqueued())
+    {
+        this->enqueue_event(&this->fsm_event, delay);
+    }
+}
+
+
+void I2C_helper::fsm_event_handler(void *__this, vp::clock_event* event)
+{
+    I2C_helper* _this = (I2C_helper *) __this;
+
+    _this->fsm_waiting = false;
+    _this->fsm_step();
+}
+
+
 void I2C_helper::clock_event_handler(vp::clock_event* event)
 {
     assert(NULL != event);
 
-    I2C_HELPER_DEBUG("clock_event_handler: none\n");
     /* clock toggling */
     if (this->is_clock_enabled)
     {
-        if (this->is_clock_low)
+        this->trace.msg(vp::trace::LEVEL_TRACE, "Toggling clock (value: %d)\n", this->clock_value);
+        if (this->clock_value)
         {
-            I2C_HELPER_DEBUG("clock_event_handler: LOW (switch to high)\n");
             /* switch to high */
             this->desired_scl = 1;
             this->sync_pins();
         }
         else
         {
-            I2C_HELPER_DEBUG("clock_event_handler: HIGH (switch to low)\n");
             /* switch to low */
             this->desired_scl = 0;
             this->sync_pins();
@@ -112,29 +151,23 @@ void I2C_helper::clock_event_handler(vp::clock_event* event)
 
 void I2C_helper::register_callback(i2c_callback_t callback)
 {
-    I2C_HELPER_DEBUG("register_callback: none\n");
+    this->trace.msg(vp::trace::LEVEL_TRACE, "register_callback: none\n");
     this->cb_master_operation = callback;
 }
 
-void I2C_helper::update_pins(int scl, int sda)
-{
-    this->fsm_step(scl, sda);
-}
-
-
 void I2C_helper::sync_pins(void)
 {
-    int res_scl = this->is_driving_scl ? this->desired_scl : 1;
-    int res_sda = this->is_driving_sda ? this->desired_sda : 1;
+    int res_scl = this->internal_state != I2C_INTERNAL_IDLE ? this->desired_scl : 1;
+    int res_sda = this->internal_state != I2C_INTERNAL_IDLE ? this->desired_sda : 1;
 
-    I2C_HELPER_DEBUG("sync_pins: scl=%d, sda=%d\n", res_scl, res_sda);
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Synchronizing pins (scl:%d, sda:%d)\n", res_scl, res_sda);
     this->itf->sync(res_scl, res_sda);
 }
 
 
 void I2C_helper::set_timings(uint64_t delay_low_ps, uint64_t delay_high_ps)
 {
-    I2C_HELPER_DEBUG("set_timings: delay_low_ps=%ld, delay_high_ps=%ld\n",
+    this->trace.msg(vp::trace::LEVEL_TRACE, "set_timings: delay_low_ps=%ld, delay_high_ps=%ld\n",
             delay_low_ps,
             delay_high_ps);
     this->delay_low_ps = delay_low_ps;
@@ -143,35 +176,28 @@ void I2C_helper::set_timings(uint64_t delay_low_ps, uint64_t delay_high_ps)
 
 void I2C_helper::send_start(void)
 {
-    I2C_HELPER_DEBUG("send_start: none\n");
-    if (!this->is_busy())
-    {
-        I2C_HELPER_DEBUG("send_start: sda=%d, scl=%d\n", this->sda, this->scl);
-        I2C_HELPER_DEBUG("send_start: this=%p\n", (void*) this);
-        this->is_driving_scl = true;
-        this->is_driving_sda = true;
-        this->desired_scl = 1;
-        this->desired_sda = 0;
-        this->sync_pins();
-        this->start_clock();
-    }
-    else
-    {
-        this->is_starting = true;
-    }
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Request to send start\n");
+
+    this->is_starting = true;
+    this->fsm_enqueue_event(1);
+}
+
+void I2C_helper::release_pins(void)
+{
+
 }
 
-bool I2C_helper::is_busy(void)
+void I2C_helper::update_pins(int scl, int sda)
 {
-    return (this->internal_state != I2C_INTERNAL_IDLE);
+    this->input_scl = scl;
+    this->input_sda = sda;
+    this->fsm_enqueue_event(1);
 }
 
 void I2C_helper::send_address(int addr, bool is_write, bool is_10bits)
 {
-    I2C_HELPER_DEBUG("send_address: addr=%d, is_write=%s, is_10bits=%s\n",
-            addr,
-            is_write ? "true" : "false",
-            is_10bits ? "true" : "false");
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Request to send address (addr: 0x%x, is_write:%d, is_10bits:%d)\n",
+            addr, is_write, is_10bits);
 
     //TODO support 10 bits mode
     assert(!is_10bits);
@@ -180,337 +206,274 @@ void I2C_helper::send_address(int addr, bool is_write, bool is_10bits)
     this->send_data(addr_byte);
 }
 
+void I2C_helper::send_ack(bool ack)
+{
+    // I2C_HELPER_DEBUG("send_ack: ack=%s\n", ack ? "true" : "false");
+    // //TODO
+    // this->expected_bit_value = ack ? 0 : 1;
+    // this->is_driving_sda = 1;
+    // this->enqueue_data_change(this->expected_bit_value);
+}
+
 void I2C_helper::send_data(int byte)
 {
-    I2C_HELPER_DEBUG("send_data: byte=%d\n", byte);
-    // TODO verify that we are in data mode ?
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Request to send data (value: 0x%x)\n", byte);
 
-    /* load byte in sending queue */
-    for (int i = 7; i >= 0; i--)
+    if (this->pending_data_bits)
     {
-        int bit = (byte >> i) & 1;
-        I2C_HELPER_DEBUG("push to send bit queue:%d\n", bit);
-        this->send_bit_queue.push(bit);
+        this->trace.force_warning("Trying to send data while there is already one pending\n");
     }
-
-    /* enqueue data change if clock is low,
-     * else will be done automatically at next falling scl */
-    if (this->is_clock_low && this->internal_state == I2C_INTERNAL_DATA)
+    else
     {
-        I2C_HELPER_DEBUG("Directly enqueueing!\n");
-        this->expected_bit_value = this->send_bit_queue.front();
-        this->send_bit_queue.pop();
-        this->is_driving_sda = true;
-        this->enqueue_data_change(this->expected_bit_value);
+        this->pending_data = byte;
+        this->pending_data_bits = 8;
+        this->fsm_enqueue_event(1);
     }
 }
 
-void I2C_helper::send_ack(bool ack)
-{
-    I2C_HELPER_DEBUG("send_ack: ack=%s\n", ack ? "true" : "false");
-    //TODO
-    this->expected_bit_value = ack ? 0 : 1;
-    this->is_driving_sda = 1;
-    this->enqueue_data_change(this->expected_bit_value);
-}
-
 void I2C_helper::send_stop(void)
 {
-    I2C_HELPER_DEBUG("send_stop: none\n");
-    if(this->is_busy())
-    {
-        this->is_stopping = true;
-        this->is_driving_sda = true;
-        this->expected_bit_value = 0;
-        this->enqueue_data_change(this->expected_bit_value);
-    }
-}
-
-void I2C_helper::release_pins(void)
-{
-    // release everything that could hold the bus
-    this->empty_queues();
-    this->stop_clock();
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Request to stop\n");
+    this->is_stopping = true;
 }
 
 void I2C_helper::start_clock(void)
 {
-    I2C_HELPER_DEBUG("Starting clock\n");
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Starting clock\n");
 
     //start high then loop(low -> high)
     this->is_clock_enabled = true;
-    this->is_clock_low = false;
+    this->clock_value = this->scl ^ 1;
     this->enqueue_clock_toggle();
 }
 
 void I2C_helper::stop_clock(void)
 {
-    I2C_HELPER_DEBUG("Stop clock\n");
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Stopping clock\n");
     this->is_clock_enabled =false;
     this->cancel_event(&this->clock_event);
+}
+
+std::string I2C_helper::get_state_name(i2c_internal_state_e state)
+{
+    switch (state)
+    {
+        case I2C_INTERNAL_IDLE: return "idle";
+        case I2C_INTERNAL_WAIT_START: return "wait_start";
+        case I2C_INTERNAL_WAIT_STOP: return "wait_stop";
+        case I2C_INTERNAL_START: return "start";
+        case I2C_INTERNAL_WAIT_DATA: return "wait_data";
+        case I2C_INTERNAL_DATA: return "data";
+        case I2C_INTERNAL_DATA_READ: return "data_read";
+        case I2C_INTERNAL_ACK: return "ack";
+        case I2C_INTERNAL_STOP_CLOCK: return "stop_clock";
+        case I2C_INTERNAL_STOP_CLOCK_WAIT: return "stop_clock_wait";
+        case I2C_INTERNAL_RESTART: return "restart";
+        case I2C_INTERNAL_STOP_0: return "stop_0";
+        case I2C_INTERNAL_STOP_1: return "stop_1";
+        default: return "unknown";
+    }
+}
 
-    this->desired_scl = 1;
 
-    this->is_driving_scl = false;
-    this->is_driving_sda = false;
-    this->enqueue_data_change(1);
+void I2C_helper::send_data_bit()
+{
+    int bit = (this->pending_data >> 7) & 1;
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Sending bit (bit: %d)\n", bit);
+    this->desired_sda = bit;
+    this->pending_data <<= 1;
+    this->pending_data_bits--;
 }
 
-void I2C_helper::fsm_step(int input_scl, int input_sda)
+
+void I2C_helper::fsm_step()
 {
-    bool scl_rising = (input_scl == 1 && this->scl == 0);
-    bool scl_falling = (input_scl == 0 && this->scl == 1);
-    bool scl_steady = (input_scl == this->scl);
+    if (this->fsm_waiting)
+    {
+        return;
+    }
+
+    this->trace.msg(vp::trace::LEVEL_TRACE, "FSM update (state: %s, prev_scl: %d, prev_sda: %d, scl: %d, sda: %d)\n",
+        this->get_state_name(this->internal_state).c_str(), this->scl, this->sda, this->input_scl, this->input_sda);
 
-    bool sda_rising = (input_sda == 1 && this->sda == 0);
-    bool sda_falling = (input_sda == 0 && this->sda == 1);
-    I2C_HELPER_DEBUG("\n\n\n");
-    I2C_HELPER_DEBUG("fsm_step: input_scl=%d, input_sda=%d\n", input_scl, input_sda);
-    I2C_HELPER_DEBUG("fsm_step: scl=%d, this->scl=%d\n", input_scl, this->scl);
-    I2C_HELPER_DEBUG("fsm_step: sda=%d, this->sda=%d\n", input_sda, this->sda);
-    I2C_HELPER_DEBUG("fsm_step: this=%p\n", (void*) this);
+    bool scl_rising = (this->input_scl == 1 && this->scl == 0);
+    bool scl_falling = (this->input_scl == 0 && this->scl == 1);
+    bool scl_steady = (this->input_scl == this->scl);
 
-    this->scl = input_scl;
-    this->sda = input_sda;
+    bool sda_rising = (this->input_sda == 1 && this->sda == 0);
+    bool sda_falling = (this->input_sda == 0 && this->sda == 1);
+
+    this->scl = this->input_scl;
+    this->sda =this-> input_sda;
 
     /* clock management */
     if (!scl_steady)
     {
-        /* manages clock synchronization and clock stretching automatically */
-        if (scl_rising)
-        {
-            this->is_clock_low = false;
-        }
-        else if (scl_falling)
-        {
-            this->is_clock_low = true;
-        }
-
-        if (this->is_clock_enabled)
-        {
-            this->enqueue_clock_toggle();
-        }
+        // Renqueue a clock toggle each time it toggles
+        this->clock_toggle();
     }
 
-    /* I2C logic */
-    if (scl_steady)
+    switch (this->internal_state)
     {
-        /* START/STOP detection */
-        if (this->scl == 1)
-        {
-            if (sda_falling && !this->is_busy())
+        case I2C_INTERNAL_IDLE:
+            if (is_starting)
             {
+                this->trace.msg(vp::trace::LEVEL_TRACE, "Waiting start\n");
+                this->internal_state = I2C_INTERNAL_WAIT_START;
                 this->is_starting = false;
+                this->desired_sda = 0;
+                this->desired_scl = 1;
             }
-            else if (sda_rising && this->is_busy())
+            break;
+        
+        case I2C_INTERNAL_WAIT_START:
+            if (scl_steady && sda_falling)
             {
-                this->internal_state = I2C_INTERNAL_IDLE;
-                this->is_stopping = false;
-                /* stop clock */
-                this->stop_clock();
-                this->empty_queues();
+                this->trace.msg(vp::trace::LEVEL_TRACE, "Detected start, waiting for data\n");
+                this->internal_state = I2C_INTERNAL_WAIT_DATA;
+                this->cb_master_operation(I2C_OP_START, I2C_STATUS_OK, 0);
+            }
+            break;
 
-                I2C_HELPER_DEBUG("STOP DETECTED\n");
+        case I2C_INTERNAL_WAIT_STOP:
+            if (scl_steady && sda_rising)
+            {
+                this->trace.msg(vp::trace::LEVEL_TRACE, "Detected stop\n");
+                this->internal_state = I2C_INTERNAL_IDLE;
                 this->cb_master_operation(I2C_OP_STOP, I2C_STATUS_OK, 0);
             }
-        }
-    }
-    else if (!this->is_busy() && scl_falling && this->sda == 0)
-    {
-        /* propagate start condition */
-        this->internal_state = I2C_INTERNAL_START;
+            break;
 
-        this->sda_rise = this ->sda;
-        this->empty_queues();
-        I2C_HELPER_DEBUG("START DETECTED\n");
-        this->cb_master_operation(I2C_OP_START, I2C_STATUS_OK, 0);
-    }
-    else if (this->is_busy())
-    {
-        /* sampling bit*/
-        if (scl_rising)
+        case I2C_INTERNAL_WAIT_DATA:
+            if (this->pending_data_bits)
+            {
+                this->trace.msg(vp::trace::LEVEL_TRACE, "Detected data, starting clock\n");
+                this->internal_state = I2C_INTERNAL_DATA;
+            }
+            this->start_clock();
+            break;
+
+        case I2C_INTERNAL_DATA:
         {
-            I2C_HELPER_DEBUG("SCL rising\n");
-            I2C_HELPER_DEBUG("fsm_step: sampling rising bit\n");
-            this->sda_rise = this->sda;
-            //TODO add check expected_bit_value
-            if (is_stopping)
+            if (scl_falling)
             {
-                this->is_driving_sda = true;
-                this->expected_bit_value = 1;
-                this->enqueue_data_change(this->expected_bit_value);
+                this->send_data_bit();
             }
-            else if (is_starting)
+            else if (scl_rising)
             {
-                /* drive sda pin down */
-                this->is_driving_sda = true;
-                this->expected_bit_value = 0;
-                this->enqueue_data_change(this->expected_bit_value);
+                if (this->pending_data_bits == 0)
+                {
+                    this->internal_state = I2C_INTERNAL_ACK;
+                }
             }
+            break;
+        }
 
-            if (this->is_driving_sda && this->desired_sda != this->sda && this->desired_sda != 0)
+        case I2C_INTERNAL_DATA_READ:
+        {
+            if (scl_rising)
             {
-                // we lost arbitration
-                i2c_operation_e operation = I2C_OP_DATA;
+                this->pending_data = (this->pending_data << 1) | this->sda;
+                this->trace.msg(vp::trace::LEVEL_TRACE, "Sampled data (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", this->sda, this->pending_data, this->pending_data_bits);
+                this->pending_data_bits--;
 
-                if (this->is_stopping)
-                {
-                    operation = I2C_OP_STOP;
-                }
-                else if (this->internal_state == I2C_INTERNAL_DATA)
+                if (this->pending_data_bits == 0)
                 {
-                    operation = I2C_OP_DATA;
-                }
-                else if (this->internal_state == I2C_INTERNAL_ACK)
-                {
-                    operation = I2C_OP_ACK;
+                    this->cb_master_operation(I2C_OP_DATA, I2C_STATUS_OK, this->pending_data);
+                    this->internal_state = I2C_INTERNAL_ACK;
                 }
-
-                this->cb_master_operation(
-                        operation,
-                        I2C_STATUS_ERROR_ARBITRATION,
-                        0);
             }
+            break;
         }
-        else if (scl_falling)
+
+        case I2C_INTERNAL_ACK:
         {
-            I2C_HELPER_DEBUG("SCL falling\n");
-            I2C_HELPER_DEBUG("INTERNAL_STATE = %d\n", this->internal_state);
-            if (this->internal_state != I2C_INTERNAL_START)
+            if (scl_rising)
             {
-                if (this->sda_rise == this->sda)
-                {
-                    this->recv_bit_queue.push(this->sda);
-                }
-                else
-                {
-                    //TODO framing error ?
-                    //TODO empty queue
-                    I2C_HELPER_DEBUG("FRAMING ERROR!, sda_rise=%d, sda=%d\n", this->sda_rise, this->sda);
-                    this->cb_master_operation(I2C_OP_STOP, I2C_STATUS_ERROR_FRAMING, 0);
-                }
+                int ack = this->sda;
+
+                this->trace.msg(vp::trace::LEVEL_TRACE, "Sampled ack (value: %d)\n", ack);
+                const i2c_status_e status = (ack == 1) ? I2C_STATUS_KO : I2C_STATUS_OK;
+                this->cb_master_operation(I2C_OP_ACK, status, 0);
+                this->internal_state = I2C_INTERNAL_STOP_CLOCK;
             }
-            else
+            break;
+        }
+
+        case I2C_INTERNAL_STOP_CLOCK:
+        {
+            if (scl_falling)
             {
-                this->internal_state = I2C_INTERNAL_DATA;
+                this->desired_sda = 1;
+                this->stop_clock();
+                this->internal_state = I2C_INTERNAL_STOP_CLOCK_WAIT;
+                this->fsm_waiting = true;
+                this->fsm_enqueue_event(this->delay_high_ps);
             }
+            break;
+        }
 
-            if (is_stopping)
+        case I2C_INTERNAL_STOP_CLOCK_WAIT:
+        {
+            if (this->pending_data_bits)
             {
-                this->is_driving_sda = true;
-                this->expected_bit_value = 0;
-                this->enqueue_data_change(this->expected_bit_value);
+                // We must continue immediately with another byte of data
+                // Send a bit now since there is no falling edge and let the usual 
+                // state continue
+                this->send_data_bit();
+                this->start_clock();
+                this->internal_state = I2C_INTERNAL_DATA;
+
             }
-            else if (is_starting)
+            else if (this->is_starting)
             {
-                this->is_driving_sda = true;
-                this->expected_bit_value = 1;
-                this->enqueue_data_change(this->expected_bit_value);
+                this->internal_state = I2C_INTERNAL_RESTART;
+                this->is_starting = false;
+                this->desired_scl = 1;
+                this->fsm_waiting = true;
+                this->fsm_enqueue_event(this->delay_high_ps);
             }
-            else if (this->internal_state == I2C_INTERNAL_DATA)
+            else if (this->is_stopping)
             {
-                /* send data */
-                if (!this->send_bit_queue.empty())
-                {
-                    assert(this->send_bit_queue.size() <= 8);
-
-                    int bit = this->send_bit_queue.front();
-                    this->send_bit_queue.pop();
-                    this->expected_bit_value = bit;
-
-                    this->is_driving_sda = true;
-                    this->enqueue_data_change(this->expected_bit_value);
-                }
-                else
-                {
-                    /* release sda pin */
-                    this->is_driving_sda = false;
-                    this->enqueue_data_change(this->expected_bit_value);
-                }
-
-                /* receiving data */
-                if (this->recv_bit_queue.size() == 8)
-                {
-                    int byte = 0;
-                    /* full byte received */
-                    for (int i = 0; i < 8; i++)
-                    {
-                        int bit = this->recv_bit_queue.front();
-                        this->recv_bit_queue.pop();
-                        byte = byte << 1 | bit;
-                    }
-                    assert(this->recv_bit_queue.empty());
-
-                    I2C_HELPER_DEBUG("fsm_step: byte received=%d\n", byte);
-
-                    this->internal_state = I2C_INTERNAL_ACK;
-                    this->empty_queues();
-
-                    this->cb_master_operation(I2C_OP_DATA, I2C_STATUS_OK, byte);
-                }
+                this->internal_state = I2C_INTERNAL_STOP_0;
+                this->is_stopping = false;
+                this->desired_sda = 0;
+                this->fsm_waiting = true;
+                this->fsm_enqueue_event(this->delay_high_ps);
             }
-            else if (this->internal_state == I2C_INTERNAL_ACK)
+            else
             {
-                if (this->recv_bit_queue.size() == 1)
-                {
-                    const int bit = this->recv_bit_queue.front();
-                    this->recv_bit_queue.pop();
-
-                    I2C_HELPER_DEBUG("fsm_step: ACK received=%d\n", bit);
-
-                    const i2c_status_e status = (bit == 1) ? I2C_STATUS_KO : I2C_STATUS_OK;
-                    assert(this->recv_bit_queue.empty());
-
-                    this->internal_state = I2C_INTERNAL_DATA;
-                    this->empty_queues();
-
-                    /* release sda pin */
-                    this->is_driving_sda = false;
-                    this->expected_bit_value = 1;
-                    this->enqueue_data_change(this->expected_bit_value);
-
-                    this->cb_master_operation(I2C_OP_ACK, status, 0);
-
-                }
+                this->start_clock();
+                this->pending_data_bits = 8;
+                this->internal_state = I2C_INTERNAL_DATA_READ;
             }
+            break;
         }
-    }
-}
 
-void I2C_helper::enqueue_clock_toggle(void)
-{
-    I2C_HELPER_DEBUG("enqueue_clock_toggle: clock_low=%s\n",
-            this->is_clock_low ? "true" : "false");
-    if (this->is_clock_enabled)
-    {
-        if (this->clock_event.is_enqueued())
+        case I2C_INTERNAL_STOP_0:
         {
-            this->cancel_event(&this->clock_event);
+            this->internal_state = I2C_INTERNAL_STOP_1;
+            this->desired_scl = 1;
+            this->fsm_waiting = true;
+            this->fsm_enqueue_event(this->delay_high_ps);
+            break;
         }
 
-        const uint64_t delay = this->is_clock_low ? this->delay_low_ps : this->delay_high_ps;
-        this->enqueue_event(&this->clock_event, delay);
-    }
-}
-
-void I2C_helper::enqueue_data_change(int new_sda)
-{
-    I2C_HELPER_DEBUG("enqueue_data_change: %d\n", new_sda);
-    if (!this->data_event.is_enqueued())
-    {
-        this->enqueue_event(&this->data_event, 1);
-    }
-}
+        case I2C_INTERNAL_STOP_1:
+        {
+            this->internal_state = I2C_INTERNAL_WAIT_STOP;
+            this->desired_sda = 1;
+            break;
+        }
 
-void I2C_helper::empty_queues(void)
-{
-    while(!this->send_bit_queue.empty())
-    {
-        this->send_bit_queue.pop();
+        case I2C_INTERNAL_RESTART:
+        {
+            this->trace.msg(vp::trace::LEVEL_TRACE, "Waiting start\n");
+            this->internal_state = I2C_INTERNAL_WAIT_START;
+            this->desired_sda = 0;
+            break;
+        }
     }
 
-    while(!this->recv_bit_queue.empty())
-    {
-        this->recv_bit_queue.pop();
-    }
+    this->sync_pins();
 }
diff --git a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp
index a9e863471..cb8289c98 100644
--- a/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp
+++ b/gvsoc/gvsoc/models/devices/i2c/helper/i2c_helper.hpp
@@ -41,9 +41,18 @@ typedef enum {
 
 typedef enum {
     I2C_INTERNAL_IDLE,
+    I2C_INTERNAL_WAIT_START,
+    I2C_INTERNAL_WAIT_STOP,
     I2C_INTERNAL_START,
+    I2C_INTERNAL_WAIT_DATA,
     I2C_INTERNAL_DATA,
+    I2C_INTERNAL_DATA_READ,
     I2C_INTERNAL_ACK,
+    I2C_INTERNAL_STOP_CLOCK,
+    I2C_INTERNAL_STOP_CLOCK_WAIT,
+    I2C_INTERNAL_RESTART,
+    I2C_INTERNAL_STOP_0,
+    I2C_INTERNAL_STOP_1,
 } i2c_internal_state_e;
 
 typedef std::function<void(i2c_operation_e id, i2c_status_e status, int value)> i2c_callback_t;
@@ -66,7 +75,7 @@ typedef std::function<void(vp::clock_event* event)> i2c_cancel_event_fn_t;
  */
 class I2C_helper {
     public:
-        I2C_helper(vp::component* parent, vp::i2c_master* itf, i2c_enqueue_event_fn_t event, i2c_cancel_event_fn_t cancel_event);
+        I2C_helper(vp::component* parent, vp::i2c_master* itf, i2c_enqueue_event_fn_t event, i2c_cancel_event_fn_t cancel_event, std::string trace_path="");
 
         // TO be called when pin values change
         void update_pins(int scl, int sda);
@@ -106,7 +115,7 @@ class I2C_helper {
         /******************/
         /* Static methods */
         /******************/
-        static void st_data_event_handler(void* __this, vp::clock_event* event);
+        static void fsm_event_handler(void* __this, vp::clock_event* event);
         static void st_clock_event_handler(void* __this, vp::clock_event* event);
         static void i2c_sync(void *__this, int scl, int sda);
 
@@ -117,13 +126,17 @@ class I2C_helper {
 
         void start_clock(void);
         void stop_clock(void);
+        void clock_toggle(void);
         void enqueue_clock_toggle(void);
         void enqueue_data_change(int new_sda);
+        void fsm_enqueue_event(int64_t delay);
+        void send_data_bit();
 
-        void fsm_step(int scl, int sda);
+        void fsm_step();
 
         void sync_pins(void);
-        void empty_queues(void);
+
+        std::string get_state_name(i2c_internal_state_e state);
 
         /*************/
         /* Externals */
@@ -149,7 +162,7 @@ class I2C_helper {
         /* Runtime data */
         /****************/
         vp::clock_event clock_event;
-        vp::clock_event data_event;
+        vp::clock_event fsm_event;
 
         i2c_internal_state_e internal_state;
 
@@ -164,8 +177,6 @@ class I2C_helper {
 
         int sda_rise; /* sda sampled on scl rising edge */
 
-        std::queue<int> send_bit_queue;
-        std::queue<int> recv_bit_queue;
         int expected_bit_value; /* checked when scl is rising */
 
         bool check_sent;
@@ -175,5 +186,16 @@ class I2C_helper {
         bool is_starting;
 
         bool is_clock_enabled;
-        bool is_clock_low; /* tell if clock is in low or high state */
+        int clock_value;
+
+        vp::trace trace;
+
+        int ack_value;
+
+        uint8_t pending_data;
+        int pending_data_bits;
+        bool fsm_waiting;
+
+        int input_scl;
+        int input_sda;
 };
diff --git a/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp b/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp
index b16036c39..81b0da6bd 100644
--- a/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp
+++ b/gvsoc/gvsoc/models/devices/i2c/i2c_bus.cpp
@@ -46,6 +46,9 @@ class I2c_bus : public vp::component
 
     vp::reg_1 bus_scl;
     vp::reg_1 bus_sda;
+
+    bool pending_resolve;
+    bool do_resolve;
 };
 
 
@@ -68,6 +71,8 @@ int I2c_bus::build()
     this->bus_scl.set(1);
     this->bus_sda.set(1);
 
+    this->pending_resolve = false;
+
     return 0;
 }
 
@@ -76,42 +81,61 @@ void I2c_bus::sync(void *__this, int scl, int sda, int id)
 {
     I2c_bus *_this = (I2c_bus *)__this;
 
-    _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus update [id=%d]: scl=%d, sda=%d\n",
+    _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus sync [id=%d]: scl=%d, sda=%d\n",
             id, scl, sda);
     /* store incoming values in maps */
     _this->i2c_values[id].scl = scl;
     _this->i2c_values[id].sda = sda;
 
-    /* browse all values and compute resulting SCL and SDA */
-    int res_scl_value = 1;
-    int res_sda_value = 1;
+    _this->do_resolve = true;
+
+    if (_this->pending_resolve)
+    {
+        return;
+    }
+
+    _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus update\n");
 
-    for (std::pair<int, i2c_pair_t> i2c_val : _this->i2c_values)
+    _this->pending_resolve = true;
+
+    while (_this->do_resolve)
     {
-        _this->trace.msg(vp::trace::LEVEL_TRACE, "bus values [id=%d]: scl=%d, sda=%d\n",
-                i2c_val.first,
-                i2c_val.second.scl,
-                i2c_val.second.sda);
-        if (i2c_val.second.scl == 0)
+        _this->do_resolve = false;
+
+        /* browse all values and compute resulting SCL and SDA */
+        int res_scl_value = 1;
+        int res_sda_value = 1;
+
+        for (std::pair<int, i2c_pair_t> i2c_val : _this->i2c_values)
         {
-            res_scl_value = 0;
+            _this->trace.msg(vp::trace::LEVEL_TRACE, "bus values [id=%d]: scl=%d, sda=%d\n",
+                    i2c_val.first,
+                    i2c_val.second.scl,
+                    i2c_val.second.sda);
+            if (i2c_val.second.scl == 0)
+            {
+                res_scl_value = 0;
+            }
+            if (i2c_val.second.sda == 0)
+            {
+                res_sda_value = 0;
+            }
         }
-        if (i2c_val.second.sda == 0)
+
+        /* broadcast the values to all peripherals if needed */
+        if (res_scl_value != _this->bus_scl.get() || res_sda_value != _this->bus_sda.get())
         {
-            res_sda_value = 0;
+            /* only propagate changes */
+            _this->bus_scl.set(res_scl_value);
+            _this->bus_sda.set(res_sda_value);
+            _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C: scl=%d, sda=%d\n",
+                    _this->bus_scl.get(), _this->bus_sda.get());
+            _this->in.sync(res_scl_value, res_sda_value);
         }
     }
 
-    /* broadcast the values to all peripherals if needed */
-    if (res_scl_value != _this->bus_scl.get() || res_sda_value != _this->bus_sda.get())
-    {
-        /* only propagate changes */
-        _this->bus_scl.set(res_scl_value);
-        _this->bus_sda.set(res_sda_value);
-        _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C: scl=%d, sda=%d\n",
-                _this->bus_scl.get(), _this->bus_sda.get());
-        _this->in.sync(res_scl_value, res_sda_value);
-    }
+    _this->pending_resolve = false;
+    _this->trace.msg(vp::trace::LEVEL_TRACE, " => bus update done[id=%d]\n", id);
 }
 
 
diff --git a/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt b/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt
index fead9b06a..47da2a0d4 100644
--- a/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt
+++ b/gvsoc/gvsoc/models/devices/sound/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(dac)
 
 set(SOUND_PREFIX "devices/sound")
 
diff --git a/gvsoc/gvsoc/models/devices/sound/dac/CMakeLists.txt b/gvsoc/gvsoc/models/devices/sound/dac/CMakeLists.txt
new file mode 100644
index 000000000..7eb9285ee
--- /dev/null
+++ b/gvsoc/gvsoc/models/devices/sound/dac/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+set(DAC_PREFIX "devices/sound/dac")
+
+vp_model(NAME ak4332
+         PREFIX ${DAC_PREFIX}
+         SOURCES "ak4332.cpp")
diff --git a/gvsoc/gvsoc/models/devices/sound/dac/ak4332.cpp b/gvsoc/gvsoc/models/devices/sound/dac/ak4332.cpp
new file mode 100644
index 000000000..5cff4d467
--- /dev/null
+++ b/gvsoc/gvsoc/models/devices/sound/dac/ak4332.cpp
@@ -0,0 +1,638 @@
+/*
+ * Copyright (C) 2020 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+
+#include <vp/vp.hpp>
+#include <vp/itf/i2c.hpp>
+
+
+
+typedef enum
+{
+  I2C_STATE_WAIT_START,
+  I2C_STATE_WAIT_ADDRESS,
+  I2C_STATE_GET_DATA,
+  I2C_STATE_SAMPLE_DATA,
+  I2C_STATE_ACK,
+  I2C_STATE_READ_ACK
+} I2c_state_e;
+
+
+class Ak4332 : public vp::component
+{
+public:
+    Ak4332(js::config *config);
+
+    int build();
+
+protected:
+    static void i2c_sync(void *__this, int scl, int sda);
+    void i2c_start(unsigned int address, bool is_read);
+    void i2c_handle_byte(uint8_t byte);
+    void i2c_stop();
+    void i2c_get_data();
+    void i2c_send_byte(uint8_t byte);
+
+    void handle_reg_write(uint8_t address, uint8_t value);
+    uint8_t handle_reg_read(uint8_t address);
+
+    void start();
+
+    vp::trace trace;
+    vp::i2c_master i2c_itf;
+
+    unsigned int device_address;
+
+    bool i2c_being_addressed;
+    unsigned int i2c_address;
+    uint8_t i2c_pending_data;
+    bool i2c_is_read;
+    I2c_state_e i2c_state;
+    int i2c_pending_bits;
+    int i2c_prev_sda;
+    int i2c_prev_scl;
+    unsigned int i2c_pending_send_byte;
+    uint8_t reg_address;
+    bool waiting_reg_address;
+    uint8_t power_1;
+    uint8_t power_2;
+    uint8_t power_3;
+    uint8_t power_4;
+    uint8_t output_mode;
+    uint8_t clock_mode;
+    uint8_t digital_filter;
+    uint8_t dac_mono_mixing;
+    uint8_t pdm_control;
+    uint8_t dac_volume_control;
+    uint8_t hp_volume_control;
+    uint8_t pll_clk_selection;
+    uint8_t pll_ref_clk_div_1;
+    uint8_t pll_ref_clk_div_2;
+    uint8_t pll_fb_clk_div_1;
+    uint8_t pll_fb_clk_div_2;
+    uint8_t dac_clk_source;
+    uint8_t dac_clk_divider;
+    uint8_t audio_format;
+    uint8_t pdm_err;
+    uint8_t dac_adjustment_1;
+    uint8_t dac_adjustment_2;
+};
+
+
+Ak4332::Ak4332(js::config *config)
+    : vp::component(config)
+{
+}
+
+
+void Ak4332::start()
+{
+    this->i2c_itf.sync(1, 1);
+}
+
+
+void Ak4332::i2c_sync(void *__this, int scl, int sda)
+{
+    Ak4332 *_this = (Ak4332 *)__this;
+
+    _this->trace.msg(vp::trace::LEVEL_TRACE, "I2C sync (scl: %d, sda: %d)\n", scl, sda);
+
+    int sdo = 1;
+
+    if (scl == 1 && _this->i2c_prev_sda != sda)
+    {
+        if (_this->i2c_prev_sda == 1)
+        {
+            _this->trace.msg(vp::trace::LEVEL_TRACE, "Detected start\n");
+
+            _this->i2c_state = I2C_STATE_WAIT_ADDRESS;
+            _this->i2c_address = 0;
+            _this->i2c_pending_bits = 8;
+        }
+        else
+        {
+            _this->i2c_state = I2C_STATE_WAIT_START;
+            _this->i2c_stop();
+        }
+        goto end;
+    }
+
+    if (!_this->i2c_prev_scl && scl)
+    {
+        switch (_this->i2c_state)
+        {
+            case I2C_STATE_WAIT_START:
+            {
+                sdo = 1;
+                break;
+            }
+
+            case I2C_STATE_WAIT_ADDRESS:
+            {
+                if (_this->i2c_pending_bits > 1)
+                {
+                    _this->i2c_address = (_this->i2c_address << 1) | sda;
+                    _this->trace.msg(vp::trace::LEVEL_TRACE, "Received address bit (bit: %d, address: 0x%x, pending_bits: %d)\n", sda, _this->i2c_address, _this->i2c_pending_bits);
+                }
+                else
+                {
+                    _this->i2c_is_read = sda;
+                }
+                _this->i2c_pending_bits--;
+                if (_this->i2c_pending_bits == 0)
+                {
+                    _this->i2c_start(_this->i2c_address, _this->i2c_is_read);
+                    _this->i2c_state = I2C_STATE_ACK;
+                    _this->i2c_pending_bits = 8;
+                }
+                break;
+            }
+
+            case I2C_STATE_SAMPLE_DATA:
+            {
+                _this->i2c_pending_data = (_this->i2c_pending_data << 1) | sda;
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Sampling data (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sda, _this->i2c_pending_data, _this->i2c_pending_bits);
+                _this->i2c_pending_bits--;
+                if (_this->i2c_pending_bits == 0)
+                {
+                    _this->i2c_pending_bits = 8;
+                    _this->i2c_handle_byte(_this->i2c_pending_data);
+                    _this->i2c_state = I2C_STATE_ACK;
+                }
+                break;
+            }
+
+            case I2C_STATE_ACK: {
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed);
+                if (_this->i2c_being_addressed)
+                {
+                    if (_this->i2c_is_read)
+                    {
+                        _this->i2c_state = I2C_STATE_GET_DATA;
+                        _this->i2c_pending_bits = 8;
+                        _this->i2c_get_data();
+                    }
+                    else
+                    {
+                        _this->i2c_state = I2C_STATE_SAMPLE_DATA;
+                    }
+                }
+                else
+                {
+                    _this->i2c_state = I2C_STATE_WAIT_START;
+                }
+
+                break;
+            }
+
+            case I2C_STATE_READ_ACK: {
+                _this->i2c_state = I2C_STATE_WAIT_START;
+                break;
+            }
+        }
+    }
+
+    if (_this->i2c_prev_scl && !scl)
+    {
+        switch (_this->i2c_state)
+        {
+            case I2C_STATE_ACK:
+            {
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Ack (being_addressed: %d)\n", _this->i2c_being_addressed);
+                sdo = !_this->i2c_being_addressed;
+                break;
+            }
+
+            case I2C_STATE_READ_ACK:
+            {
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Read ack\n");
+                sdo = 0;
+                break;
+            }
+
+            case I2C_STATE_GET_DATA:
+            {
+                sdo = (_this->i2c_pending_send_byte >> 7) & 1;
+                _this->trace.msg(vp::trace::LEVEL_TRACE, "Sending bit (bit: %d, pending_value: 0x%x, pending_bits: %d)\n", sdo, _this->i2c_pending_send_byte, _this->i2c_pending_bits);
+                _this->i2c_pending_send_byte <<= 1;
+                _this->i2c_pending_bits--;
+                if (_this->i2c_pending_bits == 0)
+                {
+                    _this->i2c_state = I2C_STATE_READ_ACK;
+                }
+                break;
+            }
+        }
+    }
+
+end:
+    if (_this->i2c_prev_scl && !scl)
+    {
+        _this->trace.msg(vp::trace::LEVEL_TRACE, "Sync sda (value: %d)\n", sdo);
+        _this->i2c_itf.sync(1, sdo);
+    }
+    _this->i2c_prev_sda = sda;
+    _this->i2c_prev_scl = scl;
+}
+
+void Ak4332::i2c_start(unsigned int address, bool is_read)
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Received header (address: 0x%x, is_read: %d)\n", address, is_read);
+
+    this->i2c_being_addressed = address == this->device_address;
+    if (is_read)
+    {
+        this->i2c_send_byte(this->handle_reg_read(this->reg_address));
+    }
+}
+
+void Ak4332::i2c_handle_byte(uint8_t byte)
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Handle byte (value: 0x%x)\n", byte);
+
+    if (this->waiting_reg_address)
+    {
+        this->reg_address = byte;
+        this->waiting_reg_address = false;
+    }
+    else
+    {
+        this->handle_reg_write(this->reg_address, byte);
+        this->waiting_reg_address = true;
+    }
+}
+
+void Ak4332::i2c_stop()
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Received stop bit\n");
+
+}
+
+void Ak4332::i2c_get_data()
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Getting data\n");
+}
+
+void Ak4332::i2c_send_byte(uint8_t byte)
+{
+  this->i2c_pending_send_byte = byte;
+}
+
+
+void Ak4332::handle_reg_write(uint8_t address, uint8_t value)
+{
+    this->trace.msg(vp::trace::LEVEL_TRACE, "Register write (address: 0x%x, value: 0x%x)\n", address, value);
+
+    switch (address)
+    {
+        case 0x00:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 1", value);
+            this->power_1 = value;
+            break;
+        }
+        case 0x01:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 2", value);
+            this->power_2 = value;
+            break;
+        }
+        case 0x02:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 3", value);
+            this->power_3 = value;
+            break;
+        }
+        case 0x03:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Power Management 4", value);
+            this->power_4 = value;
+            break;
+        }
+        case 0x04:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Output Mode Setting", value);
+            this->output_mode = value;
+            break;
+        }
+        case 0x05:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Clock Mode Selection", value);
+            this->clock_mode = value;
+            break;
+        }
+        case 0x06:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Digital Filter Selection", value);
+            this->digital_filter = value;
+            break;
+        }
+        case 0x07:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Mono Mixing", value);
+            this->dac_mono_mixing = value;
+            break;
+        }
+        case 0x08:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PDM I/F Control", value);
+            this->pdm_control = value;
+            break;
+        }
+        case 0x0B:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Output Volume", value);
+            this->dac_volume_control = value;
+            break;
+        }
+        case 0x0D:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "HP Volume Control", value);
+            this->hp_volume_control = value;
+            break;
+        }
+        case 0x0E:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL CLK Source Selection", value);
+            this->pll_clk_selection = value;
+            break;
+        }
+        case 0x0F:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 1", value);
+            this->pll_ref_clk_div_1 = value;
+            break;
+        }
+        case 0x10:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 2", value);
+            this->pll_ref_clk_div_2 = value;
+            break;
+        }
+        case 0x11:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 1", value);
+            this->pll_fb_clk_div_1 = value;
+            break;
+        }
+        case 0x12:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 2", value);
+            this->pll_fb_clk_div_2 = value;
+            break;
+        }
+        case 0x13:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC CLK Source", value);
+            this->dac_clk_source = value;
+            break;
+        }
+        case 0x14:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC CLK Divider", value);
+            this->dac_clk_divider = value;
+            break;
+        }
+        case 0x15:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "Audio I/F Format", value);
+            this->audio_format = value;
+            break;
+        }
+        case 0x17:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "PDMERR", value);
+            this->pdm_err = value;
+            break;
+        }
+        case 0x26:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Adjustment 1", value);
+            this->dac_adjustment_1 = value;
+            break;
+        }
+        case 0x27:
+        {
+            this->trace.msg(vp::trace::LEVEL_INFO, "Writing register (name: %s, value: 0x%x)\n", "DAC Adjustment 2", value);
+            this->dac_adjustment_2 = value;
+            break;
+        }
+        default:
+            this->trace.force_warning("Writing invalid register (address: 0x%x)\n", address);
+            break;
+    }
+
+}
+
+
+uint8_t Ak4332::handle_reg_read(uint8_t address)
+{
+    this->trace.msg(vp::trace::LEVEL_DEBUG, "Register read (address: 0x%x)\n", address);
+
+    uint8_t value = 0xFF;
+
+    switch (address)
+    {
+        case 0x00:
+        {
+            value = this->power_1;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 1", value);
+            break;
+        }
+        case 0x01:
+        {
+            value = this->power_2;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 2", value);
+            break;
+        }
+        case 0x02:
+        {
+            value = this->power_3;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 3", value);
+            break;
+        }
+        case 0x03:
+        {
+            value = this->power_4;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Power Management 4", value);
+            break;
+        }
+        case 0x04:
+        {
+            value = this->output_mode;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Output Mode Setting", value);
+            break;
+        }
+        case 0x05:
+        {
+            value = this->clock_mode;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Clock Mode Selection", value);
+            break;
+        }
+        case 0x06:
+        {
+            value = this->digital_filter;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Digital Filter Selection", value);
+            break;
+        }
+        case 0x07:
+        {
+            value = this->dac_mono_mixing;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Mono Mixing", value);
+            break;
+        }
+        case 0x08:
+        {
+            value = this->pdm_control;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PDM I/F Control", value);
+            break;
+        }
+        case 0x0B:
+        {
+            value = this->dac_volume_control;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Output Volume", value);
+            break;
+        }
+        case 0x0D:
+        {
+            value = this->hp_volume_control;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "HP Volume Control", value);
+            break;
+        }
+        case 0x0E:
+        {
+            value = this->pll_clk_selection;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL CLK Source Selection", value);
+            break;
+        }
+        case 0x0F:
+        {
+            value = this->pll_ref_clk_div_1;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 1", value);
+            break;
+        }
+        case 0x10:
+        {
+            value = this->pll_ref_clk_div_2;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL Ref CLK Divider 2", value);
+            break;
+        }
+        case 0x11:
+        {
+            value = this->pll_fb_clk_div_1;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 1", value);
+            break;
+        }
+        case 0x12:
+        {
+            value = this->pll_fb_clk_div_2;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PLL FB CLK Divider 2", value);
+            break;
+        }
+        case 0x13:
+        {
+            value = this->dac_clk_source;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC CLK Source", value);
+            break;
+        }
+        case 0x14:
+        {
+            value = this->dac_clk_divider;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC CLK Divider", value);
+            break;
+        }
+        case 0x15:
+        {
+            value = this->audio_format;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "Audio I/F Format", value);
+            break;
+        }
+        case 0x17:
+        {
+            value = this->pdm_err;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "PDMERR", value);
+            break;
+        }
+        case 0x26:
+        {
+            value = this->dac_adjustment_1;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Adjustment 1", value);
+            break;
+        }
+        case 0x27:
+        {
+            value = this->dac_adjustment_2;
+            this->trace.msg(vp::trace::LEVEL_INFO, "Reading register (name: %s, value: 0x%x)\n", "DAC Adjustment 2", value);
+            break;
+        }
+        default:
+            this->trace.force_warning("Reading invalid register (address: 0x%x)\n", address);
+            break;
+    }
+
+    return value;
+}
+
+
+int Ak4332::build()
+{
+    traces.new_trace("trace", &trace, vp::DEBUG);
+
+    this->i2c_itf.set_sync_meth(&Ak4332::i2c_sync);
+    this->new_master_port("i2c", &this->i2c_itf);
+
+    this->i2c_state = I2C_STATE_WAIT_START;
+    this->i2c_prev_sda = 1;
+    this->i2c_prev_scl = 1;
+    this->i2c_being_addressed = false;
+    this->device_address = 0x10;
+    this->waiting_reg_address = true;
+
+    this->power_1              = 0x00;
+    this->power_2              = 0x00;
+    this->power_3              = 0x00;
+    this->power_4              = 0x00;
+    this->output_mode          = 0x00;
+    this->clock_mode           = 0x00;
+    this->digital_filter       = 0x00;
+    this->dac_mono_mixing      = 0x00;
+    this->pdm_control          = 0x00;
+    this->dac_volume_control   = 0x19;
+    this->hp_volume_control    = 0x65;
+    this->pll_clk_selection    = 0x00;
+    this->pll_ref_clk_div_1    = 0x00;
+    this->pll_ref_clk_div_2    = 0x00;
+    this->pll_fb_clk_div_1     = 0x00;
+    this->pll_fb_clk_div_2     = 0x00;
+    this->dac_clk_source       = 0x00;
+    this->dac_clk_divider      = 0x00;
+    this->audio_format         = 0x00;
+    this->pdm_err              = 0x00;
+    this->dac_adjustment_1     = 0x6C;
+    this->dac_adjustment_2     = 0x40;
+
+    return 0;
+}
+
+
+extern "C" vp::component *vp_constructor(js::config *config)
+{
+    return new Ak4332(config);
+}
diff --git a/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp b/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp
index 289acf5a6..8d696e332 100644
--- a/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp
+++ b/gvsoc/gvsoc/models/devices/testbench/i2s_verif.cpp
@@ -74,7 +74,7 @@ class Rx_stream_libsnd_file : public Rx_stream
 class Rx_stream_raw_file : public Rx_stream
 {
 public:
-    Rx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin);
+    Rx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding);
     uint32_t get_sample(int channel_id);
     Slot *slot;
 
@@ -82,13 +82,14 @@ class Rx_stream_raw_file : public Rx_stream
     FILE *infile;
     int width;
     bool is_bin;
+    pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding;
 };
 
 
 class Tx_stream_raw_file : public Tx_stream
 {
 public:
-    Tx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin);
+    Tx_stream_raw_file(Slot *slot, string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding);
     void push_sample(uint32_t sample, int channel_id);
     Slot *slot;
 
@@ -96,6 +97,7 @@ class Tx_stream_raw_file : public Tx_stream
     FILE *outfile;
     int width;
     bool is_bin;
+    pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding;
 };
 
 
@@ -565,10 +567,11 @@ void I2s_verif::start(pi_testbench_i2s_verif_start_config_t *config)
 }
 
 
-Tx_stream_raw_file::Tx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin)
+Tx_stream_raw_file::Tx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding)
 {
     this->width = width;
     this->is_bin = is_bin;
+    this->encoding = encoding;
     this->slot = slot;
     this->outfile = fopen(filepath.c_str(), "w");
     this->slot->trace.msg(vp::trace::LEVEL_INFO, "Opening dumper (path: %s)\n", filepath.c_str());
@@ -583,6 +586,17 @@ void Tx_stream_raw_file::push_sample(uint32_t sample, int channel_id)
 {
     if (this->is_bin)
     {
+        if (this->encoding == PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS)
+        {
+            // Convert encoding from 0/1 to -1/+1
+            if (sample == 0)
+                sample = (uint32_t)-1;
+            else if (sample == 1)
+                sample = 1;
+            else
+                sample = 0; // Error
+        }
+
         int nb_bytes = (this->width + 7) / 8;
         if (fwrite((void *)&sample, nb_bytes, 1, this->outfile) != 1)
         {
@@ -666,10 +680,11 @@ void Tx_stream_libsnd_file::push_sample(uint32_t data, int channel)
 }
 
 
-Rx_stream_raw_file::Rx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin)
+Rx_stream_raw_file::Rx_stream_raw_file(Slot *slot, std::string filepath, int width, bool is_bin, pi_testbench_i2s_verif_start_config_file_encoding_type_e encoding)
 {
     this->width = width;
     this->is_bin = is_bin;
+    this->encoding = encoding;
     this->slot = slot;
     this->infile = fopen(filepath.c_str(), "r");
     if (this->infile == NULL)
@@ -685,10 +700,26 @@ uint32_t Rx_stream_raw_file::get_sample(int channel_id)
     {
         int nb_bytes = (this->width + 7) / 8;
         uint32_t result = 0;
-        if (fread((void *)&result, nb_bytes, 1, this->infile) != 1)
+        int freadres = fread((void *)&result, nb_bytes, 1, this->infile);
+
+        // this->slot->top->trace.msg(vp::trace::LEVEL_TRACE, "channel_id=%d, nb_bytes=%d, freadres=%d, result=%d\n", channel_id, nb_bytes, freadres, result);
+
+        if (freadres != 1)
         {
             return 0;
         }
+
+        if (this->encoding == PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS)
+        {
+            // Convert encoding from -1/+1 to 0/1
+            if ((int32_t)result == -1)
+                result = 0;
+            else if ((int32_t)result == 1)
+                result = 1;
+            else
+                result = 0;
+        }
+
         return result;
     }
     else
@@ -872,7 +903,12 @@ void Slot::start(pi_testbench_i2s_verif_slot_start_config_t *config, Slot *reuse
             {
                 if (config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_TX_FILE_DUMPER_TYPE_RAW || config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN)
                 {
-                    this->outstream = new Tx_stream_raw_file(this, filepath, config->tx_file_dumper.width, config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN);
+                    this->outstream = new Tx_stream_raw_file(
+                        this,
+                        filepath,
+                        config->tx_file_dumper.width,
+                        config->tx_file_dumper.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN,
+                        (pi_testbench_i2s_verif_start_config_file_encoding_type_e)config->tx_file_dumper.encoding);
                 }
                 else
                 {
@@ -904,7 +940,12 @@ void Slot::start(pi_testbench_i2s_verif_slot_start_config_t *config, Slot *reuse
         {
             if (config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_RAW || config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN)
             {
-                this->instream = new Rx_stream_raw_file(this, filepath, config->rx_file_reader.width, config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN);
+                this->instream = new Rx_stream_raw_file(
+                    this,
+                    filepath,
+                    config->rx_file_reader.width,
+                    config->rx_file_reader.type == PI_TESTBENCH_I2S_VERIF_RX_FILE_READER_TYPE_BIN,
+                    (pi_testbench_i2s_verif_start_config_file_encoding_type_e)config->rx_file_reader.encoding);
             }
             else
             {
diff --git a/gvsoc/gvsoc/models/devices/testbench/testbench.cpp b/gvsoc/gvsoc/models/devices/testbench/testbench.cpp
index 2975e770d..0d2823038 100644
--- a/gvsoc/gvsoc/models/devices/testbench/testbench.cpp
+++ b/gvsoc/gvsoc/models/devices/testbench/testbench.cpp
@@ -1353,6 +1353,21 @@ std::string Testbench::handle_command(Gv_proxy *proxy, FILE *req_file, FILE *rep
                             config->rx_file_reader.type = 0;
                         }
                     }
+                    else if (name == "encoding")
+                    {
+                        if (value_str == "asis")
+                        {
+                            config->rx_file_reader.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS;
+                        }
+                        else if (value_str == "plusminus")
+                        {
+                            config->rx_file_reader.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS;
+                        }
+                        else
+                        {
+                            config->rx_file_reader.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS;
+                        }
+                    }
                 }
 
                 config->type = PI_TESTBENCH_I2S_VERIF_RX_FILE_READER;
@@ -1410,6 +1425,21 @@ std::string Testbench::handle_command(Gv_proxy *proxy, FILE *req_file, FILE *rep
                             config->tx_file_dumper.type = 0;
                         }
                     }
+                    else if (name == "encoding")
+                    {
+                        if (value_str == "asis")
+                        {
+                            config->tx_file_dumper.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS;
+                        }
+                        else if (value_str == "plusminus")
+                        {
+                            config->tx_file_dumper.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS;
+                        }
+                        else
+                        {
+                            config->tx_file_dumper.encoding = PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS;
+                        }
+                    }
                 }
 
                 config->type = PI_TESTBENCH_I2S_VERIF_TX_FILE_DUMPER;
diff --git a/gvsoc/gvsoc/models/devices/testbench/testbench.hpp b/gvsoc/gvsoc/models/devices/testbench/testbench.hpp
index d5830608f..079f25abd 100644
--- a/gvsoc/gvsoc/models/devices/testbench/testbench.hpp
+++ b/gvsoc/gvsoc/models/devices/testbench/testbench.hpp
@@ -240,6 +240,11 @@ typedef enum
     PI_TESTBENCH_I2S_VERIF_TX_FILE_DUMPER_TYPE_AU,
 } pi_testbench_i2s_verif_start_config_tx_file_dumper_type_e;
 
+typedef enum
+{
+    PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS = 0, // Keep as is (default)
+    PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS, // Assume file contains -1/+1 values (usable for PDM only)
+} pi_testbench_i2s_verif_start_config_file_encoding_type_e;
 
 // This structure can be used to describe what an I2S slot should do
 typedef struct
@@ -261,6 +266,7 @@ typedef struct
             uint32_t filepath_len;
             uint8_t type;
             uint8_t width;
+            uint8_t encoding;
         } tx_file_dumper;
         struct
         {
@@ -269,6 +275,7 @@ typedef struct
             uint32_t filepath_len;
             uint8_t type;
             uint8_t width;
+            uint8_t encoding;
         } rx_file_reader;
     };
 
diff --git a/gvsoc/gvsoc/models/memory/memory_impl.cpp b/gvsoc/gvsoc/models/memory/memory_impl.cpp
index 4f7b748d2..da3edb540 100644
--- a/gvsoc/gvsoc/models/memory/memory_impl.cpp
+++ b/gvsoc/gvsoc/models/memory/memory_impl.cpp
@@ -40,7 +40,6 @@ class memory : public vp::component
 
 private:
 
-  static void power_callback(void *__this, vp::clock_event *event);
   static void power_ctrl_sync(void *__this, bool value);
 
   vp::trace     trace;
@@ -60,14 +59,13 @@ class memory : public vp::component
   bool power_trigger;
   bool powered_up;
 
-  vp::power::power_source idle_power;
   vp::power::power_source read_8_power;
   vp::power::power_source read_16_power;
   vp::power::power_source read_32_power;
   vp::power::power_source write_8_power;
   vp::power::power_source write_16_power;
   vp::power::power_source write_32_power;
-  vp::power::power_source leakage_power;
+  vp::power::power_source background_power;
 
   vp::clock_event *power_event;
   int64_t last_access_timestamp;
@@ -79,15 +77,6 @@ memory::memory(js::config *config)
 
 }
 
-void memory::power_callback(void *__this, vp::clock_event *event)
-{
-  memory *_this = (memory *)__this;
-  if (_this->last_access_timestamp < _this->get_time())
-  {
-    _this->idle_power.dynamic_power_start();
-  }
-}
-
 vp::io_req_status_e memory::req(void *__this, vp::io_req *req)
 {
   memory *_this = (memory *)__this;
@@ -142,9 +131,6 @@ vp::io_req_status_e memory::req(void *__this, vp::io_req *req)
         else if (size == 4)
           _this->read_32_power.account_energy_quantum();
       }
-
-      if (!_this->power_event->is_enqueued())
-        _this->event_enqueue(_this->power_event, 1);
     }
 
   #ifdef VP_TRACE_ACTIVE
@@ -223,8 +209,7 @@ int memory::build()
   js::config *config = get_js_config()->get("power_trigger");
   this->power_trigger = config != NULL && config->get_bool();
 
-  power.new_power_source("leakage", &leakage_power, this->get_js_config()->get("**/leakage"));
-  power.new_power_source("idle", &idle_power, this->get_js_config()->get("**/idle"));
+  power.new_power_source("leakage", &background_power, this->get_js_config()->get("**/background"));
   power.new_power_source("read_8", &read_8_power, this->get_js_config()->get("**/read_8"));
   power.new_power_source("read_16", &read_16_power, this->get_js_config()->get("**/read_16"));
   power.new_power_source("read_32", &read_32_power, this->get_js_config()->get("**/read_32"));
@@ -232,8 +217,6 @@ int memory::build()
   power.new_power_source("write_16", &write_16_power, this->get_js_config()->get("**/write_16"));
   power.new_power_source("write_32", &write_32_power, this->get_js_config()->get("**/write_32"));
 
-  power_event = this->event_new(memory::power_callback);
-
   return 0;
 }
 
@@ -287,8 +270,8 @@ void memory::start()
     }
   }
 
-  this->leakage_power.leakage_power_start();
-  this->idle_power.dynamic_power_start();
+  this->background_power.leakage_power_start();
+  this->background_power.dynamic_power_start();
   this->last_access_timestamp = -1;
 }
 
diff --git a/gvsoc/gvsoc/models/utils/composite_impl.cpp b/gvsoc/gvsoc/models/utils/composite_impl.cpp
index 907075e82..d5cf33847 100644
--- a/gvsoc/gvsoc/models/utils/composite_impl.cpp
+++ b/gvsoc/gvsoc/models/utils/composite_impl.cpp
@@ -37,6 +37,7 @@ class composite : public vp::component
 
     int build();
     void start();
+    void power_supply_set(int state);
 
     void dump_traces(FILE *file);
 
@@ -83,6 +84,10 @@ void composite::add_port(std::string name, vp::port *port)
     this->ports[name] = port;
 }
 
+void composite::power_supply_set(int state)
+{
+    //printf("%s power set %d\n", this->get_path().c_str(), state);
+}
 
 
 extern "C" vp::component *vp_constructor(js::config *config)
diff --git a/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt b/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt
index 9f4db84d7..693e03c7d 100644
--- a/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt
+++ b/gvsoc/gvsoc_gap/models/gap9/CMakeLists.txt
@@ -1 +1,8 @@
 add_subdirectory(cpu)
+
+set(CLUSTER_PREFIX "gap9")
+
+vp_model(NAME cluster
+    PREFIX ${CLUSTER_PREFIX}
+    SOURCES "cluster.cpp"
+    )
diff --git a/gvsoc/gvsoc_gap/models/gap9/cluster.cpp b/gvsoc/gvsoc_gap/models/gap9/cluster.cpp
new file mode 100644
index 000000000..a9da7bf70
--- /dev/null
+++ b/gvsoc/gvsoc_gap/models/gap9/cluster.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2020 GreenWaves Technologies, SAS, ETH Zurich and
+ *                    University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+#include <vp/vp.hpp>
+
+
+
+class cluster : public vp::component
+{
+
+public:
+    cluster(js::config *config);
+
+    vp::port *get_slave_port(std::string name) { return this->ports[name]; }
+    vp::port *get_master_port(std::string name) { return this->ports[name]; }
+
+    void add_slave_port(std::string name, vp::slave_port *port) { this->add_port(name, port); }
+    void add_master_port(std::string name, vp::master_port *port) { this->add_port(name, port); }
+
+    int build();
+    void start();
+    void reset(bool active);
+    void power_supply_set(int state);
+
+    void dump_traces(FILE *file);
+
+    static void cluster_clock_gating_en_sync(void *__this, bool value);
+    static void timer_busy_sync(void *__this, bool value, int id);
+    static void ne16_busy_sync(void *__this, bool value);
+    static void ico_busy_sync(void *__this, bool value);
+    static void dma_busy_sync(void *__this, bool value);
+    static void cores_busy_sync(void *__this, bool value, int id);
+
+private:
+    void add_port(std::string name, vp::port *port);
+    std::map<std::string, vp::port *> ports;
+    void check_clock_gating();
+
+    vp::wire_slave<bool>          cluster_clock_gating_en_itf;
+    vp::wire_slave<bool>          timer_busy_itf[2];
+    vp::wire_slave<bool>          ne16_busy_itf;
+    vp::wire_slave<bool>          ico_busy_itf;
+    vp::wire_slave<bool>          dma_busy_itf;
+    vp::wire_slave<bool>          cores_busy_itf[9];
+
+    int timer_busy;
+    bool ne16_busy;
+    bool ico_busy;
+    bool dma_busy;
+    int cores_busy;
+
+    bool busy_sync;
+
+    bool clock_gating_en;
+
+    vp::trace     trace;
+
+    vp::power::power_source background_power;
+};
+
+
+
+cluster::cluster(js::config *config)
+    : vp::component(config)
+{
+}
+
+
+void cluster::dump_traces(FILE *file)
+{
+    this->power.get_power_trace()->dump(file);
+}
+
+
+void cluster::check_clock_gating()
+{
+    this->trace.msg(vp::trace::LEVEL_DEBUG, "Checking cluster clock gating (timer: 0x%x, ne16: %d, ico: %d, dma: %d, cores: 0x%x)\n",
+        this->timer_busy, this->ne16_busy, this->ico_busy, this->dma_busy, this->cores_busy);
+
+    bool busy = this->timer_busy || this->ne16_busy || this->ico_busy || this->dma_busy || this->cores_busy;
+
+    if (busy != this->busy_sync)
+    {
+        if (this->clock_gating_en && !busy)
+        {
+            this->power.power_supply_set_all(2);
+        }
+        else
+        {
+            this->power.power_supply_set_all(3);
+        }
+        this->busy_sync = busy;
+    }
+}
+
+void cluster::cluster_clock_gating_en_sync(void *__this, bool value)
+{
+    cluster *_this = (cluster *)__this;
+    _this->clock_gating_en = value;
+    _this->check_clock_gating();
+}
+
+void cluster::timer_busy_sync(void *__this, bool value, int id)
+{
+    cluster *_this = (cluster *)__this;
+    _this->timer_busy = (_this->timer_busy & ~(1<<id)) | (value << id);
+    _this->check_clock_gating();
+}
+
+void cluster::ne16_busy_sync(void *__this, bool value)
+{
+    printf("%d BUSY %d\n", __LINE__, value);
+
+}
+
+void cluster::ico_busy_sync(void *__this, bool value)
+{
+    printf("%d BUSY %d\n", __LINE__, value);
+
+}
+
+void cluster::dma_busy_sync(void *__this, bool value)
+{
+    cluster *_this = (cluster *)__this;
+    _this->dma_busy = value;
+    _this->check_clock_gating();
+}
+
+void cluster::cores_busy_sync(void *__this, bool value, int id)
+{
+    cluster *_this = (cluster *)__this;
+    _this->cores_busy = (_this->cores_busy & ~(1<<id)) | (value << id);
+    _this->check_clock_gating();
+}
+
+void cluster::reset(bool active)
+{
+    if (active)
+    {
+        this->busy_sync = false;
+        this->timer_busy = 0;
+        this->ne16_busy = 0;
+        this->ico_busy = 0;
+        this->dma_busy = 0;
+        this->cores_busy = 0;
+        this->clock_gating_en = 0;
+    }
+}
+
+
+int cluster::build()
+{
+    traces.new_trace("trace", &trace, vp::DEBUG);
+
+    this->cluster_clock_gating_en_itf.set_sync_meth(&cluster::cluster_clock_gating_en_sync);
+    new_slave_port("cluster_clock_gating_en", &this->cluster_clock_gating_en_itf);
+
+    for (int i=0; i<2; i++)
+    {
+        this->timer_busy_itf[i].set_sync_meth_muxed(&cluster::timer_busy_sync, i);
+        new_slave_port("timer" + std::to_string(i) + "_busy", &this->timer_busy_itf[i]);
+    }
+
+    this->ne16_busy_itf.set_sync_meth(&cluster::ne16_busy_sync);
+    new_slave_port("ne16_busy", &this->ne16_busy_itf);
+
+    this->ico_busy_itf.set_sync_meth(&cluster::ico_busy_sync);
+    new_slave_port("ico_busy", &this->ico_busy_itf);
+
+    this->dma_busy_itf.set_sync_meth(&cluster::dma_busy_sync);
+    new_slave_port("dma_busy", &this->dma_busy_itf);
+
+    for (int i=0; i<9; i++)
+    {
+        this->cores_busy_itf[i].set_sync_meth_muxed(&cluster::cores_busy_sync, i);
+        new_slave_port("core_busy_" + std::to_string(i), &this->cores_busy_itf[i]);
+    }
+
+    this->power.new_power_source("background", &this->background_power, this->get_js_config()->get("power_models/background"));
+
+    this->create_comps();
+    this->create_ports();
+    this->create_bindings();
+
+    return 0;
+}
+
+
+void cluster::start()
+{
+    this->background_power.leakage_power_start();
+    this->background_power.dynamic_power_start();
+}
+
+
+
+void cluster::add_port(std::string name, vp::port *port)
+{
+    vp_assert_always(port != NULL, this->get_trace(), "Adding NULL port\n");
+    //vp_assert_always(this->ports[name] == NULL, this->get_trace(), "Adding already existing port\n");
+    this->ports[name] = port;
+}
+
+void cluster::power_supply_set(int state)
+{
+
+    //printf("%s power set %d\n", this->get_path().c_str(), state);
+}
+
+
+extern "C" vp::component *vp_constructor(js::config *config)
+{
+    return new cluster(config);
+}
diff --git a/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp
index dc1c3bca0..8e44c267c 100644
--- a/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/cluster/cluster_ctrl_v2_impl.cpp
@@ -68,6 +68,8 @@ class cluster_ctrl : public vp::component
   uint32_t dbg_halt_mask;
   uint32_t dbg_halt_status;
   uint32_t dbg_halt_status_sync;
+
+  vp::wire_master<bool>  clock_gating_en_itf;
 };
 
 cluster_ctrl::cluster_ctrl(js::config *config)
@@ -111,6 +113,10 @@ vp::io_req_status_e cluster_ctrl::req(void *__this, vp::io_req *req)
   }
   else if (offset == ARCHI_CLUSTER_CTRL_CLUSTER_CLK_GATE)
   {
+    if (_this->clock_gating_en_itf.is_bound())
+    {
+      _this->clock_gating_en_itf.sync((*data) & 1);
+    }
     return vp::IO_REQ_OK;
   }
   else if (offset == ARCHI_CLUSTER_CTRL_DBG_STATUS)
@@ -258,6 +264,8 @@ int cluster_ctrl::build()
   in.set_req_meth(&cluster_ctrl::req);
   new_slave_port("input", &in);
 
+  this->new_master_port("clock_gating_en", &this->clock_gating_en_itf);
+
   for (int i = 0; i<nb_core; i++)
   {
     cores[i].bootaddr = 0x57575757;
diff --git a/gvsoc/gvsoc_gap/models/pulp/mchan/mchan_v7_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/mchan/mchan_v7_impl.cpp
index 00fa69358..d2bc7db97 100644
--- a/gvsoc/gvsoc_gap/models/pulp/mchan/mchan_v7_impl.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/mchan/mchan_v7_impl.cpp
@@ -196,6 +196,7 @@ class mchan : public vp::component
   void account_transfered_bytes(Mchan_cmd *cmd, int bytes);
   void send_req_to_ext(Mchan_cmd *cmd, vp::io_req *req);
   void handle_ext_write_req_end(Mchan_cmd *cmd, vp::io_req *req);
+  void cmd_start(int cmd_id);
 
   vp::trace     trace;
 
@@ -239,8 +240,13 @@ class mchan : public vp::component
   int64_t *loc_port_ready_cycle;
 
   bool ext_is_stalled;
+  int nb_cmd_started;
 
   vp::trace     cmd_events[MCHAN_NB_COUNTERS];
+
+  vp::wire_master<bool> busy_itf;
+  vp::power::power_source background_power;
+  vp::power::power_source active_power;
 };
 
 void Mchan_channel::reset()
@@ -396,8 +402,7 @@ bool Mchan_channel::check_command(Mchan_cmd *cmd)
   top->trace.msg(vp::trace::LEVEL_TRACE, "Incrementing counter (id: %d, bytes: %d, remaining bytes: %d)\n", current_counter, cmd->size, top->pending_bytes[current_counter]);
 
   // Enqueue the command to the core queue
-  uint8_t one = 1;
-  this->top->cmd_events[cmd->counter_id].event(&one);
+  this->top->cmd_start(cmd->counter_id);
 
   pending_cmds->push(cmd);
 
@@ -526,6 +531,20 @@ void Mchan_channel::trigger_event(Mchan_cmd *cmd)
   }
 }
 
+void mchan::cmd_start(int cmd_id)
+{
+    uint8_t one = 1;
+    this->cmd_events[cmd_id].event(&one);
+    this->nb_cmd_started++;
+    if (this->nb_cmd_started == 1)
+    {
+        if (this->busy_itf.is_bound())
+        {
+            this->active_power.dynamic_power_start();
+            this->busy_itf.sync(1);
+        }
+    }
+}
 
 void mchan::ext_grant(void *__this, vp::io_req *req)
 {
@@ -876,6 +895,10 @@ void mchan::send_req()
   {
     ext_is_stalled = true;
   }
+  else
+  {
+    trace.force_warning("Got error during transfer (addr: 0x%lx, size: 0x%x)\n", cmd->source, size);
+  }
 }
 
 void mchan::check_ext_read_handler(void *__this, vp::clock_event *event)
@@ -920,8 +943,17 @@ void mchan::check_ext_write_handler(void *__this, vp::clock_event *event)
 
 void mchan::handle_cmd_termination(Mchan_cmd *cmd)
 {
-  this->cmd_events[cmd->counter_id].event(NULL);
-  free_command(cmd);
+    this->cmd_events[cmd->counter_id].event(NULL);
+    this->nb_cmd_started--;
+    if (this->nb_cmd_started == 0)
+    {
+        if (this->busy_itf.is_bound())
+        {
+            this->active_power.dynamic_power_stop();
+            this->busy_itf.sync(0);
+        }
+    }
+    free_command(cmd);
 }
 
 void mchan::account_transfered_bytes(Mchan_cmd *cmd, int bytes)
@@ -1148,6 +1180,7 @@ void mchan::check_queue()
 int mchan::build()
 {
   traces.new_trace("trace", &this->trace, vp::DEBUG);
+  new_master_port("busy", &this->busy_itf);
 
   for (int i=0; i<nb_channels; i++)
   {
@@ -1159,11 +1192,16 @@ int mchan::build()
     traces.new_trace_event("channel_" + std::to_string(i), &this->cmd_events[i], 8);
   }
 
+  this->power.new_power_source("background", &this->background_power, this->get_js_config()->get("**/power_models/background"));
+  this->power.new_power_source("active", &this->active_power, this->get_js_config()->get("**/power_models/active"));
+
   return 0;
 }
 
 void mchan::start()
 {
+    this->background_power.leakage_power_start();
+    this->background_power.dynamic_power_start();
 }
 
 void mchan::reset(bool active)
@@ -1187,6 +1225,7 @@ void mchan::reset(bool active)
       loc_port_ready_cycle[i] = 0;
     }
 
+    this->nb_cmd_started = 0;
     first_alloc_pending_req = NULL;
     last_alloc_pending_req = NULL;
     nb_core_read_cmd = 0;
diff --git a/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp b/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp
index 23131fa14..4bff46f71 100644
--- a/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/ne16/src/ne16_debug.cpp
@@ -23,13 +23,13 @@
 void Ne16::debug_x_buffer() {
   if(this->mode_linear) {
     std::ostringstream stringStream;
-    stringStream << "x_buffer[32,16] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(2) << this->x_buffer_linear << std::dec << "\n";
+    stringStream << "x_buffer[32,16] = \n" << (this->trace_format?std::hex:std::dec) << this->x_buffer_linear << std::dec << "\n";
     std::string copyOfStr = stringStream.str();
     this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str());
   }
   else {
     std::ostringstream stringStream;
-    stringStream << "x_buffer[5,5,16] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(2) << this->x_buffer << std::dec << "\n";
+    stringStream << "x_buffer[5,5,16] = \n" << (this->trace_format?std::hex:std::dec) << this->x_buffer << std::dec << "\n";
     std::string copyOfStr = stringStream.str();
     this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str());
   }
@@ -47,7 +47,7 @@ void Ne16::debug_x_array() {
   // }
   // else {
     std::ostringstream stringStream;
-    stringStream << "x_array[9,9,16] = \n" << xt::print_options::threshold(10000) << (this->trace_format?std::hex:std::dec) << std::setw(2) << this->x_array << std::dec << "\n";
+    stringStream << "x_array[9,9,16] = \n" << xt::print_options::threshold(10000) << (this->trace_format?std::hex:std::dec) << this->x_array << std::dec << "\n";
     std::string copyOfStr = stringStream.str();
     this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str());
   // }
@@ -55,7 +55,7 @@ void Ne16::debug_x_array() {
 
 void Ne16::debug_accum(){
   std::ostringstream stringStream;
-  stringStream << "accum[9,32] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(8) << xt::cast<int32_t>(this->accum) << std::dec << "\n";
+  stringStream << "accum[9,32] = \n" << (this->trace_format?std::hex:std::dec) << xt::cast<int32_t>(this->accum) << std::dec << "\n";
   std::string copyOfStr = stringStream.str();
   this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str());
 }
@@ -69,7 +69,7 @@ void Ne16::debug_accum(){
 
 void Ne16::debug_psum_block(){
   std::ostringstream stringStream;
-  stringStream << "psum_block[9,9] = \n" << (this->trace_format?std::hex:std::dec) << std::setw(8) << xt::cast<int32_t>(this->psum_block) << std::dec << "\n";
+  stringStream << "psum_block[9,9] = \n" << (this->trace_format?std::hex:std::dec) << xt::cast<int32_t>(this->psum_block) << std::dec << "\n";
   std::string copyOfStr = stringStream.str();
   this->trace.msg(vp::trace::LEVEL_DEBUG, copyOfStr.c_str());
 }
diff --git a/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp
index ab9d559a2..a048ba4ef 100644
--- a/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/pmu/pmu_v4_impl.cpp
@@ -236,6 +236,7 @@ class pmu_icu : public pmu_picl_slave
 private:
   pmu *top;
   vp::wire_master<bool>  reset_itf;
+  vp::wire_master<int>  power_itf;
   pmu_icu_state states[16];
   int index;
   int current_supply_state;
@@ -706,6 +707,11 @@ void pmu_icu::icu_ctrl_req(bool is_write, uint16_t pwdata)
     }
   }
 
+  if (this->power_itf.is_bound())
+  {
+    this->power_itf.sync(state->supply == MAESTRO_ICU_SUPPLY_ON);
+  }
+
   this->current_supply_state = state->supply;
 
   top->picl_reply();
@@ -797,6 +803,7 @@ pmu_icu::pmu_icu(pmu *top, int index)
 : pmu_picl_slave(top), top(top), index(index)
 {
   top->new_master_port("icu" + std::to_string(index) + "_reset", &this->reset_itf);
+  top->new_master_port("icu" + std::to_string(index) + "_power", &this->power_itf);
 
   for (int i=0; i<16; i++)
   {
diff --git a/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp b/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp
index a1c9c4db2..bcc09ba9c 100644
--- a/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/timer/timer_v2_impl.cpp
@@ -61,8 +61,10 @@ class timer : public vp::component
   uint64_t get_compare_value(bool is_64, int counter);
   uint64_t get_value(bool is_64, int counter);
   void set_value(bool is_64, int counter, uint64_t new_value);
+  void set_enable(int counter, bool enabled);
 
   vp::wire_master<bool> irq_itf[2];
+  vp::wire_master<bool> busy_itf;
   vp::clock_slave ref_clock_itf;
 
   uint32_t value[2];
@@ -143,6 +145,16 @@ void timer::set_value(bool is_64, int counter, uint64_t new_value)
   else value[counter] = new_value;
 }
 
+void timer::set_enable(int counter, bool enabled)
+{
+  this->is_enabled[counter] = enabled;
+
+  if (this->busy_itf.is_bound())
+  {
+    this->busy_itf.sync(enabled);
+  }
+}
+
 void timer::check_state_counter(bool is_64, int counter)
 {
   if (is_enabled[counter] && get_compare_value(is_64, counter) == get_value(is_64, counter))
@@ -165,7 +177,7 @@ void timer::check_state_counter(bool is_64, int counter)
 
     if (one_shot[counter]) {
       this->trace.msg(vp::trace::LEVEL_DEBUG, "Reached one-shot end (timer: %d)\n", counter);
-      is_enabled[counter] = false;
+      this->set_enable(counter, false);
     }
 
   }
@@ -300,7 +312,7 @@ vp::io_req_status_e timer::handle_compare(int counter, uint32_t *data, unsigned
 
 void timer::depack_config(int counter, uint32_t configuration)
 {
-  is_enabled[counter] = (configuration >> TIMER_CFG_LO_ENABLE_BIT) & 1;
+  this->set_enable(counter, (configuration >> TIMER_CFG_LO_ENABLE_BIT) & 1);
   irq_enabled[counter] = (configuration >> TIMER_CFG_LO_IRQEN_BIT) & 1;
   iem[counter] = (configuration >> TIMER_CFG_LO_IEM_BIT) & 1;
   cmp_clr[counter] = (configuration >> TIMER_CFG_LO_MODE_BIT) & 1;
@@ -385,6 +397,8 @@ int timer::build()
   new_master_port("irq_itf_0", &irq_itf[0]);
   new_master_port("irq_itf_1", &irq_itf[1]);
 
+  new_master_port("busy", &busy_itf);
+
   ref_clock_itf.set_sync_meth(&timer::ref_clock_sync);
   new_slave_port("ref_clock", &ref_clock_itf);
 
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp
index f0b448356..c6dd9d543 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.cpp
@@ -22,9 +22,9 @@
 using namespace std::placeholders;
 
 /* delay needed to replicate real performances */
-/* this should be 1 because ffc is 1 data/cycle, but 14 is the closest value
+/* this should be 1 because ffc is 1 data/cycle, but 2 is the closest value
  * to replicate performance */
-#define FFC_DELAY_CYCLES (14)
+#define FFC_DELAY_CYCLES (2)
 
 Ffc_periph::Ffc_periph(udma *top, int id, int itf_id) : Udma_periph(top, id)
 {
@@ -47,6 +47,9 @@ Ffc_periph::Ffc_periph(udma *top, int id, int itf_id) : Udma_periph(top, id)
 
     /* setup event handlers */
     this->event_convert = top->event_new(this, Ffc_periph::handle_event);
+
+    /* Busy signal for VCD tracing */
+    this->top->new_reg(itf_name + "/busy", &this->busy, 8);
 }
 
 
@@ -56,6 +59,9 @@ void Ffc_periph::reset(bool active)
 
     this->rx_channel->reset(active);
     this->tx_channel->reset(active);
+
+    // Since busy signal is displayed as a state, we need to release it when the FFC is not busy. */
+    this->busy.release();
 }
 
 
@@ -96,6 +102,7 @@ vp::io_req_status_e Ffc_periph::custom_req(vp::io_req *req, uint64_t offset)
             this->trace.msg(vp::trace::LEVEL_TRACE, "Received START access\n");
             /* start converting data */
             this->enqueue_event();
+            this->busy.set(1);
             break;
         default:
             break;
@@ -207,6 +214,8 @@ void Ffc_periph::handle_event(void* __this, vp::clock_event* event)
             {
                 /* done with conversion */
                 _this->state = FFC_STATE_IDLE;
+                // Since busy signal is displayed as a state, we need to release it when the FFC is not busy. */
+                _this->busy.release();
             }
             else if (!_this->ffc_queue.empty())
             {
@@ -570,6 +579,7 @@ void Ffc_periph::convert_to_fixed(uint8_t* src,
                         printf("Invalid float type\n");
                         break;
                 }
+
                 this->push_data((uint8_t*) &dst, 4);
             }
             break;
@@ -583,7 +593,7 @@ void Ffc_periph::enqueue_event(void)
 {
     if (!(this->event_convert)->is_enqueued())
     {
-        this->top->get_periph_clock()->enqueue(this->event_convert, FFC_DELAY_CYCLES);
+        this->top->event_enqueue(this->event_convert, FFC_DELAY_CYCLES);
     }
 }
 
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp
index 8b142491f..345c65715 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/ffc/udma_ffc_v1.hpp
@@ -281,6 +281,9 @@ class Ffc_periph : public Udma_periph
 
         /** FFC TX channel, used to transmit data to the FFC */
         Ffc_tx_channel *tx_channel;
+
+        /** Busy signal for VCD tracing */
+        vp::reg_8 busy;
 };
 
 
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp
index 0a351268a..aad3c8d80 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/udma_i2c_v2.cpp
@@ -108,6 +108,7 @@ void I2c_tx_channel::handle_pending_word(void *__this, vp::clock_event *event)
   if (_this->periph->waiting_rx)
   {
     _this->periph->prev_scl ^= 1;
+    _this->periph->trace.msg("Sync (scl: %d, sda: %d)\n", _this->periph->prev_scl, 0);    
     _this->periph->i2c_itf.sync(_this->periph->prev_scl, 0);
 
     if (_this->periph->prev_scl)
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp
index 8a589314f..411559042 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/i2c/v4/udma_i2c.cpp
@@ -38,7 +38,8 @@ I2c_periph::I2c_periph(udma *top, int id, int itf_id) :
                 _2),
             std::bind(&I2c_periph::i2c_cancel_event,
                 this,
-                _1)
+                _1),
+                "i2c" + std::to_string(itf_id)
             ),
     is_waiting_i2c_start(false),
     is_waiting_i2c_data(false),
@@ -295,7 +296,7 @@ void I2c_periph::i2c_sync(void *__this, int scl, int sda)
 
 void I2c_periph::ucode_handler(ucode_data_t data)
 {
-    //I2C_PERIPH_FPRINTF("[I2C] ucode_handler: data.id=%d\n", data.id);
+    I2C_PERIPH_FPRINTF("[I2C] ucode_handler: data.id=0x%x\n", data.id);
     switch(data.id)
     {
         case CMD_MISC_NOP:
@@ -367,11 +368,10 @@ void I2c_periph::ucode_handler(ucode_data_t data)
             }
             break;
         case CMD_LEAD_RECV:
-            if (this->repeat_downcounter == 0)
+            if (this->repeat_downcounter > 0)
             {
-                this->repeat_downcounter = 1;
+                this->is_waiting_i2c_data = true;
             }
-            this->is_waiting_i2c_data = true;
             break;
         case CMD_LEAD_RECV_LAST:
             // TODO
@@ -576,7 +576,7 @@ void I2c_periph::i2c_helper_callback(i2c_operation_e id, i2c_status_e status, in
 
 void I2c_periph::i2c_start(void)
 {
-    if (!this->i2c_helper.is_busy())
+    if (1) //!this->i2c_helper.is_busy())
     {
         I2C_PERIPH_FPRINTF("Sending start directly\n");
         this->is_waiting_i2c_start = true;
diff --git a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp
index 43d48b9cb..0dea6be56 100644
--- a/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp
+++ b/gvsoc/gvsoc_gap/models/pulp/udma/udma_v4_addrgens.cpp
@@ -134,6 +134,7 @@ void Udma_addrgen_linear::cfg_ctrl_req(uint64_t reg_offset, int size, uint8_t *v
 
 vp::io_req_status_e Udma_addrgen_linear::access(uint64_t offset, int size, uint8_t *value, bool is_write)
 {
+
     if (this->regmap.access(offset, size, value, is_write))
         return vp::IO_REQ_INVALID;
 
diff --git a/libs/gap_lib/jpeg/cluster.c b/libs/gap_lib/jpeg/cluster.c
index fd0644ae4..4f9fd4191 100644
--- a/libs/gap_lib/jpeg/cluster.c
+++ b/libs/gap_lib/jpeg/cluster.c
@@ -30,8 +30,14 @@
 #ifdef PMSIS_DRIVERS
     #define RT_USER_EVENT (CL_USER_EVENT)
     #define eu_evt_trig_from_id(x,y) (hal_eu_cluster_evt_trig_set(x,y))
+#if defined(__GAP9__)
+    #define eu_evt_maskWaitAndClr(x) (hal_cl_eu_evt_mask_wait_and_clear(x))
+#else
     #define eu_evt_maskWaitAndClr(x) (hal_cl_eu_evt_mask_wait_clear(x))
 #endif
+#else
+    #define RT_USER_EVENT 6
+#endif
 
 #define FLOAT2FIX(f)  ((int)((f) * (1 << 11)))
 #define FIXQ 11
diff --git a/libs/gap_lib/testbench/testbench.h b/libs/gap_lib/testbench/testbench.h
index 3eece6c29..5bbd70444 100644
--- a/libs/gap_lib/testbench/testbench.h
+++ b/libs/gap_lib/testbench/testbench.h
@@ -122,6 +122,12 @@ typedef enum
     PI_TESTBENCH_I2S_VERIF_RX_FILE_READER
 } pi_testbench_i2s_verif_start_config_type_e;
 
+typedef enum
+{
+    PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_ASIS = 0, // Keep as is (default)
+    PI_TESTBENCH_I2S_VERIF_FILE_ENCODING_TYPE_PLUSMINUS, // Assume file contains -1/+1 values (usable for PDM only)
+} pi_testbench_i2s_verif_start_config_file_encoding_type_e;
+
 // This structure can be used to describe what an I2S slot should do
 typedef struct
 {
@@ -142,6 +148,7 @@ typedef struct
             uint32_t filepath_len;
             uint8_t type;
             uint8_t width;
+            uint8_t encoding;
         } tx_file_dumper;
         struct
         {
@@ -150,6 +157,7 @@ typedef struct
             uint32_t filepath_len;
             uint8_t type;
             uint8_t width;
+            uint8_t encoding;
         } rx_file_reader;
     };
 
diff --git a/libs/gap_lib/testbench/testlib.c b/libs/gap_lib/testbench/testlib.c
index 721a38078..36cfc0209 100644
--- a/libs/gap_lib/testbench/testlib.c
+++ b/libs/gap_lib/testbench/testlib.c
@@ -10,7 +10,6 @@
 #include "pmsis.h"
 #include "testbench.h"
 #include "testlib.h"
-#include <bsp/ram/hyperram.h>
 #include <string.h>
 
 
@@ -1108,128 +1107,3 @@ int i2s_test_stop(i2s_test_t *test)
 
     return 0;
 }
-
-
-void testlib_hyperram_trafficgen_conf_init(testlib_hyperram_trafficgen_config_t *config)
-{
-    config->transfer_size = 8192;
-    config->itf = -1;
-    config->cs = -1;
-    config->frequency = -1;
-}
-
-
-int testlib_hyperram_trafficgen_init(testlib_hyperram_trafficgen_t *data, testlib_hyperram_trafficgen_config_t *config)
-{
-    struct pi_hyperram_conf conf;
-    pi_hyperram_conf_init(&conf);
-
-    if (config->itf != -1)
-    {
-        conf.hyper_itf = config->itf;
-    }
-    if (config->cs != -1)
-    {
-        conf.hyper_cs = config->cs;
-    }
-    if (config->frequency != -1)
-    {
-        conf.baudrate = config->frequency;
-    }
-
-    pi_open_from_conf(&data->dev, &conf);
-
-    if (pi_ram_open(&data->dev))
-        goto error0;
-
-    if (pi_ram_alloc(&data->dev, &data->hyper_addr, config->transfer_size))
-        goto error1;
-
-    data->transfer_size = config->transfer_size;
-
-    data->buffer = pi_l2_malloc(config->transfer_size);
-    if (data->buffer == NULL) goto error2;
-
-    for (int i=0; i<config->transfer_size/4; i++)
-    {
-        ((uint32_t *)data->buffer)[i] = i;
-    }
-
-    return 0;
-
-error2:
-    pi_ram_free(&data->dev, data->hyper_addr, config->transfer_size);
-error1:
-    pi_ram_close(&data->dev);
-error0:
-    return -1;
-}
-
-
-static void testlib_hyperram_callback(void *arg)
-{
-    testlib_hyperram_trafficgen_t *data = (testlib_hyperram_trafficgen_t *)arg;
-
-    if (data->end)
-    {
-        data->pending--;
-        if (data->pending == 0)
-        {
-            pi_task_push(&data->end_task);
-        }
-        return;
-    }
-
-    if (data->is_read)
-    {
-        data->is_read = 0;
-        pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data));
-    }
-    else
-    {
-        data->is_read = 1;
-        pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data));
-    }
-}
-
-
-int testlib_hyperram_trafficgen_start(testlib_hyperram_trafficgen_t *data)
-{
-    data->is_read = 0;
-    data->end = 0;
-    data->pending = 2;
-    pi_task_block(&data->end_task);
-    pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data));
-    pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data));
-    return 0;
-}
-
-
-int testlib_hyperram_trafficgen_stop(testlib_hyperram_trafficgen_t *data)
-{
-    int errors = 0;
-
-    data->end = 1;
-
-    pi_task_wait_on(&data->end_task);
-
-    for (int i=0; i<data->transfer_size/4; i++)
-    {
-        uint32_t expected = i;
-        if (expected != ((uint32_t *)data->buffer)[i])
-        {
-            errors++;
-        }
-    }
-
-    return errors;
-}
-
-
-int testlib_hyperram_trafficgen_deinit(testlib_hyperram_trafficgen_t *data)
-{
-    pi_ram_free(&data->dev, data->hyper_addr, data->transfer_size);
-    pi_ram_close(&data->dev);
-    return 0;
-}
-
diff --git a/libs/gap_lib/testbench/testlib.h b/libs/gap_lib/testbench/testlib.h
index 5d9470398..650af5d7e 100644
--- a/libs/gap_lib/testbench/testlib.h
+++ b/libs/gap_lib/testbench/testlib.h
@@ -13,6 +13,9 @@
 
 #include "pmsis.h"
 #include "testbench.h"
+#include "testlib_i2s.h"
+#include "testlib_uart.h"
+#include "testlib_i2c.h"
 
 #define I2S_SLOT_STATIC_INIT {0}
 
diff --git a/libs/gap_lib/testbench/testlib_hyper.c b/libs/gap_lib/testbench/testlib_hyper.c
new file mode 100644
index 000000000..6ac967bc9
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_hyper.c
@@ -0,0 +1,139 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#include "pmsis.h"
+#include "testbench.h"
+#include "testlib.h"
+#include <bsp/ram/hyperram.h>
+#include <string.h>
+
+
+void testlib_hyperram_trafficgen_conf_init(testlib_hyperram_trafficgen_config_t *config)
+{
+    config->transfer_size = 8192;
+    config->itf = -1;
+    config->cs = -1;
+    config->frequency = -1;
+}
+
+
+int testlib_hyperram_trafficgen_init(testlib_hyperram_trafficgen_t *data, testlib_hyperram_trafficgen_config_t *config)
+{
+    struct pi_hyperram_conf conf;
+    pi_hyperram_conf_init(&conf);
+
+    if (config->itf != -1)
+    {
+        conf.hyper_itf = config->itf;
+    }
+    if (config->cs != -1)
+    {
+        conf.hyper_cs = config->cs;
+    }
+    if (config->frequency != -1)
+    {
+        conf.baudrate = config->frequency;
+    }
+
+    pi_open_from_conf(&data->dev, &conf);
+
+    if (pi_ram_open(&data->dev))
+        goto error0;
+
+    if (pi_ram_alloc(&data->dev, &data->hyper_addr, config->transfer_size))
+        goto error1;
+
+    data->transfer_size = config->transfer_size;
+
+    data->buffer = pi_l2_malloc(config->transfer_size);
+    if (data->buffer == NULL) goto error2;
+
+    for (int i=0; i<config->transfer_size/4; i++)
+    {
+        ((uint32_t *)data->buffer)[i] = i;
+    }
+
+    return 0;
+
+error2:
+    pi_ram_free(&data->dev, data->hyper_addr, config->transfer_size);
+error1:
+    pi_ram_close(&data->dev);
+error0:
+    return -1;
+}
+
+
+static void testlib_hyperram_callback(void *arg)
+{
+    testlib_hyperram_trafficgen_t *data = (testlib_hyperram_trafficgen_t *)arg;
+
+    if (data->end)
+    {
+        data->pending--;
+        if (data->pending == 0)
+        {
+            pi_task_push(&data->end_task);
+        }
+        return;
+    }
+
+    if (data->is_read)
+    {
+        data->is_read = 0;
+        pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data));
+    }
+    else
+    {
+        data->is_read = 1;
+        pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data));
+    }
+}
+
+
+int testlib_hyperram_trafficgen_start(testlib_hyperram_trafficgen_t *data)
+{
+    data->is_read = 0;
+    data->end = 0;
+    data->pending = 2;
+    pi_task_block(&data->end_task);
+    pi_ram_write_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->read_task, testlib_hyperram_callback, (void *)data));
+    pi_ram_read_async(&data->dev, data->hyper_addr, data->buffer, data->transfer_size, pi_task_callback(&data->write_task, testlib_hyperram_callback, (void *)data));
+    return 0;
+}
+
+
+int testlib_hyperram_trafficgen_stop(testlib_hyperram_trafficgen_t *data)
+{
+    int errors = 0;
+
+    data->end = 1;
+
+    pi_task_wait_on(&data->end_task);
+
+    for (int i=0; i<data->transfer_size/4; i++)
+    {
+        uint32_t expected = i;
+        if (expected != ((uint32_t *)data->buffer)[i])
+        {
+            errors++;
+        }
+    }
+
+    return errors;
+}
+
+
+int testlib_hyperram_trafficgen_deinit(testlib_hyperram_trafficgen_t *data)
+{
+    pi_ram_free(&data->dev, data->hyper_addr, data->transfer_size);
+    pi_ram_close(&data->dev);
+    return 0;
+}
+
diff --git a/libs/gap_lib/testbench/testlib_i2c.c b/libs/gap_lib/testbench/testlib_i2c.c
new file mode 100644
index 000000000..69ce83bfd
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_i2c.c
@@ -0,0 +1,110 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#include "pmsis.h"
+#include "testbench.h"
+#include "testlib.h"
+#include <string.h>
+
+
+void testlib_i2c_trafficgen_conf_init(testlib_i2c_trafficgen_config_t *config)
+{
+    config->transfer_size = 32;
+    config->itf = 0;
+    config->baudrate = 400000;
+}
+
+
+int testlib_i2c_trafficgen_init(testlib_i2c_trafficgen_t *data, testlib_i2c_trafficgen_config_t *config)
+{
+    struct pi_i2c_conf conf;
+
+    data->transfer_size = config->transfer_size;
+    data->tx_buffers[0] = pi_l2_malloc(config->transfer_size);
+    data->tx_buffers[1] = pi_l2_malloc(config->transfer_size);
+
+    if (data->tx_buffers[0] == NULL || data->tx_buffers[1] == NULL)
+    {
+        return -1;
+    }
+
+    for (int i=0; i<config->transfer_size; i++)
+    {
+        ((uint8_t *)data->tx_buffers[0])[i] = i;
+        ((uint8_t *)data->tx_buffers[1])[i] = i;
+    }
+
+    pi_i2c_conf_init(&conf);
+
+    conf.itf = config->itf;
+    //conf.max_baudrate = config->baudrate;
+    conf.cs = 0xA0;
+
+    pi_open_from_conf(&data->dev, &conf);
+
+    if (pi_i2c_open(&data->dev))
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+
+static void testlib_i2c_tx_callback(void *arg)
+{
+    testlib_i2c_trafficgen_t *data = (testlib_i2c_trafficgen_t *)arg;
+
+    if (data->end)
+    {
+        data->tx_pending--;
+        if (data->tx_pending == 0)
+        {
+            pi_task_push(&data->tx_end_task);
+        }
+        return;
+    }
+
+    pi_i2c_write_async(&data->dev, data->tx_buffers[data->tx_current_task], data->transfer_size, 0, pi_task_irq_callback(&data->tx_tasks[data->tx_current_task], testlib_i2c_tx_callback, (void *)data));
+    data->tx_current_task ^= 1;
+}
+
+
+int testlib_i2c_trafficgen_start(testlib_i2c_trafficgen_t *data)
+{
+    data->end = 0;
+
+    data->tx_pending = 2;
+    data->tx_current_task = 0;
+    pi_task_block(&data->tx_end_task);
+    pi_i2c_write_async(&data->dev, data->tx_buffers[0], data->transfer_size, 0, pi_task_irq_callback(&data->tx_tasks[0], testlib_i2c_tx_callback, (void *)data));
+    pi_i2c_write_async(&data->dev, data->tx_buffers[1], data->transfer_size, 0, pi_task_irq_callback(&data->tx_tasks[1], testlib_i2c_tx_callback, (void *)data));
+
+    return 0;
+}
+int testlib_i2c_trafficgen_stop(testlib_i2c_trafficgen_t *data)
+{
+    int errors = 0;
+
+    data->end = 1;
+
+    pi_task_wait_on(&data->tx_end_task);
+
+    return errors;
+}
+
+
+int testlib_i2c_trafficgen_deinit(testlib_i2c_trafficgen_t *data)
+{
+    pi_l2_free(data->tx_buffers[0], data->transfer_size);
+    pi_l2_free(data->tx_buffers[1], data->transfer_size);
+    pi_i2c_close(&data->dev);
+    return 0;
+}
+
diff --git a/libs/gap_lib/testbench/testlib_i2c.h b/libs/gap_lib/testbench/testlib_i2c.h
new file mode 100644
index 000000000..c97cbfd84
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_i2c.h
@@ -0,0 +1,40 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#pragma once
+
+#include "pmsis.h"
+
+typedef struct
+{
+    int itf;
+    int transfer_size;
+    int baudrate;
+} testlib_i2c_trafficgen_config_t;
+
+
+typedef struct
+{
+    pi_device_t dev;
+    uint32_t frame;
+    void *tx_buffers[2];
+    pi_task_t tx_tasks[2];
+    int tx_pending;
+    int tx_current_task;
+    pi_task_t tx_end_task;
+    int transfer_size;
+    int end;
+} testlib_i2c_trafficgen_t;
+
+
+void testlib_i2c_trafficgen_conf_init(testlib_i2c_trafficgen_config_t *config);
+int testlib_i2c_trafficgen_init(testlib_i2c_trafficgen_t *data, testlib_i2c_trafficgen_config_t *config);
+int testlib_i2c_trafficgen_start(testlib_i2c_trafficgen_t *data);
+int testlib_i2c_trafficgen_stop(testlib_i2c_trafficgen_t *data);
+int testlib_i2c_trafficgen_deinit(testlib_i2c_trafficgen_t *data);
diff --git a/libs/gap_lib/testbench/testlib_i2s.c b/libs/gap_lib/testbench/testlib_i2s.c
new file mode 100644
index 000000000..80d48128d
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_i2s.c
@@ -0,0 +1,202 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#include "pmsis.h"
+#include "testbench.h"
+#include "testlib.h"
+#include <string.h>
+
+
+void testlib_i2s_trafficgen_conf_init(testlib_i2s_trafficgen_config_t *config)
+{
+    config->transfer_size = 128;
+    config->itf = 0;
+    config->sample_rate = 48000;
+    config->nb_slots = 16;
+    config->word_size = 32;
+}
+
+
+int testlib_i2s_trafficgen_init(testlib_i2s_trafficgen_t *data, testlib_i2s_trafficgen_config_t *config)
+{
+    struct pi_i2s_conf i2s_conf;
+    pi_i2s_conf_init(&i2s_conf);
+
+    i2s_conf.frame_clk_freq = config->sample_rate;
+    i2s_conf.itf = config->itf;
+    i2s_conf.word_size = config->word_size;
+    i2s_conf.channels = config->nb_slots;
+    i2s_conf.options = PI_I2S_OPT_TDM | PI_I2S_OPT_FULL_DUPLEX;
+
+    pi_open_from_conf(&data->dev, &i2s_conf);
+    if (pi_i2s_open(&data->dev))
+    {
+        printf("Error opening i2s\n");
+        return -3;
+    }
+
+    struct pi_i2s_channel_conf i2s_slot_conf;
+    pi_i2s_channel_conf_init(&i2s_slot_conf);
+    uint16_t frame = (1 << config->nb_slots) - 1;
+    data->frame = frame;
+    for (int i=0; i<config->nb_slots; i++)
+    {
+        i2s_slot_conf.options = PI_I2S_OPT_IS_RX | PI_I2S_OPT_PINGPONG | PI_I2S_OPT_ENABLED;
+        i2s_slot_conf.word_size = config->word_size;
+        i2s_slot_conf.block_size = config->transfer_size;
+
+        if (i == 0)
+        {
+            data->transfer_size = config->transfer_size * config->nb_slots;
+            i2s_slot_conf.pingpong_buffers[0] = pi_l2_malloc(data->transfer_size);
+            i2s_slot_conf.pingpong_buffers[1] = pi_l2_malloc(data->transfer_size);
+            data->rx_buffers[0] = i2s_slot_conf.pingpong_buffers[0];
+            data->rx_buffers[1]= i2s_slot_conf.pingpong_buffers[1];
+
+            if (i2s_slot_conf.pingpong_buffers[0] == NULL || i2s_slot_conf.pingpong_buffers[1] == NULL)
+            {
+                printf("Error allocating memory\n");
+                return -1;
+            }
+        }
+
+        if (pi_i2s_frame_channel_conf_set(&data->dev, frame, i, &i2s_slot_conf))
+        {
+            printf("Error setting conf channel\n");
+            return -4;
+        }
+    }
+
+    pi_i2s_channel_conf_init(&i2s_slot_conf);
+    for (int i=0; i<config->nb_slots; i++)
+    {
+        i2s_slot_conf.options = PI_I2S_OPT_IS_TX | PI_I2S_OPT_PINGPONG | PI_I2S_OPT_ENABLED;
+        i2s_slot_conf.word_size = config->word_size;
+        i2s_slot_conf.block_size = config->transfer_size;
+
+        if (i == 0)
+        {
+            i2s_slot_conf.pingpong_buffers[0] = pi_l2_malloc(config->transfer_size * config->nb_slots);
+            i2s_slot_conf.pingpong_buffers[1] = pi_l2_malloc(config->transfer_size * config->nb_slots);
+            data->tx_buffers[0] = i2s_slot_conf.pingpong_buffers[0];
+            data->tx_buffers[1]= i2s_slot_conf.pingpong_buffers[1];
+
+            if (i2s_slot_conf.pingpong_buffers[0] == NULL || i2s_slot_conf.pingpong_buffers[1] == NULL)
+            {
+                printf("Error allocating memory\n");
+                return -1;
+            }
+            for (int i=0; i<config->transfer_size * config->nb_slots; i++)
+            {
+                ((uint8_t *)i2s_slot_conf.pingpong_buffers[0])[i] = i;
+                ((uint8_t *)i2s_slot_conf.pingpong_buffers[1])[i] = i;
+            }
+
+        }
+
+        if (pi_i2s_frame_channel_conf_set(&data->dev, frame, i, &i2s_slot_conf))
+        {
+            printf("Error setting conf channel\n");
+            return -4;
+        }
+    }
+
+    return 0;
+}
+
+
+static void testlib_i2s_rx_callback(void *arg)
+{
+    testlib_i2s_trafficgen_t *data = (testlib_i2s_trafficgen_t *)arg;
+
+    if (data->end)
+    {
+        data->rx_pending--;
+        if (data->rx_pending == 0)
+        {
+            pi_task_push(&data->rx_end_task);
+        }
+        return;
+    }
+
+    pi_i2s_frame_read_async(&data->dev, data->frame, pi_task_irq_callback(&data->rx_tasks[data->rx_current_task], testlib_i2s_rx_callback, (void *)data));
+    data->rx_current_task ^= 1;
+}
+
+static void testlib_i2s_tx_callback(void *arg)
+{
+    testlib_i2s_trafficgen_t *data = (testlib_i2s_trafficgen_t *)arg;
+
+    if (data->end)
+    {
+        data->tx_pending--;
+        if (data->tx_pending == 0)
+        {
+            pi_task_push(&data->tx_end_task);
+        }
+        return;
+    }
+
+    pi_i2s_frame_write_async(&data->dev, data->frame, NULL, 0, pi_task_irq_callback(&data->tx_tasks[data->tx_current_task], testlib_i2s_tx_callback, (void *)data));
+    data->tx_current_task ^= 1;
+}
+
+
+int testlib_i2s_trafficgen_start(testlib_i2s_trafficgen_t *data)
+{
+    data->end = 0;
+
+    data->tx_pending = 2;
+    data->tx_current_task = 0;
+    pi_task_block(&data->tx_end_task);
+    pi_i2s_frame_write_async(&data->dev, data->frame, NULL, 0, pi_task_irq_callback(&data->tx_tasks[0], testlib_i2s_tx_callback, (void *)data));
+    pi_i2s_frame_write_async(&data->dev, data->frame, NULL, 0, pi_task_irq_callback(&data->tx_tasks[1], testlib_i2s_tx_callback, (void *)data));
+
+    data->rx_pending = 2;
+    data->rx_current_task = 0;
+    pi_task_block(&data->rx_end_task);
+    pi_i2s_frame_read_async(&data->dev, data->frame, pi_task_irq_callback(&data->rx_tasks[0], testlib_i2s_rx_callback, (void *)data));
+    pi_i2s_frame_read_async(&data->dev, data->frame, pi_task_irq_callback(&data->rx_tasks[1], testlib_i2s_rx_callback, (void *)data));
+
+    if (pi_i2s_ioctl(&data->dev, PI_I2S_IOCTL_START, NULL))
+    {
+        return -4;
+    }
+    return 0;
+}
+
+
+int testlib_i2s_trafficgen_stop(testlib_i2s_trafficgen_t *data)
+{
+    int errors = 0;
+
+    data->end = 1;
+
+    pi_task_wait_on(&data->tx_end_task);
+    pi_task_wait_on(&data->rx_end_task);
+
+    if (pi_i2s_ioctl(&data->dev, PI_I2S_IOCTL_STOP, NULL))
+    {
+        return -4;
+    }
+
+    return errors;
+}
+
+
+int testlib_i2s_trafficgen_deinit(testlib_i2s_trafficgen_t *data)
+{
+    pi_l2_free(data->tx_buffers[0], data->transfer_size);
+    pi_l2_free(data->tx_buffers[1], data->transfer_size);
+    pi_l2_free(data->rx_buffers[0], data->transfer_size);
+    pi_l2_free(data->rx_buffers[1], data->transfer_size);
+    pi_i2s_close(&data->dev);
+    return 0;
+}
+
diff --git a/libs/gap_lib/testbench/testlib_i2s.h b/libs/gap_lib/testbench/testlib_i2s.h
new file mode 100644
index 000000000..2c4a2b2ce
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_i2s.h
@@ -0,0 +1,47 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#pragma once
+
+#include "pmsis.h"
+
+typedef struct
+{
+    int itf;
+    int transfer_size;
+    int sample_rate;
+    int nb_slots;
+    int word_size;
+} testlib_i2s_trafficgen_config_t;
+
+
+typedef struct
+{
+    pi_device_t dev;
+    uint32_t frame;
+    void *tx_buffers[2];
+    void *rx_buffers[2];
+    pi_task_t rx_tasks[2];
+    pi_task_t tx_tasks[2];
+    int tx_pending;
+    int tx_current_task;
+    pi_task_t tx_end_task;
+    int rx_pending;
+    int rx_current_task;
+    pi_task_t rx_end_task;
+    int transfer_size;
+    int end;
+} testlib_i2s_trafficgen_t;
+
+
+void testlib_i2s_trafficgen_conf_init(testlib_i2s_trafficgen_config_t *config);
+int testlib_i2s_trafficgen_init(testlib_i2s_trafficgen_t *data, testlib_i2s_trafficgen_config_t *config);
+int testlib_i2s_trafficgen_start(testlib_i2s_trafficgen_t *data);
+int testlib_i2s_trafficgen_stop(testlib_i2s_trafficgen_t *data);
+int testlib_i2s_trafficgen_deinit(testlib_i2s_trafficgen_t *data);
diff --git a/libs/gap_lib/testbench/testlib_uart.c b/libs/gap_lib/testbench/testlib_uart.c
new file mode 100644
index 000000000..74b484b3e
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_uart.c
@@ -0,0 +1,144 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#include "pmsis.h"
+#include "testbench.h"
+#include "testlib.h"
+#include <string.h>
+
+
+void testlib_uart_trafficgen_conf_init(testlib_uart_trafficgen_config_t *config)
+{
+    config->transfer_size = 128;
+    config->itf = 0;
+    config->baudrate = 1000000;
+    config->control_flow = 1;
+}
+
+
+int testlib_uart_trafficgen_init(testlib_uart_trafficgen_t *data, testlib_uart_trafficgen_config_t *config)
+{
+    struct pi_uart_conf conf;
+
+    data->transfer_size = config->transfer_size;
+    data->rx_buffers[0] = pi_l2_malloc(config->transfer_size);
+    data->rx_buffers[1] = pi_l2_malloc(config->transfer_size);
+    data->tx_buffers[0] = pi_l2_malloc(config->transfer_size);
+    data->tx_buffers[1] = pi_l2_malloc(config->transfer_size);
+
+    if (data->rx_buffers[0] == NULL || data->rx_buffers[1] == NULL || data->tx_buffers[0] == NULL || data->tx_buffers[1] == NULL)
+    {
+        return -1;
+    }
+
+    for (int i=0; i<config->transfer_size; i++)
+    {
+        ((uint8_t *)data->tx_buffers[0])[i] = i;
+        ((uint8_t *)data->tx_buffers[1])[i] = i;
+    }
+
+    pi_uart_conf_init(&conf);
+
+    conf.use_ctrl_flow = config->control_flow;
+    conf.enable_tx = 1;
+    conf.enable_rx = 1;
+    conf.uart_id = config->itf;
+    conf.baudrate_bps = config->baudrate;
+
+    pi_open_from_conf(&data->dev, &conf);
+
+    if (pi_uart_open(&data->dev))
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+
+static void testlib_uart_rx_callback(void *arg)
+{
+    testlib_uart_trafficgen_t *data = (testlib_uart_trafficgen_t *)arg;
+
+    if (data->end)
+    {
+        data->rx_pending--;
+        if (data->rx_pending == 0)
+        {
+            pi_task_push(&data->rx_end_task);
+        }
+        return;
+    }
+
+    pi_uart_read_async(&data->dev, data->rx_buffers[data->rx_current_task], data->transfer_size, pi_task_irq_callback(&data->rx_tasks[data->rx_current_task], testlib_uart_rx_callback, (void *)data));
+    data->rx_current_task ^= 1;
+}
+
+static void testlib_uart_tx_callback(void *arg)
+{
+    testlib_uart_trafficgen_t *data = (testlib_uart_trafficgen_t *)arg;
+
+    if (data->end)
+    {
+        data->tx_pending--;
+        if (data->tx_pending == 0)
+        {
+            pi_task_push(&data->tx_end_task);
+        }
+        return;
+    }
+
+    pi_uart_write_async(&data->dev, data->tx_buffers[data->tx_current_task], data->transfer_size, pi_task_irq_callback(&data->tx_tasks[data->tx_current_task], testlib_uart_tx_callback, (void *)data));
+    data->tx_current_task ^= 1;
+}
+
+
+int testlib_uart_trafficgen_start(testlib_uart_trafficgen_t *data)
+{
+    data->end = 0;
+
+    data->tx_pending = 2;
+    data->tx_current_task = 0;
+    pi_task_block(&data->tx_end_task);
+    pi_uart_write_async(&data->dev, data->tx_buffers[0], data->transfer_size, pi_task_irq_callback(&data->tx_tasks[0], testlib_uart_tx_callback, (void *)data));
+    pi_uart_write_async(&data->dev, data->tx_buffers[1], data->transfer_size, pi_task_irq_callback(&data->tx_tasks[1], testlib_uart_tx_callback, (void *)data));
+
+    data->rx_pending = 2;
+    data->rx_current_task = 0;
+    pi_task_block(&data->rx_end_task);
+    pi_uart_read_async(&data->dev, data->rx_buffers[0], data->transfer_size, pi_task_irq_callback(&data->rx_tasks[0], testlib_uart_rx_callback, (void *)data));
+    pi_uart_read_async(&data->dev, data->rx_buffers[1], data->transfer_size, pi_task_irq_callback(&data->rx_tasks[1], testlib_uart_rx_callback, (void *)data));
+
+    return 0;
+}
+
+
+int testlib_uart_trafficgen_stop(testlib_uart_trafficgen_t *data)
+{
+    int errors = 0;
+
+    data->end = 1;
+
+    pi_task_wait_on(&data->tx_end_task);
+    pi_task_wait_on(&data->rx_end_task);
+
+    return errors;
+}
+
+
+int testlib_uart_trafficgen_deinit(testlib_uart_trafficgen_t *data)
+{
+    pi_l2_free(data->tx_buffers[0], data->transfer_size);
+    pi_l2_free(data->tx_buffers[1], data->transfer_size);
+    pi_l2_free(data->rx_buffers[0], data->transfer_size);
+    pi_l2_free(data->rx_buffers[1], data->transfer_size);
+    pi_uart_close(&data->dev);
+    return 0;
+}
+
diff --git a/libs/gap_lib/testbench/testlib_uart.h b/libs/gap_lib/testbench/testlib_uart.h
new file mode 100644
index 000000000..f776509a1
--- /dev/null
+++ b/libs/gap_lib/testbench/testlib_uart.h
@@ -0,0 +1,46 @@
+/* 
+ * Copyright (C) 2017 GreenWaves Technologies
+ * All rights reserved.
+ *
+ * This software may be modified and distributed under the terms
+ * of the BSD license.  See the LICENSE file for details.
+ *
+ */
+
+#pragma once
+
+#include "pmsis.h"
+
+typedef struct
+{
+    int itf;
+    int transfer_size;
+    int baudrate;
+    int control_flow;
+} testlib_uart_trafficgen_config_t;
+
+
+typedef struct
+{
+    pi_device_t dev;
+    uint32_t frame;
+    void *tx_buffers[2];
+    void *rx_buffers[2];
+    pi_task_t rx_tasks[2];
+    pi_task_t tx_tasks[2];
+    int tx_pending;
+    int tx_current_task;
+    pi_task_t tx_end_task;
+    int rx_pending;
+    int rx_current_task;
+    pi_task_t rx_end_task;
+    int transfer_size;
+    int end;
+} testlib_uart_trafficgen_t;
+
+
+void testlib_uart_trafficgen_conf_init(testlib_uart_trafficgen_config_t *config);
+int testlib_uart_trafficgen_init(testlib_uart_trafficgen_t *data, testlib_uart_trafficgen_config_t *config);
+int testlib_uart_trafficgen_start(testlib_uart_trafficgen_t *data);
+int testlib_uart_trafficgen_stop(testlib_uart_trafficgen_t *data);
+int testlib_uart_trafficgen_deinit(testlib_uart_trafficgen_t *data);
diff --git a/libs/openmp/tests/benchmark/testset.cfg b/libs/openmp/tests/benchmark/testset.cfg
index daa17de0c..cf85c9156 100644
--- a/libs/openmp/tests/benchmark/testset.cfg
+++ b/libs/openmp/tests/benchmark/testset.cfg
@@ -1,4 +1,5 @@
 from plptest import *
+import os
 
 TestConfig = c = {}
 
diff --git a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h
index f5a72ddec..85e653ab6 100644
--- a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h
+++ b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/drivers.h
@@ -38,6 +38,7 @@
 #include "pmsis/chips/gap8/pad.h"
 #include "pmsis/chips/gap8/gpio.h"
 #include "pmsis/chips/gap8/pmu.h"
+#include "pmsis/chips/gap8/timer.h"
 
 /* Drivers. */
 #include "pmsis/drivers/cpi.h"
diff --git a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h
index da1ea6a48..ae15ecdac 100644
--- a/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h
+++ b/rtos/freeRTOS/vendors/gwt/gap8/pmsis/include/pmsis/implem/drivers/timer/timer.h
@@ -32,6 +32,7 @@
 #define __PI_TIMER_H__
 
 #include <stdint.h>
+#include "pmsis/chips/gap8/timer.h"
 #include "pmsis/targets/target.h"
 #include "pmsis/implem/hal/hal.h"
 
@@ -56,6 +57,7 @@
  *****************************************************************************/
 
 /* @brief Timers. */
+#if 0
 typedef enum
 {
     SYS_TIMER  = 0,             /*!< FC_TIMER_0 used as SysTick timer by preemptive RTOS. */
@@ -64,6 +66,8 @@ typedef enum
     CL_TIMER_0 = 2,             /*!< Cluster Timer_Low. */
     CL_TIMER_1 = 3              /*!< Cluster Timer_High. */
 } timer_e;
+#endif
+typedef pi_timer_e timer_e;
 
 /*******************************************************************************
  * Function declaration
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c b/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c
index b232c2aa3..15e67449a 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/backend/pmsis_backend_native_task_api.c
@@ -78,18 +78,17 @@ void pi_time_wait_us(int time_us)
         /* Wait less than 1 ms. */
         if (time_us < 1000)
         {
-            uint32_t irq = pi_irq_disable();
-            uint32_t freq_fc = pi_freq_get(PI_FREQ_DOMAIN_FC);
-            //uint64_t counter = (uint64_t) (((uint64_t) time_us) * freq_fc) / 1000000;
-            //uint64_t counter = (uint64_t) (((uint64_t) time_us) * freq_fc);
-            uint64_t freq = (uint64_t) ((uint64_t) time_us * (uint64_t) freq_fc);
-            uint32_t counter = (uint32_t) ((freq) >> 20);
-            //counter >>= 20; /* Div 10^6 */
-            //uint32_t counter = (uint32_t) time_us;
-            //counter = (counter * freq_fc) / 1000000;
-            //printf("counter=%ld, freq=%ld, time_us=%d\n", counter, freq_fc, time_us);
+#ifdef __VEGA__
+            int irq = pi_irq_disable();
+            for (volatile int i = 0; i < time_us; i++){};
             pi_irq_restore(irq);
-            for (volatile uint32_t i=0; i<counter; i++);
+#else
+            pi_task_t task_block;
+            pi_task_block(&task_block);
+            extern struct pi_device sys_timer_hi_prec;
+            pi_timer_task_add(&sys_timer_hi_prec, time_us, &task_block);
+            pi_task_wait_on(&task_block);
+#endif
         }
         else
         {
@@ -104,6 +103,7 @@ void pi_time_wait_us(int time_us)
 
 unsigned long long int pi_time_get_us()
 {
+    #if defined(__GAP8__) || defined (__VEGA__)
     uint64_t time = 0;
     uint32_t irq = pi_irq_disable();
     uint32_t cur_tick = xTaskGetTickCount();
@@ -115,6 +115,18 @@ unsigned long long int pi_time_get_us()
     //time += 95; /* Around 95us between main() and kernel start. */
     pi_irq_restore(irq);
     return time;
+    #else
+    uint64_t time = 0;
+    uint32_t irq = pi_irq_disable();
+    uint32_t cur_timer_val = 0;
+    extern struct pi_device sys_timer_hi_prec;
+    int32_t status = pi_timer_current_value_read(&sys_timer_hi_prec, &cur_timer_val);
+    uint32_t freq_timer = pi_timer_clock_freq_get(&sys_timer_hi_prec);
+    time = ((uint64_t)cur_timer_val * 1000000) / freq_timer;
+    //time += cur_timer_val;
+    pi_irq_restore(irq);
+    return time;
+    #endif  /* __GAP8__ */
 }
 
 PI_FC_L1 struct pi_task_delayed_s delayed_task = {0};
@@ -137,7 +149,7 @@ void pi_task_delayed_fifo_enqueue(struct pi_task *task, uint32_t delay_us)
     delayed_task.fifo_tail = task;
 }
 
-
+#if defined(__GAP8__)
 PI_FC_L1 struct pi_task_delayed_s timer_task = {0};
 
 // TODO: use a proper define for ref clk (does not exist yet)
@@ -145,7 +157,6 @@ PI_FC_L1 struct pi_task_delayed_s timer_task = {0};
 
 void pi_task_timer_enqueue(struct pi_task *task, uint32_t delay_us)
 {
-#ifdef __GAP8__
     task->data[8] = ((delay_us)/ref_clk_us)
         + (((delay_us)%ref_clk_us) > 0);
     //printf("ticks: %i\n ref_clk_us: %i\n rem: %i\n",task->data[8]
@@ -165,7 +176,6 @@ void pi_task_timer_enqueue(struct pi_task *task, uint32_t delay_us)
         delayed_task.fifo_tail->next = task;
     }
     delayed_task.fifo_tail = task;
-#endif
 }
 
 
@@ -200,6 +210,7 @@ void __pi_task_timer_irq(void)
         NVIC_DisableIRQ(FC_IRQ_TIMER0_HI_EVT);
     }
 }
+#endif  /* __GAP8__ */
 
 // return value allows to skip some OS logic when a switch has already been triggered
 int pi_task_delayed_increment_push(void)
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h
index c18724f57..02fd95137 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/include/pmsis/backend/implementation_specific_defines.h
@@ -26,7 +26,8 @@
 #define pi_data_free(x,y)     pmsis_l2_malloc_free(x,y)
 
 #define PI_TASK_IMPLEM                          \
-    uint8_t destroy;
+    uint8_t destroy;                            \
+    uint32_t time;
 
 #define PI_TASK_IMPLEM_NB_DATA 9
 
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h
index 78e06acd8..38b08f218 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/include/pmsis/rtos/os/pmsis_task.h
@@ -55,21 +55,6 @@
  */
 pi_task_t *__pi_task_block(pi_task_t *task);
 
-/**
- * \brief Prepare an event task with callback.
- *
- * This function initializes an instance of event task.
- * This event task executes the callback given in argument.
- *
- * \param callback_task  Pointer to event task.
- * \param func           Callback function.
- * \param arg            Callback function argument.
- *
- * \return task This function returns the event task initialized.
- */
-pi_task_t *__pi_task_callback(pi_task_t *callback_task,
-                              void (*func)(void *), void *arg);
-
 /**
  * \brief Wait on an event task.
  *
@@ -110,6 +95,21 @@ void __pi_task_destroy(pi_task_t *task);
  */
 void pi_task_delayed_fifo_enqueue(struct pi_task *task, uint32_t delay_us);
 
+
+static inline void __pi_task_push_no_irq(pi_task_t *task)
+{
+    pmsis_event_push(pmsis_event_get_default_scheduler(), task);
+}
+
+static inline void __pi_task_push_exec_irq_safe(pi_task_t *task)
+{
+    pi_callback_func_t func = (pi_callback_func_t) task->arg[0];
+    void *arg = (void *) task->arg[1];
+    func(arg);
+}
+
+void __pi_task_push_locked(pi_task_t * task);
+
 /*******************************************************************************
  * API implementation
  ******************************************************************************/
@@ -122,15 +122,24 @@ static inline pi_task_t *pi_task_block(pi_task_t *task)
 static inline pi_task_t *pi_task_callback(pi_task_t *task,
                                           void (*callback)(void*), void *arg)
 {
-    return __pi_task_callback(task, callback, arg);
+    task->id = PI_TASK_CALLBACK_ID;
+    task->arg[0] = (uintptr_t) callback;
+    task->arg[1] = (uintptr_t) arg;
+    task->done = 0;
+    task->sync_obj = NULL;
+    //task->destroy = 0;
+    task->core_id = -1;
+    task->timeout = 0;
+    task->next = NULL;
+    return task;
 }
 
 static inline pi_task_t *pi_task_irq_callback(pi_task_t *task,
-                                          void (*callback)(void*), void *arg)
+                                              void (*callback)(void*), void *arg)
 {
     task->id = PI_TASK_IRQ_ID;
-    task->arg[0] = (uintptr_t)callback;
-    task->arg[1] = (uintptr_t)arg;
+    task->arg[0] = (uintptr_t) callback;
+    task->arg[1] = (uintptr_t) arg;
     return task;
 }
 
@@ -139,19 +148,16 @@ static inline void pi_task_wait_on(pi_task_t *task)
     __pi_task_wait_on(task);
 }
 
+static inline void pi_task_push_irq_safe(pi_task_t *task)
+{
+    __pi_task_push_locked(task);
+}
+
 static inline void pi_task_push(pi_task_t *task)
 {
-    switch (task->id)
-    {
-        case PI_TASK_NONE_ID :
-            pi_task_release(task);
-            break;
-        case PI_TASK_CALLBACK_ID :
-            __pi_task_push(task);
-            break;
-        default :
-            return;
-    }
+    uint32_t irq = pi_irq_disable();
+    __pi_task_push_locked(task);
+    pi_irq_restore(irq);
 }
 
 static inline void pi_task_destroy(pi_task_t *task)
@@ -169,6 +175,16 @@ static inline int32_t pi_task_transfer_end_result_get(pi_task_t *task)
     return task->arg[3];
 }
 
+static inline int32_t pi_task_status_get(pi_task_t *task)
+{
+    return task->arg[3];
+}
+
+static inline void pi_task_status_set(pi_task_t *task, int32_t status)
+{
+    task->arg[3] = status;
+}
+
 static inline void pi_task_timeout_callback_set(pi_task_t *task, pi_callback_func_t func,
                                                 void *arg)
 {
diff --git a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c
index ee3ae0893..83bbc2956 100644
--- a/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c
+++ b/rtos/freeRTOS/vendors/gwt/pmsis/rtos/os/pmsis_task.c
@@ -43,22 +43,7 @@ pi_task_t *__pi_task_block(pi_task_t *callback_task)
     callback_task->id = PI_TASK_NONE_ID;
     callback_task->done = 0;
     pi_sync_obj_init((void *) &(callback_task->sync_obj));
-    callback_task->destroy = 1;
-    callback_task->core_id = -1;
-    callback_task->timeout = 0;
-    callback_task->next = NULL;
-    return callback_task;
-}
-
-pi_task_t *__pi_task_callback(pi_task_t *callback_task,
-                              pi_callback_func_t func, void *arg)
-{
-    callback_task->id = PI_TASK_CALLBACK_ID;
-    callback_task->arg[0] = (uintptr_t) func;
-    callback_task->arg[1] = (uintptr_t) arg;
-    callback_task->done = 0;
-    callback_task->sync_obj = NULL;
-    callback_task->destroy = 0;
+    //callback_task->destroy = 1;
     callback_task->core_id = -1;
     callback_task->timeout = 0;
     callback_task->next = NULL;
@@ -67,16 +52,16 @@ pi_task_t *__pi_task_callback(pi_task_t *callback_task,
 
 void __pi_task_destroy(pi_task_t *task)
 {
-    if (task->destroy)
+    //if (task->destroy)
     {
-        task->destroy = 0;
+        //task->destroy = 0;
         // if the mutex is only virtual (e.g. wait on soc event)
-        hal_compiler_barrier();
+        //hal_compiler_barrier();
         if (task->sync_obj != NULL)
         {
             pi_sync_obj_deinit((void *) &(task->sync_obj));
         }
-        hal_compiler_barrier();
+        //hal_compiler_barrier();
     }
 }
 
@@ -96,12 +81,14 @@ void __pi_task_wait_on(pi_task_t *task)
     __pi_task_destroy(task);
 }
 
+#if 0
 void __pi_task_push(pi_task_t *task)
 {
     uint32_t irq = disable_irq();
     pmsis_event_push(pmsis_event_get_default_scheduler(), task);
     restore_irq(irq);
 }
+#endif
 
 /*******************************************************************************
  * API implementation
@@ -118,13 +105,34 @@ pi_task_t *pi_task_block_no_mutex(pi_task_t *callback_task)
     callback_task->id = PI_TASK_NONE_ID;
     callback_task->done = 0;
     callback_task->sync_obj = NULL;
-    callback_task->destroy = 0;
+    //callback_task->destroy = 0;
     callback_task->core_id = -1;
     callback_task->timeout = 0;
     callback_task->next = NULL;
     return callback_task;
 }
 
+void __pi_task_push_locked(pi_task_t * task)
+{
+    switch (task->id)
+    {
+    case PI_TASK_NONE_ID :
+        pi_task_release(task);
+        break;
+
+    case PI_TASK_CALLBACK_ID :
+        __pi_task_push_no_irq(task);
+        break;
+
+    case PI_TASK_IRQ_ID :
+        __pi_task_push_exec_irq_safe(task);
+        break;
+
+    default :
+        return;
+    }
+}
+
 void pi_task_release(pi_task_t *task)
 {
     DEBUG_PRINTF("[%s] releasing task %p\n",__func__,task);
diff --git a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
index 6e0fc1289..c2225d9c6 100644
--- a/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
+++ b/rtos/freeRTOS/vendors/gwt/rules/freeRTOS_rules.mk
@@ -408,13 +408,16 @@ $(BIN).size: $(BIN)
 
 
 flash: $(BIN)
-	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args)
+	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
-flash_fs: $(BIN)
+flash_noforce: $(BIN)
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args)
 
+flash_fs: $(BIN)
+	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) $(WSL_ENV)
+
 image: $(BIN)
-	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args)
+	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 run: $(BIN)
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --exec --binary=$(BIN) $(runner_args) $(WSL_ENV)
diff --git a/rtos/pmsis/pmsis_api/docs/rtos.rst b/rtos/pmsis/pmsis_api/docs/rtos.rst
index 35500b4f4..3bb11eed1 100644
--- a/rtos/pmsis/pmsis_api/docs/rtos.rst
+++ b/rtos/pmsis/pmsis_api/docs/rtos.rst
@@ -9,6 +9,14 @@ Task
     :private-members:
     :protected-members:
 
+Event_Task
+....
+
+.. doxygengroup:: Event_Task
+    :members:
+    :private-members:
+    :protected-members:
+
 Memory allocation
 .................
 
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/chips/gap8/timer.h b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap8/timer.h
new file mode 100644
index 000000000..7cb15c00d
--- /dev/null
+++ b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap8/timer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2021 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \ingroup groupChips
+ *
+ * \addtogroup GAP8
+ * \{
+ *
+ * \defgroup GAP8_Timers GAP8 Timers
+ *
+ * \brief GAP8 Timers
+ *
+ * This part enumerates available Timers on chip **GAP8**.
+ *
+ * There are a total of 4 timers :
+ * * 2 on Fabric Controller
+ * * 2 on Cluster
+ *
+ * \addtogroup GAP8_Timers
+ * \{
+ */
+
+/**
+ * \enum pi_timer_e
+ *
+ * \brief Timers.
+ *
+ * List of available timers.
+ */
+typedef enum
+{
+    SYS_TIMER  = 0, /*!< FC_TIMER_0 used as SysTick timer by preemptive RTOS. */
+    FC_TIMER_0 = 0, /*!< FC Timer_Low. */
+    FC_TIMER_1 = 1, /*!< FC Timer_High, can be used as perf counter. */
+    CL_TIMER_0 = 2, /*!< Cluster Timer_Low. */
+    CL_TIMER_1 = 3  /*!< Cluster Timer_High. */
+} pi_timer_e;
+
+/**
+ * \} end of GAP8_Timers
+ *
+ * \} end of GAP8
+ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/chips/gap9/timer.h b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap9/timer.h
new file mode 100644
index 000000000..dfa9b21c5
--- /dev/null
+++ b/rtos/pmsis/pmsis_api/include/pmsis/chips/gap9/timer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2021 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \ingroup groupChips
+ *
+ * \addtogroup GAP9
+ * \{
+ *
+ * \defgroup GAP9_Timers GAP9 Timers
+ *
+ * \brief GAP9 Timers
+ *
+ * This part enumerates available Timers on chip **GAP9**.
+ *
+ * There are a total of 4 timers :
+ * * 4 on Fabric Controller
+ * * 2 on Cluster
+ *
+ * \addtogroup GAP9_Timers
+ * \{
+ */
+
+/**
+ * \enum pi_timer_e
+ *
+ * \brief Timers.
+ *
+ * List of available timers.
+ */
+typedef enum
+{
+    SYS_TIMER  = 0, /*!< FC TIMER_0 used as SysTick timer by preemptive RTOS. */
+    FC_TIMER_0 = 0, /*!< FC Timer_0_Low. FC_TIMER_0 and FC_TIMER_1 can be used
+                         together to form a 64 bits timer. */
+    FC_TIMER_1 = 1, /*!< FC Timer_0_High. */
+    FC_TIMER_2 = 2, /*!< FC Timer_1_Low. FC_TIMER_2 and FC_TIMER_3 can be used
+                         together to form a 64 bits timer. */
+    FC_TIMER_3 = 3, /*!< FC Timer_1_High. */
+    CL_TIMER_0 = 4, /*!< Cluster Timer_0_Low. */
+    CL_TIMER_1 = 5  /*!< Cluster Timer_0_High. */
+} pi_timer_e;
+
+/**
+ * \} end of GAP9_Timers
+ *
+ * \} end of GAP9
+ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/drivers/timer.h b/rtos/pmsis/pmsis_api/include/pmsis/drivers/timer.h
new file mode 100644
index 000000000..b4bcdc7d2
--- /dev/null
+++ b/rtos/pmsis/pmsis_api/include/pmsis/drivers/timer.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2021 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \ingroup groupDrivers
+ *
+ * \defgroup Timer Timer
+ *
+ * \brief Timer
+ *
+ * The timer driver includes API to manage different timers available on
+ * chips, for both Fabric Controller side and Cluster side.
+ */
+
+/**
+ * \addtogroup Timer
+ * \{
+ */
+
+
+/**
+ * \struct pi_timer_conf_s
+ *
+ * \brief Timer configuration structure.
+ *
+ * This structure is used to pass the desired timer configuration to the
+ * runtime when opening the device.
+ */
+struct pi_timer_conf_s
+{
+    uint32_t time_us; /*!< Timer value to compare. */
+    uint8_t timer_id; /*!< Timer ID, refer to \ref pi_timer_e. */
+    uint8_t one_shot; /*!< One shot timer, after reaching time_us, timer is disabled. */
+    uint8_t irq_en;   /*!< Enable timer IRQ. */
+    uint8_t clk_src;  /*!< Timer clock source: FLL=0, REF_CLK=1. */
+    uint8_t timer_reset;        /*!< When value is reached: CONTINUE=0, RESET=1. */
+    //uint8_t timer_64; /*!< Enable a 64-bit timer, using two 32-bit timers. */
+};
+
+
+/**
+ * \brief Initialize a timer configuration with default values.
+ *
+ * This function can be called to get default values for all parameters before
+ * setting some of them.
+ * The structure containing the configuration must be kept alive until the I2S
+ * device is opened.
+ *
+ * \param conf           Pointer to the timer configuration.
+ *
+ * \note Only FC_TIMER_2 and FC_TIMER_3 timers are available.
+ */
+void pi_timer_conf_init(struct pi_timer_conf_s *conf);
+
+/**
+ * \brief Open a timer.
+ *
+ * This function will do all the needed configuration to initialize a timer(on
+ * FC or Cluster) with given configuration.
+ *
+ * \param device         Pointer to device structure.
+ *
+ * \retval 0             If operation is successfull.
+ * \retval ERRNO         An error code otherwise.
+ *
+ * \note This function must be called before the timer device can be used.
+ * \note For preemptive RTOS using time slicing, the FC_Timer_0 should be used as
+ *       SysTick timer. Thus a 64 bit timer can not be used on FC side.
+ */
+int pi_timer_open(struct pi_device *device);
+
+/**
+ * \brief Close an opened timer device.
+ *
+ * This function closes a timer device.
+ *
+ * \param device         Pointer to device structure.
+ */
+void pi_timer_close(struct pi_device *device);
+
+/**
+ * \brief Start a timer.
+ *
+ * This function starts a timer.
+ *
+ * \param device         Pointer to device structure.
+ *
+ * \retval 0             If operation is successfull.
+ * \retval ERRNO         An error code otherwise.
+ */
+int pi_timer_start(struct pi_device *device);
+
+/**
+ * \brief Stop a timer.
+ *
+ * This function stops a timer.
+ *
+ * \param device         Pointer to device structure.
+ */
+void pi_timer_stop(struct pi_device *device);
+
+/**
+ * \brief Reset a timer counter.
+ *
+ * This function resets a timer's counter register.
+ *
+ * \param device         Pointer to device structure.
+ *
+ * \note This function does not reset a timer's configuration.
+ *       To fully reset a timer, stop first the timer, then reinitialize
+ *       the timer using pi_timer_init() function.
+ */
+void pi_timer_reset(struct pi_device *device);
+
+/**
+ * \brief Get a timer's counter value.
+ *
+ * This function reads the current counter value of a timer and stores it in the
+ * given buffer.
+ *
+ * \param device         Pointer to device structure.
+ * \param value          Buffer to store counter value.
+ *
+ * \retval 0             If operation is successfull.
+ * \retval ERRNO         An error code otherwise.
+ */
+int pi_timer_current_value_read(struct pi_device *device, uint32_t *value);
+
+int pi_timer_task_add(struct pi_device *device, uint32_t time_us, pi_task_t *task);
+
+/**
+ * \} end of Timer
+ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h b/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h
index 4770f588c..e70c60ca6 100644
--- a/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h
+++ b/rtos/pmsis/pmsis_api/include/pmsis/rtos/pi_log.h
@@ -410,10 +410,10 @@ static inline int pi_log_default_vprintf(const char *format, va_list list)
 #define HYPER_DBG(fmt, ...) PI_LOG_DBG(HYPER_TAG, fmt, ##__VA_ARGS__)
 #define HYPER_TRC(fmt, ...) PI_LOG_TRC(HYPER_TAG, fmt, ##__VA_ARGS__)
 
-#define MRAM_TAG "mram"
+//#define MRAM_TAG "mram"
 #define MRAM_ERR(fmt, ...) PI_LOG_ERR(MRAM_TAG, fmt, ##__VA_ARGS__)
 #define MRAM_WNG(fmt, ...) PI_LOG_WNG(MRAM_TAG, fmt, ##__VA_ARGS__)
-#define MRAM_INF(fmt, ...) PI_LOG_INF(MRAM_TAG, fmt, ##__VA_ARGS__)
+#define MRAM_INF(fmt, ...)
 #define MRAM_DBG(fmt, ...) PI_LOG_DBG(MRAM_TAG, fmt, ##__VA_ARGS__)
 #define MRAM_TRC(fmt, ...) PI_LOG_TRC(MRAM_TAG, fmt, ##__VA_ARGS__)
 
diff --git a/rtos/pmsis/pmsis_api/include/pmsis/task.h b/rtos/pmsis/pmsis_api/include/pmsis/task.h
index 3d4189d8c..c6ae9ba3f 100644
--- a/rtos/pmsis/pmsis_api/include/pmsis/task.h
+++ b/rtos/pmsis/pmsis_api/include/pmsis/task.h
@@ -182,13 +182,22 @@ static inline pi_callback_t *pi_callback_init(pi_callback_t *callback,
 static inline void pi_task_timeout_set(pi_task_t *task, uint32_t timeout_us);
 
 /**
- * \brief Query result end of transfer.
+ * \brief Query task status.
  *
- * This function can be used to check the end result of a transfer.
+ * This function can be used to check if a task completed successfully.
  *
- * \return ERRNO         Value corresponding to end of transfer.
+ * \return ERRNO         Value corresponding to task status.
  */
-static inline int32_t pi_task_transfer_end_result_get(pi_task_t *task);
+static inline int32_t pi_task_status_get(pi_task_t *task);
+
+/**
+ * \brief Set task status.
+ *
+ * This function can be used to tell if a task completed successfully.
+ *
+ * \param status        Value corresponding to task status.
+ */
+static inline void pi_task_status_set(pi_task_t *task, int32_t status);
 
 /**
  * @}
diff --git a/rtos/pmsis/pmsis_bsp/adc/ads1014.c b/rtos/pmsis/pmsis_bsp/adc/ads1014.c
new file mode 100644
index 000000000..45b5835d4
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/adc/ads1014.c
@@ -0,0 +1,349 @@
+#include "pmsis.h"
+#include "bsp/bsp.h"
+#include "bsp/adc/ads1014.h"
+
+/**************/
+/* Structures */
+/**************/
+
+enum ads1014_registers {
+    ADS1014_REGISTER_VALUE          = 0x0,
+    ADS1014_REGISTER_CONF           = 0x1,
+    ADS1014_REGISTER_THRESHOLD_LOW  = 0x2,
+    ADS1014_REGISTER_THRESHOLD_HIGH = 0x3,
+};
+
+typedef union __attribute__((__packed__)) {
+    struct __attribute__((packed)) {
+        uint8_t _unused :4;
+        int16_t value :12;
+    } reg;
+    uint16_t value;
+} ads1014_register_value_t;
+
+typedef union __attribute__((__packed__)) {
+    struct __attribute__((__packed__)) {
+        enum ads1014_comparator_status comparator_status :2;
+        enum ads1014_comparator_latch comparator_latch :1;
+        enum ads1014_comparator_polarity comparator_polarity :1;
+        enum ads1014_comparator_mode comparator_mode :1;
+        enum ads1014_data_rate data_rate :3;
+        enum ads1014_operating_mode operating_mode :1;
+        enum ads1014_pga pga :3;
+        uint8_t _unused: 3;
+        uint8_t converting :1;
+    } reg;
+    uint16_t value;
+} ads1014_register_conf_t;
+
+typedef struct {
+    pi_device_t i2c_device;
+    ads1014_register_conf_t adc_conf;
+} ads1014_data_t;
+
+/********************/
+/* Static functions */
+/********************/
+
+static int
+__ads1014_update_configuration(pi_device_t *device) {
+    ads1014_data_t *data = (ads1014_data_t*) device->data;
+
+    /* set the adc to trigger a conversion (useful in single shot mode) */
+    data->adc_conf.reg.converting = 1;
+
+    uint16_t packed_conf = data->adc_conf.value;
+    uint8_t payload[3] = { ADS1014_REGISTER_CONF,
+        (packed_conf >> 8) & 0xFF,
+        packed_conf & 0xFF,
+    };
+
+    int status = pi_i2c_write(&data->i2c_device, payload, 3,
+            PI_I2C_XFER_START | PI_I2C_XFER_STOP);
+    return status;
+};
+
+/**
+ *
+ * Convert a float value (in mV) to fit in a comparator threshold register
+ *
+ * @param[in] pga ads1014 pga setting (to know the scale)
+ * @param[in] f float to convert
+ * @param[out] reg register that will contain the final 12 bit value
+ *
+ * @return PI_OK if operation was successful,
+ *         an error otherwise
+ */
+static inline int
+__convert_float_to_register_value(enum ads1014_pga pga, float f,
+        ads1014_register_value_t *reg)
+{
+    if (NULL == reg) {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    /* Change the scale according to the ADC PGA */
+    switch(pga) {
+        case ADS1014_PGA_FSR_6V144:
+            f /= 3;
+            break;
+        case ADS1014_PGA_FSR_4V096:
+            f /= 2;
+            break;
+        case ADS1014_PGA_FSR_2V048:
+            /* nothing, f *= 1*/
+            break;
+        case ADS1014_PGA_FSR_1V024:
+            f *= 2;
+            break;
+        case ADS1014_PGA_FSR_0V512:
+            f *= 4;
+            break;
+        case ADS1014_PGA_FSR_0V256: /* fallthrough */
+        default:
+            f *= 8;
+            break;
+    }
+
+    /* fit into 12 bits */
+    int32_t i = f;
+    /* check if it fits in 12 bits */
+    if (__builtin_pulp_clb(i) < 20) {
+        return PI_ERR_INVALID_STATE;
+    }
+    reg->reg.value = f;
+
+    return PI_OK;
+}
+
+/**
+ * Helper to write a comparator register
+ *
+ * @param[in] device pointer to the ads1014 device (not checked)
+ * @param[in] reg register to write
+ * @param[in] value value to write in the register
+ *
+ * @return PI_OK if operation was succesful, an error code otherwise
+ */
+static inline int
+__write_comparator_register(pi_device_t *device, enum ads1014_registers reg,
+        ads1014_register_value_t value)
+{
+    ads1014_data_t *data = (ads1014_data_t*) device->data;
+
+    uint16_t packed_conf = value.value;
+    uint8_t payload[3] = { reg,
+        (packed_conf >> 8) & 0xFF,
+        packed_conf & 0xFF,
+    };
+
+    int status = pi_i2c_write(&data->i2c_device, payload, 3,
+            PI_I2C_XFER_START | PI_I2C_XFER_STOP);
+    return status;
+}
+
+/*****************/
+/* API functions */
+/*****************/
+
+void pi_ads1014_conf_init(struct pi_ads1014_conf *conf) {
+    if (NULL == conf) {
+        return;
+    }
+
+    /* I2C related */
+    conf->i2c_itf = 1; //TODO retrieve from BSP
+    conf->i2c_addr = 0x90; //TODO retrieve from BSP
+
+    /* ADC General settings */
+    conf->operating_mode = ADS1014_OPERATING_MODE_SINGLE_SHOT;
+    conf->pga = ADS1014_PGA_FSR_2V048;
+    conf->data_rate = ADS1014_DATA_RATE_SPS_1600;
+
+    /* ADC comparator settings */
+    conf->comparator_status = ADS1014_COMPARATOR_STATUS_DISABLED;
+    conf->comparator_mode = ADS1014_COMPARATOR_MODE_TRADITIONAL;
+    conf->comparator_latch = ADS1014_COMPARATOR_LATCH_DISABLED;
+    conf->comparator_polarity = ADS1014_COMPARATOR_POLARITY_ACTIVE_LOW;
+}
+
+int pi_ads1014_open(pi_device_t *device) {
+    if (NULL == device) {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    struct pi_ads1014_conf *conf = (struct pi_ads1014_conf *) device->config;
+
+    /* allocate memory for runtime data */
+    ads1014_data_t *ads1014 = (ads1014_data_t *) pi_l2_malloc(sizeof(ads1014_data_t));
+    if (NULL == ads1014) {
+        return PI_ERR_L2_NO_MEM;
+    }
+    device->data = (void*) ads1014;
+
+    /* open bsp */
+    //TODO
+
+    /* initialize configuration register */
+    {
+        ads1014->adc_conf.reg.operating_mode = conf->operating_mode;
+        ads1014->adc_conf.reg.pga = conf->pga;
+        ads1014->adc_conf.reg.data_rate = conf->data_rate;
+        ads1014->adc_conf.reg.comparator_status = conf->comparator_status;
+        ads1014->adc_conf.reg.comparator_mode = conf->comparator_mode;
+        ads1014->adc_conf.reg.comparator_latch = conf->comparator_latch;
+        ads1014->adc_conf.reg.comparator_mode = conf->comparator_mode;
+        ads1014->adc_conf.reg.comparator_polarity = conf->comparator_polarity;
+        ads1014->adc_conf.reg.converting = 1; /* initial trigger */
+    }
+
+    /* initialize I2C device */
+    {
+        struct pi_i2c_conf i2c_conf;
+        pi_i2c_conf_init(&i2c_conf);
+        i2c_conf.itf = conf->i2c_itf;
+        i2c_conf.max_baudrate = 100000;
+        pi_i2c_conf_set_slave_addr(&i2c_conf, conf->i2c_addr, 0);
+
+        pi_open_from_conf(&ads1014->i2c_device, &i2c_conf);
+
+        if (PI_OK != pi_i2c_open(&ads1014->i2c_device)) {
+            pi_l2_free(device->data, sizeof(ads1014_data_t));
+            return PI_ERR_INVALID_STATE;
+        }
+    }
+
+    /* ADC configuration */
+    int status = __ads1014_update_configuration(device);
+    if (status != PI_OK) {
+        pi_i2c_close(&ads1014->i2c_device);
+        pi_l2_free(device->data, sizeof(ads1014_data_t));
+        return PI_ERR_INVALID_STATE;
+    }
+
+    return PI_OK;
+}
+
+void pi_ads1014_close(pi_device_t *device) {
+    if (NULL == device || NULL == device->data) {
+        return;
+    }
+
+    ads1014_data_t *data = (ads1014_data_t*) device->data;
+
+    /* close devices */
+    pi_i2c_close(&data->i2c_device);
+
+    /* free memory */
+    pi_l2_free(data, sizeof(ads1014_data_t));
+    device->data = NULL;
+}
+
+int pi_ads1014_read(pi_device_t *device, float *value) {
+    if (NULL == device || NULL == device->data || NULL == value) {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    ads1014_data_t *data = (ads1014_data_t*) device->data;
+
+    int status = PI_OK;
+
+    // if ADC is in continous mode, no need to trigger a conversion.
+    // else we need to trigger for a single shot measurement
+    if (data->adc_conf.reg.operating_mode == ADS1014_OPERATING_MODE_SINGLE_SHOT) {
+        status = __ads1014_update_configuration(device);
+        if (PI_OK != status) {
+            return PI_ERR_INVALID_STATE;
+        }
+
+        //TODO wait for the end of the conversion ?
+    }
+
+    uint8_t write_payload = ADS1014_REGISTER_VALUE;
+    ads1014_register_value_t result;
+
+    status = pi_i2c_write(&data->i2c_device, &write_payload, 1,
+            PI_I2C_XFER_START | PI_I2C_XFER_STOP);
+
+    if (status != PI_OK)  {
+        return PI_ERR_INVALID_STATE;
+    }
+
+    status = pi_i2c_read(&data->i2c_device, (uint8_t*) &result, 2,
+            PI_I2C_XFER_START | PI_I2C_XFER_STOP);
+
+    if (status != PI_OK) {
+        return PI_ERR_INVALID_STATE;
+    }
+
+    result.value= (result.value << 8) | (result.value >> 8);
+
+    /* conversion depending on the pga */
+    switch(data->adc_conf.reg.pga) {
+        case ADS1014_PGA_FSR_6V144:
+            *value = result.reg.value * 3;
+            break;
+        case ADS1014_PGA_FSR_4V096:
+            *value = result.reg.value * 2;
+            break;
+        case ADS1014_PGA_FSR_2V048:
+            *value = result.reg.value * 1;
+            break;
+        case ADS1014_PGA_FSR_1V024:
+            *value = result.reg.value * 0.5;
+            break;
+        case ADS1014_PGA_FSR_0V512:
+            *value = result.reg.value * 0.25;
+            break;
+        case ADS1014_PGA_FSR_0V256: /* fallthrough */
+        default:
+            *value = result.reg.value * 0.125;
+            break;
+    }
+
+    return PI_OK;
+}
+
+int pi_ads1014_set_comparator_thresholds(pi_device_t *device,
+        float threshold_low, float threshold_high)
+{
+    if (NULL == device || NULL == device->data) {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    ads1014_data_t *data = (ads1014_data_t*) device->data;
+
+    ads1014_register_value_t low_th_reg;
+    ads1014_register_value_t high_th_reg;
+
+    /* Convert threshold to 12bits */
+    {
+        /* return an error if they do not fit inside 12 bits */
+        enum ads1014_pga pga = data->adc_conf.reg.pga;
+
+        if (PI_OK != __convert_float_to_register_value(pga, threshold_low,
+                    &low_th_reg)) {
+            return PI_ERR_INVALID_STATE;
+        }
+
+        if (PI_OK != __convert_float_to_register_value(pga, threshold_high,
+                    &high_th_reg)) {
+            return PI_ERR_INVALID_STATE;
+        }
+    }
+
+    /* write values to registers */
+    int status = __write_comparator_register(device,
+            ADS1014_REGISTER_THRESHOLD_LOW, low_th_reg);
+    if (PI_OK != status) {
+        return status;
+    }
+
+    status = __write_comparator_register(device,
+            ADS1014_REGISTER_THRESHOLD_HIGH, high_th_reg);
+    if (PI_OK != status) {
+        return status;
+    }
+
+    return PI_OK;
+}
diff --git a/rtos/pmsis/pmsis_bsp/audio/adc/tlv320.c b/rtos/pmsis/pmsis_bsp/audio/adc/tlv320.c
new file mode 100644
index 000000000..1f471ece0
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/audio/adc/tlv320.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2018 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+#include "pmsis.h"
+#include "bsp/bsp.h"
+
+
+typedef struct
+{
+    struct pi_device i2c;
+} tlv320_t;
+
+
+static int __pi_tlv320_reg_write(pi_device_t *dev, uint8_t addr, uint8_t value)
+{
+    uint8_t buffer[2] = { addr, value };
+    if (pi_i2c_write(dev, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+
+static uint8_t __pi_tlv320_reg_read(pi_device_t *dev, uint8_t addr)
+{
+    uint8_t result;
+    pi_i2c_write_read(dev, &addr, &result, 1, 1);
+    return result;
+}
+
+
+int pi_tlv320_open(struct pi_device *device)
+{
+    struct pi_tlv320_conf *conf = (struct pi_tlv320_conf *)device->config;
+
+    tlv320_t *tlv320 = (tlv320_t *)pmsis_l2_malloc(sizeof(tlv320_t));
+    if (tlv320 == NULL)
+    {
+        return -1;
+    }
+
+    device->data = (void *)tlv320;
+
+    if (bsp_tlv320_open(conf))
+    {
+        goto error;
+    }
+
+    struct pi_i2c_conf i2c_conf;
+    pi_i2c_conf_init(&i2c_conf);
+    i2c_conf.itf = conf->i2c_itf;
+    i2c_conf.max_baudrate = 100000;
+    pi_i2c_conf_set_slave_addr(&i2c_conf, 0x98, 0);
+
+    pi_open_from_conf(&tlv320->i2c, &i2c_conf);
+    if (pi_i2c_open(&tlv320->i2c)) goto error;
+
+    //tlv320 write i2c config (and read for debug)
+    
+    uint8_t expected , read;
+
+    // Select page 0 (configuration register page)
+    expected = 0x00;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x00, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x00);
+    printf("Read page 0x%x \n",read);
+    if (read == expected)
+      printf("Page ok \n");
+    else printf("Page failed \n",read);
+     
+    // Wake-up device by I2C write into P0_R2 using internal AREG
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x02, 0x81);
+    expected = 0x81;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x02, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x02);
+    if (read == expected)
+      printf("Wake up ok \n");
+    else printf("Wake up failed \n",read);
+
+    //wait at least 1ms to complete init
+    pi_time_wait_us(2000);
+    
+    // Enable Input Ch-1,2,3,4 by I2C write into P0_R115
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x73, 0xF0);
+    //only channel 2
+    expected = 0x40;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x73, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x73);
+    if (read == expected)
+      printf("Enable channel 2 ok \n");
+    else printf("Enable channel 2 failed \n",read);
+    
+    // Enable ASI Output Ch-1,2,3,4 slot by I2C write into P0_R116
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x74, 0xF0);
+    //only channel 2
+    expected = 0x40;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x74, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x74);
+    if (read == expected)
+      printf("Enable ASI out channel 2 ok \n");
+    else printf("Enable ASI out channel 2 failed \n",read);
+    
+    // Power-up ADC, MICBIAS and PLL by I2C write into P0_R117
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x75, 0xe0);
+    //power up only ADC and PLL
+    expected = 0x60;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x75, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x75);
+    if (read == expected)
+      printf("Power up ADC and PLL  ok \n");
+    else printf("Power Up ADC and PLL failed \n",read);
+    
+
+    // TDM 32bits mode
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x7, 0x30);
+    expected = 0x30;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x7, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x7);
+    if (read == expected)
+      printf("TDM 32 bit ok \n");
+    else printf("TDM 32 bit failed \n",read);
+
+    // TX OFFSET. set to 1 to match our ws_delay of 1, but not sure it is the right value -- seems to work
+    expected = 0x1;
+    __pi_tlv320_reg_write(&tlv320->i2c, 0x8, expected);
+    read = __pi_tlv320_reg_read(&tlv320->i2c, 0x8);
+    if (read == expected)
+      printf("Tx offset 1  ok \n");
+    else printf("Tx offset 1  failed \n",read);
+
+
+
+    //configure AC single - ended
+    //channel1
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x3C, 0x20);
+    //channel2
+      __pi_tlv320_reg_write(&tlv320->i2c, 0x41, 0x20);
+    //channel3
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x46, 0x20);
+    //channel4
+    //__pi_tlv320_reg_write(&tlv320->i2c, 0x4B, 0x20);
+
+    return 0;
+
+error:
+    pmsis_l2_malloc_free(tlv320, sizeof(tlv320_t));
+    return -2;
+}
+
+
+void pi_tlv320_close(struct pi_device *device)
+{
+  tlv320_t *tlv320 = (tlv320_t *)device->data;
+  pmsis_l2_malloc_free(tlv320, sizeof(tlv320_t));
+}
+
+
+void pi_tlv320_conf_init(struct pi_tlv320_conf *conf)
+{
+  bsp_tlv320_conf_init(conf);
+}
+
diff --git a/rtos/pmsis/pmsis_bsp/audio/dac/ak4332.c b/rtos/pmsis/pmsis_bsp/audio/dac/ak4332.c
new file mode 100644
index 000000000..120457112
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/audio/dac/ak4332.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2018 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+#include "pmsis.h"
+#include "bsp/bsp.h"
+
+
+typedef struct
+{
+    struct pi_device i2c;
+} ak4332_t;
+
+
+static int __pi_ak4332_reg_write(pi_device_t *dev, uint8_t addr, uint8_t value)
+{
+    uint8_t buffer[2] = { addr, value };
+    if (pi_i2c_write(dev, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+
+static uint8_t __pi_ak4332_reg_read(pi_device_t *dev, uint8_t addr)
+{
+    uint8_t result;
+    pi_i2c_write_read(dev, &addr, &result, 1, 1);
+    return result;
+}
+
+
+int pi_ak4332_open(struct pi_device *device)
+{
+    struct pi_ak4332_conf *conf = (struct pi_ak4332_conf *)device->config;
+
+    ak4332_t *ak4332 = (ak4332_t *)pmsis_l2_malloc(sizeof(ak4332_t));
+    if (ak4332 == NULL)
+    {
+        return -1;
+    }
+
+    device->data = (void *)ak4332;
+
+    if (bsp_ak4332_open(conf))
+    {
+        goto error;
+    }
+
+    struct pi_i2c_conf i2c_conf;
+    pi_i2c_conf_init(&i2c_conf);
+    i2c_conf.itf = conf->i2c_itf;
+    i2c_conf.max_baudrate = 100000;
+    pi_i2c_conf_set_slave_addr(&i2c_conf, 0x20, 0);
+
+    pi_open_from_conf(&ak4332->i2c, &i2c_conf);
+    if (pi_i2c_open(&ak4332->i2c)) goto error;
+
+    // DAC initial settings
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x26, 0x02);
+
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x27, 0xC0);
+
+    // Select left channel
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x07, 0x1);
+
+    // Set 32bits samples
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x15, 0b110);
+
+    // Set 48KHz sampling rate
+    // and CM to 512
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x5, 0x2A);
+
+    //Set HP Gain to 0
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x0d, 0b101);
+    //Set DAC volume to max
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x0b, 0x1F);
+
+    // Configure PLL to take BLCK as input clock
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x0E, 0x1);
+
+    // Configure DAC to take PLL as input clock
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x13, 0x1);
+
+
+    int pld = 3;
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x0F, pld >> 8);
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x10, pld & 0xff);
+    int plm = 31;
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x11, plm >> 8);
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x12, plm & 0xff);
+
+    // set volume to max
+    //__pi_ak4332_reg_write(&ak4332->i2c, 0x0b, 0x1f);
+    //__pi_ak4332_reg_write(&ak4332->i2c, 0x0d, 0x7);
+
+    // Power-up PLL
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x00, 0x1);
+
+    pi_time_wait_us(20000);
+
+    // Power-up PMTIM
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x00, 0x3);
+
+    // Power-up charge pump for both channels
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x01, 0x1);
+
+    pi_time_wait_us(65000);
+
+    // Power-up LDO1
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x01, 0x31);
+
+    pi_time_wait_us(5000);
+
+    // Power up charge pump 2
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x01, 0x33);
+
+    // Power-up DAC
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x02, 0x1);
+
+    // Power-up Amplifier
+    __pi_ak4332_reg_write(&ak4332->i2c, 0x03, 0x1);
+
+    return 0;
+
+error:
+    pmsis_l2_malloc_free(ak4332, sizeof(ak4332_t));
+    return -2;
+}
+
+
+int pi_ak4332_set_dac_volume(pi_device_t *device, uint8_t volume)
+{
+    ak4332_t *ak4332 = (ak4332_t *)device->data;
+    if (volume > 0x1F)
+    {
+        return -1;
+    }
+
+    return __pi_ak4332_reg_write(&ak4332->i2c, 0x0b, volume);
+}
+
+int pi_ak4332_set_hp_volume(pi_device_t *device, uint8_t volume)
+{
+    ak4332_t *ak4332 = (ak4332_t *)device->data;
+    if (volume > 0x7)
+    {
+        return -1;
+    }
+
+    return __pi_ak4332_reg_write(&ak4332->i2c, 0x0d, volume);
+}
+
+
+void pi_ak4332_close(struct pi_device *device)
+{
+  ak4332_t *ak4332 = (ak4332_t *)device->data;
+  pmsis_l2_malloc_free(ak4332, sizeof(ak4332_t));
+}
+
+
+void pi_ak4332_conf_init(struct pi_ak4332_conf *conf)
+{
+  bsp_ak4332_conf_init(conf);
+}
+
diff --git a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c
index 67df81b68..124ceff6f 100644
--- a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c
+++ b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk.c
@@ -17,7 +17,6 @@
 #include "pmsis.h"
 
 #include "bsp/bsp.h"
-#include "bsp/gap9_v2.h"
 #include "bsp/camera/himax.h"
 #include "bsp/flash/hyperflash.h"
 #include "bsp/ram/hyperram.h"
@@ -35,6 +34,19 @@ static void __bsp_init_pads()
   }
 }
 
+void bsp_aps25xxxn_conf_init(struct pi_aps25xxxn_conf *conf)
+{
+    conf->ram_start = CONFIG_APS25XXXN_START;
+    conf->ram_size = CONFIG_APS25XXXN_SIZE;
+    conf->spi_itf = CONFIG_APS25XXXN_SPI_ITF;
+    conf->spi_cs = CONFIG_APS25XXXN_SPI_CS;
+}
+
+int bsp_aps25xxxn_open(struct pi_aps25xxxn_conf *conf)
+{
+    return 0;
+}
+
 
 int bsp_24xx1025_open(struct pi_24xx1025_conf *conf)
 {
@@ -64,93 +76,20 @@ void bsp_virtual_eeprom_conf_init(struct pi_virtual_eeprom_conf *conf)
   conf->i2c_itf = CONFIG_VIRTUAL_EEPROM_I2C_ITF;
 }
 
-void bsp_hyperram_conf_init(struct pi_hyperram_conf *conf)
-{
-  conf->ram_start = CONFIG_HYPERRAM_START;
-  conf->ram_size = CONFIG_HYPERRAM_SIZE;
-  conf->skip_pads_config = 0;
-  conf->hyper_itf = CONFIG_HYPERRAM_HYPER_ITF;
-  conf->hyper_cs = CONFIG_HYPERRAM_HYPER_CS;
-}
-
-
-int bsp_hyperram_open(struct pi_hyperram_conf *conf)
-{
-  __bsp_init_pads();
-  return 0;
-}
-
-
-void bsp_spiram_conf_init(struct pi_spiram_conf *conf)
-{
-  conf->ram_start = CONFIG_SPIRAM_START;
-  conf->ram_size = CONFIG_SPIRAM_SIZE;
-  conf->skip_pads_config = 0;
-  conf->spi_itf = CONFIG_SPIRAM_SPI_ITF;
-  conf->spi_cs = CONFIG_SPIRAM_SPI_CS;
-}
 
-int bsp_spiram_open(struct pi_spiram_conf *conf)
+void bsp_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf)
 {
-  return 0;
-}
-
-
-void bsp_aps25xxxn_conf_init(struct pi_aps25xxxn_conf *conf)
-{
-    conf->ram_start = CONFIG_APS25XXXN_START;
-    conf->ram_size = CONFIG_APS25XXXN_SIZE;
-    conf->spi_itf = CONFIG_APS25XXXN_SPI_ITF;
-    conf->spi_cs = CONFIG_APS25XXXN_SPI_CS;
-}
-
-int bsp_aps25xxxn_open(struct pi_aps25xxxn_conf *conf)
-{
-    return 0;
-}
-
-
-void bsp_atxp032_conf_init(struct pi_atxp032_conf *conf)
-{
-    conf->spi_itf = CONFIG_ATXP032_SPI_ITF;
-    conf->spi_cs = CONFIG_ATXP032_SPI_CS;
+    conf->spi_itf = CONFIG_MX25U51245G_SPI_ITF;
+    conf->spi_cs = CONFIG_MX25U51245G_SPI_CS;
     conf->baudrate = 200000000;
 }
 
-int bsp_atxp032_open(struct pi_atxp032_conf *conf)
+int bsp_mx25u51245g_open(struct pi_mx25u51245g_conf *conf)
 {
     return 0;
 }
 
 
-void bsp_spiflash_conf_init(struct pi_spiflash_conf *conf)
-{
-  conf->size = CONFIG_SPIFLASH_SIZE;
-  // sector size is in number of KB
-  conf->sector_size = CONFIG_SPIFLASH_SECTOR_SIZE;
-  conf->spi_itf = CONFIG_SPIFLASH_SPI_ITF;
-  conf->spi_cs = CONFIG_SPIFLASH_SPI_CS;
-}
-
-int bsp_spiflash_open(struct pi_spiflash_conf *conf)
-{
-  return 0;
-}
-
-
-void bsp_hyperflash_conf_init(struct pi_hyperflash_conf *conf)
-{
-  conf->hyper_itf = CONFIG_HYPERFLASH_HYPER_ITF;
-  conf->hyper_cs = CONFIG_HYPERFLASH_HYPER_CS;
-}
-
-int bsp_hyperflash_open(struct pi_hyperflash_conf *conf)
-{
-  __bsp_init_pads();
-  return 0;
-}
-
-
 
 void bsp_himax_conf_init(struct pi_himax_conf *conf)
 {
@@ -164,22 +103,6 @@ int bsp_himax_open(struct pi_himax_conf *conf)
   return 0;
 }
 
-void bsp_nina_b112_conf_init(struct pi_nina_b112_conf *conf)
-{
-    conf->uart_itf = (uint8_t) CONFIG_NINA_B112_UART_ID;
-}
-
-int bsp_nina_b112_open(struct pi_nina_b112_conf *conf)
-{
-    return 0;
-}
-
-int bsp_nina_b112_open_old()
-{
-    __bsp_init_pads();
-    return 0;
-}
-
 void bsp_init()
 {
 }
@@ -191,8 +114,25 @@ void pi_bsp_init_profile(int profile)
 
 
 
+// This function is automatically called by the OS during init
 void pi_bsp_init()
 {
+    // Set the pads alternate so that we have by default flash/ram and uart
+    // working.
+    // Flash and ram are on hyperbus0, pads 0 to 12 included.
+    // Uart is on pad 44 and 45
+#ifdef __FREERTOS__
+    // TODO freertos is setting everything to 0 by default, keep it for now to not break everything
+    uint32_t pad_values[] = { 0, 0, 0, 0, 0, 0 };
+#else
+    uint32_t pad_values[] = { 0x54000000, 0x55555555, 0x50555555, 0x55555555, 0x55555555, 0x55555555 };
+#endif
+    pi_pad_init(pad_values);
+
+    // Since pad 44 and 45 are for i2c, we need to configure the pad mux
+    pi_pad_set_mux_group(PI_PAD_044, PI_PAD_MUX_GROUP_UART1_RX);
+    pi_pad_set_mux_group(PI_PAD_045, PI_PAD_MUX_GROUP_UART1_TX);
+
     pi_bsp_init_profile(PI_BSP_PROFILE_DEFAULT);
 
 #if defined(CONFIG_GAP9_EVK_AUDIO_ADDON)
diff --git a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c
index c7220d13f..cffee3e17 100644
--- a/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c
+++ b/rtos/pmsis/pmsis_bsp/bsp/gap9_evk_audio_addon.c
@@ -17,119 +17,127 @@
 #include "pmsis.h"
 
 #include "bsp/bsp.h"
-#include "bsp/gap9_v2.h"
 #include "bsp/boards/gap9_evk/audio_addon.h"
 
-static pi_device_t __bsp_fxl6408_i2c;
-static PI_FC_TINY uint8_t __bsp_fxl6408_is_init;
-static PI_FC_TINY uint32_t __bsp_fxl6408_output_state;  // Keep as 32bits to lower code footprint
 
-
-void bsp_ak4332_conf_init(struct pi_ak4332_conf *conf)
-{
-    conf->i2c_itf = CONFIG_AK4332_I2C_ITF;
-    conf->i2s_itf = CONFIG_AK4332_I2S_ITF;
-}
-
-
-int bsp_ak4332_open(struct pi_ak4332_conf *conf)
-{
-    return 0;
-}
+#if defined(CONFIG_FXL6408)
+static pi_device_t __pi_bsp_fxl6408;
+#if !defined(__FREERTOS__)
+static PI_FC_TINY uint8_t __pi_bsp_fxl6408_is_init;
+#else
+static uint8_t __pi_bsp_fxl6408_is_init;
+#endif
 
 
-static void __bsp_fxl6408_write_reg(uint8_t addr, uint8_t value)
+void bsp_fxl6408_conf_init(struct pi_fxl6408_conf *conf)
 {
-    uint8_t buffer[2] = { addr, value };
-    pi_i2c_write(&__bsp_fxl6408_i2c, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP);
+    pi_fxl6408_conf_init(conf);
+    conf->i2c_itf = CONFIG_FXL6408UMX_I2C_ITF;
+    conf->interrupt_pin = PI_PAD_089;
 }
 
 
-static uint8_t __bsp_fxl6408_read_reg(uint8_t addr)
+static __attribute__((noinline)) int __pi_bsp_fxl6408_do_init()
 {
-    uint8_t result;
-    pi_i2c_write_read(&__bsp_fxl6408_i2c, &addr, &result, 1, 1);
-    return result;
-}
-
-
-static __attribute__((noinline)) int __bsp_fxl6408_do_init()
-{
-    struct pi_i2c_conf conf;
-    pi_i2c_conf_init(&conf);
-    conf.itf = CONFIG_FXL6408UMX_I2C_ITF;
-    pi_i2c_conf_set_slave_addr(&conf, CONFIG_FXL6408UMX_I2C_ADDR, 0);
-
-    pi_open_from_conf(&__bsp_fxl6408_i2c, &conf);
-    if (pi_i2c_open(&__bsp_fxl6408_i2c))
+    struct pi_fxl6408_conf conf;
+    pi_fxl6408_conf_init(&conf);
+    pi_open_from_conf(&__pi_bsp_fxl6408, &conf);
+    if (pi_fxl6408_open(&__pi_bsp_fxl6408))
     {
         return - 1;
     }
 
-    // Configure GPIO direction (output) for dac and adc
-    __bsp_fxl6408_write_reg(0x03, (1 << CONFIG_FXL6408UMX_AK4332_GPIO) | (1 << CONFIG_FXL6408UMX_TLV320_GPIO));
-
-    __bsp_fxl6408_output_state = 0;
-    __bsp_fxl6408_is_init = 1;
+    __pi_bsp_fxl6408_is_init = 1;
 
     return 0;
 }
 
-
-static inline __attribute__((always_inline)) int __bsp_fxl6408_check_init()
+static inline __attribute__((always_inline)) int __pi_bsp_fxl6408_check_init()
 {
-    if (!__bsp_fxl6408_is_init)
+    if (!__pi_bsp_fxl6408_is_init)
     {
-        return __bsp_fxl6408_do_init();
+        return __pi_bsp_fxl6408_do_init();
     }
 
     return 0;
 }
+#endif
 
 
-static void __bsp_fxl6408_gpio_output_state(unsigned int gpio, int state)
+#if defined(CONFIG_AK4332)
+void bsp_ak4332_conf_init(struct pi_ak4332_conf *conf)
 {
-    __bsp_fxl6408_output_state = __BITINSERT_R(__bsp_fxl6408_output_state, state, 1, gpio);
-
-    __bsp_fxl6408_write_reg(0x05, __bsp_fxl6408_output_state);
+    conf->i2c_itf = CONFIG_AK4332_I2C_ITF;
 }
 
-
-void __bsp_audio_addon_init()
+int bsp_ak4332_open(struct pi_ak4332_conf *conf)
 {
-    // Initialize this global state here to work well with retentive wake-up
-    __bsp_fxl6408_is_init = 0;
+    if (__pi_bsp_fxl6408_check_init())
+    {
+        return -1;
+    }
 
-    // Configure padframe for I2C1
-    // TODO should use real pad names when available
-    pi_pad_set_function(PI_PAD_042, 0);
-    pi_pad_set_function(PI_PAD_043, 0);
+    pi_fxl6408_gpio_conf_t gpio_conf;
+    pi_fxl6408_gpio_conf_init(&gpio_conf);
+    gpio_conf.id = CONFIG_FXL6408UMX_AK4332_GPIO;
+    gpio_conf.direction = FXL6408_GPIO_DIR_OUTPUT;
+    gpio_conf.output_state = FXL6408_GPIO_OUTPUT_STATE_HIGH;
+
+    if (PI_OK != pi_fxl6408_gpio_set(&__pi_bsp_fxl6408, &gpio_conf))
+    {
+        return -1;
+    }
+
+    // Wait at least 1ms after ak4332 power-up
+    // TODO took 2x as margin, check we can switch back to 1
+    pi_time_wait_us(2000);
+
+    return 0;
 }
+#endif
 
 
-uint8_t pi_bsp_fxl6408_read_id()
+#if defined(CONFIG_TLV320)
+void bsp_tlv320_conf_init(struct pi_tlv320_conf *conf)
 {
-    if (__bsp_fxl6408_check_init()) return 0;
-    
-    return __bsp_fxl6408_read_reg(0x01);
+    conf->i2c_itf = CONFIG_TLV320_I2C_ITF;
 }
 
-
-int pi_bsp_ak4332_power_ctrl(int power_enable)
+int bsp_tlv320_open(struct pi_tlv320_conf *conf)
 {
-    if (__bsp_fxl6408_check_init()) return -1;
+    if (__pi_bsp_fxl6408_check_init())
+    {
+        return -1;
+    }
 
-    __bsp_fxl6408_gpio_output_state(CONFIG_FXL6408UMX_AK4332_GPIO, power_enable);
+    pi_fxl6408_gpio_conf_t gpio_conf;
+    pi_fxl6408_gpio_conf_init(&gpio_conf);
+    gpio_conf.id = CONFIG_FXL6408UMX_TLV320_GPIO;
+    gpio_conf.direction = FXL6408_GPIO_DIR_OUTPUT;
+    gpio_conf.output_state = FXL6408_GPIO_OUTPUT_STATE_HIGH;
+
+    if (PI_OK != pi_fxl6408_gpio_set(&__pi_bsp_fxl6408, &gpio_conf))
+    {
+        return -1;
+    }
 
     return 0;
 }
+#endif
 
 
-int pi_bsp_tlv320_power_ctrl(int power_enable)
+void __bsp_audio_addon_init()
 {
-    if (__bsp_fxl6408_check_init()) return -1;
+#if defined(CONFIG_FXL6408)
+    // Initialize this global state here to work well with retentive wake-up
+    __pi_bsp_fxl6408_is_init = 0;
+#endif
 
-    __bsp_fxl6408_gpio_output_state(CONFIG_FXL6408UMX_TLV320_GPIO, power_enable);
+    // Configure padframe for I2C1
+    // TODO should use real pad names when available
+    pi_pad_set_function(PI_PAD_042, PI_PAD_FUNC0);
+    pi_pad_set_function(PI_PAD_043, PI_PAD_FUNC0);
 
-    return 0;
+    // configure gpio for gpio expander interrupt
+    pi_pad_set_function(PI_PAD_089, PI_PAD_FUNC1);
 }
diff --git a/rtos/pmsis/pmsis_bsp/docs/adc.rst b/rtos/pmsis/pmsis_bsp/docs/adc.rst
new file mode 100644
index 000000000..e033027ad
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/docs/adc.rst
@@ -0,0 +1,10 @@
+ADC
+---
+
+ADS1014
+"""""""
+
+.. doxygengroup:: ADS1014
+   :members:
+   :private-members:
+   :protected-members:
diff --git a/rtos/pmsis/pmsis_bsp/docs/gpio.rst b/rtos/pmsis/pmsis_bsp/docs/gpio.rst
new file mode 100644
index 000000000..c56fd18e7
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/docs/gpio.rst
@@ -0,0 +1,10 @@
+GPIO
+----
+
+FXL6408
+"""""""
+
+.. doxygengroup:: FXL6408
+   :members:
+   :private-members:
+   :protected-members:
diff --git a/rtos/pmsis/pmsis_bsp/docs/index.rst b/rtos/pmsis/pmsis_bsp/docs/index.rst
index 0db979642..2be0a4424 100644
--- a/rtos/pmsis/pmsis_bsp/docs/index.rst
+++ b/rtos/pmsis/pmsis_bsp/docs/index.rst
@@ -38,9 +38,11 @@ Drivers
 .. toctree::
    :maxdepth: 2
 
-   ram.rst
-   flash.rst
-   camera.rst
-   fs.rst
+   adc.rst
    ble.rst
+   camera.rst
    display.rst
+   flash.rst
+   fs.rst
+   gpio.rst
+   ram.rst
diff --git a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c
index 2df49b820..6fb99e928 100644
--- a/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c
+++ b/rtos/pmsis/pmsis_bsp/flash/mram/mram-v2.c
@@ -230,11 +230,7 @@ static void pos_mram_handle_pending_tasks(void *arg)
 }
 
 
-#ifndef PMSIS_DRIVERS
-PI_LOCAL_CODE static void pos_mram_handle_event(int event, void *arg)
-#else
-static void pos_mram_handle_event(void *arg)
-#endif
+PI_LOCAL_CODE static void pos_mram_handle_event(uint32_t event, void *arg)
 {
     pi_device_t *dev = (pi_device_t *)arg;
     pos_mram_t *mram = (pos_mram_t *)(pos_mram_t *)dev->data;
diff --git a/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c b/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c
index de792089f..5b3a14628 100644
--- a/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c
+++ b/rtos/pmsis/pmsis_bsp/flash/spiflash/atxp032.c
@@ -61,7 +61,7 @@
 #define ATXP032_PROGRAM_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
 
 #define ATXP032_READ_CMD (0x0B | PI_OCTOSPI_CMD_ADDR_EVEN)
-#define ATXP032_READ_LATENCY 22
+#define ATXP032_READ_LATENCY 5
 #define ATXP032_READ_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
 
 #define SECTOR_SIZE (1<<12)
@@ -248,7 +248,7 @@ static int atxp032_open(struct pi_device *device)
 
 
     // Activate octospi mode and DTR and unprotect all sectors
-    uint32_t data = 0x1b880200;
+    uint32_t data = 0x17880200;
 
     pi_octospi_op_conf_t op_ws = { .cmd=ATXP032_WRITE_STATUS_CMD, .latency=ATXP032_WRITE_STATUS_LATENCY_SPI, .flags=ATXP032_WRITE_STATUS_FLAGS_SPI };
     pi_octospi_write(&atxp032->octospi_device, 0, &data, 4, &op_ws);
@@ -262,7 +262,7 @@ static int atxp032_open(struct pi_device *device)
 
     // Activate octospi mode and DTR and unprotect all sectors
     // Since the UDMA does not support 1 byte address in DDR mode, we pack it into the data
-    char status_regs[5] = { 0x00, 0x00, 0x02, 0x88, 0x1b };
+    char status_regs[5] = { 0x00, 0x00, 0x02, 0x88, 0x17 };
 
     pi_octospi_op_conf_t op_ws = { .cmd=ATXP032_WRITE_STATUS_CMD, .latency=ATXP032_WRITE_STATUS_LATENCY_OCTO, .flags=ATXP032_WRITE_STATUS_FLAGS_OCTO };
     pi_octospi_write(&atxp032->octospi_device, 0, status_regs, 5, &op_ws);
@@ -271,6 +271,12 @@ static int atxp032_open(struct pi_device *device)
   // In the spec writing to volatile status register should take 200ns but RTL model take 10us to update it
   pi_time_wait_us(20);
 
+
+  if(conf->xip_en)
+  {
+      pi_octospi_ioctl(&atxp032->octospi_device, PI_OCTOSPI_IOCTL_SET_XIP_OP, (void *)&atxp032_read_op);
+  }
+
   return 0;
 
 error:
diff --git a/rtos/pmsis/pmsis_bsp/flash/spiflash/mx25u51245g.c b/rtos/pmsis/pmsis_bsp/flash/spiflash/mx25u51245g.c
new file mode 100644
index 000000000..44fc4e4ed
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/flash/spiflash/mx25u51245g.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (C) 2018 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+// Driver for the octo spi flash Adesto MX25U
+
+#include "pmsis.h"
+#include "bsp/bsp.h"
+#include "pmsis/drivers/octospi.h"
+
+#ifndef PI_LOCAL_CODE
+#define PI_LOCAL_CODE
+#endif
+
+#if defined(CONFIG_XIP)
+// For now we always activate XIP locking since we only have flash which do not support concurrent read and write.
+// This has to be deactivated for flash which support it (e.g. ATXP064R)
+#define MX25U_LOCK_XIP
+#endif
+
+#define MX25U_READ_STATUS_CMD_SPI (0x05 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_READ_STATUS_CMD_OCTO (0x05FA | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_READ_STATUS_LATENCY_SPI 0
+#define MX25U_READ_STATUS_LATENCY_OCTO 4
+#define MX25U_READ_STATUS_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_0 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR)
+#define MX25U_READ_STATUS_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
+
+#define MX25U_WRITE_STATUS_CMD (0x71 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_WRITE_STATUS_LATENCY_SPI 0
+#define MX25U_WRITE_STATUS_LATENCY_OCTO 0
+#define MX25U_WRITE_STATUS_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR)
+#define MX25U_WRITE_STATUS_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_0 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
+
+#define MX25U_READ_CONFREG_CMD_SPI (0x71 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_READ_CONFREG_CMD_OCTO (0x718E | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_READ_CONFREG_LATENCY_SPI 0
+#define MX25U_READ_CONFREG_LATENCY_OCTO 4
+#define MX25U_READ_CONFREG_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR)
+#define MX25U_READ_CONFREG_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
+
+#define MX25U_WRITE_CONFREG_CMD (0x72 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_WRITE_CONFREG_LATENCY_SPI 0
+#define MX25U_WRITE_CONFREG_LATENCY_OCTO 0
+#define MX25U_WRITE_CONFREG_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_STR | PI_OCTOSPI_FLAG_DATA_STR)
+#define MX25U_WRITE_CONFREG_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_ADDR_SIZE_0 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
+
+#define MX25U_WRITE_ENABLE_CMD_SPI (0x0006 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_WRITE_ENABLE_CMD_OCTO (0x06F9 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_WRITE_ENABLE_LATENCY_SPI 0
+#define MX25U_WRITE_ENABLE_LATENCY_OCTO 0
+#define MX25U_WRITE_ENABLE_FLAGS_SPI (PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_LINE_SINGLE | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_DATA_STR)
+#define MX25U_WRITE_ENABLE_FLAGS_OCTO (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
+
+#define MX25U_ERASE_CMD (0x21DE | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_ERASE_LATENCY 0
+#define MX25U_ERASE_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR)
+
+#define MX25U_PROGRAM_CMD (0x12ED | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_PROGRAM_LATENCY 0
+#define MX25U_PROGRAM_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR | PI_OCTOSPI_FLAG_DATA_DTR_MSB)
+
+#define MX25U_READ_CMD (0xEE11 | PI_OCTOSPI_CMD_ADDR_EVEN)
+#define MX25U_READ_LATENCY 20
+#define MX25U_READ_FLAGS (PI_OCTOSPI_FLAG_CMD_SIZE_2 | PI_OCTOSPI_FLAG_ADDR_SIZE_4 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_DTR | PI_OCTOSPI_FLAG_ADDR_DTR | PI_OCTOSPI_FLAG_DATA_DTR | PI_OCTOSPI_FLAG_DATA_DTR_MSB)
+
+#define SECTOR_SIZE (1<<12)
+
+#define STALL_TASK_PROGRAM      0
+#define STALL_TASK_ERASE_CHIP   1
+#define STALL_TASK_ERASE_SECTOR 2
+#define STALL_TASK_REG_SET      3
+#define STALL_TASK_REG_GET      4
+#define STALL_TASK_READ         5
+#define STALL_TASK_READ_2D      6
+
+typedef struct {
+    struct pi_device octospi_device;
+    // Used for communications with mx25u through udma
+    uint16_t udma_buffer[2];
+
+    // Waiting queue for common operations (only 1 is handled at the same time)
+    pi_task_t *waiting_first;
+    pi_task_t *waiting_last;
+
+    // Task to be enqueued when the on-going operation is done
+    pi_task_t *pending_task;
+
+    // Waiting queue for erase operation (only 1 is handled at the same time) as it needs a
+    // second level FSM
+    pi_task_t *erase_waiting_first;
+    pi_task_t *erase_waiting_last;
+
+    // Task to be enqueued when the on-going erase operation is done
+    pi_task_t *erase_task;
+
+    // Task used for internal FSM scheduling for common operations
+    pi_task_t task;
+
+    // Task used for internal second-level FSM scheduling (for erase operation)
+    pi_task_t task2;
+
+    // Description of on-going task for common operations. The FSM will keep executing
+    // until this operation is done
+    uint32_t pending_octospi_addr;
+    uint32_t pending_data;
+    uint32_t pending_size;
+
+    // Description of on-going task for erase operation. The FSM will keep executing
+    // until this operation is done
+    uint32_t pending_erase_octospi_addr;
+    uint32_t pending_erase_size;
+
+} mx25u_t;
+
+
+static pi_octospi_op_conf_t mx25u_erase_op = { .cmd=MX25U_ERASE_CMD, .latency=MX25U_ERASE_LATENCY, .flags=MX25U_ERASE_FLAGS };
+
+static pi_octospi_op_conf_t mx25u_program_op = { .cmd=MX25U_PROGRAM_CMD, .latency=MX25U_PROGRAM_LATENCY, .flags=MX25U_PROGRAM_FLAGS };
+
+static pi_octospi_op_conf_t mx25u_read_op = { .cmd=MX25U_READ_CMD, .latency=MX25U_READ_LATENCY, .flags=MX25U_READ_FLAGS };
+
+
+
+static void mx25u_program_async(struct pi_device *device, uint32_t octospi_addr, const void *data, uint32_t size, pi_task_t *task);
+
+static void mx25u_check_program(void *arg);
+
+static void mx25u_erase_async(struct pi_device *device, uint32_t addr, int size, pi_task_t *task);
+
+static int mx25u_stall_task(mx25u_t *mx25u, pi_task_t *task, uint32_t id, uint32_t arg0, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint32_t arg4);
+
+static void mx25u_handle_pending_task(void *arg);
+
+static void mx25u_erase_chip_async(struct pi_device *device, pi_task_t *task);
+
+static void mx25u_erase_sector_async(struct pi_device *device, uint32_t addr, pi_task_t *task);
+
+static void mx25u_set_reg_exec(mx25u_t *mx25u, unsigned int addr, unsigned short value)
+{
+    mx25u->udma_buffer[0] = value;
+    //pi_octospi_write(&mx25u->octospi_device, addr, mx25u->udma_buffer, 2);
+}
+
+
+
+// TODO should be moved to pmsis api
+static void pi_task_enqueue(pi_task_t *task)
+{
+    pi_task_push(task);
+}
+
+
+
+static unsigned short mx25u_get_reg_exec(mx25u_t *mx25u, unsigned int addr)
+{
+    //pi_octospi_read(&mx25u->octospi_device, addr, mx25u->udma_buffer, 4);
+    return mx25u->udma_buffer[0];
+}
+
+
+#ifdef CONFIG_XIP
+
+PI_LOCAL_CODE static uint32_t mx25u_get_status(mx25u_t *mx25u)
+{
+    struct pi_task task;
+    uint32_t data;
+    pi_octospi_op_conf_t op = { .cmd=MX25U_READ_STATUS_CMD_OCTO, .latency=MX25U_READ_STATUS_LATENCY_OCTO, .flags=MX25U_READ_STATUS_FLAGS_OCTO };
+    pi_octospi_read_async(&mx25u->octospi_device, 0, &data, 4, &op, pi_task_block(&task));
+    pi_task_wait_on_xip(&task);
+    return data;
+}
+
+#else
+
+static uint32_t mx25u_get_status(mx25u_t *mx25u)
+{
+    uint32_t data;
+    pi_octospi_op_conf_t op = { .cmd=MX25U_READ_STATUS_CMD_OCTO, .latency=MX25U_READ_STATUS_LATENCY_OCTO, .flags=MX25U_READ_STATUS_FLAGS_OCTO };
+    pi_octospi_read(&mx25u->octospi_device, 0, &data, 4, &op);
+    return data;
+}
+
+#endif
+
+
+PI_LOCAL_CODE static int mx25u_is_busy(mx25u_t *mx25u)
+{
+    uint32_t value = mx25u_get_status(mx25u);
+    return (value >> 0) & 1;
+}
+
+
+static void mx25u_write_enable(mx25u_t *mx25u)
+{
+    pi_octospi_op_conf_t op = { .cmd=MX25U_WRITE_ENABLE_CMD_OCTO, .latency=MX25U_WRITE_ENABLE_LATENCY_OCTO, .flags=MX25U_WRITE_ENABLE_FLAGS_OCTO };
+    int dummy = 0;
+    pi_octospi_write(&mx25u->octospi_device, 0, &dummy, 0, &op);
+}
+
+
+
+static int mx25u_open(struct pi_device *device)
+{
+    struct pi_mx25u51245g_conf *conf = (struct pi_mx25u51245g_conf *)device->config;
+
+    mx25u_t *mx25u = (mx25u_t *)pmsis_l2_malloc(sizeof(mx25u_t));
+    if (mx25u == NULL)
+    {
+        return -1;
+    }
+
+    device->data = (void *)mx25u;
+
+    if (bsp_mx25u51245g_open(conf))
+    {
+        goto error;
+    }
+
+    struct pi_octospi_conf octospi_conf;
+    pi_octospi_conf_init(&octospi_conf);
+
+    octospi_conf.id = (unsigned char) conf->spi_itf;
+    octospi_conf.cs = conf->spi_cs;
+    octospi_conf.type = PI_OCTOSPI_TYPE_FLASH;
+    octospi_conf.xip_en = conf->xip_en;
+    octospi_conf.baudrate = conf->baudrate;
+
+    pi_open_from_conf(&mx25u->octospi_device, &octospi_conf);
+
+    int32_t error = pi_octospi_open(&mx25u->octospi_device);
+    if (error)
+    {
+        goto error;
+    }
+
+    mx25u->pending_task = NULL;
+    mx25u->waiting_first = NULL;
+
+    mx25u->erase_task = NULL;
+    mx25u->erase_waiting_first = NULL;
+
+    // Activate DTR octospi mode and DTR
+    {
+        pi_octospi_op_conf_t op_we = { .cmd=MX25U_WRITE_ENABLE_CMD_SPI, .latency=MX25U_WRITE_ENABLE_LATENCY_SPI, .flags=MX25U_WRITE_ENABLE_FLAGS_SPI };
+        pi_octospi_write(&mx25u->octospi_device, 0, NULL, 0, &op_we);
+
+        pi_octospi_op_conf_t op_ws = { .cmd=MX25U_WRITE_CONFREG_CMD, .latency=MX25U_WRITE_CONFREG_LATENCY_SPI, .flags=MX25U_WRITE_CONFREG_FLAGS_SPI };
+        uint32_t data = 1 << 1;
+        pi_octospi_write(&mx25u->octospi_device, 0, &data, 1, &op_ws);
+
+        pi_time_wait_us(60);
+    }
+
+    return 0;
+
+error:
+    pmsis_l2_malloc_free(mx25u, sizeof(mx25u_t));
+    return -2;
+}
+
+
+
+static void mx25u_close(struct pi_device *device)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+    pi_octospi_close(&mx25u->octospi_device);
+    pmsis_l2_malloc_free(mx25u, sizeof(mx25u_t));
+}
+
+
+
+static int32_t mx25u_ioctl(struct pi_device *device, uint32_t cmd, void *arg)
+{
+    switch (cmd)
+    {
+        case PI_FLASH_IOCTL_INFO:
+        {
+            struct pi_flash_info *flash_info = (struct pi_flash_info *)arg;
+            flash_info->sector_size = 1<<18;
+            // TODO find a way to know what is on the flash, as they may be a boot binary
+            flash_info->flash_start = flash_info->sector_size;
+        }
+    }
+  return 0;
+}
+
+
+void pi_mx25u_deep_sleep_enter(pi_device_t *device)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+    pi_octospi_op_conf_t op_we = {
+        .cmd=0xB9,
+        .latency=0,
+        .flags=PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_DATA_DTR
+    };
+    int dummy = 0;
+    pi_octospi_write(&mx25u->octospi_device, 0, &dummy, 1, &op_we);
+}
+
+
+void pi_mx25u_deep_sleep_exit(pi_device_t *device)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+    pi_octospi_op_conf_t op_we = {
+        .cmd=0xAB,
+        .latency=0,
+        .flags=PI_OCTOSPI_FLAG_CMD_SIZE_1 | PI_OCTOSPI_FLAG_LINE_OCTO | PI_OCTOSPI_FLAG_CMD_STR | PI_OCTOSPI_FLAG_DATA_DTR
+    };
+    int dummy = 0;
+    pi_octospi_write(&mx25u->octospi_device, 0, &dummy, 1, &op_we);
+}
+
+
+static void mx25u_reg_set_async(struct pi_device *device, uint32_t addr, uint8_t *value, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_REG_SET, addr, (uint32_t)value, 0, 0, 0))
+        return;
+
+    mx25u_set_reg_exec(mx25u, addr, *(uint16_t *)value);
+
+    mx25u_handle_pending_task(device);
+}
+
+
+
+static void mx25u_reg_get_async(struct pi_device *device, uint32_t addr, uint8_t *value, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_REG_GET, addr, (uint32_t)value, 0, 0, 0))
+        return;
+
+    *(uint16_t *)value = mx25u_get_reg_exec(mx25u, addr);
+
+    mx25u_handle_pending_task(device);
+}
+
+
+
+static void mx25u_read_async(struct pi_device *device, uint32_t addr, void *data, uint32_t size, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_READ, addr, (uint32_t)data, size, 0, 0))
+        return;
+
+    pi_octospi_read_async(&mx25u->octospi_device, addr, data, size, &mx25u_read_op, pi_task_callback(&mx25u->task, mx25u_handle_pending_task, device));
+}
+
+
+
+static void mx25u_read_2d_async(struct pi_device *device, uint32_t addr, void *data, uint32_t size, uint32_t stride, uint32_t length, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_READ_2D, addr, (uint32_t)data, size, stride, length))
+    {
+        return;
+    }
+
+    pi_octospi_read_2d_async(&mx25u->octospi_device, addr, data, size, stride, length, &mx25u_read_op, pi_task_callback(&mx25u->task, mx25u_handle_pending_task, device));
+}
+
+
+
+static void mx25u_handle_pending_task(void *arg)
+{
+    struct pi_device *device = (struct pi_device *)arg;
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    uint32_t irq = disable_irq();
+
+    pi_task_enqueue(mx25u->pending_task);
+    mx25u->pending_task = NULL;
+
+    pi_task_t *task = mx25u->waiting_first;
+
+    if (task)
+    {
+        mx25u->waiting_first = task->next;
+    }
+
+    restore_irq(irq);
+
+    if (task)
+    {
+        if (task->data[0] == STALL_TASK_PROGRAM)
+        {
+            mx25u_program_async(device, task->data[1], (void *)task->data[2], task->data[3], task);
+        }
+        else if (task->data[0] == STALL_TASK_ERASE_CHIP)
+        {
+            mx25u_erase_chip_async(device, task);
+        }
+        else if (task->data[0] == STALL_TASK_ERASE_SECTOR)
+        {
+            mx25u_erase_sector_async(device, task->data[1], task);
+        }
+        else if (task->data[0] == STALL_TASK_REG_SET)
+        {
+            mx25u_reg_set_async(device, task->data[1], (uint8_t *)task->data[2], task);
+        }
+        else if (task->data[0] == STALL_TASK_REG_GET)
+        {
+            mx25u_reg_get_async(device, task->data[1], (uint8_t *)task->data[2], task);
+        }
+        else if (task->data[0] == STALL_TASK_READ)
+        {
+            mx25u_read_async(device, task->data[1], (void *)task->data[2], task->data[3], task);
+        }
+        else if (task->data[0] == STALL_TASK_READ_2D)
+        {
+            mx25u_read_2d_async(device, task->data[1], (void *)task->data[2], task->data[3], task->data[4], task->data[5], task);
+        }
+    }
+}
+
+
+
+static void mx25u_handle_pending_erase_task(void *arg)
+{
+    struct pi_device *device = (struct pi_device *)arg;
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    uint32_t irq = disable_irq();
+
+    pi_task_enqueue(mx25u->erase_task);
+    mx25u->erase_task = NULL;
+
+    pi_task_t *task = mx25u->erase_waiting_first;
+    if (task)
+    {
+        mx25u->erase_waiting_first = task->next;
+    }
+
+    restore_irq(irq);
+
+    if (task)
+    {
+        mx25u_erase_async(device, task->data[1], task->data[2], task);
+    }
+}
+
+
+
+static int mx25u_stall_task(mx25u_t *mx25u, pi_task_t *task, uint32_t id, uint32_t arg0, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint32_t arg4)
+{
+    uint32_t irq = disable_irq();
+
+    if (mx25u->pending_task != NULL)
+    {
+        task->data[0] = id;
+        task->data[1] = arg0;
+        task->data[2] = arg1;
+        task->data[3] = arg2;
+        task->data[4] = arg3;
+        task->data[5] = arg4;
+        task->next = NULL;
+
+        if (mx25u->waiting_first)
+        {
+        mx25u->waiting_last->next = task;
+        }
+        else
+        {
+        mx25u->waiting_first = task;
+        }
+
+        mx25u->waiting_last = task;
+
+        restore_irq(irq);
+        return 1;
+    }
+
+    mx25u->pending_task = task;
+
+    restore_irq(irq);
+    return 0;
+}
+
+
+
+static int mx25u_stall_erase_task(mx25u_t *mx25u, pi_task_t *task, uint32_t id, uint32_t arg0, uint32_t arg1, uint32_t arg2)
+{
+  uint32_t irq = disable_irq();
+
+    if (mx25u->erase_task != NULL)
+    {
+        task->data[0] = id;
+        task->data[1] = arg0;
+        task->data[2] = arg1;
+        task->data[3] = arg2;
+        task->next = NULL;
+
+        if (mx25u->erase_waiting_first)
+        {
+            mx25u->erase_waiting_last->next = task;
+        }
+        else
+        {
+            mx25u->erase_waiting_first = task;
+        }
+
+        mx25u->erase_waiting_last = task;
+
+        restore_irq(irq);
+        return 1;
+    }
+
+    mx25u->erase_task = task;
+
+    restore_irq(irq);
+    return 0;
+}
+
+
+PI_LOCAL_CODE static void mx25u_program_resume(void *arg)
+{
+    struct pi_device *device = (struct pi_device *)arg;
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u->pending_size == 0)
+    {
+        mx25u_handle_pending_task(device);
+    }
+    else
+    {
+    #ifdef MX25U_LOCK_XIP
+        // When XIP is active and flash does not support concurrent read and write, loop on the program operation until it is done
+        // Since XIP can not work at the same time.
+        // On multi-threaded systems, we should also put on hold any request to this driver and resume them after the program operation is done,
+        // since the octosp drover will let other requests execute between 2 operations, to let other devices being used.
+        while (mx25u->pending_size > 0)
+        {
+        mx25u_write_enable(mx25u);
+
+        unsigned int iter_size = 256 - (mx25u->pending_octospi_addr & 0xff);
+        if (iter_size > mx25u->pending_size)
+            iter_size = mx25u->pending_size;
+
+        uint32_t octospi_addr = mx25u->pending_octospi_addr;
+        uint32_t data = mx25u->pending_data;
+
+        mx25u->pending_octospi_addr += iter_size;
+        mx25u->pending_data += iter_size;
+        mx25u->pending_size -= iter_size;
+
+        // In XIP mode, we need to lock XIP refills to avoid having a read while the flash is doing the program operation.
+        pi_octospi_xip_lock(&mx25u->octospi_device);
+
+        // Even though the operation should be asynchronous, do everything synchronously to avoid XIP refills until the operation is done
+        struct pi_task task;
+        pi_octospi_write_async(&mx25u->octospi_device, octospi_addr, (void *)data, iter_size, &mx25u_program_op, pi_task_block(&task));
+        pi_task_wait_on_xip(&task);
+        while (mx25u_is_busy(mx25u))
+        {
+            for (int i=0; i<32768/1000; i++)
+            {
+                pos_wait_for_event(1<<ARCHI_FC_EVT_CLK_REF_RISE);
+            }
+        }
+        pi_octospi_xip_unlock(&mx25u->octospi_device);
+        }
+
+        mx25u_handle_pending_task(device);
+    #else
+        unsigned int iter_size = 256 - (mx25u->pending_octospi_addr & 0xff);
+        if (iter_size > mx25u->pending_size)
+            iter_size = mx25u->pending_size;
+
+        uint32_t octospi_addr = mx25u->pending_octospi_addr;
+        uint32_t data = mx25u->pending_data;
+
+        mx25u->pending_octospi_addr += iter_size;
+        mx25u->pending_data += iter_size;
+        mx25u->pending_size -= iter_size;
+
+        mx25u_write_enable(mx25u);
+        pi_octospi_write_async(&mx25u->octospi_device, octospi_addr, (void *)data, iter_size, &mx25u_program_op, pi_task_callback(&mx25u->task, mx25u_check_program, device));
+    #endif
+    }
+}
+
+
+
+static void mx25u_check_program(void *arg)
+{
+    struct pi_device *device = (struct pi_device *)arg;
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_is_busy(mx25u))
+    {
+        // Typical buffer programming time is 4ms. Note that this could be optimzed by taking into account buffer size
+        pi_task_push_delayed_us(pi_task_callback(&mx25u->task, mx25u_check_program, device), 1000);
+    }
+    else
+    {
+        mx25u_program_resume(device);
+    }
+}
+
+
+
+PI_LOCAL_CODE static void mx25u_program_async(struct pi_device *device, uint32_t octospi_addr, const void *data, uint32_t size, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_PROGRAM, octospi_addr, (uint32_t)data, size, 0, 0))
+        return;
+
+    mx25u->pending_octospi_addr = octospi_addr;
+    mx25u->pending_data = (uint32_t)data;
+    mx25u->pending_size = size;
+
+    mx25u_program_resume(device);
+}
+
+
+
+
+
+static void mx25u_check_erase(void *arg)
+{
+    struct pi_device *device = (struct pi_device *)arg;
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    uint32_t reg_status; // = mx25u_get_status_reg(mx25u);
+    if (mx25u_is_busy(mx25u))
+    {
+        // Typical sector erase time is 25ms but keep it short as this time is shorter or some platform
+        pi_task_push_delayed_us(pi_task_callback(&mx25u->task, mx25u_check_erase, device), 10000);
+    }
+    else
+    {
+        mx25u_handle_pending_task(device);
+    }
+}
+
+
+static void mx25u_erase_chip_async(struct pi_device *device, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_ERASE_CHIP, 0, 0, 0, 0, 0))
+        return;
+
+
+
+    pi_task_push_delayed_us(pi_task_callback(&mx25u->task, mx25u_check_erase, device), 100000);
+}
+
+
+static void mx25u_erase_sector_async(struct pi_device *device, uint32_t addr, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_task(mx25u, task, STALL_TASK_ERASE_SECTOR, addr, 0, 0, 0, 0))
+        return;
+
+    mx25u_write_enable(mx25u);
+
+    // We don't need to send data but UDMA needs at least 1 byte, this will be ignored by the flash
+    pi_octospi_write_async(&mx25u->octospi_device, addr, mx25u->udma_buffer, 0, &mx25u_erase_op, pi_task_callback(&mx25u->task, mx25u_check_erase, device));
+}
+
+
+
+PI_LOCAL_CODE static void mx25u_erase_resume(void *arg)
+{
+    struct pi_device *device = (struct pi_device *)arg;
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u->pending_erase_size == 0)
+    {
+        mx25u_handle_pending_erase_task(device);
+    }
+    else
+    {
+    #ifdef MX25U_LOCK_XIP
+        // When XIP is active and flash does not support concurrent read and write, loop on the erase operation until it is done
+        // Since XIP can not work at the same time.
+        // On multi-threaded systems, we should also put on hold any request to this driver and resume them after the program operation is done,
+        // since the octosp drover will let other requests execute between 2 operations, to let other devices being used.
+        while (mx25u->pending_erase_size > 0)
+        {
+        mx25u_write_enable(mx25u);
+
+        // In XIP mode, we need to lock XIP refills to avoid having a read while the flash is doing the program operation.
+        pi_octospi_xip_lock(&mx25u->octospi_device);
+
+        unsigned int iter_size = SECTOR_SIZE - (mx25u->pending_erase_octospi_addr & (SECTOR_SIZE - 1));
+        if (iter_size > mx25u->pending_erase_size)
+            iter_size = mx25u->pending_erase_size;
+
+        uint32_t octospi_addr = mx25u->pending_erase_octospi_addr;
+
+        mx25u->pending_erase_octospi_addr += iter_size;
+        mx25u->pending_erase_size -= iter_size;
+
+        struct pi_task task;
+
+        // We don't need to send data but UDMA needs at least 1 byte, this will be ignored by the flash
+        pi_octospi_write_async(&mx25u->octospi_device, octospi_addr, mx25u->udma_buffer, 1, &mx25u_erase_op, pi_task_block(&task));
+        pi_task_wait_on_xip(&task);
+
+        while (mx25u_is_busy(mx25u))
+        {
+            for (int i=0; i<32768/100; i++)
+            {
+                pos_wait_for_event(1<<ARCHI_FC_EVT_CLK_REF_RISE);
+            }
+        }
+        pi_octospi_xip_unlock(&mx25u->octospi_device);
+        }
+
+        mx25u_handle_pending_erase_task(device);
+    #else
+        unsigned int iter_size = SECTOR_SIZE - (mx25u->pending_erase_octospi_addr & (SECTOR_SIZE - 1));
+        if (iter_size > mx25u->pending_erase_size)
+            iter_size = mx25u->pending_erase_size;
+
+        uint32_t octospi_addr = mx25u->pending_erase_octospi_addr;
+        mx25u_erase_sector_async(device, octospi_addr, pi_task_callback(&mx25u->task2, mx25u_erase_resume, device));
+
+        mx25u->pending_erase_octospi_addr += iter_size;
+        mx25u->pending_erase_size -= iter_size;
+    #endif
+    }
+}
+
+
+
+static void mx25u_erase_async(struct pi_device *device, uint32_t addr, int size, pi_task_t *task)
+{
+    mx25u_t *mx25u = (mx25u_t *)device->data;
+
+    if (mx25u_stall_erase_task(mx25u, task, 3, addr, size, 0))
+    {
+        return;
+    }
+
+    mx25u->pending_erase_octospi_addr = addr;
+    mx25u->pending_erase_size = size;
+
+    mx25u_erase_resume(device);
+}
+
+
+
+static int mx25u_copy_async(struct pi_device *device, uint32_t flash_addr, void *buffer, uint32_t size, int ext2loc, pi_task_t *task)
+{
+    if (!ext2loc)
+        mx25u_program_async(device, flash_addr, buffer, size, task);
+    else
+        mx25u_read_async(device, flash_addr, buffer, size, task);
+
+    return 0;
+}
+
+
+
+static int mx25u_copy_2d_async(struct pi_device *device, uint32_t flash_addr, void *buffer, uint32_t size, uint32_t stride, uint32_t length, int ext2loc, pi_task_t *task)
+{
+    if (!ext2loc)
+        return -1;
+
+    mx25u_read_2d_async(device, flash_addr, buffer, size, stride, length, task);
+
+    return 0;
+}
+
+static int mx25u_read(struct pi_device *device, uint32_t pi_flash_addr, void *data, uint32_t size)
+{
+    pi_task_t task;
+    mx25u_read_async(device, pi_flash_addr, data, size, pi_task_block(&task));
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+
+static int mx25u_program(struct pi_device *device, uint32_t pi_flash_addr, const void *data, uint32_t size)
+{
+    pi_task_t task;
+    mx25u_program_async(device, pi_flash_addr, data, size, pi_task_block(&task));
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_erase_chip(struct pi_device *device)
+{
+    pi_task_t task;
+    mx25u_erase_chip_async(device, pi_task_block(&task));
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_erase_sector(struct pi_device *device, uint32_t pi_flash_addr)
+{
+    pi_task_t task;
+    mx25u_erase_sector_async(device, pi_flash_addr, pi_task_block(&task));
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_erase(struct pi_device *device, uint32_t pi_flash_addr, int size)
+{
+    pi_task_t task;
+    pi_task_block(&task);
+    mx25u_erase_async(device, pi_flash_addr, size, &task);
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_reg_set(struct pi_device *device, uint32_t pi_flash_addr, uint8_t *value)
+{
+    pi_task_t task;
+    mx25u_reg_set_async(device, pi_flash_addr, value, pi_task_block(&task));
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_reg_get(struct pi_device *device, uint32_t pi_flash_addr, uint8_t *value)
+{
+    pi_task_t task;
+    mx25u_reg_get_async(device, pi_flash_addr, value, pi_task_block(&task));
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_copy(struct pi_device *device, uint32_t pi_flash_addr, void *buffer, uint32_t size, int ext2loc)
+{
+    pi_task_t task;
+    pi_task_block(&task);
+    if (mx25u_copy_async(device, pi_flash_addr, buffer, size, ext2loc, &task))
+        return -1;
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static inline int mx25u_copy_2d(struct pi_device *device, uint32_t pi_flash_addr, void *buffer, uint32_t size, uint32_t stride, uint32_t length, int ext2loc)
+{
+    pi_task_t task;
+    pi_task_block(&task);
+    if (mx25u_copy_2d_async(device, pi_flash_addr, buffer, size, stride, length, ext2loc, &task))
+        return -1;
+    pi_task_wait_on(&task);
+    return 0;
+}
+
+static pi_flash_api_t mx25u_api = {
+    .open                 = &mx25u_open,
+    .close                = &mx25u_close,
+    .ioctl                = &mx25u_ioctl,
+    .read_async           = &mx25u_read_async,
+    .program_async        = &mx25u_program_async,
+    .erase_chip_async     = &mx25u_erase_chip_async,
+    .erase_sector_async   = &mx25u_erase_sector_async,
+    .erase_async          = &mx25u_erase_async,
+    .reg_set_async        = &mx25u_reg_set_async,
+    .reg_get_async        = &mx25u_reg_get_async,
+    .copy_async           = &mx25u_copy_async,
+    .copy_2d_async        = &mx25u_copy_2d_async,
+    .read                 = &mx25u_read,
+    .program              = &mx25u_program,
+    .erase_chip           = &mx25u_erase_chip,
+    .erase_sector         = &mx25u_erase_sector,
+    .erase                = &mx25u_erase,
+    .reg_set              = &mx25u_reg_set,
+    .reg_get              = &mx25u_reg_get,
+    .copy                 = &mx25u_copy,
+    .copy_2d              = &mx25u_copy_2d,
+};
+
+
+
+void pi_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf)
+{
+    conf->flash.api = &mx25u_api;
+    bsp_mx25u51245g_conf_init(conf);
+    __flash_conf_init(&conf->flash);
+    conf->xip_en = 0;
+}
diff --git a/rtos/pmsis/pmsis_bsp/fs/fs.c b/rtos/pmsis/pmsis_bsp/fs/fs.c
index 1b79329df..f47fcb449 100644
--- a/rtos/pmsis/pmsis_bsp/fs/fs.c
+++ b/rtos/pmsis/pmsis_bsp/fs/fs.c
@@ -340,19 +340,21 @@ void __pi_cl_fs_copy_req_exec(void *_req);
 void __pi_cl_fs_copy_req_done(void *_req)
 {
     pi_cl_fs_req_t *req = (pi_cl_fs_req_t *)_req;
+    pi_cl_fs_req_t *next_req;
     pi_fs_file_t *file = req->file;
     pi_fs_data_t *fs = req->file->fs_data;
     pi_task_t *task = &fs->cl_req_task;
 
-    cl_notify_task_done(&(req->copy.done), req->copy.cid);
-
     uint32_t irq = disable_irq();
     fs->cluster_reqs_first = (void *)req->callback.next;
-    req = fs->cluster_reqs_first;
+    next_req = fs->cluster_reqs_first;
     restore_irq(irq);
-    if (req)
+
+    cl_notify_task_done(&(req->copy.done), req->copy.cid);
+
+    if (next_req)
     {
-        __pi_cl_fs_copy_req_exec(req);
+        __pi_cl_fs_copy_req_exec(next_req);
     }
 }
 
diff --git a/rtos/pmsis/pmsis_bsp/gpio/fxl6408.c b/rtos/pmsis/pmsis_bsp/gpio/fxl6408.c
new file mode 100644
index 000000000..bf26b3544
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/gpio/fxl6408.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use fxl6408 file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+#include "bsp/gpio/fxl6408.h"
+
+/**************/
+/* Structures */
+/**************/
+
+typedef enum {
+    FXL6408_REG_DEVICE_ID_CTRL      = 0x1,
+    FXL6408_REG_IO_DIRECTION        = 0x3,
+    FXL6408_REG_OUTPUT_STATE        = 0x5,
+    FXL6408_REG_OUTPUT_HIGHZ        = 0x7,
+    FXL6408_REG_INPUT_DEFAULT_STATE = 0x9,
+    FXL6408_REG_PULL_ENABLE         = 0xB,
+    FXL6408_REG_PULL_UP_DOWN        = 0xD,
+    FXL6408_REG_INPUT_STATUS        = 0xF,
+    FXL6408_REG_INTERRUPT_MASK      = 0x11,
+    FXL6408_REG_INTERRUPT_STATUS    = 0x13,
+} fxl6408_register_e;
+
+/* values for the IO direction register */
+enum {
+    __GPIO_MODE_INPUT = 0x0,
+    __GPIO_MODE_OUTPUT = 0x1,
+};
+
+/* values for the output highz register */
+enum {
+    __GPIO_HIGHZ_DISABLED = 0x0,
+    __GPIO_HIGHZ_ENABLED = 0x1,
+};
+
+/* values for the output value register */
+enum {
+    __GPIO_OUTPUT_LOW = 0x0,
+    __GPIO_OUTPUT_HIGH = 0x1,
+};
+
+/* values for the interrupt mask register */
+enum {
+    __GPIO_INTERRUPT_ENABLED = 0x0,
+    __GPIO_INTERRUPT_DISABLED = 0x1,
+};
+
+/* values for the input default state register */
+enum {
+    __GPIO_TRIGGER_RISING = 0x0,
+    __GPIO_TRIGGER_FALLING = 0x1,
+};
+
+/* values for the pull enable register */
+enum {
+    __GPIO_PULL_DISABLED = 0x0,
+    __GPIO_PULL_ENABLED = 0x1,
+};
+
+/* values for the pull up down register */
+enum {
+    __GPIO_PULL_DOWN = 0x0,
+    __GPIO_PULL_UP = 0x1,
+};
+
+typedef struct {
+    pi_device_t i2c;
+    pi_device_t gpio_irq;
+
+    /* store device registers value locally to avoid reading and writing back */
+    uint8_t gpio_dir;
+    uint8_t gpio_value;
+    uint8_t gpio_high_z;
+    uint8_t gpio_trigger;
+    uint8_t gpio_pull_enable;
+    uint8_t gpio_pull_updown;
+    uint8_t gpio_interrupt_mask;
+
+    /* gpio irq status related */
+    pi_task_t gpio_irq_cb;
+    pi_task_t i2c_interrupt_status_cb;
+    /* this value will be set by the i2c write read async */
+    uint8_t i2c_interrupt_status_payload; //1-byte payload
+    uint8_t i2c_interrupt_status;
+    pi_task_t *irq_tasks[8];
+} fxl6408_t;
+
+/********************/
+/* Static functions */
+/********************/
+
+static int __pi_fxl6408_reg_write(pi_device_t *dev, fxl6408_register_e addr,
+        uint8_t value)
+{
+    uint8_t buffer[2] = { addr, value };
+    if (pi_i2c_write(dev, buffer, 2, PI_I2C_XFER_START | PI_I2C_XFER_STOP))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+
+static uint8_t __pi_fxl6408_reg_read(pi_device_t *dev,
+        fxl6408_register_e addr)
+{
+    uint8_t result;
+    pi_i2c_write_read(dev, &addr, &result, 1, 1);
+    return result;
+}
+
+
+static int __pi_fxl6408_reset(fxl6408_t *fxl6408)
+{
+    // To reset the IO expander, just make sure it does not drive gpio outputs
+    // Set all GPIO to input
+    fxl6408->gpio_dir = 0x00;
+    if (__pi_fxl6408_reg_write(&fxl6408->i2c,
+                FXL6408_REG_IO_DIRECTION, fxl6408->gpio_dir)) {
+        return -1;
+    }
+
+    // Set all GPIO output value to 0
+    fxl6408->gpio_value = 0x00;
+    if (__pi_fxl6408_reg_write(&fxl6408->i2c,
+                FXL6408_REG_OUTPUT_STATE, fxl6408->gpio_value)) {
+        return -1;
+    }
+
+    // Set all GPIO to high-Z
+    fxl6408->gpio_high_z = 0xFF;
+    if (__pi_fxl6408_reg_write(&fxl6408->i2c,
+                FXL6408_REG_OUTPUT_HIGHZ, fxl6408->gpio_high_z)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static void __attribute__((noinline))
+__i2c_interrupt_status_cb(void *arg)
+{
+    fxl6408_t* fxl6408 = (fxl6408_t *) arg;
+    // retrieve the status of the interrupt and
+    // schedule corresponding pi_task
+
+    uint8_t irq_status = fxl6408->i2c_interrupt_status;
+    for (int i = 0; i < 8; i++) {
+        if (irq_status & (1 << i)) {
+            /* irq has been triggered */
+            if (fxl6408->irq_tasks[i] != NULL) {
+                /* schedule the task */
+                pi_task_push(fxl6408->irq_tasks[i]);
+            }
+        }
+    }
+}
+
+static void __attribute__((noinline))
+__gpio_irq_cb(void* arg)
+{
+    fxl6408_t* fxl6408 = (fxl6408_t *) arg;
+    // read the interrupt status (async call) will call another callback.
+    pi_task_callback(&fxl6408->i2c_interrupt_status_cb,
+            __i2c_interrupt_status_cb, (void*) fxl6408);
+    fxl6408->i2c_interrupt_status_payload = FXL6408_REG_INTERRUPT_STATUS;
+    pi_i2c_write_read_async(&fxl6408->i2c,
+            &fxl6408->i2c_interrupt_status_payload,
+            &fxl6408->i2c_interrupt_status,
+            1, 1,
+            &fxl6408->i2c_interrupt_status_cb);
+}
+
+/*****************/
+/* API Functions */
+/*****************/
+
+int pi_fxl6408_open(pi_device_t *device)
+{
+    struct pi_fxl6408_conf *conf = (struct pi_fxl6408_conf *) device->config;
+
+    fxl6408_t *fxl6408 = (fxl6408_t *) pmsis_l2_malloc(sizeof(fxl6408_t));
+    if (fxl6408 == NULL)
+    {
+        return -1;
+    }
+
+    device->data = (void *) fxl6408;
+
+    struct pi_i2c_conf i2c_conf;
+    pi_i2c_conf_init(&i2c_conf);
+    i2c_conf.itf = conf->i2c_itf;
+    i2c_conf.max_baudrate = 100000;
+    pi_i2c_conf_set_slave_addr(&i2c_conf, 0x86, 0);
+
+    pi_open_from_conf(&fxl6408->i2c, &i2c_conf);
+    if (pi_i2c_open(&fxl6408->i2c)) goto error;
+
+    /* Reset the IO expander to a known state in case, in case it was kept on
+     * after chip reset */
+    if (__pi_fxl6408_reset(fxl6408)) goto error2;
+
+    /* initialize gpio input irq tasks */
+    for (int i = 0; i < 8; i++)
+    {
+        fxl6408->irq_tasks[i] = NULL;
+    }
+
+    /* initiliaze gpio irq callback */
+    {
+        pi_gpio_e gpio_pin = conf->interrupt_pin;
+        struct pi_gpio_conf gpio_conf;
+
+        pi_gpio_conf_init(&gpio_conf);
+        gpio_conf.port = PI_PAD_089 / 32;
+
+        pi_open_from_conf(&fxl6408->gpio_irq, &gpio_conf);
+        if (PI_OK != pi_gpio_open(&fxl6408->gpio_irq)) {
+            goto error2;
+        }
+
+        pi_gpio_notif_e irq_type = PI_GPIO_NOTIF_FALL;
+        pi_gpio_flags_e cfg_flags = PI_GPIO_INPUT
+            | PI_GPIO_PULL_DISABLE
+            | PI_GPIO_DRIVE_STRENGTH_LOW;
+
+        pi_gpio_pin_configure(&fxl6408->gpio_irq, gpio_pin, cfg_flags);
+        pi_gpio_pin_notif_configure(&fxl6408->gpio_irq, gpio_pin, irq_type);
+
+        pi_task_callback(&fxl6408->gpio_irq_cb, __gpio_irq_cb, (void*) fxl6408);
+
+        if (PI_OK != pi_gpio_pin_task_add(&fxl6408->gpio_irq, gpio_pin,
+                    &fxl6408->gpio_irq_cb, irq_type))
+        {
+            goto error3;
+        }
+    }
+
+    /* clears the reset int else interrupt wont trigger the int pin
+     * (value is not important) */
+    __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_DEVICE_ID_CTRL);
+    /* clears the interrupt status (value is not important) */
+    __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_INTERRUPT_STATUS);
+
+    return 0;
+
+error3:
+    //pi_gpio_close(&fxl6408->gpio_irq); //not implemented on pulpos2 ?
+error2:
+    pi_i2c_close(&fxl6408->i2c);
+error:
+    pmsis_l2_malloc_free(fxl6408, sizeof(fxl6408_t));
+    return -2;
+}
+
+int pi_fxl6408_gpio_set(pi_device_t *device, pi_fxl6408_gpio_conf_t *gpio_conf)
+{
+    if (NULL == device || NULL == device->data || NULL == gpio_conf)
+    {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    fxl6408_t *fxl6408 = (fxl6408_t *)device->data;
+
+    if (gpio_conf->direction == FXL6408_GPIO_DIR_OUTPUT)
+    {
+        /* pin is in output mode, only set registers that have an effect */
+        fxl6408->gpio_dir = __BITINSERT_R(fxl6408->gpio_dir,
+                __GPIO_MODE_OUTPUT, 1, gpio_conf->id);
+        if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_IO_DIRECTION, fxl6408->gpio_dir))
+        {
+            return PI_ERR_INVALID_STATE;
+        }
+
+        if (gpio_conf->output_state == FXL6408_GPIO_OUTPUT_STATE_DISABLED)
+        {
+            fxl6408->gpio_high_z = __BITINSERT_R(fxl6408->gpio_high_z,
+                    __GPIO_HIGHZ_ENABLED, 1, gpio_conf->id);
+        }
+        else {
+            fxl6408->gpio_high_z = __BITINSERT_R(fxl6408->gpio_high_z,
+                    __GPIO_HIGHZ_DISABLED, 1, gpio_conf->id);
+
+            if (gpio_conf->output_state == FXL6408_GPIO_OUTPUT_STATE_LOW)
+            {
+                fxl6408->gpio_value = __BITINSERT_R(fxl6408->gpio_value,
+                        __GPIO_OUTPUT_LOW, 1, gpio_conf->id);
+            }
+            else
+            {
+                fxl6408->gpio_value = __BITINSERT_R(fxl6408->gpio_value,
+                        __GPIO_OUTPUT_HIGH, 1, gpio_conf->id);
+            }
+
+            if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_OUTPUT_STATE, fxl6408->gpio_value))
+            {
+                return PI_ERR_INVALID_STATE;
+            }
+        }
+        if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_OUTPUT_HIGHZ, fxl6408->gpio_high_z))
+        {
+            return PI_ERR_INVALID_STATE;
+        }
+    }
+    else
+    {
+        /* pin is in input mode, only set registers that have an effect */
+        fxl6408->gpio_dir = __BITINSERT_R(fxl6408->gpio_dir,
+                __GPIO_MODE_INPUT, 1, gpio_conf->id);
+        if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_IO_DIRECTION, fxl6408->gpio_dir))
+        {
+            return PI_ERR_INVALID_STATE;
+        }
+
+        // input trigger => (disabled, rising, falling)
+        if (gpio_conf->input_trigger == FXL6408_GPIO_INPUT_TRIGGER_DISABLED)
+        {
+            fxl6408->gpio_interrupt_mask = __BITINSERT_R(fxl6408->gpio_interrupt_mask,
+                    __GPIO_INTERRUPT_DISABLED, 1, gpio_conf->id);
+        }
+        else {
+            fxl6408->gpio_interrupt_mask = __BITINSERT_R(fxl6408->gpio_interrupt_mask,
+                    __GPIO_INTERRUPT_ENABLED, 1, gpio_conf->id);
+
+            if (gpio_conf->input_trigger == FXL6408_GPIO_INPUT_TRIGGER_FALLING)
+            {
+                fxl6408->gpio_trigger = __BITINSERT_R(fxl6408->gpio_trigger,
+                        __GPIO_TRIGGER_FALLING, 1, gpio_conf->id);
+            }
+            else
+            {
+                fxl6408->gpio_trigger = __BITINSERT_R(fxl6408->gpio_trigger,
+                        __GPIO_TRIGGER_RISING, 1, gpio_conf->id);
+            }
+
+            if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_INPUT_DEFAULT_STATE,
+                        fxl6408->gpio_trigger))
+            {
+                return PI_ERR_INVALID_STATE;
+            }
+
+
+            /* set the irq task */
+            if (NULL != gpio_conf->irq_task)
+            {
+                fxl6408->irq_tasks[gpio_conf->id] = gpio_conf->irq_task;
+            }
+        }
+        if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_INTERRUPT_MASK,
+                    fxl6408->gpio_interrupt_mask))
+        {
+            return PI_ERR_INVALID_STATE;
+        }
+
+        // pull state => (disabled, pull-up, pull-down)
+        if (gpio_conf->pull_state == FXL6408_GPIO_PULL_STATE_DISABLED)
+        {
+            fxl6408->gpio_pull_enable = __BITINSERT_R(fxl6408->gpio_pull_enable,
+                    __GPIO_PULL_DISABLED, 1, gpio_conf->id);
+        }
+        else {
+            fxl6408->gpio_pull_enable = __BITINSERT_R(fxl6408->gpio_pull_enable,
+                    __GPIO_PULL_ENABLED, 1, gpio_conf->id);
+
+            if (gpio_conf->pull_state == FXL6408_GPIO_PULL_STATE_DOWN)
+            {
+                fxl6408->gpio_pull_updown = __BITINSERT_R(fxl6408->gpio_pull_updown,
+                        __GPIO_PULL_DOWN, 1, gpio_conf->id);
+            }
+            else
+            {
+                fxl6408->gpio_pull_updown = __BITINSERT_R(fxl6408->gpio_pull_updown,
+                        __GPIO_PULL_UP, 1, gpio_conf->id);
+            }
+
+            if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_PULL_UP_DOWN,
+                        fxl6408->gpio_pull_updown))
+            {
+                return PI_ERR_INVALID_STATE;
+            }
+        }
+        if (__pi_fxl6408_reg_write(&fxl6408->i2c, FXL6408_REG_PULL_ENABLE,
+                    fxl6408->gpio_pull_enable))
+        {
+            return PI_ERR_INVALID_STATE;
+        }
+    }
+
+    return PI_OK;
+}
+
+int pi_fxl6408_input_status_get(pi_device_t *device, uint8_t *input_status)
+{
+    if (NULL == device || NULL == device->data || NULL == input_status)
+    {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    fxl6408_t *fxl6408 = (fxl6408_t *)device->data;
+    *input_status = __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_INPUT_STATUS);
+
+    return PI_OK;
+}
+
+int pi_fxl6408_interrupt_status_get(pi_device_t *device, uint8_t *interrupt_status)
+{
+    if (NULL == device || NULL == device->data || NULL == interrupt_status)
+    {
+        return PI_ERR_INVALID_ARG;
+    }
+
+    fxl6408_t *fxl6408 = (fxl6408_t *)device->data;
+    *interrupt_status = __pi_fxl6408_reg_read(&fxl6408->i2c, FXL6408_REG_INTERRUPT_STATUS);
+
+    return PI_OK;
+}
+
+
+void pi_fxl6408_close(pi_device_t *device)
+{
+    fxl6408_t *fxl6408 = (fxl6408_t *)device->data;
+    // Make sure it is not driving anymore any gpio output
+    __pi_fxl6408_reset(fxl6408);
+    pi_i2c_close(&fxl6408->i2c);
+    pmsis_l2_malloc_free(fxl6408, sizeof(fxl6408_t));
+}
+
+
+void pi_fxl6408_conf_init(struct pi_fxl6408_conf *conf)
+{
+    conf->i2c_itf = 0;
+    conf->interrupt_pin = 0;
+}
+
+void pi_fxl6408_gpio_conf_init(pi_fxl6408_gpio_conf_t *gpio_conf)
+{
+    if (NULL == gpio_conf)
+    {
+        return;
+    }
+
+    gpio_conf->id = 0;
+
+    /* set the pin as output high impedance */
+    gpio_conf->direction = FXL6408_GPIO_DIR_OUTPUT;
+    gpio_conf->output_state = FXL6408_GPIO_OUTPUT_STATE_DISABLED;
+
+    /* no effect */
+    gpio_conf->input_trigger = FXL6408_GPIO_INPUT_TRIGGER_DISABLED;
+    gpio_conf->pull_state = FXL6408_GPIO_PULL_STATE_DISABLED;
+
+    gpio_conf->irq_task = NULL;
+}
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/adc/ads1014.h b/rtos/pmsis/pmsis_bsp/include/bsp/adc/ads1014.h
new file mode 100644
index 000000000..df4eb24be
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/adc/ads1014.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2021 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+
+#pragma once
+
+/**
+ * @addtogroup ADC
+ * @{
+ */
+
+/**
+ * @defgroup ADS1014 ADS1014
+ *
+ * TI ADS1014 Analog-To-Digital-Converter
+ *
+ * @warning Support for comparator IRQ handling is not implemented.
+ */
+
+/**
+ * @addtogroup ADS1014
+ * @{
+ */
+
+/**
+ * ADS1014 PGA (Programmable Gain Amplifier) values
+ *
+ * It is the range of the measured value.
+ * The absolute value of the measured voltage will never go above the
+ * power supply voltage value.
+ */
+enum ads1014_pga {
+    ADS1014_PGA_FSR_6V144 = 0x0,
+    ADS1014_PGA_FSR_4V096 = 0x1,
+    ADS1014_PGA_FSR_2V048 = 0x2,
+    ADS1014_PGA_FSR_1V024 = 0x3,
+    ADS1014_PGA_FSR_0V512 = 0x4,
+    ADS1014_PGA_FSR_0V256 = 0x5,
+};
+
+/**
+ * ADS1014 operating mode values
+ */
+enum ads1014_operating_mode {
+    /** The ADC measures continuously. */
+    ADS1014_OPERATING_MODE_CONTINUOUS  = 0x0,
+    /** The ADC only measures once, and goes back to power-saving mode */
+    ADS1014_OPERATING_MODE_SINGLE_SHOT = 0x1,
+};
+
+/**
+ * ADS1014 Sampling rate (samples per second)
+ */
+enum ads1014_data_rate {
+    ADS1014_DATA_RATE_SPS_128  = 0x0,
+    ADS1014_DATA_RATE_SPS_250  = 0x1,
+    ADS1014_DATA_RATE_SPS_490  = 0x2,
+    ADS1014_DATA_RATE_SPS_920  = 0x3,
+    ADS1014_DATA_RATE_SPS_1600 = 0x4,
+    ADS1014_DATA_RATE_SPS_2400 = 0x5,
+    ADS1014_DATA_RATE_SPS_3300 = 0x6,
+};
+
+/**
+ * ADS1014 Comparator mode
+ */
+enum ads1014_comparator_mode {
+    /**
+     * The comparator triggers when the measured value goes above the high
+     * threshold, and resets when the value goes below the low threshold.
+     */
+    ADS1014_COMPARATOR_MODE_TRADITIONAL = 0x0,
+    /**
+     * The comparator triggers if the measured value goes outside the window,
+     * i.e. above the high threshold or below the low threshold.
+     */
+    ADS1014_COMPARATOR_MODE_WINDOW      = 0x1,
+};
+
+/**
+ * ADS1014 Alert/Ready comparator pin polarity
+ *
+ * Controls the ADC Alert/Ready pin active polarity
+ */
+enum ads1014_comparator_polarity {
+    ADS1014_COMPARATOR_POLARITY_ACTIVE_LOW  = 0x0,
+    ADS1014_COMPARATOR_POLARITY_ACTIVE_HIGH = 0x1,
+};
+
+/**
+ * ADS1014 comparator latching mode
+ *
+ * Determines whether the comparator latches after triggering.
+ * When the comparator is set to latch, it will only be cleared by reading
+ * the ADC measured value.
+ */
+enum ads1014_comparator_latch {
+    ADS1014_COMPARATOR_LATCH_DISABLED = 0x0,
+    ADS1014_COMPARATOR_LATCH_ENABLED  = 0x1,
+};
+
+/**
+ * ADS1014 comparator status
+ *
+ * Determines after how many out-of-bounds conversions the comparator will
+ * trigger.
+ */
+enum ads1014_comparator_status {
+    /** triggers after 1 conversion */
+    ADS1014_COMPARATOR_STATUS_ASSERT_ONE   = 0x0,
+    /** triggers after 2 out-of-bounds conversions */
+    ADS1014_COMPARATOR_STATUS_ASSERT_TWO   = 0x1,
+    /** triggers after 3 out-of-bounds conversions */
+    ADS1014_COMPARATOR_STATUS_ASSERT_THREE = 0x2,
+    /** the comparator is disabled */
+    ADS1014_COMPARATOR_STATUS_DISABLED     = 0x3,
+};
+
+/* @brief Structure holding ADS1014 configuration */
+struct pi_ads1014_conf
+{
+    /** I2C interface which is connected to the ADC */
+    uint8_t i2c_itf;
+    /** Address of the ADC */
+    uint8_t i2c_addr;
+
+    /** ADC operating mode (single or continous) */
+    enum ads1014_operating_mode operating_mode;
+    /** range of the measured value */
+    enum ads1014_pga pga;
+    /** sampling rate */
+    enum ads1014_data_rate data_rate;
+
+    /** ADC comparator status (enabled&trigger conditions, or disabled) */
+    enum ads1014_comparator_status comparator_status;
+    /** ADC comparator mode (traditional or window) */
+    enum ads1014_comparator_mode comparator_mode;
+    /** ADC comparator latch setting */
+    enum ads1014_comparator_latch comparator_latch;
+    /** ADC comparator triggered polarity */
+    enum ads1014_comparator_polarity comparator_polarity;
+};
+
+/**
+ * @brief Initialize an ADS1014 configuration with default values.
+ *
+ * The structure containing the configuration must be kept alive until
+ * the device is opened.
+ * It can only be called from fabric-controller side.
+ *
+ * @param[inout] conf Pointer to the device configuration.
+ */
+void pi_ads1014_conf_init(struct pi_ads1014_conf *conf);
+
+/**
+ * Open a ADS1014 device
+ *
+ * @param[inout] device pointer to the ADS1014 device
+ *
+ * @return PI_OK if operation was sucessful,
+ *         an error code otherwise
+ */
+int pi_ads1014_open(pi_device_t *device);
+
+/**
+ * Close a ADS1014 device
+ *
+ * @param[inout] device pointer to the ADS1014 device
+ */
+void pi_ads1014_close(pi_device_t *device);
+
+
+/**
+ * Read the value measured by the ADC.
+ *
+ * @param[in] device pointer to the ads1014 device
+ * @param[out] value value in mV returned by the ADC
+ *
+ * @return PI_OK if operation was successful,
+ *         an error code otherwise
+ */
+int pi_ads1014_read(pi_device_t *device, float *value);
+
+/**
+ * Set the comparator thresholds (low and high)
+ *
+ * @param[in] device pointer to the ads1014 device
+ * @param[in] threshold_low new value for comparator low threshold (in mV)
+ * @param[in] threshold_high new value for comparator high threshold (in mV)
+ *
+ * @return PI_OK if operation was successful,
+ *         an error code otherwise
+ */
+int pi_ads1014_set_comparator_thresholds(pi_device_t *device,
+        float threshold_low, float threshold_high);
+
+
+/**
+ * @}
+ */
+
+/**
+ * @}
+ */
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/audio/adc/tlv320.h b/rtos/pmsis/pmsis_bsp/include/bsp/audio/adc/tlv320.h
new file mode 100644
index 000000000..c78aa4cd7
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/audio/adc/tlv320.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2019 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+
+#pragma once
+
+/**
+ * @addtogroup ADC
+ * @{
+ */
+
+/**
+ * @defgroup tlv320 tlv320
+ *
+ * DAC tlv320
+ */
+
+/**
+ * @addtogroup tlv320
+ * @{
+ */
+
+/* @brief Struct holding tlv320 display config. */
+struct pi_tlv320_conf
+{
+    int i2c_itf;
+};
+
+/**
+ * @brief Initialize an tlv320 configuration with default values.
+ *
+ * The structure containing the configuration must be kept alive until
+ * the device is opened.
+ * It can only be called from fabric-controller side.
+ *
+ * @param conf           Pointer to the device configuration.
+ */
+void pi_tlv320_conf_init(struct pi_tlv320_conf *conf);
+
+int pi_tlv320_open(pi_device_t *device);
+
+void pi_tlv320_close(struct pi_device *device);
+
+/**
+ * @}
+ */
+
+/**
+ * @}
+ */
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/audio/dac/ak4332.h b/rtos/pmsis/pmsis_bsp/include/bsp/audio/dac/ak4332.h
new file mode 100644
index 000000000..f2c3833e0
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/audio/dac/ak4332.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2019 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+
+#pragma once
+
+/**
+ * @addtogroup Dac
+ * @{
+ */
+
+/**
+ * @defgroup Ak4332 Ak4332
+ *
+ * DAC AK4332
+ */
+
+/**
+ * @addtogroup Ak4332
+ * @{
+ */
+
+/* @brief Struct holding ak4332 display config. */
+struct pi_ak4332_conf
+{
+    int i2c_itf;    /*!< I2C interface number where the device is connected. */
+};
+
+/**
+ * @brief Initialize an ak4332 configuration with default values.
+ *
+ * The structure containing the configuration must be kept alive until
+ * the device is opened.
+ * It can only be called from fabric-controller side.
+ *
+ * @param conf           Pointer to the device configuration.
+ */
+void pi_ak4332_conf_init(struct pi_ak4332_conf *conf);
+
+/** \brief Open a ak4332 device.
+ *
+ * This function must be called before the ak4332 device can be used.
+ * It will do all the needed configuration to make it usable and initialize
+ * the handle used to refer to this opened device when calling other functions.
+ *
+ * \param device    A pointer to the device structure of the device to open.
+ *   This structure is allocated by the called and must be kept alive until the
+ *   device is closed.
+ * \return          0 if the operation is successfull, -1 if there was an error.
+ */
+int pi_ak4332_open(pi_device_t *device);
+
+/**
+ * @brief Set DAC digital input volume
+ *
+ * The volume can be set to 0 to mute it or from 0x01 (-12dB) to 0x1F (+3.0dB).
+ *
+ * @param device Pointer to the device structure.
+ * \return          0 if the operation is successfull, -1 if there was an error.
+ */
+int pi_ak4332_set_dac_volume(pi_device_t *device, uint8_t volume);
+
+/**
+ * @brief Set headphone amplifier volume
+ *
+ * The volume can be set from 0x00 (-10dB) to 0x7 (+4dB).
+ *
+ * @param device Pointer to the device structure.
+ * \return          0 if the operation is successfull, -1 if there was an error.
+ */
+int pi_ak4332_set_hp_volume(pi_device_t *device, uint8_t volume);
+
+/** \brief Close an opened ak4332 device.
+ *
+ * This function can be called to close an opened ak4332 device once it is
+ * not needed anymore, in order to free all allocated resources. Once this
+ * function is called, the device is not accessible anymore and must be opened
+ * again before being used.
+ *
+ * \param device    The device structure of the device to close.
+ */
+void pi_ak4332_close(struct pi_device *device);
+
+/**
+ * @}
+ */
+
+/**
+ * @}
+ */
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h b/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h
index c8da1cb47..47ed8bfc5 100644
--- a/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/boards/gap9_evk/audio_addon.h
@@ -18,11 +18,12 @@
 
 #include <stdint.h>
 
-#define CONFIG_AK4332
-
 #define CONFIG_AK4332_I2C_ITF 1
 #define CONFIG_AK4332_I2S_ITF 2
 
+#define CONFIG_TLV320_I2C_ITF 1
+#define CONFIG_TLV320_I2S_ITF 2
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -39,7 +40,6 @@ uint8_t pi_bsp_fxl6408_read_id();
 void __bsp_audio_addon_init();
 
 #define CONFIG_FXL6408UMX_I2C_ITF 1
-#define CONFIG_FXL6408UMX_I2C_ADDR 0x86
 #define CONFIG_FXL6408UMX_AK4332_GPIO 1
 #define CONFIG_FXL6408UMX_TLV320_GPIO 2
 /// @endcond
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h b/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h
index 2beabc8ba..bda0383c2 100644
--- a/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/bsp.h
@@ -178,6 +178,12 @@ void bsp_atxp032_conf_init(struct pi_atxp032_conf *conf);
 int bsp_atxp032_open(struct pi_atxp032_conf *conf);
 #endif
 
+#if defined(CONFIG_MX25U51245G)
+#include "bsp/flash/mx25u51245g.h"
+void bsp_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf);
+int bsp_mx25u51245g_open(struct pi_mx25u51245g_conf *conf);
+#endif
+
 #if defined(CONFIG_NINA_W10)
 #include "bsp/transport/nina_w10.h"
 void bsp_nina_w10_conf_init(struct pi_nina_w10_conf *conf);
@@ -204,6 +210,19 @@ void bsp_ak4332_conf_init(struct pi_ak4332_conf *conf);
 int bsp_ak4332_open(struct pi_ak4332_conf *conf);
 #endif  /* CONFIG_AK4332 */
 
+#if defined(CONFIG_TLV320)
+#include "audio/adc/tlv320.h"
+void bsp_tlv320_conf_init(struct pi_tlv320_conf *conf);
+int bsp_tlv320_open(struct pi_tlv320_conf *conf);
+#endif  /* CONFIG_TLV320 */
+
+#if defined(CONFIG_FXL6408)
+#include "gpio/fxl6408.h"
+void bsp_fxl6408_conf_init(struct pi_fxl6408_conf *conf);
+int bsp_fxl6408_open(struct pi_fxl6408_conf *conf);
+int bsp_fxl6408_close(struct pi_fxl6408_conf *conf);
+#endif  /* CONFIG_FXL6408 */
+
 void bsp_init();
 
 void pi_bsp_init();
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/flash/mx25u51245g.h b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mx25u51245g.h
new file mode 100644
index 000000000..d2cb1fa88
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/flash/mx25u51245g.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2019 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSP__FLASH__MX25U51245G_H__
+#define __BSP__FLASH__MX25U51245G_H__
+
+#include "bsp/flash.h"
+
+/**
+ * @addtogroup Flash
+ * @{
+ */
+
+/**
+ * @defgroup Mx25u51245g Mx25u51245g
+ *
+ */
+
+/**
+ * @addtogroup Mx25u51245g
+ * @{
+ */
+
+/**@{*/
+
+/** \struct pi_mx25u51245g_conf
+ * \brief Mx25u51245g configuration structure.
+ *
+ * This structure is used to pass the desired Mx25u51245g configuration to the
+ * runtime when opening the device.
+ */
+struct pi_mx25u51245g_conf
+{
+  struct pi_flash_conf flash;  /*!< Generic flash configuration. */
+    int spi_itf;           /*!< SPI interface where the RAM is
+      connected. */
+    int spi_cs;            /*!< Chip select where the RAM is connected. */
+    int xip_en;
+    uint32_t baudrate;     /*!< Baudrate (in bytes/second). */
+};
+
+/** \brief Initialize an Mx25u51245g configuration with default values.
+ *
+ * The structure containing the configuration must be kept alive until the
+ * mx25u51245g device is opened.
+ *
+ * \param conf A pointer to the mx25u51245g configuration.
+ */
+void pi_mx25u51245g_conf_init(struct pi_mx25u51245g_conf *conf);
+
+
+//!@}
+
+/**
+ * @} end of Mx25u51245g
+ */
+
+/**
+ * @} end of Flash
+ */
+
+#endif 
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h
index 735d7a0f6..9a0cebbc8 100644
--- a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_evk.h
@@ -22,46 +22,23 @@
 #endif
 
 #define CONFIG_HIMAX
-#define CONFIG_HYPERFLASH
 #define CONFIG_MRAM
-#define CONFIG_HYPERRAM
-#define CONFIG_SPIRAM
-#define CONFIG_SPIFLASH
 #define CONFIG_24XX1025
 #define CONFIG_APS25XXXN
 #define CONFIG_VIRTUAL_EEPROM
-#define CONFIG_ATXP032
+#define CONFIG_MX25U51245G
 #define CONFIG_NINA_B112
 
 #define CONFIG_HIMAX_CPI_ITF 0
 #define CONFIG_HIMAX_I2C_ITF 0
 
-#define CONFIG_HYPERFLASH_HYPER_ITF 0
-#define CONFIG_HYPERFLASH_HYPER_CS  1
-
-#define CONFIG_HYPERRAM_HYPER_ITF 0
-#define CONFIG_HYPERRAM_HYPER_CS  0
-#define CONFIG_HYPERRAM_START     0
-#define CONFIG_HYPERRAM_SIZE     (8<<20)
-
-#define CONFIG_SPIRAM_SPI_ITF   0
-#define CONFIG_SPIRAM_SPI_CS    0
-#define CONFIG_SPIRAM_START     0
-#define CONFIG_SPIRAM_SIZE     (1<<20)
-
-#define CONFIG_APS25XXXN_SPI_ITF   1
-#define CONFIG_APS25XXXN_SPI_CS    0
+#define CONFIG_APS25XXXN_SPI_ITF   0
+#define CONFIG_APS25XXXN_SPI_CS    1
 #define CONFIG_APS25XXXN_START     0
 #define CONFIG_APS25XXXN_SIZE     (1<<25)
 
-#define CONFIG_ATXP032_SPI_ITF   1
-#define CONFIG_ATXP032_SPI_CS    1
-
-#define CONFIG_SPIFLASH_SPI_ITF     0
-#define CONFIG_SPIFLASH_SPI_CS      0
-#define CONFIG_SPIFLASH_START       0
-#define CONFIG_SPIFLASH_SIZE        (1<<24)
-#define CONFIG_SPIFLASH_SECTOR_SIZE (1<<12)
+#define CONFIG_MX25U51245G_SPI_ITF   0
+#define CONFIG_MX25U51245G_SPI_CS    0
 
 #define CONFIG_24XX1025_I2C_ADDR         0xA0
 #define CONFIG_24XX1025_I2C_ITF          0
@@ -80,4 +57,10 @@
 #define GPIO_NINA_PWRON                ( PI_PAD_042)
 #define GPIO_NINA17_DSR                ( PI_PAD_043)
 
+#define pi_default_flash_conf pi_mx25u51245g_conf
+#define pi_default_flash_conf_init pi_mx25u51245g_conf_init
+
+#define pi_default_ram_conf pi_aps25xxxn_conf
+#define pi_default_ram_conf_init pi_aps25xxxn_conf_init
+
 #endif
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h
index e935802f3..7eba5d772 100644
--- a/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/gap9_v2.h
@@ -45,13 +45,13 @@
 #define CONFIG_SPIRAM_START     0
 #define CONFIG_SPIRAM_SIZE     (1<<20)
 
-#define CONFIG_APS25XXXN_SPI_ITF   1
-#define CONFIG_APS25XXXN_SPI_CS    0
+#define CONFIG_APS25XXXN_SPI_ITF   0
+#define CONFIG_APS25XXXN_SPI_CS    1
 #define CONFIG_APS25XXXN_START     0
 #define CONFIG_APS25XXXN_SIZE     (1<<25)
 
-#define CONFIG_ATXP032_SPI_ITF   1
-#define CONFIG_ATXP032_SPI_CS    1
+#define CONFIG_ATXP032_SPI_ITF   0
+#define CONFIG_ATXP032_SPI_CS    0
 
 #define CONFIG_SPIFLASH_SPI_ITF     0
 #define CONFIG_SPIFLASH_SPI_CS      0
@@ -76,4 +76,22 @@
 #define GPIO_NINA_PWRON                ( PI_PAD_042)
 #define GPIO_NINA17_DSR                ( PI_PAD_043)
 
+#if defined(__PLATFORM_GVSOC__)
+
+#define pi_default_flash_conf pi_hyperflash_conf
+#define pi_default_flash_conf_init pi_hyperflash_conf_init
+
+#define pi_default_ram_conf pi_hyperram_conf
+#define pi_default_ram_conf_init pi_hyperram_conf_init
+
+#else
+
+#define pi_default_flash_conf pi_atxp032_conf
+#define pi_default_flash_conf_init pi_atxp032_conf_init
+
+#define pi_default_ram_conf pi_aps25xxxn_conf
+#define pi_default_ram_conf_init pi_aps25xxxn_conf_init
+
+#endif
+
 #endif
diff --git a/rtos/pmsis/pmsis_bsp/include/bsp/gpio/fxl6408.h b/rtos/pmsis/pmsis_bsp/include/bsp/gpio/fxl6408.h
new file mode 100644
index 000000000..268733dae
--- /dev/null
+++ b/rtos/pmsis/pmsis_bsp/include/bsp/gpio/fxl6408.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+
+#pragma once
+
+/**
+ * @addtogroup GPIO
+ * @{
+ */
+
+/**
+ * @defgroup FXL6408 FXL6408
+ *
+ * I2C Controlled GPIO Expander (8 configurable IOs)
+ */
+
+/**
+ * @addtogroup FXL6408
+ * @{
+ */
+
+/** @brief Struct holding FXL6408 display config. */
+struct pi_fxl6408_conf
+{
+    int i2c_itf;    /*!< I2C interface number where the device is connected. */
+    pi_gpio_e interrupt_pin; /*!< interrupt pin */
+};
+
+/**
+ * Direction of a GPIO (input or output)
+ */
+typedef enum {
+    FXL6408_GPIO_DIR_INPUT = 0x0,
+    FXL6408_GPIO_DIR_OUTPUT = 0x1,
+} fxl6408_gpio_dir_e;
+
+/**
+ * GPIO Output state
+ *
+ * This has no effect in input mode.
+ */
+typedef enum {
+    /** GPIO is in High-Z (impedance) mode */
+    FXL6408_GPIO_OUTPUT_STATE_DISABLED = 0x0,
+    /** GPIO is in low voltage level, or 0 */
+    FXL6408_GPIO_OUTPUT_STATE_LOW = 0x1,
+    /** GPIO is in high voltage level, or 1 */
+    FXL6408_GPIO_OUTPUT_STATE_HIGH = 0x2,
+} fxl6408_gpio_output_state_e;
+
+/**
+ * GPIO Input trigger conditions (disabled, rising or falling edge)
+ *
+ * This has no effect in output mode.
+ */
+typedef enum {
+    /** Input will trigger on a rising edge */
+    FXL6408_GPIO_INPUT_TRIGGER_RISING = 0x0,
+    /** Input will trigger on a falling edge */
+    FXL6408_GPIO_INPUT_TRIGGER_FALLING = 0x1,
+    /** Input will never trigger */
+    FXL6408_GPIO_INPUT_TRIGGER_DISABLED = 0x2,
+} fxl6408_gpio_input_trigger_e;
+
+/** GPIO Pull up/down state
+ *
+ * This has no effect in output mode.
+ */
+typedef enum {
+    /** GPIO is set to pull down */
+    FXL6408_GPIO_PULL_STATE_DOWN = 0x0,
+    /** GPIO is set to pull up */
+    FXL6408_GPIO_PULL_STATE_UP = 0x1,
+    /** GPIO pull is disabled */
+    FXL6408_GPIO_PULL_STATE_DISABLED = 0x2,
+} fxl6408_gpio_pull_state_e;
+
+/** Structure holding the configuration of a FXL6408 GPIO */
+typedef struct {
+    /** GPIO id(from 0 to 7) */
+    uint8_t id;
+    /** Direction (input or output) */
+    fxl6408_gpio_dir_e direction;
+    /** Output State (disabled/High-Z, 0 or 1) */
+    fxl6408_gpio_output_state_e output_state;
+    /** Input trigger (trigger on falling edge, rising edge or disabled) */
+    fxl6408_gpio_input_trigger_e input_trigger;
+    /** pull state (pull-up, pull-down, disabled) */
+    fxl6408_gpio_pull_state_e pull_state;
+    /** task executed when an irq is detected */
+    pi_task_t *irq_task;
+} pi_fxl6408_gpio_conf_t;
+
+/**
+ * @brief Initialize an FXL6408 configuration with default values.
+ *
+ * The structure containing the configuration must be kept alive until
+ * the device is opened.
+ * It can only be called from fabric-controller side.
+ * It is not thread-safe and cannot be called from a pmsis task callback or
+ * interrupt handler.
+ *
+ * @param[inout] conf           Pointer to the device configuration.
+ */
+void pi_fxl6408_conf_init(struct pi_fxl6408_conf *conf);
+
+/**
+ * @brief Open an FXL6408 device.
+ *
+ * It can only be called from fabric-controller side.
+ * It is not thread-safe and cannot be called from a pmsis task callback or
+ * interrupt handler.
+ *
+ * @param[inout] device           Pointer to the device.
+ *
+ * @return 0 if successfull or any other value otherwise
+ */
+int pi_fxl6408_open(pi_device_t *device);
+
+/**
+ * Close an FXL6408 device.
+ *
+ * @param[inout] device device to be closed
+ */
+void pi_fxl6408_close(pi_device_t *device);
+
+/**
+ * Initialize the configuration of a GPIO (Output, High-Z)
+ *
+ * @param[inout] gpio_conf configuration of the gpio
+ */
+void pi_fxl6408_gpio_conf_init(pi_fxl6408_gpio_conf_t *gpio_conf);
+
+/**
+ * @brief Set a GPIO state.
+ *
+ * It can only be called from fabric-controller side.
+ * It is not thread-safe and cannot be called from a pmsis task callback or
+ * interrupt handler.
+ *
+ * @param[in] device           Pointer to the device.
+ * @param[in] gpio_conf        GPIO configuration
+ * @return PI_OK if successfull or any other value otherwise
+ */
+int pi_fxl6408_gpio_set(pi_device_t *device, pi_fxl6408_gpio_conf_t *gpio_conf);
+
+/**
+ * Return the current status of inputs
+ *
+ * Each bit of the input status is the status of the corresponding
+ * gpio input.
+ *
+ * @param[in] device pointer to the device
+ * @param[out] input_status value of the input status register
+ *
+ * @return PI_OK if operation was successful, an error code otherwise.
+ */
+int pi_fxl6408_input_status_get(pi_device_t *device, uint8_t *input_status);
+
+/**
+ * Return the current status of interrupts
+ *
+ * This will clear the interrupt status register.
+ *
+ * Each bit of the interrupt status is the status of the corresponding
+ * gpio interrupt.
+ *
+ * @param[in] device pointer to the device
+ * @param[out] interrupt_status value of the interrupt status register
+ *
+ * @return PI_OK if operation was successful, an error code otherwise.
+ */
+int pi_fxl6408_interrupt_status_get(pi_device_t *device, uint8_t *interrupt_status);
+
+/**
+ * @}
+ */
+
+/**
+ * @}
+ */
diff --git a/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c b/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c
index d1ced37ad..853297d77 100644
--- a/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c
+++ b/rtos/pmsis/pmsis_bsp/ram/hyperram/hyperram.c
@@ -92,6 +92,18 @@ static int hyperram_open(struct pi_device *device)
       goto error2;
   }
 
+
+#if defined(WINBOND_HYPER)
+  hyper_crt0_set(REG_ACCESS);
+  hyper_crt1_set(REG_ACCESS);
+  hyperram->reg_value = 0x9f10;
+  pi_hyper_write(&hyperram->hyper_device, 0x1000, &hyperram->reg_value, 2);
+  pi_hyper_read(&hyperram->hyper_device, 0x1000, &hyperram->reg_value, 2);
+  //printf("Reg value of 0x80001000 = %lx\n", hyperram->reg_value);
+  hyper_crt0_set(MEM_ACCESS);
+  hyper_crt1_set(MEM_ACCESS);
+#endif
+
 #if defined(__GAP9__)
 
   pi_hyper_ioctl(&hyperram->hyper_device, PI_HYPER_IOCTL_ENABLE_AES, (void*) 0);
diff --git a/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c b/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c
index f19c892aa..529b89d71 100644
--- a/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c
+++ b/rtos/pmsis/pmsis_bsp/ram/spiram/aps25xxxn.c
@@ -215,6 +215,10 @@ void pi_aps25xxxn_conf_init(struct pi_aps25xxxn_conf *conf)
   conf->baudrate = 0;
   conf->xip_en = 0;
   conf->reserve_addr_0 = 1;
+  #if defined(__GAP9__)
+  conf->ram.aes_conf.enabled = 0;
+  conf->ram.aes_conf.qk_en = 0;
+  #endif
   bsp_aps25xxxn_conf_init(conf);
 }
 
diff --git a/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk b/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk
index a6e1acd7d..d5bf804f8 100644
--- a/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk
+++ b/rtos/pmsis/pmsis_bsp/rules/freertos_bsp_rules.mk
@@ -19,21 +19,23 @@
 include $(PMSIS_BSP_DIR)/src.mk
 
 ifeq ($(BOARD_NAME), gapuino)
-PMSIS_BSP_SRC = $(GAPUINO_SRC)
+PMSIS_BSP_SRC += $(GAPUINO_SRC)
 else ifeq ($(BOARD_NAME), gapoc_a)
-PMSIS_BSP_SRC = $(GAPOC_A_SRC)
+PMSIS_BSP_SRC += $(GAPOC_A_SRC)
 else ifeq ($(BOARD_NAME), gapoc_a_revb)
-PMSIS_BSP_SRC = $(GAPOC_A_SRC)
+PMSIS_BSP_SRC += $(GAPOC_A_SRC)
 else ifeq ($(BOARD_NAME), gapoc_b)
-PMSIS_BSP_SRC = $(GAPOC_B_SRC)
+PMSIS_BSP_SRC += $(GAPOC_B_SRC)
 else ifeq ($(BOARD_NAME), gapoc_b_revb)
-PMSIS_BSP_SRC = $(GAPOC_B_SRC)
+PMSIS_BSP_SRC += $(GAPOC_B_SRC)
 else ifeq ($(BOARD_NAME), vega)
-PMSIS_BSP_SRC = $(VEGA_SRC)
+PMSIS_BSP_SRC += $(VEGA_SRC)
 else ifeq ($(BOARD_NAME), gap9_v2)
-PMSIS_BSP_SRC = $(GAP9_SRC)
+PMSIS_BSP_SRC += $(GAP9_SRC)
 else ifeq ($(BOARD_NAME), ai_deck)
-PMSIS_BSP_SRC = $(AI_DECK_SRC)
+PMSIS_BSP_SRC += $(AI_DECK_SRC)
+else ifeq ($(BOARD_NAME), gap9_evk)
+PMSIS_BSP_SRC += $(GAP9_EVK_SRC)
 endif
 
 EXCLUDE_FROM_SRCS= transport/transport.c transport/nina_w10/nina_w10.c
diff --git a/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk b/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk
index d01b1dc05..96493e41f 100644
--- a/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk
+++ b/rtos/pmsis/pmsis_bsp/rules/pulpos/src.mk
@@ -18,11 +18,6 @@ endif
 BOARD_PROFILE_UPPER = $(shell echo $(PULPOS_BOARD_PROFILE) | tr 'a-z' 'A-Z')
 PULP_CFLAGS += -DCONFIG_PROFILE_$(BOARD_PROFILE_UPPER)
 
-ifneq (,$(findstring $(BOARD_FEATURES),audio_addon))
-	PULP_SRCS += $(BSP_GAP9_EVK_AUDIO_ADDON)
-	PULP_CFLAGS += -DCONFIG_GAP9_EVK_AUDIO_ADDON=1
-endif
-
 # BSP is needed if i2s is used to properly configure pads
 ifeq '$(CONFIG_I2S)' '1'
 CONFIG_BSP = 1
@@ -116,6 +111,5 @@ ifeq '$(CONFIG_BLE_NINA_B112)' '1'
 PULP_SRCS += $(BSP_BLE_NINA_B112_SRC)
 endif
 
-ifeq '$(CONFIG_AK4332)' '1'
-PULP_SRCS += $(BSP_AK4332_SRC)
-endif
+PULP_SRCS += $(PMSIS_BSP_SRC)
+PULP_CFLAGS += $(PMSIS_BSP_CFLAGS)
\ No newline at end of file
diff --git a/rtos/pmsis/pmsis_bsp/src.mk b/rtos/pmsis/pmsis_bsp/src.mk
index 58c747924..505fe287f 100644
--- a/rtos/pmsis/pmsis_bsp/src.mk
+++ b/rtos/pmsis/pmsis_bsp/src.mk
@@ -22,6 +22,9 @@ BSP_HIMAX_SRC = camera/himax/himax.c
 BSP_HM0360_SRC = camera/hm0360/hm0360.c
 BSP_BLE_NINA_B112_SRC= ble/ble.c ble/nina_b112/nina_b112.c ble/nina_b112/nina_b112_old.c
 BSP_AK4332_SRC = audio/dac/ak4332.c
+BSP_TLV320_SRC = audio/adc/tlv320.c
+BSP_FXL6408_SRC = gpio/fxl6408.c
+BSP_ADC_ADS1014_SRC = adc/ads1014.c
 
 COMMON_SRC = \
   $(BSP_FLASH_SRC) \
@@ -57,10 +60,8 @@ GAP9_SRC = \
   $(BSP_HIMAX_SRC) \
   $(BSP_HYPERFLASH_SRC) \
   $(BSP_HYPERRAM_SRC) \
-  $(BSP_RAM_SRC) \
   $(BSP_MRAM_SRC) \
   $(BSP_OSPI_FLASH_SRC) \
-  $(BSP_OSPI_RAM_SRC) \
   $(BSP_BLE_NINA_B112_SRC)
 
 WOLFE_SRC = \
@@ -105,6 +106,11 @@ AI_DECK_SRC = \
   $(BSP_SPIFLASH_SRC) \
   $(BSP_RAM_SRC)
 
+GAP9_EVK_SRC = \
+  $(COMMON_SRC) \
+  $(BSP_MRAM_SRC) \
+  bsp/gap9_evk.c
+
 GAPOC_A_SRC = \
   $(COMMON_SRC) \
   bsp/gapoc_a.c \
@@ -152,3 +158,66 @@ GAPOC_B_SRC = \
   camera/ov5640/ov5640.c
 endif				# TARGET_CHIP
 
+ifeq '$(BOARD_NAME)' 'gap9_evk'
+# Configure the right spi flash
+CONFIG_MX25U51245G=1
+CONFIG_APS25XXXN=1
+CONFIG_IO_UART_ITF=1
+CONFIG_IO_UART_BAUDRATE=115200
+endif
+
+ifeq '$(BOARD_NAME)' 'gap9_v2'
+# Configure the right spi flash
+CONFIG_ATXP032=1
+CONFIG_HYPERFLASH=1
+CONFIG_HYPERRAM=1
+CONFIG_APS25XXXN=1
+endif
+
+ifneq (,$(findstring $(BOARD_FEATURES),audio_addon))
+	PMSIS_BSP_SRC += $(BSP_GAP9_EVK_AUDIO_ADDON)
+	PMSIS_BSP_CFLAGS += -DCONFIG_GAP9_EVK_AUDIO_ADDON=1
+endif
+
+CONFIG_OCTOSPI = 1
+ifeq '$(CONFIG_AK4332)' '1'
+PMSIS_BSP_SRC += $(BSP_AK4332_SRC)
+CONFIG_FXL6408 = 1
+CONFIG_I2C = 1
+PMSIS_BSP_CFLAGS += -DCONFIG_AK4332=1
+endif
+
+ifeq '$(CONFIG_TLV320)' '1'
+PMSIS_BSP_SRC += $(BSP_TLV320_SRC)
+CONFIG_FXL6408 = 1
+CONFIG_I2C = 1
+PMSIS_BSP_CFLAGS += -DCONFIG_TLV320=1
+endif
+
+ifeq '$(CONFIG_FXL6408)' '1'
+PMSIS_BSP_SRC += $(BSP_FXL6408_SRC)
+PMSIS_BSP_CFLAGS += -DCONFIG_FXL6408=1
+CONFIG_I2C = 1
+endif
+
+ifeq '$(CONFIG_MX25U51245G)' '1'
+PMSIS_BSP_SRC += flash/spiflash/mx25u51245g.c
+CONFIG_FLASH = 1
+CONFIG_OCTOSPI = 1
+endif
+
+ifeq '$(CONFIG_APS25XXXN)' '1'
+PMSIS_BSP_SRC += ram/spiram/aps25xxxn.c
+CONFIG_RAM = 1
+CONFIG_OCTOSPI = 1
+endif
+
+ifeq '$(CONFIG_RAM)' '1'
+PMSIS_BSP_SRC += $(BSP_RAM_SRC)
+CONFIG_BSP = 1
+endif
+
+ifeq '$(CONFIG_ADS1014)' '1'
+PMSIS_BSP_SRC += $(BSP_ADC_ADS1014_SRC)
+PMSIS_BSP_CFLAGS += -DCONFIG_ADS1014=1
+endif
diff --git a/rtos/pmsis/pmsis_implem/CMakeLists.txt b/rtos/pmsis/pmsis_implem/CMakeLists.txt
index c7876bb83..f4eb2da9d 100644
--- a/rtos/pmsis/pmsis_implem/CMakeLists.txt
+++ b/rtos/pmsis/pmsis_implem/CMakeLists.txt
@@ -1,7 +1,14 @@
 # Driver sources
 LIST(APPEND PMSIS_SRC
     chips/gap9/drivers/i2s/i2s.c
+    chips/gap9/drivers/spim/spim.c
     chips/gap9/drivers/udma/udma_core.c
+    chips/gap9/drivers/udma/udma_datamove.c
+    chips/gap9/drivers/udma/udma_ffc.c
+    chips/gap9/drivers/udma/udma_timeout.c
+    chips/gap9/drivers/udma/udma_timestamp.c
+    chips/gap9/drivers/i2c/i2c.c
+    chips/gap9/drivers/i2c/i2c_slave.c
     )
 
 add_library(pmsis_implem STATIC ${PMSIS_SRC})
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c
new file mode 100644
index 000000000..566c8c84a
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c.c
@@ -0,0 +1,893 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+#include "pmsis/drivers/i2c.h"
+#include "i2c_internal.h"
+
+
+//#define USE_TIMEOUT 1
+
+
+i2c_itf_data_t *__pi_i2c_itf_data[UDMA_NB_I2C];
+
+
+static int __pi_i2c_prepare_write_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *write_buffer,
+    int size, pi_i2c_xfer_flags_e flags);
+
+static int __pi_i2c_prepare_read_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *write_buffer,
+    int size, pi_i2c_xfer_flags_e flags);
+
+static int __pi_i2c_prepare_dual_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *write_buffer,
+    int size, pi_i2c_xfer_flags_e flags);
+
+static int __pi_i2c_prepare_write_read_buf(i2c_slave_data_t *slave_data,
+    uint32_t *buffer, int size0, int size1);
+
+static int __pi_i2c_prepare_write_dual_buf(i2c_slave_data_t *slave_data,
+    uint32_t *buffer, int size0, int size1);
+
+
+static void __pi_i2c_write_exec(i2c_slave_data_t *slave_data, uint8_t *buffer, int size,
+    pi_i2c_xfer_flags_e flags, pi_task_t *task)
+{
+    uint16_t slave_addr = slave_data->slave_addr;
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+
+#if defined(USE_TIMEOUT)
+    // TODO this does not take into account errors. Timeout should be restarted with proper time
+    uint32_t timeout_us = task->timeout;
+    if (timeout_us)
+    {
+        __pi_i2c_timeout_config_set(task, slave_data->itf_data->tx_timeout_id,
+                                    slave_data->itf_data->tx_chan_id, timeout_us,
+                                    __pi_i2c_timeout_abort, slave_data->itf_data);
+    }
+#endif
+    int cmd_buf_size = __pi_i2c_prepare_write_cmd_buf(slave_data, itf_data->cmd_buf,
+        size, flags);
+    pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)buffer, size, 0);
+    pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf,
+            cmd_buf_size*sizeof(uint32_t), 0);
+}
+
+
+static void __pi_i2c_read_exec(i2c_slave_data_t *slave_data, uint8_t *buffer, int size,
+    pi_i2c_xfer_flags_e flags, pi_task_t *task)
+{
+    uint16_t slave_addr = slave_data->slave_addr;
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+
+#if defined(USE_TIMEOUT)
+    uint32_t timeout_us = task->timeout;
+    if (timeout_us)
+    {
+        __pi_i2c_timeout_config_set(task, slave_data->itf_data->rx_timeout_id,
+                                    slave_data->itf_data->rx_chan_id, timeout_us,
+                                    __pi_i2c_timeout_abort, slave_data->itf_data);
+    }
+#endif
+    int cmd_buf_size = __pi_i2c_prepare_read_cmd_buf(slave_data, itf_data->cmd_buf, size,
+        flags);
+    pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)buffer, size, 0);
+    pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf,
+            cmd_buf_size*sizeof(uint32_t), 0);
+}
+
+
+static void __pi_i2c_write_read_exec(i2c_slave_data_t *slave_data, void *tx_buffer,
+    void *rx_buffer, uint32_t tx_size, uint32_t rx_size, pi_task_t *task)
+{
+    uint16_t slave_addr = slave_data->slave_addr;
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+    int8_t bits = slave_data->is_10_bits;
+
+    pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)tx_buffer, tx_size, 0);
+    pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)rx_buffer, rx_size, 0);
+
+    int cmd_buf_size = __pi_i2c_prepare_write_read_buf(slave_data, itf_data->cmd_buf,
+        tx_size, rx_size);
+    pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf,
+        sizeof(i2c_cmd_t)*cmd_buf_size, 0);
+}
+
+static void __pi_i2c_write_dual_exec(i2c_slave_data_t *slave_data, void *tx_buffer0,
+    void *tx_buffer1, uint32_t tx_size0, uint32_t tx_size1, pi_task_t *task)
+{
+    uint16_t slave_addr = slave_data->slave_addr;
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+
+    pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)tx_buffer0, tx_size0, 0);
+    pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)tx_buffer1, tx_size1, 0);
+    int cmd_buf_size = __pi_i2c_prepare_write_dual_buf(slave_data, itf_data->cmd_buf,
+        tx_size0, tx_size1);
+    pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)itf_data->cmd_buf,
+            sizeof(i2c_cmd_t)*cmd_buf_size, 0);
+}
+
+
+static inline void __pi_i2c_send_request_from_irq(i2c_itf_data_t* itf_data, pi_task_t* task)
+{
+    int cmd_buf_size;
+
+    pi_device_t *device = (pi_device_t *)task->data[3];
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    if(task->data[0] == I2C_WRITE)
+    {
+        __pi_i2c_write_exec(slave_data, (void*) task->data[1], task->data[2], task->data[4], task);
+    }
+    else if(task->data[0] == I2C_READ)
+    {
+        __pi_i2c_read_exec(slave_data, (void*) task->data[1], task->data[2], task->data[4], task);
+    }
+    else if(task->data[0] == I2C_WRITE_READ)
+    {
+        __pi_i2c_write_read_exec(slave_data, (void*)task->data[1], (void *)task->data[2],
+            task->data[4], task->data[5], task);
+    }
+    else if(task->data[0] == I2C_WRITE_DUAL)
+    {
+        __pi_i2c_write_dual_exec(slave_data, (void*)task->data[1], (void *)task->data[2],
+            task->data[4], task->data[5], task);
+    }
+}
+
+static inline void __pi_i2c_handle_error(int device_id, i2c_itf_data_t* itf_data)
+{
+    I2C_TRACE("I2C(%d)->lead_error_handler\n", device_id);
+    uint32_t nack = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_ERROR_NACK_EVENT);
+    uint32_t arlo = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_ERROR_ARLO_EVENT);
+    uint32_t framing = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_ERROR_FRAMING_EVENT);
+    if (nack || arlo || framing)
+    {
+        // 1) stop cmd, rx and tx leader udma addr gen
+        pi_udma_core_lin_stop(itf_data->rx_chan_addr);
+        pi_udma_core_lin_stop(itf_data->tx_chan_addr);
+        pi_udma_core_lin_stop(itf_data->cmd_chan_addr);
+
+        // 2) clear event, unlock and purge
+        udma_i2c_status_reg_idx_set(itf_data->base,
+                (nack << I2C_STATUS_ERROR_NACK_EVENT) |
+                (arlo << I2C_STATUS_ERROR_ARLO_EVENT) |
+                (framing << I2C_STATUS_ERROR_FRAMING_EVENT) |
+                (1 << I2C_FLAG_UNLOCK_EVENT_O) |
+                (1 << I2C_FLAG_PURGE_EVENT_O));
+    }
+
+    // 3) depends on error:
+    if (nack)
+    {
+        I2C_TRACE_ERR("I2C(%d)->lead_error_handler - nack error\n", device_id);
+        // NACK => report error
+        itf_data->end_task->data[0] = PI_ERR_I2C_NACK;
+        itf_data->end_task->arg[3] = 0;
+        pi_task_push_irq_safe(itf_data->end_task);
+        itf_data->end_task = NULL;
+    }
+    else if (arlo || framing)
+    {
+        I2C_TRACE_ERR("I2C(%d)->lead_error_handler - arbitration loss or framing error\n", device_id);
+        // ARLO and FRAMING error => restart current CMD buffer
+        __pi_i2c_send_request_from_irq(itf_data, itf_data->end_task);
+    }
+}
+
+__attribute__((section(".text"))) __noinline
+void __pi_i2c_lead_event_handler(uint32_t event, void *arg)
+{
+    i2c_itf_data_t *itf_data = arg;
+    int device_id = itf_data->id;
+    I2C_TRACE("I2C(%d)->lead_event_handler\n", device_id);
+
+    // I2C_FLAG_CMD_EVENT is basically our EOT?
+    if(__pi_i2c_get_event_status(itf_data->base,I2C_FLAG_CMD_EVENT_I))
+    {
+        // set the return status to OK
+        itf_data->end_task->data[0] = PI_OK;
+        itf_data->end_task->arg[3] = 0;
+        // if it's a mutex, release on the spot
+        pi_task_push_irq_safe(itf_data->end_task);
+        itf_data->end_task = NULL;
+        udma_i2c_status_reg_idx_set(itf_data->base, 1<<I2C_FLAG_CMD_EVENT_I);
+    }
+    else
+    {
+        // There was an error
+        __pi_i2c_handle_error(device_id, itf_data);
+        return;
+    }
+    pi_task_t *next_task = __pi_i2c_drv_fifo_pop(itf_data);
+    if(next_task)
+    {
+        __pi_i2c_send_request_from_irq(itf_data, next_task);
+        itf_data->end_task = next_task;
+    }
+    return;
+}
+
+/**
+ * \brief internal helper function for preparing write command buffer
+ */
+static int __pi_i2c_prepare_write_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *buffer,
+    int size, pi_i2c_xfer_flags_e flags)
+{
+    uint16_t slave_addr = slave_data->slave_addr;
+    int index = 0;
+
+    if(!(flags & PI_I2C_XFER_NO_START) || (flags & PI_I2C_XFER_RESTART))
+    {   // generate a start condition
+        buffer[index++]  = slave_data->cfg;
+        buffer[index++]  = I2C_CMD_LEAD_START(1);
+    }
+    // slave addr, no rnw bit since it's a write
+    if(!slave_data->is_10_bits)
+    {
+        buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_addr & 0xFE);
+    }
+    else
+    {
+        slave_addr = slave_addr & 0x3FF;
+        uint16_t slave_addrh = (((slave_addr>>7)|0)&0x7) | 0xF0;
+        uint16_t slave_addrl = (slave_addr&0xFF);
+        buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(1, (slave_addrh << 8)
+                | slave_addrl);
+    }
+
+    buffer[index++]  = I2C_CMD_RPT(size);
+    buffer[index++]  = I2C_CMD_MISC_SEND(1);
+    if(!(flags & PI_I2C_XFER_NO_STOP))
+    {
+        buffer[index++] = I2C_CMD_STOP(1);
+    }
+
+    if(slave_data->wait_cycles)
+    {
+        buffer[index++] = I2C_CMD_RPT(slave_data->wait_cycles);
+        buffer[index++] = I2C_CMD_MISC_WAIT(1);
+    }
+
+    buffer[index++] = I2C_CMD_EVENT(1);
+
+    return index;
+}
+
+/**
+ * \brief internal helper function for preparing read command buffer
+ */
+static inline int __pi_i2c_prepare_read_cmd_buf(i2c_slave_data_t *slave_data, uint32_t *buffer,
+    int size, pi_i2c_xfer_flags_e flags)
+{
+    uint16_t slave_addr = slave_data->slave_addr;
+    int index = 0;
+
+    if(!(flags & PI_I2C_XFER_NO_START) || (flags & PI_I2C_XFER_RESTART))
+    {
+        buffer[index++]  = slave_data->cfg;
+        buffer[index++]  = I2C_CMD_LEAD_START(1);
+    }
+
+    // slave addr + rnw bit
+    if(!slave_data->is_10_bits)
+    {
+        buffer[index++]  = I2C_CMD_LEAD_SEND_IMM_ADDR(0, (slave_addr &0xFE)|1);
+    }
+    else
+    {   // for 10 bits, need to use write mode first
+        slave_addr = slave_addr & 0x3FF;
+        uint16_t slave_addrh = (((slave_addr>>7)|0)&0x7) | 0xF0;
+        uint16_t slave_addrl = (slave_addr&0xFF);
+        buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(1, (slave_addrh << 8)
+                | slave_addrl);
+        buffer[index++] = I2C_CMD_LEAD_START(1);
+        buffer[index++] = I2C_CMD_LEAD_SEND_IMM(slave_addrh|1);
+    }
+    buffer[index++] = I2C_CMD_RPT(size - 1);
+    // receive -1 byte because there is a "last"
+    buffer[index++] = I2C_CMD_MISC_RECEIVE(1);
+    buffer[index++] = I2C_CMD_MISC_RECEIVE_LAST(1);
+
+    if(!(flags & PI_I2C_XFER_NO_STOP))
+    {
+        buffer[index++] = I2C_CMD_STOP(1);
+    }
+
+    if(slave_data->wait_cycles)
+    {
+        buffer[index++] = I2C_CMD_RPT(slave_data->wait_cycles);
+        buffer[index++] = I2C_CMD_MISC_WAIT(1);
+    }
+    buffer[index++] = I2C_CMD_EVENT(1);
+
+    return index;
+}
+
+
+/**
+ * \brief internal helper function for preparing write&read command buffer
+ */
+static int __pi_i2c_prepare_write_read_buf(i2c_slave_data_t *slave_data,
+    uint32_t *buffer, int size0, int size1)
+{
+    int index = 0;
+
+    buffer[index++]  = slave_data->cfg;
+    buffer[index++]  = I2C_CMD_LEAD_START(1);
+    // slave addr, no rnw bit since it's a write
+    buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_data->slave_addr);
+    buffer[index++] = I2C_CMD_RPT(size0);
+    buffer[index++] = I2C_CMD_MISC_SEND(1);
+    buffer[index++] = I2C_CMD_LEAD_START(1);
+    // slave addr + rnw bit
+    if(!slave_data->is_10_bits)
+    {
+        buffer[index++]  = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_data->slave_addr|1);
+    }
+    else
+    {// for 10 bits, need to use write mode first
+        buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(1, slave_data->slave_addr);
+        buffer[index++] = I2C_CMD_LEAD_START(1);
+        buffer[index++] = I2C_CMD_LEAD_SEND_IMM(slave_data->slave_addrh|1);
+    }
+    buffer[index++] = I2C_CMD_RPT(size1);
+    // receive -1 byte because there is a "last"
+    buffer[index++] = I2C_CMD_MISC_RECEIVE(1);
+    buffer[index++] = I2C_CMD_MISC_RECEIVE_LAST(1);
+    buffer[index++] = I2C_CMD_STOP(1);
+    buffer[index++] = I2C_CMD_EVENT(1);
+
+    return index;
+}
+
+
+/**
+ * \brief internal helper function for preparing write&read command buffer
+ */
+static int __pi_i2c_prepare_write_dual_buf(i2c_slave_data_t *slave_data,
+    uint32_t *buffer, int size0, int size1)
+{
+    int index = 0;
+
+    buffer[index++]  = slave_data->cfg;
+    buffer[index++]  = I2C_CMD_LEAD_START(1);
+    // slave addr, no rnw bit since it's a write
+    buffer[index++] = I2C_CMD_LEAD_SEND_IMM_ADDR(0, slave_data->slave_addr);
+    buffer[index++] = I2C_CMD_RPT(size0);
+    buffer[index++] = I2C_CMD_MISC_SEND(1);
+    buffer[index++] = I2C_CMD_RPT(size1);
+    buffer[index++] = I2C_CMD_MISC_SEND(1);
+    buffer[index++] = I2C_CMD_STOP(1);
+    buffer[index++] = I2C_CMD_EVENT(1);
+
+    return index;
+}
+
+
+static void __pi_i2c_timestamp_enable(i2c_itf_data_t *itf_data, struct pi_i2c_conf *conf)
+{
+#if defined(__FREERTOS__)
+    uint8_t is_rx = conf->ts_ch;
+
+    uint32_t base = UDMA_CTRL_ADDR;
+    uint8_t evt_id = conf->ts_evt_id; 
+    uint8_t soc_evt = is_rx ? SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id) : SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id); 
+
+    uint32_t cfg_evt_val = (udma_ctrl_cfg_event_get(base) & ~(0xFF<<evt_id*8))| (soc_evt << (evt_id*8));
+
+    pi_soc_eu_pr_mask_set(soc_evt);
+    udma_ctrl_cfg_event_set(base, cfg_evt_val);
+#endif
+}
+
+/*
+ * @brief Internal open of i2c interface
+ * Caller must provide synchronization
+ */
+int pi_i2c_open(pi_device_t *device)
+{
+    pi_assert((NULL != device) && (NULL != device->config));
+
+    i2c_slave_data_t *slave_data;
+    struct pi_i2c_conf *conf = device->config;
+    i2c_itf_data_t *itf_data = NULL;
+
+    // check interface first
+    I2C_TRACE("I2C(%d)->lead_open\n", conf->itf);
+    int irq = disable_irq();
+
+    slave_data = pi_fc_l1_malloc(sizeof(i2c_slave_data_t));
+    if (slave_data == NULL) goto error0;
+
+    if(!(itf_data = __pi_i2c_itf_data[conf->itf]))
+    {
+        uint32_t i2c_base = UDMA_I2C_ADDR(conf->itf);
+
+        // prepare itf struct
+        itf_data = pi_fc_l1_malloc(sizeof(i2c_itf_data_t));
+        I2C_TRACE("I2C(%d)->itf_data=%x\n", conf->itf, itf_data);
+        if(itf_data == NULL) goto error1;
+
+        itf_data->rx_chan_id = pi_udma_core_lin_alloc();
+        if (itf_data->rx_chan_id == -1) goto error2;
+        itf_data->tx_chan_id = pi_udma_core_lin_alloc();
+        if (itf_data->tx_chan_id == -1) goto error3;
+        itf_data->cmd_chan_id = pi_udma_core_lin_alloc();
+        if (itf_data->cmd_chan_id == -1) goto error4;
+
+        __pi_i2c_itf_data[conf->itf] = itf_data;
+        itf_data->id = conf->itf;
+        itf_data->fifo_head = NULL;
+        itf_data->end_task = NULL;
+        itf_data->base = i2c_base;
+        itf_data->open_nb = 0;
+
+        // disable udma reset before setting regs
+        udma_ctrl_cfg_rstn_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf));
+        udma_ctrl_cfg_cg_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf));
+
+        udma_i2c_lead_udma_rx_dest_reg_idx_set(i2c_base, itf_data->rx_chan_id);
+        udma_i2c_lead_udma_tx_dest_reg_idx_set(i2c_base, itf_data->tx_chan_id);
+        udma_i2c_udma_cmd_dest_reg_idx_set(i2c_base, itf_data->cmd_chan_id);
+
+        itf_data->rx_chan_addr = pi_udma_core_lin_addr_get(itf_data->rx_chan_id);
+        itf_data->tx_chan_addr = pi_udma_core_lin_addr_get(itf_data->tx_chan_id);
+        itf_data->cmd_chan_addr = pi_udma_core_lin_addr_get(itf_data->cmd_chan_id);
+
+#if defined(USE_TIMEOUT)
+        itf_data->rx_timeout_id = 0xFF;
+        itf_data->tx_timeout_id = 0xFF;
+#endif
+
+        udma_i2c_status_reg_idx_set(i2c_base, 1<<I2C_FLAG_PRESC_DIV10_EVENT_O);
+        pi_fc_event_handler_set(SOC_EVENT_UDMA_I2C_LEAD_EVT(conf->itf),
+                __pi_i2c_lead_event_handler, itf_data);
+        pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_I2C_LEAD_EVT(conf->itf));
+    }
+
+    slave_data->itf_data = itf_data;
+    slave_data->is_10_bits = conf->is_10_bits;
+    slave_data->wait_cycles = conf->wait_cycles;
+    slave_data->cfg = __i2c_prepare_timing(conf->max_baudrate, pi_freq_get(PI_FREQ_DOMAIN_PERIPH));
+
+    if (conf->is_10_bits)
+    {
+        uint16_t slave_addr = conf->cs & 0x3FF;
+        uint16_t slave_addrh = (((slave_addr>>7)|0)&0x7) | 0xF0;
+        uint16_t slave_addrl = (slave_addr&0xFF);
+        slave_data->slave_addr = (slave_addrh << 8) | slave_addrl;
+        slave_data->slave_addrh = slave_addrh;
+    }
+    else
+    {
+        slave_data->slave_addr = conf->cs & 0xFE;
+    }
+
+    device->data = (void *)slave_data;
+
+    itf_data->open_nb++;
+    restore_irq(irq);
+    return 0;
+
+error4:
+    pi_udma_core_lin_free(itf_data->tx_chan_id);
+error3:
+    pi_udma_core_lin_free(itf_data->rx_chan_id);
+error2:
+    pi_fc_l1_free(itf_data, sizeof(i2c_itf_data_t));
+error1:
+    pi_fc_l1_free(slave_data, sizeof(i2c_slave_data_t));
+error0:
+    restore_irq(irq);
+    return -1;
+}
+
+
+void pi_i2c_close(pi_device_t *device)
+{
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    I2C_TRACE("I2C->i2c_close\n");
+    int irq = disable_irq();
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+    pi_fc_l1_free(slave_data,sizeof(*slave_data));
+
+    itf_data->open_nb--;
+    if(itf_data->open_nb == 0)
+    {
+        // flush channels
+        pi_udma_core_lin_free(itf_data->rx_chan_id);
+        pi_udma_core_lin_free(itf_data->tx_chan_id);
+        pi_udma_core_lin_free(itf_data->cmd_chan_id);
+
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->cmd_chan_id));
+
+        pi_udma_core_lin_reset(itf_data->rx_chan_addr);
+        pi_udma_core_lin_reset(itf_data->tx_chan_addr);
+        pi_udma_core_lin_reset(itf_data->cmd_chan_addr);
+
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_I2C_LEAD_EVT(itf_data->id));
+
+        udma_ctrl_cfg_cg_clr_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(itf_data->id));
+        udma_ctrl_cfg_rstn_clr_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(itf_data->id));
+        __pi_i2c_itf_data[itf_data->id] = NULL;
+        pi_fc_l1_free(itf_data,sizeof(*itf_data));
+    }
+    restore_irq(irq);
+}
+
+
+static void __pi_i2c_baudrate_set(i2c_itf_data_t *driver_data)
+{
+}
+
+
+#if defined(USE_TIMEOUT)
+void __pi_i2c_timeout_abort(void* arg)
+{
+    i2c_itf_data_t *driver_data = (i2c_itf_data_t*) arg;
+    uint32_t device_id = driver_data->id;
+    int irq = pi_irq_disable();
+    /* Stop UDMA channels. */
+    udma_core_lin_t *udma_core = NULL;
+    if (driver_data->cmd_chan_id != 0xFF)
+    {
+        pi_udma_core_lin_reset(driver_data->rx_chan_addr);
+    }
+    if (driver_data->tx_chan_id != 0xFF)
+    {
+        if (driver_data->end_task->data[0] == I2C_WRITE)
+        {
+            driver_data->end_task->arg[3] = pi_udma_core_lin_bytes_left_get(driver_data->base);
+        }
+        pi_udma_core_lin_reset(driver_data->tx_chan_addr);
+    }
+    if (driver_data->rx_chan_id != 0xFF)
+    {
+        if (driver_data->end_task->data[0] == I2C_READ)
+        {
+            driver_data->end_task->arg[3] = pi_udma_core_lin_bytes_left_get(driver_data->base);
+        }
+        pi_udma_core_lin_reset(driver_data->rx_chan_addr);
+    }
+
+    /* Status events clear. */
+    uint32_t status_mask = (1 << UDMA_I2C_STATUS_REG_IDX_STATUS_LEAD_UNLOCK_EVENT_O_IDX_BIT |
+                            1 << UDMA_I2C_STATUS_REG_IDX_STATUS_LEAD_PURGE_EVENT_O_IDX_BIT |
+                            1 << UDMA_I2C_STATUS_REG_IDX_STATUS_I2C_SOFT_RESET_EVENT_O_IDX_BIT);
+    udma_i2c_status_reg_idx_set(driver_data->base, status_mask);
+
+    /* Pop current aborted task. */
+    driver_data->end_task = NULL;
+    pi_task_t *next_task = __pi_i2c_drv_fifo_pop(driver_data);
+    if (next_task)
+    {
+        driver_data->end_task = next_task;
+        __pi_i2c_send_request_from_irq(driver_data, next_task);
+    }
+    pi_irq_restore(irq);
+}
+
+static void __pi_i2c_udma_timeout_rx_set(i2c_itf_data_t *driver_data,
+                                         uint8_t timeout_id)
+{
+    driver_data->rx_timeout_id = timeout_id;
+}
+
+static void __pi_i2c_udma_timeout_tx_set(i2c_itf_data_t *driver_data,
+                                         uint8_t timeout_id)
+{
+    driver_data->tx_timeout_id = timeout_id;
+}
+#endif
+
+void pi_i2c_ioctl(struct pi_device *device, uint32_t cmd, void *arg)
+{
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    I2C_TRACE("I2C(%d) : ioctl cmd=%lx, arg=%lx\n", slave_data->itf_data->id, cmd, arg);
+
+    uint32_t irq = disable_irq();
+    uint8_t udma_timeout_id = 0xFF;
+    switch (cmd)
+    {
+    case PI_I2C_CTRL_SET_MAX_BAUDRATE :
+        __pi_i2c_baudrate_set(slave_data->itf_data);
+        break;
+
+    case PI_I2C_IOCTL_ABORT_RX :
+        __pi_i2c_timeout_abort(slave_data->itf_data);
+        break;
+
+    case PI_I2C_IOCTL_ABORT_TX :
+        __pi_i2c_timeout_abort(slave_data->itf_data);
+        break;
+
+    case PI_I2C_IOCTL_ATTACH_TIMEOUT_RX :
+#if defined(USE_TIMEOUT)
+        udma_timeout_id = (uint32_t) arg;
+        __pi_i2c_udma_timeout_rx_set(slave_data->itf_data, udma_timeout_id);
+#endif
+        break;
+
+    case PI_I2C_IOCTL_DETACH_TIMEOUT_RX :
+#if defined(USE_TIMEOUT)
+        __pi_i2c_udma_timeout_rx_set(slave_data->itf_data, udma_timeout_id);
+#endif
+        break;
+
+    case PI_I2C_IOCTL_ATTACH_TIMEOUT_TX :
+#if defined(USE_TIMEOUT)
+        udma_timeout_id = (uint32_t) arg;
+        __pi_i2c_udma_timeout_tx_set(slave_data->itf_data, udma_timeout_id);
+#endif
+        break;
+
+    case PI_I2C_IOCTL_DETACH_TIMEOUT_TX :
+#if defined(USE_TIMEOUT)
+        __pi_i2c_udma_timeout_tx_set(slave_data->itf_data, udma_timeout_id);
+#endif
+        break;
+
+    case PI_I2C_IOCTL_EN_TIMESTAMP : 
+        __pi_i2c_timestamp_enable(slave_data->itf_data, (struct pi_i2c_conf *) arg);
+        break;
+
+    default :
+        break;
+    }
+    restore_irq(irq);
+}
+
+
+void pi_i2c_conf_init(pi_i2c_conf_t *conf)
+{
+    pi_assert(NULL != conf);
+
+    conf->max_baudrate = 400000;
+    conf->itf = 0;
+    conf->cs = 0;
+    conf->wait_cycles = 0;
+    conf->is_10_bits = 0;
+}
+
+void pi_i2c_conf_set_wait_cycles(struct pi_i2c_conf *conf, uint16_t wait_cycles)
+{
+    pi_assert(NULL != conf);
+
+    conf->wait_cycles = wait_cycles;
+}
+
+/** accessors **/
+void pi_i2c_conf_set_slave_addr(struct pi_i2c_conf *conf, uint16_t slave_addr,
+                                int8_t is_10_bits)
+{
+    pi_assert(NULL != conf);
+
+    conf->cs = slave_addr;
+    conf->is_10_bits = is_10_bits;
+}
+
+int pi_i2c_write(struct pi_device *device, uint8_t *tx_data, int length, pi_i2c_xfer_flags_e flags)
+{
+    pi_task_t task_block;
+    pi_task_block(&task_block);
+    pi_i2c_write_async(device, (void*)tx_data, (uint32_t)length, flags, &task_block);
+    pi_task_wait_on(&task_block);
+    return pi_i2c_get_request_status(&task_block);
+}
+
+int pi_i2c_read (struct pi_device *device, uint8_t *rx_buff, int length,
+                 pi_i2c_xfer_flags_e flags)
+{
+    pi_task_t task_block;
+    pi_task_block(&task_block);
+    pi_i2c_read_async(device, (void*)rx_buff, (uint32_t)length, flags, &task_block);
+    pi_task_wait_on(&task_block);
+    return pi_i2c_get_request_status(&task_block);
+}
+
+void pi_i2c_read_async(struct pi_device *device, uint8_t *buffer, int size,
+                       pi_i2c_xfer_flags_e flags, pi_task_t *task)
+{
+    pi_assert(NULL != device);
+    pi_assert((NULL != rx_buff) && (IS_BUFF_IN_L2(rx_buff)));
+    pi_assert(0 != length);
+    pi_assert(NULL != task);
+
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+    uint16_t slave_addr = slave_data->slave_addr;
+
+    task->data[0] = I2C_READ;
+    task->data[1] = (uintptr_t)buffer;
+    task->data[2] = (uintptr_t)size;
+    task->data[3] = (uintptr_t)device;
+    task->data[4] = (uintptr_t)flags;
+
+    int irq = disable_irq();
+    if (!itf_data->end_task)
+    {  // exec transfer
+        __pi_i2c_read_exec(slave_data, buffer, size, flags, task);
+        itf_data->end_task = task;
+    }
+    else
+    {
+        __pi_i2c_drv_fifo_enqueue(itf_data, task);
+    }
+    restore_irq(irq);
+}
+
+void pi_i2c_write_async(struct pi_device *device, uint8_t *buffer, int size,
+                        pi_i2c_xfer_flags_e flags, pi_task_t *task)
+{
+    pi_assert(NULL != device);
+    pi_assert((NULL != tx_data) && (IS_BUFF_IN_L2(tx_data)));
+    pi_assert(0 != length);
+    pi_assert(NULL != task);
+
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    int irq = disable_irq();
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+
+    task->data[0] = I2C_WRITE;
+    task->data[1] = (uintptr_t)buffer;
+    task->data[2] = (uintptr_t)size;
+    task->data[3] = (uintptr_t)device;
+    task->data[4] = (uintptr_t)flags;
+
+    if(!itf_data->end_task)
+    {   // exec transfer
+        __pi_i2c_write_exec(slave_data, buffer, size, flags, task);
+        itf_data->end_task = task;
+    }
+    else
+    {
+        __pi_i2c_drv_fifo_enqueue(itf_data, task);
+    }
+    restore_irq(irq);
+}
+
+int pi_i2c_read_timeout(struct pi_device *device, uint8_t *rx_buff, int length,
+                        pi_i2c_xfer_flags_e flags, uint32_t timeout_us)
+{
+    pi_assert(NULL != device);
+    pi_assert((NULL != rx_buff) && (IS_BUFF_IN_L2(rx_buff)));
+    pi_assert(0 != length);
+
+    pi_task_t task_block = {0};
+    pi_task_block(&task_block);
+#if defined(USE_TIMEOUT)
+    pi_task_timeout_set(&task_block, timeout_us);
+#endif
+    pi_i2c_read_async(device, rx_buff, length, flags, &task_block);
+    pi_task_wait_on(&task_block);
+    int status = pi_task_status_get(&task_block);
+    //return ((status == -1) ? -1 : 0);
+    return status;
+}
+
+
+int pi_i2c_write_timeout(struct pi_device *device, uint8_t *tx_data, int length,
+                         pi_i2c_xfer_flags_e flags, uint32_t timeout_us)
+{
+    pi_assert(NULL != device);
+    pi_assert((NULL != tx_data) && (IS_BUFF_IN_L2(tx_data)));
+    pi_assert(0 != length);
+
+    pi_task_t task_block = {0};
+    pi_task_block(&task_block);
+#if defined(USE_TIMEOUT)
+    pi_task_timeout_set(&task_block, timeout_us);
+#endif
+    pi_i2c_write_async(device, tx_data, length, flags, &task_block);
+    pi_task_wait_on(&task_block);
+    int status = pi_task_status_get(&task_block);
+    //return ((status == -1) ? -1 : 0);
+    return status;
+}
+
+int pi_i2c_get_request_status(pi_task_t* task)
+{
+    pi_assert(NULL != task);
+
+    return (int) (task->data[0]);
+}
+
+
+void pi_i2c_write_read(struct pi_device *device, void *tx_buffer,
+                       void *rx_buffer, uint32_t tx_size, uint32_t rx_size)
+{
+    pi_task_t task_block;
+    pi_task_block(&task_block);
+    pi_i2c_write_read_async(device, tx_buffer, rx_buffer, tx_size, rx_size, &task_block);
+    pi_task_wait_on(&task_block);
+}
+
+
+void pi_i2c_write_read_async(struct pi_device *device, void *tx_buffer,
+                             void *rx_buffer, uint32_t tx_size, uint32_t rx_size,
+                             pi_task_t *task)
+{
+    pi_assert(NULL != device);
+    pi_assert((NULL != tx_buffer) && (IS_BUFF_IN_L2(tx_buffer)));
+    pi_assert((NULL != rx_buffer) && (IS_BUFF_IN_L2(rx_buffer)));
+    pi_assert(0 != tx_size);
+    pi_assert(0 != rx_size);
+    pi_assert(NULL != task);
+
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    int irq = disable_irq();
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+
+    task->data[0] = I2C_WRITE_READ;
+    task->data[1] = (uintptr_t)tx_buffer;
+    task->data[2] = (uintptr_t)rx_buffer;
+    task->data[3] = (uintptr_t)device;
+    task->data[4] = (uintptr_t)tx_size;
+    task->data[5] = (uintptr_t)rx_size;
+
+    if(!itf_data->end_task)
+    {
+        __pi_i2c_write_read_exec(slave_data, tx_buffer, rx_buffer, tx_size, rx_size, task);
+        itf_data->end_task = task;
+    }
+    else
+    {
+        __pi_i2c_drv_fifo_enqueue(itf_data, task);
+    }
+    restore_irq(irq);
+}
+
+void pi_i2c_write_dual_async(struct pi_device *device, void *tx_buffer0,
+                             void *tx_buffer1, uint32_t tx_size0, uint32_t tx_size1,
+                             pi_task_t *task)
+{
+    pi_assert(NULL != device);
+    pi_assert((NULL != tx_buffer0) && (IS_BUFF_IN_L2(tx_buffer0)));
+    pi_assert((NULL != tx_buffer1) && (IS_BUFF_IN_L2(tx_buffer1)));
+    pi_assert(0 != tx_size0);
+    pi_assert(0 != tx_size1);
+    pi_assert(NULL != task);
+
+    i2c_slave_data_t *slave_data = (i2c_slave_data_t *)device->data;
+
+    int irq = disable_irq();
+    i2c_itf_data_t *itf_data = slave_data->itf_data;
+
+    task->data[0] = I2C_WRITE_DUAL;
+    task->data[1] = (uintptr_t)tx_buffer0;
+    task->data[2] = (uintptr_t)tx_buffer1;
+    task->data[3] = (uintptr_t)device;
+    task->data[4] = (uintptr_t)tx_size0;
+    task->data[5] = (uintptr_t)tx_size1;
+
+    if(!itf_data->end_task)
+    {
+        __pi_i2c_write_dual_exec(slave_data, tx_buffer0, tx_buffer1, tx_size0, tx_size1, task);
+        itf_data->end_task = task;
+    }
+    else
+    {
+        __pi_i2c_drv_fifo_enqueue(itf_data, task);
+    }
+    restore_irq(irq);
+}
+
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_internal.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_internal.h
new file mode 100644
index 000000000..ed833146c
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_internal.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "stdlib.h"
+#include "pmsis.h"
+#include "pmsis/drivers/i2c.h"
+#include "udma_i2c.h"
+#include <string.h>
+
+#if !defined(__FREERTOS__)
+#if !defined(__TRACE_ALL__) && !defined(__TRACE_I2C__)
+#define I2C_TRACE(x...)
+#define I2C_TRACE_ERR(...)         ((void) 0)
+#else
+#define I2C_TRACE(level, x...) POS_TRACE(level, "[I2C] " x)
+#define I2C_TRACE_ERR(...)         PI_LOG_ERR(__func__, __VA_ARGS__)
+#endif
+#endif
+
+/*! @brief UART request structure. */
+#define  i2c_req_t udma_req_t
+
+#define I2C_READ 0
+#define I2C_WRITE 1
+#define I2C_WRITE_READ 2
+#define I2C_WRITE_DUAL 3
+
+#define I2C_CMD_BUF_SIZE   14
+
+#define I2C_BUF_START_POS   6
+#define I2C_BUF_WRITE_RPT_POS 8
+
+#define I2C_W_R_BUF_WR_POS 2
+#define I2C_W_R_BUF_RESTART_POS 4
+#define RD_BUF_RPT(bits) ((bits) ? 10 : 8)
+#define W_R_BUF_RD(bits) ((bits) ? 8  : 6)
+
+#define RD_BUF_STOP_ID(bits) (RD_BUF_RPT((bits)) + 3)
+#define WR_BUF_STOP_ID 10
+
+#define I2C_SLAVE_GET_ITF(slave) (slave->itf_data)
+
+
+typedef uint32_t i2c_cmd_t;
+
+typedef struct i2c_slave_data
+{
+    struct i2c_itf_data *itf_data;
+    uint32_t cfg;
+    // a slave might answer to up to two addresses, 7 or 10 bits
+    int16_t wait_cycles;
+    uint16_t slave_addr;
+    uint16_t slave_addrh;
+    int8_t is_10_bits;
+} i2c_slave_data_t;
+
+typedef struct i2c_itf_data {
+    uint32_t base;
+    pi_task_t *fifo_head;
+    pi_task_t *fifo_tail;
+    pi_task_t *end_task;
+    i2c_cmd_t cmd_buf[I2C_CMD_BUF_SIZE];
+    uint32_t rx_chan_addr;
+    uint32_t tx_chan_addr;
+    uint32_t cmd_chan_addr;
+    // per itf
+    uint8_t open_nb;
+    uint8_t id;
+    // --- channel event ---
+    int8_t rx_chan_id;
+    int8_t tx_chan_id;
+    int8_t cmd_chan_id;
+    // --- timeout channel ---
+    uint8_t rx_timeout_id;
+    uint8_t tx_timeout_id;
+} i2c_itf_data_t;
+
+
+// Has to be synchronized with irq_disabled since irq handler might pop at the same time
+static inline void __pi_i2c_drv_fifo_enqueue(struct i2c_itf_data *data,
+        pi_task_t *pi_task)
+{
+    if (data->fifo_head)
+    {
+        data->fifo_tail->next = pi_task;
+    }
+    else
+    {
+        data->fifo_head = pi_task;
+    }
+
+    pi_task->next = NULL;
+    data->fifo_tail = pi_task;
+}
+
+static inline pi_task_t *__pi_i2c_drv_fifo_pop(struct i2c_itf_data *data)
+{
+    pi_task_t *ret_task = data->fifo_head;
+    if (ret_task)
+    {
+        data->fifo_head = ret_task->next;
+    }
+    return ret_task;
+}
+
+/*
+ * @brief compute input divisor for i2c ip
+ * Input divisor is not the same as hw/real divisor, so compute an input
+ * divisor as near as possible to satisfy constraint
+ */
+static inline uint32_t __i2c_prepare_timing(uint32_t max_baudrate,
+        uint32_t periph_clock)
+{
+    pi_i2c_mode_e mode;
+    // TODO: HW, discuss plain divisor, this makes no sense
+    uint32_t input_div = 0, hw_div=0, targetL, targetH, divH, divL, cmd;
+
+    // second, determine mode:
+    if(max_baudrate < 200000)
+    {
+        mode = PI_I2C_STD_MODE;
+    }
+    else if(max_baudrate >= 200000 && max_baudrate < 750000)
+    {
+        mode = PI_I2C_FAST_MODE;
+    }
+    else if(max_baudrate >= 750000)
+    {
+        mode = PI_I2C_FAST_MODE_PLUS;
+    }
+
+    // choose target L and H (depend on std = 100KHz, fast mode=400KHz)
+    // we try to get the best frequency, without going faster than what standard
+    // allows
+    switch(mode)
+    {
+        case PI_I2C_STD_MODE: // 100KHz - up to 200
+            targetL = 4700; // 4.7 µs
+            targetH = 4000; // 4.0 µs
+            break;
+        case PI_I2C_FAST_MODE: // 400KHz
+            targetL = 1300; // 1.3 µs
+            targetH = 600;  // 0.6 µs
+            break;
+        case PI_I2C_FAST_MODE_PLUS: // 1MHz
+            targetL = 500; // 0.5 µs
+            targetH = 260; // 0.260 µs
+            break;
+        default:
+            return PI_FAIL;
+    }
+
+    /* timing method extracted from IP designer python script */
+    input_div = 0;
+    divL = 15;
+    divH = 15;
+
+    uint32_t dio = 3;
+    uint32_t f_pclk_mega = periph_clock / 1000000;
+    input_div = targetL * f_pclk_mega / (1000 * (15 + 1 + dio));
+    divL = targetL * f_pclk_mega / (1000 * (input_div + 1)) - dio;
+    divH = targetH * f_pclk_mega / (1000 * (input_div + 1)) - dio;
+
+    if (divL > 15)
+    {
+        input_div = input_div + 1;
+        divL = 1000 * targetL * f_pclk_mega / (input_div + 1) - dio;
+        divH = 1000 * targetH * f_pclk_mega / (input_div + 1) - dio;
+    }
+
+    cmd = I2C_CMD_TIMING(((input_div&0xFF) << 8) | ((divH&0xF) << 4)
+            | (divL&0xF));
+    return cmd;
+}
+
+void __pi_i2c_timeout_abort(void* arg);
+
+static inline void __pi_i2c_timeout_config_set(pi_task_t *task, uint8_t timeout_id,
+                                               uint8_t udma_chan_id, uint32_t timeout_us,
+                                               pi_callback_func_t abort_func,
+                                               void *arg)
+{
+#if defined(__FREERTOS__)
+    pi_udma_timeout_config_set(task, timeout_id, udma_chan_id, timeout_us);
+    pi_task_timeout_callback_set(task, abort_func, arg);
+#endif
+}
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave.c
new file mode 100644
index 000000000..25e378d9b
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pmsis.h"
+#include "i2c_internal.h"
+#include "pmsis/drivers/i2c_slave.h"
+#include "i2c_slave_internal.h"
+
+
+/** Internal defines **/
+#define I2C_SLAVE_ADDR0 0
+#define I2C_SLAVE_ADDR1 1
+#define I2C_ADDR_PUSH_DISABLE 0
+#define I2C_ADDR_PUSH_ENABLE  1
+
+struct i2c_slave_itf_data *__global_i2c_slave_itf_data[UDMA_NB_I2C];
+
+
+static void __pi_i2c_slave_handle_error(struct i2c_slave_itf_data *itf_data)
+{
+    I2C_TRACE("I2C_slave(%d)->slave_error_handler\n", device_id);
+    // I2C_STATUS_FOLL_ERROR_ARLO_EVENT & I2C_STATUS_FOLL_ERROR_FRAMING_EVENT
+    // => clear status, unlock and purge
+    int device_id = itf_data->id;
+    uint32_t arlo = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_ARLO_EVENT);
+    uint32_t framing = __pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_FRAMING_EVENT);
+    if (arlo || framing)
+    {
+        I2C_TRACE("I2C_slave(%d)->slave_error_handler - arbitration loss or framing error\n", device_id);
+        udma_i2c_status_reg_idx_set(itf_data->base,
+                (arlo << I2C_STATUS_FOLL_ERROR_ARLO_EVENT) |
+                (framing << I2C_STATUS_FOLL_ERROR_FRAMING_EVENT) |
+                (1 << I2C_FLAG_FOLL_UNLOCK_EVENT_O) |
+                (1 << I2C_FLAG_FOLL_PURGE_EVENT_O));
+    }
+}
+
+__attribute__((section(".text"))) __noinline
+void __pi_i2c_slave_event_handler(uint32_t event, void *arg)
+{
+    struct i2c_slave_itf_data *itf_data = arg;
+    int device_id = itf_data->id;
+    I2C_TRACE("I2C_slave(%d)->slave_event_handler\n", device_id);
+
+    // even for tx, there is a pseudo rx for addr matcher byte
+    uint8_t *l2_buffer = itf_data->rx_buffer;
+
+    uint8_t addr_byte = *l2_buffer;
+    uint8_t match = addr_byte >> 6;
+    l2_buffer++;
+
+    struct pi_i2c_slave_args slave_args = {
+        .slave_addr = itf_data->addr[match],
+        .handle = arg,
+        .ret = PI_FAIL, // if callee does not fill it, consider it a failure
+        .itf_id = device_id,
+    };
+
+    // First check whether we're here because of read or send
+    // then let the user supplied callback execute
+    // User will have the responsibility of unlocking the udma once callback
+    // is done
+    if(__pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_ARLO_EVENT)
+            ||__pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_ERROR_FRAMING_EVENT))
+    {
+        // TODO should we execute callback if there is an error ?
+        __pi_i2c_slave_handle_error(itf_data);
+    }
+    if(__pi_i2c_get_event_status(itf_data->base, I2C_STATUS_FOLL_EOF_RCV_EVENT))
+    {
+        uint32_t bytes_left = pi_udma_core_lin_bytes_left_get(itf_data->rx_chan_addr);
+        uint32_t size = itf_data->rx_buffer_size - bytes_left;
+        slave_args.nb_bytes = size - 1; // remove push byte
+        slave_args.l2_buffer = l2_buffer; // take buffer minus first byte
+        if(itf_data->rx_callback)
+        {
+            itf_data->rx_callback(&slave_args);
+        }
+    }
+    if(__pi_i2c_get_event_status(itf_data->base,
+            I2C_STATUS_FOLL_EOF_SND_EVENT))
+    {
+        uint32_t bytes_left = pi_udma_core_lin_bytes_left_get(itf_data->tx_chan_addr);
+        uint32_t size = itf_data->tx_buffer_size - bytes_left;
+        slave_args.nb_bytes = size;
+        slave_args.l2_buffer = itf_data->tx_buffer;
+        if(itf_data->tx_callback)
+        {
+            itf_data->tx_callback(&slave_args);
+        }
+    }
+    return;
+}
+
+/*
+ * @brief Internal open of i2c interface
+ * Caller must provide synchronization
+ */
+int pi_i2c_slave_open(struct pi_device *device)
+{
+    pi_assert((NULL != device) && (NULL != device->config));
+    struct pi_i2c_slave_conf *conf = device->config;
+
+    struct i2c_slave_itf_data *itf_data = NULL;
+    // check interface first
+    I2C_TRACE("I2C_slave(%d)->open_slave\n", conf->itf);
+    int irq = disable_irq();
+    if(!(itf_data = __global_i2c_slave_itf_data[conf->itf]))
+    {
+        uint32_t i2c_base = UDMA_I2C_ADDR(conf->itf);
+
+        // prepare itf struct
+        itf_data = pi_fc_l1_malloc(sizeof(struct i2c_slave_itf_data));
+        I2C_TRACE("I2C_slave(%d)->itf_data=%x\n", conf->itf, itf_data);
+        if(itf_data == NULL)
+        {
+            restore_irq(irq);
+            return -1;
+        }
+        __global_i2c_slave_itf_data[conf->itf] = itf_data;
+        memset(itf_data,0,sizeof(struct i2c_slave_itf_data));
+        itf_data->id = conf->itf;
+        itf_data->base = i2c_base;
+
+        // disable udma reset before setting regs
+        udma_ctrl_cfg_rstn_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf));
+        udma_ctrl_cfg_cg_set_set(UDMA_CTRL_ADDR, 1 << UDMA_I2C_ID(conf->itf));
+
+        itf_data->rx_chan_id = pi_udma_core_lin_alloc();
+        I2C_TRACE("I2C(%d)->rx chan id = %x\n", conf->itf, itf_data->rx_chan_id);
+        itf_data->tx_chan_id = pi_udma_core_lin_alloc();
+        itf_data->cmd_chan_id = pi_udma_core_lin_alloc();
+        udma_i2c_udma_cmd_dest_reg_idx_set(i2c_base, itf_data->cmd_chan_id);
+
+        itf_data->rx_chan_addr = pi_udma_core_lin_addr_get(itf_data->rx_chan_id);
+        itf_data->tx_chan_addr = pi_udma_core_lin_addr_get(itf_data->tx_chan_id);
+        itf_data->cmd_chan_addr = pi_udma_core_lin_addr_get(itf_data->cmd_chan_id);
+
+        // Master init procedure
+        int cmd_buf_id = 0;
+        uint32_t cmd_buf[8];
+
+        udma_i2c_status_reg_idx_set(i2c_base, 1<<I2C_FLAG_PRESC_DIV10_EVENT_O);
+        cmd_buf[cmd_buf_id++] = __i2c_prepare_timing(conf->max_baudrate,
+                pi_freq_get(PI_FREQ_DOMAIN_PERIPH));
+        if(conf->addr0 != 0)
+        {
+            itf_data->addr[0] = conf->addr0;
+            cmd_buf[cmd_buf_id++] = CMD_FOLL_ADDR(I2C_SLAVE_ADDR0,
+                    I2C_ADDR_PUSH_ENABLE, conf->addr0_10_bit, conf->addr0,
+                    conf->mask0, conf->sof0, conf->eof0);
+        }
+        if(conf->addr1 != 0)
+        {
+            itf_data->addr[1] = conf->addr1;
+            cmd_buf[cmd_buf_id++] = CMD_FOLL_ADDR(I2C_SLAVE_ADDR1,
+                    I2C_ADDR_PUSH_ENABLE, conf->addr1_10_bit, conf->addr1,
+                    conf->mask1, conf->sof1, conf->eof1);
+        }
+        cmd_buf[cmd_buf_id++] = I2C_CMD_EVENT(1);
+        pi_udma_core_lin_enqueue(itf_data->cmd_chan_addr, (uint32_t)cmd_buf,
+                cmd_buf_id*sizeof(uint32_t), 0);
+
+        while(!__pi_i2c_get_event_status(i2c_base,I2C_FLAG_CMD_EVENT_I))
+        {
+            pi_time_wait_us(1);
+        }
+        udma_i2c_status_reg_idx_set(i2c_base, 1<<I2C_FLAG_CMD_EVENT_I);
+
+        udma_i2c_foll_udma_rx_dest_reg_idx_set(i2c_base, itf_data->rx_chan_id);
+        udma_i2c_foll_udma_tx_dest_reg_idx_set(i2c_base, itf_data->tx_chan_id);
+        pi_fc_event_handler_set(SOC_EVENT_UDMA_I2C_SLAVE_EVT(conf->itf),
+                __pi_i2c_slave_event_handler, itf_data);
+        pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_I2C_SLAVE_EVT(conf->itf));
+
+        itf_data->rx_callback = conf->rx_callback;
+        itf_data->tx_callback = conf->tx_callback;
+
+    }
+    device->data = itf_data;
+    itf_data->open_nb++;
+    restore_irq(irq);
+    return 0;
+}
+
+void pi_i2c_slave_close(struct pi_device *device)
+{
+    pi_assert(NULL != device);
+
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)device->data;
+
+    int irq = disable_irq();
+    itf_data->open_nb--;
+    if(itf_data->open_nb == 0)
+    {
+        // flush channels
+        pi_udma_core_lin_free(itf_data->rx_chan_id);
+        pi_udma_core_lin_free(itf_data->tx_chan_id);
+        pi_udma_core_lin_free(itf_data->cmd_chan_id);
+
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->cmd_chan_id));
+
+        pi_udma_core_lin_reset(itf_data->rx_chan_addr);
+        pi_udma_core_lin_reset(itf_data->tx_chan_addr);
+        pi_udma_core_lin_reset(itf_data->cmd_chan_addr);
+
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_I2C_SLAVE_EVT(itf_data->id));
+
+        // disable udma reset before setting regs
+        udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, 1 << UDMA_I2C_ID(itf_data->id));
+        udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, 1 << UDMA_I2C_ID(itf_data->id));
+        __global_i2c_slave_itf_data[itf_data->id] = NULL;
+        pi_fc_l1_free(itf_data,sizeof(itf_data));
+    }
+    restore_irq(irq);
+}
+
+
+void pi_i2c_slave_conf_init(pi_i2c_slave_conf_t *conf)
+{
+    pi_assert(NULL != conf);
+
+    conf->max_baudrate = 400000;
+    conf->itf = 0;
+    conf->addr0 = 0;
+    conf->addr1 = 0;
+    conf->sof0 = 0;
+    conf->eof0 = 0;
+    conf->sof1 = 0;
+    conf->eof1 = 0;
+    conf->addr0_10_bit = 0;
+    conf->addr1_10_bit = 0;
+    conf->mask0 = 0x1F;
+    conf->mask1 = 0x1F;
+    conf->addr0 = 0;
+    conf->addr1 = 0;
+    conf->rx_callback = NULL;
+    conf->tx_callback = NULL;
+}
+
+
+/** accessors **/
+
+void pi_i2c_slave_conf_set_addr0(struct pi_i2c_slave_conf *conf, uint16_t addr,
+                                 uint8_t mask, uint8_t is_10_bit, uint8_t eof, uint8_t sof)
+{
+    pi_assert(NULL != conf);
+
+    conf->addr0 = addr;
+    conf->mask0 = mask;
+    conf->addr0_10_bit = is_10_bit;
+    conf->eof0 = eof;
+    conf->sof0 = sof;
+}
+
+void pi_i2c_slave_conf_set_addr1(struct pi_i2c_slave_conf *conf, uint16_t addr,
+                                 uint8_t mask, uint8_t is_10_bit, uint8_t eof, uint8_t sof)
+{
+    pi_assert(NULL != conf);
+
+    conf->addr1 = addr;
+    conf->mask1 = mask;
+    conf->addr1_10_bit = is_10_bit;
+    conf->eof1 = eof;
+    conf->sof1 = sof;
+}
+
+void pi_i2c_slave_set_rx_channel(void *handle,
+                                 void *l2_addr, uint32_t size)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != itf_data);
+    pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr)));
+    pi_assert(0 != size);
+
+    int irq = disable_irq();
+    pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)l2_addr, size, 0);
+    itf_data->rx_buffer = l2_addr;
+    itf_data->rx_buffer_size = size;
+    restore_irq(irq);
+}
+
+void pi_i2c_slave_set_tx_channel(void *handle,
+                                 void *l2_addr, uint32_t size)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != itf_data);
+    pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr)));
+    pi_assert(0 != size);
+
+    int irq = disable_irq();
+    pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)l2_addr, size, 0);
+    itf_data->tx_buffer = l2_addr;
+    itf_data->tx_buffer_size = size;
+    restore_irq(irq);
+}
+
+void pi_i2c_slave_unlock(void *handle, int is_rd)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != handle);
+
+    int device_id = itf_data->id;
+    if(is_rd)
+    {
+        udma_i2c_status_reg_idx_set(itf_data->base,(1<<I2C_FLAG_FOLL_UNLOCK_EVENT_O)
+                | (1 << I2C_FLAG_FOLL_PURGE_EVENT_O)
+                |(1 << I2C_STATUS_FOLL_EOF_RCV_EVENT));
+    }
+    else
+    {
+        udma_i2c_status_reg_idx_set(itf_data->base,(1<<I2C_FLAG_FOLL_UNLOCK_EVENT_O) |
+            (1 << I2C_FLAG_FOLL_PURGE_EVENT_O) | (1<<I2C_STATUS_FOLL_EOF_SND_EVENT));
+    }
+}
+
+void pi_i2c_slave_set_rx(void *handle, void *l2_addr,
+                         uint32_t size)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != itf_data);
+    pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr)));
+    pi_assert(0 != size);
+
+    int irq = disable_irq();
+    pi_udma_core_lin_enqueue(itf_data->rx_chan_addr, (uint32_t)l2_addr, size, 0);
+    itf_data->rx_buffer = l2_addr;
+    itf_data->rx_buffer_size = size;
+    restore_irq(irq);
+}
+
+void pi_i2c_slave_set_tx(void *handle, void *l2_addr,
+                         uint32_t size)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != itf_data);
+    pi_assert((NULL != l2_addr) && (IS_BUFF_IN_L2(l2_addr)));
+    pi_assert(0 != size);
+
+    int irq = disable_irq();
+    pi_udma_core_lin_enqueue(itf_data->tx_chan_addr, (uint32_t)l2_addr, size, 0);
+    itf_data->tx_buffer = l2_addr;
+    itf_data->tx_buffer_size = size;
+    restore_irq(irq);
+}
+
+void pi_i2c_slave_stop_rx(void *handle)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != itf_data);
+
+    int irq = disable_irq();
+    pi_udma_core_lin_stop(itf_data->rx_chan_addr);
+    restore_irq(irq);
+}
+
+void pi_i2c_slave_stop_tx(void *handle)
+{
+    struct i2c_slave_itf_data *itf_data = (struct i2c_slave_itf_data *)handle;
+
+    pi_assert(NULL != itf_data);
+
+    int irq = disable_irq();
+    pi_udma_core_lin_stop(itf_data->tx_chan_addr);
+    restore_irq(irq);
+}
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave_internal.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave_internal.h
new file mode 100644
index 000000000..70ec112cf
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/i2c_slave_internal.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "stdlib.h"
+#include "pmsis.h"
+#include "pmsis/drivers/i2c_slave.h"
+
+#ifndef I2C_DRIVER_DATA_IMPLEM_SPECIFC
+    #define I2C_DRIVER_DATA_IMPLEM_SPECIFC
+#endif
+
+/*! @brief UART request structure. */
+#define  i2c_req_t udma_req_t 
+
+typedef uint32_t i2c_cmd_t;
+
+struct i2c_slave_itf_data {
+    uint32_t base;
+
+    // per itf
+    uint32_t rx_chan_addr;
+    uint32_t tx_chan_addr;
+    uint32_t cmd_chan_addr;
+    
+    uint8_t open_nb;
+    uint8_t id;
+    // --- channel event ---
+    uint8_t rx_chan_id;
+    uint8_t tx_chan_id;
+    uint8_t cmd_chan_id;
+    // -- addresses to which we answer
+    uint8_t addr0_10_bit;
+    uint8_t addr1_10_bit;
+    uint8_t addr0_mask;
+    uint8_t addr1_mask;
+    uint16_t addr[4];
+    // in handler callbacks
+    pi_i2c_callback_t rx_callback;
+    pi_i2c_callback_t tx_callback;
+    // buffers
+    void *rx_buffer;
+    uint32_t rx_buffer_size;
+    void *tx_buffer;
+    uint32_t tx_buffer_size;
+    I2C_DRIVER_DATA_IMPLEM_SPECIFC
+};
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/udma_i2c.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/udma_i2c.h
new file mode 100644
index 000000000..02244b182
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2c/udma_i2c.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2022 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+typedef enum i2c_mode {
+    PI_I2C_STD_MODE,
+    PI_I2C_FAST_MODE,
+    PI_I2C_FAST_MODE_PLUS,
+    PI_I2C_NB_MODES
+} pi_i2c_mode_e;
+
+// TEMPORARY, until can be generated
+#define I2C_STATUS_FOLL_EOF_RCV_EVENT 2
+#define I2C_STATUS_FOLL_EOF_SND_EVENT 3
+#define I2C_STATUS_FOLL_ERROR_ARLO_EVENT 4
+#define I2C_STATUS_FOLL_ERROR_FRAMING_EVENT 5
+#define I2C_FLAG_CMD_EVENT_I 19
+#define I2C_FLAG_FOLL_UNLOCK_EVENT_O 14
+#define I2C_FLAG_FOLL_PURGE_EVENT_O 15
+#define I2C_STATUS_ERROR_NACK_EVENT 16
+#define I2C_STATUS_ERROR_ARLO_EVENT 17
+#define I2C_STATUS_ERROR_FRAMING_EVENT 18
+#define I2C_FLAG_UNLOCK_EVENT_O 22
+#define I2C_FLAG_PURGE_EVENT_O 23
+#define I2C_FLAG_SOFT_RESET_EVENT 24
+#define I2C_FLAG_PRESC_DIV10_EVENT_O 25
+
+#define I2C_CMD_TIMING(T) ((0x10 << 24) | (T))
+#define I2C_CMD_EVENT(T) ((0x41 << 24))
+
+
+#define CMD_FOLL_ADDR(match_id,push_en,addr_10_bit,slave_addr,mask,sof,eof) \
+    ((0x20 << 24) | (match_id << 22) | (push_en << 21) | (eof << 20) \
+    | (sof << 19) | (eof << 18) | (sof << 17) | (addr_10_bit << 16) \
+    | ((!(addr_10_bit)) << 15) | (mask << 10) \
+    | ((addr_10_bit) ? (slave_addr) : ((slave_addr)>>1) << 0))
+
+/* enable automatic sending of stop when receiving a nack error */
+#define __I2C_NACK_STOP ((1 << 23))
+
+
+#define I2C_CMD_LEAD_START(T)  ((0x30 << 24) | __I2C_NACK_STOP)
+#define I2C_CMD_MISC_WAIT(T)   ((0x3<<24))
+#define I2C_CMD_NOP(T)         ((0x0<<24))
+// add -1, well, hw guys saving on bits...
+#define I2C_CMD_RPT(T) ((0x02 << 24) | (((T)&0xFFFF)))
+#define I2C_CMD_LEAD_SEND_IMM(T) ((0x32 << 24) | __I2C_NACK_STOP | ((T)&0xFF))
+#define I2C_CMD_LEAD_SEND_IMM_ADDR(IS_10BITS, T) ((0x37 << 24) | __I2C_NACK_STOP | (IS_10BITS << 15) | ((T)&0xFFFF))
+#define I2C_CMD_MISC_RECEIVE(T) ((0x33 << 24) | __I2C_NACK_STOP)
+#define I2C_CMD_MISC_RECEIVE_LAST(T) ((0x34 << 24) | __I2C_NACK_STOP)
+#define I2C_CMD_MISC_SEND(T) ((0x31 << 24) | __I2C_NACK_STOP)
+#define I2C_CMD_STOP(T) ((0x36 << 24) | __I2C_NACK_STOP)
+#define I2C_CMD_UDMA_TX_CHAN_CFG(T) (( 0x50 << 24 ) | (T))
+#define I2C_CMD_UDMA_RX_CHAN_CFG(T) (( 0x51 << 24 ) | (T))
+
+#define I2C_CHAN_ADDR_REG 0x0
+#define I2C_CHAN_SIZE_REG 0x2
+#define I2C_CHAN_CFG_REG 0x7
+
+
+
+/** get_current value of event in i2c event register. Defined as a macro to work with O0  **/
+#define __pi_i2c_get_event_status(base, event_id) __BITEXTRACT(udma_i2c_status_reg_idx_get(base), 1, event_id)
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c
index 8f0127712..44c869014 100644
--- a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/i2s/i2s.c
@@ -21,10 +21,6 @@
 #include <pmsis.h>
 #include <string.h>
 
-#if defined(__FREERTOS__)
-#define pi_task_push_irq_safe(pi_task)                   pi_task_push((pi_task))
-#endif  /* __FREERTOS__ */
-
 #define I2S_NB_SLOTS 16
 
 typedef struct
@@ -143,11 +139,7 @@ static void *__pi_i2s_ring_buffer_pop(__pi_i2s_slot_t *slot)
 }
 
 
-#if !defined(__FREERTOS__)
-static void __pi_i2s_handle_rx_frame(int event, void *arg)
-#else
-static void __pi_i2s_handle_rx_frame(void *arg)
-#endif  /* __FREERTOS__ */
+static void __pi_i2s_handle_rx_frame(uint32_t event, void *arg)
 {
     __pi_i2s_slot_t *frame_slot = (__pi_i2s_slot_t *)arg;
     __pi_i2s_t *i2s = frame_slot->i2s;
@@ -204,7 +196,7 @@ static void __pi_i2s_handle_rx_frame(void *arg)
 
             frame_slot->tx_buffer0 = frame_slot->tx_buffer1;
             frame_slot->tx_buffer1 = __pi_i2s_ring_buffer_pop(frame_slot);
-    
+
             if (frame_slot->tx_buffer1)
             {
                 uint32_t frame = frame_slot->frame;
@@ -253,11 +245,7 @@ static void __pi_i2s_handle_rx_frame(void *arg)
     }
 }
 
-#if !defined(__FREERTOS__)
-static void __pi_i2s_handle_tx_frame(int event, void *arg)
-#else
-static void __pi_i2s_handle_tx_frame(void *arg)
-#endif  /* __FREERTOS__ */
+static void __pi_i2s_handle_tx_frame(uint32_t event, void *arg)
 {
     __pi_i2s_slot_t *frame_slot = (__pi_i2s_slot_t *)arg;
     __pi_i2s_t *i2s = frame_slot->i2s;
@@ -321,11 +309,7 @@ static void __pi_i2s_handle_tx_frame(void *arg)
 }
 
 
-#if !defined(__FREERTOS__)
-static void __pi_i2s_handle_rx_channel(int event, void *arg)
-#else
-static void __pi_i2s_handle_rx_channel(void *arg)
-#endif  /* __FREERTOS__ */
+static void __pi_i2s_handle_rx_channel(uint32_t event, void *arg)
 {
     __pi_i2s_slot_t *slot = (__pi_i2s_slot_t *)arg;
     __pi_i2s_t *i2s = slot->i2s;
@@ -390,7 +374,7 @@ static void __pi_i2s_handle_rx_channel(void *arg)
 
             slot->tx_buffer0 = slot->tx_buffer1;
             slot->tx_buffer1 = __pi_i2s_ring_buffer_pop(slot);
-    
+
             if (slot->tx_buffer1)
             {
                 uint32_t base = slot->channel_base;
@@ -424,11 +408,7 @@ static void __pi_i2s_handle_rx_channel(void *arg)
     }
 }
 
-#if !defined(__FREERTOS__)
-static void __pi_i2s_handle_tx_channel(int event, void *arg)
-#else
-static void __pi_i2s_handle_tx_channel(void *arg)
-#endif  /* __FREERTOS__ */
+static void __pi_i2s_handle_tx_channel(uint32_t event, void *arg)
 {
     __pi_i2s_slot_t *slot = (__pi_i2s_slot_t *)arg;
     __pi_i2s_t *i2s = slot->i2s;
@@ -505,8 +485,7 @@ static inline void __pi_i2s_enqueue_buffer(__pi_i2s_slot_t *slot, void *buffer)
     slot->ring_buffer.buffer[slot->ring_buffer.head++] = buffer;
     if (slot->ring_buffer.head == slot->ring_buffer.nb_elem)
         slot->ring_buffer.head = 0;
-    
-    
+
     slot->ring_buffer.current_nb_elem++;
 }
 
@@ -637,7 +616,6 @@ int __pi_i2s_channel_conf_set(struct pi_device *device, uint32_t frame, int slot
         unsigned int loopback = __BITEXTRACTU(conf->options, 1, PI_I2S_OPT_LOOPBACK_ENA_SHIFT);
         int ring_buffer_nb_elem = conf->mem_slab ? conf->mem_slab->num_blocks : 2;
         int ring_buffer_size = sizeof(void *) * ring_buffer_nb_elem;
-        int use_buffers = ((conf->pingpong_buffers[0] != NULL || conf->mem_slab) && loopback == 0) || frame;
 
         slot = pi_fc_l1_malloc(sizeof(__pi_i2s_slot_t) + ring_buffer_size);
         if (slot == NULL)
@@ -671,7 +649,7 @@ int __pi_i2s_channel_conf_set(struct pi_device *device, uint32_t frame, int slot
         slot->tx_task1 = NULL;
         slot->ignore_first_error = 0;
 
-        if (use_buffers)
+        if (conf->asrc_channel == -1 && loopback == 0)
         {
             int channel = pi_udma_core_lin_alloc();
             if (channel == -1)
@@ -819,8 +797,8 @@ int pi_i2s_open(struct pi_device *device)
     if (i2s->open_count == 1)
     {
         int periph_id = UDMA_I2S_ID(itf_id);
-        udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, 1 << periph_id);
-        udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, 1 << periph_id);
+        udma_ctrl_cfg_rstn_set_set(UDMA_CTRL_ADDR, 1 << periph_id);
+        udma_ctrl_cfg_cg_set_set(UDMA_CTRL_ADDR, 1 << periph_id);
 
         i2s->itf = itf_id;
         i2s->errors = 0;
@@ -959,8 +937,8 @@ void pi_i2s_close(struct pi_device *device)
 
         // And deactivated device
         int periph_id = UDMA_I2S_ID(i2s->itf);
-        udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, 1 << periph_id);
-        udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, 1 << periph_id);
+        udma_ctrl_cfg_rstn_clr_set(UDMA_CTRL_ADDR, 1 << periph_id);
+        udma_ctrl_cfg_cg_clr_set(UDMA_CTRL_ADDR, 1 << periph_id);
     }
 }
 
@@ -1024,19 +1002,11 @@ static void __pi_i2s_slot_stop(__pi_i2s_slot_t *slot)
 
     if (slot->is_rx)
     {
-        #if !defined(__FREERTOS__)
         __pi_i2s_handle_rx_channel(0, slot);
-        #else
-        __pi_i2s_handle_rx_channel(slot);
-        #endif  /* __FREERTOS__ */
     }
     else
     {
-        #if !defined(__FREERTOS__)
         __pi_i2s_handle_tx_channel(0, slot);
-        #else
-        __pi_i2s_handle_tx_channel(slot);
-        #endif  /* __FREERTOS__ */
     }
 }
 
@@ -1271,7 +1241,7 @@ int pi_i2s_channel_write_async(struct pi_device *dev, int channel,
     // them if they are different.
     task->data[0] = 0;
 
-    if (!slot->mem_slab)
+    if (slot->pingpong_buffers[0])
     {
         int buffer_index = slot->current_buffer;
         mem_block = slot->pingpong_buffers[buffer_index];
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim.c
new file mode 100644
index 000000000..025d0afad
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim.c
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (C) 2019 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * Authors: Germain Haugou, GreenWaves Technologies (germain.haugou@greenwaves-technologies.com)
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "pmsis.h"
+
+#include "spim_v4.h"
+#include "chips/gap9/drivers/udma/udma_core.h"
+#include "pmsis/task.h"
+
+#if !defined(__FREERTOS__)
+#define SOC_EVENT_UDMA_SPIM_EOT(id)        (80 + id + 0)
+#else
+#define likely(x)                          (__builtin_expect(x, 1))
+#endif  /* __FREERTOS__ */
+
+#ifndef SPIM_TRACE
+#if !defined(__TRACE_ALL__) && !defined(__TRACE_SPIM__)
+#define SPIM_TRACE(x...)
+#else
+#define SPIM_TRACE(level, x...) POS_TRACE(level, "[SPIM] " x)
+#endif
+#endif
+
+#define PI_SPIM_UDMA_CMD_SIZE 4
+
+typedef struct
+{
+    uint32_t *temp_buff;
+    uint32_t addr;
+    uint32_t size;
+    uint32_t end;
+} pi_spim_pending_transfert_t;
+
+
+typedef struct
+{
+    uint32_t udma_cmd[PI_SPIM_UDMA_CMD_SIZE];
+    uint32_t temp_buff[2];
+} pi_spim_l2_t;
+
+typedef struct
+{
+    pi_task_t *pending_copy;
+    pi_task_t *waiting_first;
+    pi_task_t *waiting_last;
+    unsigned int pending_repeat_base;
+    unsigned int pending_repeat_callback;
+    unsigned int pending_repeat_asm_callback;
+    unsigned int pending_repeat_misaligned_size;
+    unsigned int pending_repeat_misaligned_addr;
+    unsigned int pending_repeat_misaligned_ram_addr;
+    unsigned int pending_repeat_misaligned_end;
+    unsigned int pending_repeat_misaligned_length;
+    unsigned int pending_repeat_misaligned_stride;
+    unsigned int pending_repeat_misaligned_2d_size;
+    unsigned int pending_chunk_size;
+    unsigned int pending_send_cmd;
+    unsigned int pending_eot;
+    void (*pending_callback)(uint32_t event, void *arg);
+    uint32_t udma_cmd[PI_SPIM_UDMA_CMD_SIZE];
+    int open_count;
+    int id;
+    int is_slave;
+    pi_task_t task;
+    unsigned int pending_repeat_addr;
+    unsigned int pending_repeat_dup_addr;
+    unsigned int pending_repeat_send;
+    unsigned int pending_repeat_flags;
+    pi_spim_pending_transfert_t pending_transfers[2];
+    int pending_transfer_index;
+    int pending_transfer_read_index;
+    struct pi_device *pending_repeat_device;
+    uint32_t periph_base;
+    uint32_t rx_cmd;
+    uint32_t tx_cmd;
+    int channel;
+    int pending_is_auto;
+    uint32_t cmd_channel_base;
+    uint32_t tx_channel_base;
+    uint32_t rx_channel_base;
+    int cmd_channel;
+    int tx_channel;
+    int rx_channel;
+} pi_spim_t;
+
+#define PI_SPIM_T_PENDING_COPY          0
+#define PI_SPIM_T_WAITING_FIRST         4
+#define PI_SPIM_T_WAITING_LAST          8
+#define PI_SPIM_T_REPEAT_BASE           12
+//#define PI_SPIM_T_REPEAT_LEN            16
+#define PI_SPIM_T_REPEAT_CALLBACK       16
+#define PI_SPIM_T_REPEAT_ASM_CALLBACK   20
+
+typedef struct
+{
+    pi_spim_t *spim;
+    uint32_t rx_cmd;
+    uint32_t tx_cmd;
+    uint8_t *receive_addr_ucode;
+    uint32_t receive_addr_ucode_size;
+    uint8_t *send_addr_ucode;
+    uint32_t send_addr_ucode_size;
+    uint32_t *udma_receive_cmd;
+    uint32_t *udma_send_cmd;
+    uint32_t udma_receive_cmd_size;
+    uint32_t udma_send_cmd_size;
+    int max_baudrate;
+    unsigned int cfg;
+    unsigned int periph_base;
+    char cs;
+    char wordsize;
+    char big_endian;
+    char channel;
+    char byte_align;
+    unsigned char div;
+    char polarity;
+    char phase;
+    uint32_t max_rcv_size;
+    uint32_t max_snd_size;
+} pi_spim_cs_t;
+
+typedef struct {
+    unsigned int cmd[4];
+} pi_spim_cmd_t;
+
+
+static PI_L2 pi_spim_t g_spim_data[ARCHI_UDMA_NB_SPIM];
+
+void pi_spim_handle_waiting_copy(pi_task_t *task);
+
+void pi_spim_handle_eot(uint32_t event, void *arg)
+{
+    pi_spim_t *spim = (pi_spim_t *) arg;
+
+    pi_task_t *task = spim->pending_copy;
+    spim->pending_copy = NULL;
+
+    /* handle current task end */
+    pi_task_push_irq_safe(task);
+
+    task = spim->waiting_first;
+    if (task)
+    {
+        spim->waiting_first = task->next;
+        pi_spim_handle_waiting_copy(task);
+    }
+}
+
+void pi_spim_handle_rx_copy(uint32_t event, void *arg)
+{
+    pi_soc_eu_fc_mask_clear(event);
+    pi_spim_handle_eot(event, arg);
+}
+
+void pi_spim_handle_tx_copy(uint32_t event, void *arg)
+{
+    pi_soc_eu_fc_mask_clear(event);
+    pi_spim_handle_eot(event, arg);
+}
+
+
+static int pi_spim_get_div(uint32_t spi_freq)
+{
+    uint32_t periph_freq = pi_freq_get(PI_FREQ_DOMAIN_PERIPH);
+
+    if (spi_freq >= periph_freq)
+    {
+        return 0;
+    }
+    else
+    {
+        // Round-up the divider to obtain an SPI frequency which is below the maximum
+        int div = (periph_freq + spi_freq - 1)/ spi_freq;
+        // The SPIM always divide by 2 once we activate the divider, thus increase by 1
+        // in case it is even to not go above the max frequency.
+        if (div & 1) div += 1;
+        return div;
+    }
+}
+
+
+
+static inline int pi_spim_get_byte_align(int wordsize, int big_endian)
+{
+    return wordsize == PI_SPI_WORDSIZE_32 && big_endian;
+}
+
+static void pi_spim_apply_conf(pi_spim_cs_t *spim_cs)
+{
+    if (spim_cs->udma_receive_cmd)
+    {
+        spim_cs->udma_receive_cmd[0] = spim_cs->cfg;
+        spim_cs->udma_receive_cmd[1] = SPI_CMD_SOT(spim_cs->cs);
+    }
+
+    if (spim_cs->udma_send_cmd)
+    {
+        spim_cs->udma_send_cmd[0] = spim_cs->cfg;
+        spim_cs->udma_send_cmd[1] = SPI_CMD_SOT(spim_cs->cs);
+    }
+
+    spim_cs->rx_cmd = SPI_CMD_RX_DATA(1, SPI_CMD_4_WORD_PER_TRANSF, 8, 0, 0);
+    spim_cs->tx_cmd = SPI_CMD_TX_DATA(1, SPI_CMD_4_WORD_PER_TRANSF, 8, 0, 0);
+}
+
+int pi_spi_open(struct pi_device *device)
+{
+    uint32_t irq = pi_irq_disable();
+
+    struct pi_spi_conf *conf = (struct pi_spi_conf *) device->config;
+
+    int periph_id = ARCHI_UDMA_SPIM_ID(conf->itf);
+
+    SPIM_TRACE(POS_LOG_INFO, "Opening SPIM device (device: %p, id: %d, cs: %d, max_baudrate: %d, wordsize: %d, big_endian: %d, polarity: %d, phase: %d)\n",
+               device, conf->itf, conf->cs, conf->max_baudrate, conf->wordsize, conf->big_endian, conf->polarity, conf->phase);
+
+    pi_spim_t *spim = &g_spim_data[conf->itf];
+
+    pi_spim_cs_t *spim_cs = pmsis_l2_malloc(sizeof(pi_spim_cs_t));
+    if (spim_cs == NULL)
+    {
+        SPIM_TRACE(POS_LOG_WARNING, "Failed to allocate spim structure\n");
+        goto error;
+    }
+
+    device->data = (void *) spim_cs;
+
+    spim_cs->channel = periph_id;
+    spim_cs->periph_base = (uint32_t) UDMA_SPIM_ADDR(conf->itf);
+    spim_cs->spim = spim;
+    spim_cs->wordsize = conf->wordsize;
+    spim_cs->big_endian = conf->big_endian;
+    spim_cs->polarity = conf->polarity;
+    spim_cs->phase = conf->phase;
+    spim_cs->max_baudrate = conf->max_baudrate;
+    spim_cs->cs = conf->cs;
+    spim_cs->byte_align = pi_spim_get_byte_align(conf->wordsize, conf->big_endian);
+    spim_cs->max_rcv_size = conf->max_rcv_chunk_size;
+    spim_cs->max_snd_size = conf->max_snd_chunk_size;
+    spim_cs->udma_send_cmd = NULL;
+    spim_cs->udma_receive_cmd = NULL;
+
+    int div = pi_spim_get_div(spim_cs->max_baudrate);
+    spim_cs->div = div;
+
+    spim_cs->cfg = SPI_CMD_CFG(div, conf->polarity, conf->phase);
+
+    spim->open_count++;
+    if (spim->open_count == 1)
+    {
+        spim->is_slave = conf->is_slave;
+
+        spim->rx_channel  = pi_udma_core_lin_alloc();
+        spim->tx_channel  = pi_udma_core_lin_alloc();
+        spim->cmd_channel = pi_udma_core_lin_alloc();
+
+        if (spim->rx_channel < 0 || spim->tx_channel < 0 || spim->cmd_channel < 0)
+        {
+            pi_udma_core_lin_free(spim->rx_channel);
+            pi_udma_core_lin_free(spim->tx_channel);
+            pi_udma_core_lin_free(spim->cmd_channel);
+
+            SPIM_TRACE(POS_LOG_WARNING, "Failed to allocate channels\n");
+            goto error;
+        }
+
+        spim->rx_channel_base  = pi_udma_core_lin_addr_get(spim->rx_channel);
+        spim->tx_channel_base  = pi_udma_core_lin_addr_get(spim->tx_channel);
+        spim->cmd_channel_base = pi_udma_core_lin_addr_get(spim->cmd_channel);
+
+
+        udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+        udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+
+        uint32_t event = SOC_EVENT_UDMA_SPIM_EOT(conf->itf);
+
+        pi_fc_event_handler_set(event, pi_spim_handle_eot, (void *) spim);
+        pi_soc_eu_fc_mask_set(event);
+
+        udma_spim_rx_dest_set(spim_cs->periph_base, spim->rx_channel);
+        udma_spim_tx_dest_set(spim_cs->periph_base, spim->tx_channel);
+        udma_spim_cmd_dest_set(spim_cs->periph_base, spim->cmd_channel);
+
+        udma_spim_config_set(spim_cs->periph_base, UDMA_SPIM_CONFIG_SPI_SLAVE_MODE(conf->is_slave));
+    }
+
+    pi_irq_restore(irq);
+
+    return 0;
+
+error:
+    pi_irq_restore(irq);
+    return -1;
+}
+
+static void __pi_spi_timestamp_enable(pi_spim_cs_t *spim_cs, struct pi_spi_conf *conf)
+{
+    pi_spim_t *spim = spim_cs->spim;
+    uint8_t is_rx = conf->ts_ch;
+
+    uint32_t base = ARCHI_UDMA_ADDR;
+    uint8_t evt_id = conf->ts_evt_id;
+    uint8_t soc_evt = is_rx ? spim->rx_channel : spim->tx_channel; 
+
+    uint32_t cfg_evt_val = (udma_ctrl_cfg_event_get(base) & ~(0xFF<<evt_id*8))| (soc_evt << (evt_id*8));
+
+    pi_soc_eu_pr_mask_set(soc_evt);
+    udma_ctrl_cfg_event_set(base, cfg_evt_val);
+}
+
+void pi_spi_ioctl(struct pi_device *device, uint32_t cmd, void *_arg)
+{
+    uint32_t irq = pi_irq_disable();
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+    uint32_t arg = (uint32_t) _arg;
+
+    int polarity = (cmd >> __PI_SPI_CTRL_CPOL_BIT) & 3;
+    int phase = (cmd >> __PI_SPI_CTRL_CPHA_BIT) & 3;
+    int set_freq = (cmd >> __PI_SPI_CTRL_SET_MAX_BAUDRATE_BIT) & 1;
+    int wordsize = (cmd >> __PI_SPI_CTRL_WORDSIZE_BIT) & 3;
+    int big_endian = (cmd >> __PI_SPI_CTRL_ENDIANNESS_BIT) & 3;
+    int ts_spi = (cmd >> __PI_SPI_CTRL_SET_TIMESTAMP) & 1;
+
+    if (ts_spi)
+    {
+        __pi_spi_timestamp_enable(spim_cs, (struct pi_spi_conf *) _arg);
+    }
+    else
+    {
+        if (set_freq)
+        {
+            spim_cs->max_baudrate = arg;
+            spim_cs->div = pi_spim_get_div(arg);
+        }
+
+        if (polarity)
+        {
+            spim_cs->polarity = polarity >> 1;
+        }
+        if (phase)
+        {
+            spim_cs->phase = phase >> 1;
+        }
+        if (wordsize)
+        {
+            spim_cs->wordsize = wordsize >> 1;
+        }
+        if (big_endian)
+        {
+            spim_cs->big_endian = big_endian >> 1;
+        }
+
+        spim_cs->cfg = SPI_CMD_CFG(spim_cs->div, spim_cs->polarity, spim_cs->phase);
+        spim_cs->byte_align = pi_spim_get_byte_align(spim_cs->wordsize, spim_cs->big_endian);
+    }
+
+    pi_irq_restore(irq);
+}
+
+void pi_spi_close(struct pi_device *device)
+{
+    int irq = pi_irq_disable();
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+    pi_spim_t *spim = spim_cs->spim;
+
+    SPIM_TRACE(POS_LOG_INFO, "Closing SPIM device (device: %p)\n", device);
+
+    spim->open_count--;
+
+    if (spim->open_count == 0)
+    {
+        // Deactivate SPIM channels
+        udma_spim_rx_dest_set(spim_cs->periph_base, 0xFF);
+        udma_spim_tx_dest_set(spim_cs->periph_base, 0xFF);
+        udma_spim_cmd_dest_set(spim_cs->periph_base, 0xFF);
+
+        // And free them
+        pi_udma_core_lin_free(spim->rx_channel);
+        pi_udma_core_lin_free(spim->tx_channel);
+        pi_udma_core_lin_free(spim->cmd_channel);
+
+        // Deactivate event routing
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_SPIM_EOT(spim->id));
+
+        // Reactivate clock-gating and reset
+        // Reactivate clock-gating and reset
+        udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, (1 << spim_cs->channel));
+        udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, (1 << spim_cs->channel));
+    }
+
+    pi_l2_free(spim_cs, sizeof(pi_spim_cs_t));
+
+    pi_irq_restore(irq);
+}
+
+
+
+static void __attribute__((noinline)) pi_spim_enqueue_to_pending(pi_spim_t *spim, pi_task_t *task, uint32_t data0, uint32_t data1, uint32_t data2, uint32_t data3, uint32_t data4)
+{
+    task->data[0] = data0;
+    task->data[1] = data1;
+    task->data[2] = data2;
+    task->data[3] = data3;
+    task->data[4] = data4;
+
+    if (spim->waiting_first)
+    {
+        spim->waiting_last->next = task;
+    }
+    else
+    {
+        spim->waiting_first = task;
+    }
+    spim->waiting_last = task;
+    task->next = NULL;
+}
+
+
+
+static void __attribute__((noinline)) pi_spim_enqueue_to_pending_7(pi_spim_t *spim, pi_task_t *task, uint32_t data0, uint32_t data1, uint32_t data2, uint32_t data3, uint32_t data4, uint32_t data5, uint32_t data6, uint32_t data7)
+{
+    task->data[0] = data0;
+    task->data[1] = data1;
+    task->data[2] = data2;
+    task->data[3] = data3;
+    task->data[4] = data4;
+    task->data[5] = data5;
+    task->data[6] = data6;
+    task->data[7] = data7;
+
+    if (spim->waiting_first)
+    {
+        spim->waiting_last->next = task;
+    }
+    else
+    {
+        spim->waiting_first = task;
+    }
+    spim->waiting_last = task;
+    task->next = NULL;
+}
+
+
+
+void pi_spi_send_async(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags, pi_task_t *task)
+{
+    int irq = pi_irq_disable();
+
+    SPIM_TRACE(POS_LOG_TRACE, "Sending bitstream (device: %p, buffer: %p, len: 0x%x, flags: 0x%x, task: %p)\n", device, data, len, flags, task);
+
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+    pi_spim_t *spim = spim_cs->spim;
+    int qspi = ((flags >> 2) & 0x3) == 1;
+    int cs_mode = (flags >> 0) & 0x3;
+
+    if (spim->pending_copy)
+    {
+        task->data[0] = 0;
+        task->data[1] = (int)device;
+        task->data[2] = (int)data;
+        task->data[3] = len;
+        task->data[4] = flags;
+
+        if (spim->waiting_first)
+        {
+            spim->waiting_last->next = task;
+        }
+        else
+        {
+            spim->waiting_first = task;
+        }
+        spim->waiting_last = task;
+        task->next = NULL;
+
+        goto end;
+    }
+
+    int buffer_size = (len+7)/8;
+
+    spim->pending_copy = task;
+
+    int size = (len + 7) >> 3;
+    int endianness = spim_cs->big_endian ? SPI_CMD_MSB_FIRST : SPI_CMD_LSB_FIRST;
+
+    // First enqueue the header with SPI config, cs, and send command.
+    // The rest will be sent by the assembly code.
+    // First the user data and finally an epilogue with the EOT command.
+
+    pi_udma_core_lin_enqueue(spim->tx_channel_base, (uint32_t) data, buffer_size, 0);
+
+    if (!spim->is_slave)
+    {
+        spim->udma_cmd[0] = spim_cs->cfg;
+        spim->udma_cmd[1] = SPI_CMD_SOT(spim_cs->cs);
+
+        if (spim_cs->wordsize == PI_SPI_WORDSIZE_8)
+        {
+            spim->udma_cmd[2] = SPI_CMD_TX_DATA(len/8, SPI_CMD_4_WORD_PER_TRANSF, 8, qspi, endianness);
+        }
+        else if (spim_cs->wordsize == PI_SPI_WORDSIZE_16)
+        {
+            spim->udma_cmd[2] = SPI_CMD_TX_DATA(len/16, SPI_CMD_2_WORD_PER_TRANSF, 16, qspi, endianness);
+        }
+        else
+        {
+            spim->udma_cmd[2] = SPI_CMD_TX_DATA(len/32, SPI_CMD_1_WORD_PER_TRANSF, 32, qspi, endianness);
+        }
+        spim->udma_cmd[3] = SPI_CMD_EOT(1, cs_mode == PI_SPI_CS_KEEP);
+
+        pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 4*4, 0);
+    }
+    else
+    {
+        spim->udma_cmd[0] = SPI_CMD_SLAVE_TX_DATA(len>>3, 0);
+        spim->udma_cmd[1] = SPI_CMD_SLAVE_EOT();
+
+        pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 8, 0);
+    }
+
+end:
+    pi_irq_restore(irq);
+}
+
+
+void pi_spi_send(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags)
+{
+    pi_task_t task;
+    pi_spi_send_async(device, data, len, flags, pi_task_block(&task));
+    pi_task_wait_on(&task);
+}
+
+
+
+void pi_spi_receive_async(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags, pi_task_t *task)
+{
+    SPIM_TRACE(POS_LOG_TRACE, "Receive bitstream (device: %p, buffer: %p, len: 0x%x, flags: 0x%x, task: %p)\n", device, data, len, flags, task);
+
+    int irq = pi_irq_disable();
+
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data;
+    pi_spim_t *spim = spim_cs->spim;
+    int qspi = ((flags >> 2) & 0x3) == 1;
+    int cs_mode = (flags >> 0) & 0x3;
+
+    if (spim->pending_copy)
+    {
+        task->data[0] = 1;
+        task->data[1] = (int)device;
+        task->data[2] = (int)data;
+        task->data[3] = len;
+        task->data[4] = flags;
+
+        if (spim->waiting_first)
+        {
+            spim->waiting_last->next = task;
+        }
+        else
+        {
+            spim->waiting_first = task;
+        }
+
+        spim->waiting_last = task;
+        task->next = NULL;
+
+        goto end;
+    }
+
+    spim->pending_copy = task;
+
+    int size = (len + 7) >> 3;
+    int endianness = spim_cs->big_endian ? SPI_CMD_MSB_FIRST : SPI_CMD_LSB_FIRST;
+
+    pi_udma_core_lin_enqueue(spim->rx_channel_base, (uint32_t) data, size, 0);
+
+    if (!spim->is_slave)
+    {
+        spim->udma_cmd[0] = spim_cs->cfg;
+        spim->udma_cmd[1] = SPI_CMD_SOT(spim_cs->cs);
+
+        if (spim_cs->wordsize == PI_SPI_WORDSIZE_8)
+        {
+            spim->udma_cmd[2] = SPI_CMD_RX_DATA(len/8, SPI_CMD_4_WORD_PER_TRANSF, 8, qspi, endianness);
+        }
+        else if (spim_cs->wordsize == PI_SPI_WORDSIZE_16)
+        {
+            spim->udma_cmd[2] = SPI_CMD_RX_DATA(len/16, SPI_CMD_2_WORD_PER_TRANSF, 16, qspi, endianness);
+        }
+        else
+        {
+            spim->udma_cmd[2] = SPI_CMD_RX_DATA(len/32, SPI_CMD_1_WORD_PER_TRANSF, 32, qspi, endianness);
+        }
+
+        spim->udma_cmd[3] = SPI_CMD_EOT(1, cs_mode == PI_SPI_CS_KEEP);
+
+        pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 4*4, 0);
+    }
+    else
+    {
+        spim->udma_cmd[0] = SPI_CMD_SLAVE_RX_DATA(len>>3, 0);
+        spim->udma_cmd[1] = SPI_CMD_SLAVE_EOT();
+
+        pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 8, 0);
+    }
+
+end:
+    pi_irq_restore(irq);
+}
+
+void pi_spi_receive(struct pi_device *device, void *data, size_t len, pi_spi_flags_e flags)
+{
+    pi_task_t task;
+    pi_spi_receive_async(device, data, len, flags, pi_task_block(&task));
+    pi_task_wait_on(&task);
+}
+
+
+void pi_spi_transfer_async(struct pi_device *device, void *tx_data, void *rx_data, size_t len, pi_spi_flags_e flags, pi_task_t *task)
+{
+    SPIM_TRACE(POS_LOG_TRACE, "Transfering bitstream (device: %p, tx_buffer: %p, rx_buffer: %p, len: 0x%x, flags: 0x%x, task: %p)\n", device, tx_data, rx_data, len, flags, task);
+
+    int irq = pi_irq_disable();
+
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data;
+    pi_spim_t *spim = spim_cs->spim;
+    int cs_mode = (flags >> 0) & 0x3;
+
+    if (spim->pending_copy)
+    {
+        task->data[0] = 2;
+        task->data[1] = (int)device;
+        task->data[2] = (int)tx_data;
+        task->data[3] = (int)rx_data;
+        task->data[4] = len;
+        task->data[5] = cs_mode;
+
+        if (spim->waiting_first)
+            spim->waiting_last->next = task;
+        else
+            spim->waiting_first = task;
+
+        spim->waiting_last = task;
+        task->next = NULL;
+
+        goto end;
+    }
+
+    //int channel_id = UDMA_CHANNEL_ID(spim_cs->channel);
+
+    int endianness = spim_cs->big_endian ? SPI_CMD_MSB_FIRST : SPI_CMD_LSB_FIRST;
+
+    spim->pending_copy = task;
+
+    int size = (len + 7) >> 3;
+
+    pi_udma_core_lin_enqueue(spim->rx_channel_base, (uint32_t) rx_data, size, 0);
+    pi_udma_core_lin_enqueue(spim->tx_channel_base, (uint32_t) tx_data, size, 0);
+
+    if (!spim->is_slave)
+    {
+        // First enqueue the header with SPI config, cs, and send command.
+        // The rest will be sent by the assembly code.
+        // First the user data and finally an epilogue with the EOT command.
+        spim->udma_cmd[0] = spim_cs->cfg;
+        spim->udma_cmd[1] = SPI_CMD_SOT(spim_cs->cs);
+
+        if (spim_cs->wordsize == PI_SPI_WORDSIZE_8)
+        {
+            spim->udma_cmd[2] = SPI_CMD_FUL(len/8, SPI_CMD_4_WORD_PER_TRANSF, 8, endianness);
+        }
+        else if (spim_cs->wordsize == PI_SPI_WORDSIZE_16)
+        {
+            spim->udma_cmd[2] = SPI_CMD_FUL(len/16, SPI_CMD_2_WORD_PER_TRANSF, 16, endianness);
+        }
+        else
+        {
+            spim->udma_cmd[2] = SPI_CMD_FUL(len/32, SPI_CMD_1_WORD_PER_TRANSF, 32, endianness);
+        }
+
+        spim->udma_cmd[3] = SPI_CMD_EOT(1, cs_mode == PI_SPI_CS_KEEP);
+
+        pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 4*4, 0);
+    }
+    else
+    {
+        spim->udma_cmd[0] = SPI_CMD_SLAVE_FUL(len>>3, 0);
+        spim->udma_cmd[1] = SPI_CMD_SLAVE_EOT();
+
+        pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) spim->udma_cmd, 8, 0);
+    }
+
+end:
+    pi_irq_restore(irq);
+}
+
+void pi_spi_transfer(struct pi_device *device, void *tx_data, void *rx_data,
+                     size_t len, pi_spi_flags_e flags)
+{
+    pi_task_t task;
+    pi_spi_transfer_async(device, tx_data, rx_data, len, flags, pi_task_block(&task));
+    pi_task_wait_on(&task);
+}
+
+void pi_spi_copy_2d(struct pi_device *device, uint32_t addr, void *data,
+                    uint32_t size, uint32_t stride, uint32_t length, pi_spi_flags_e flags)
+{
+    pi_task_t task;
+    pi_spi_copy_2d_async(device, addr, data, size, stride, length, flags, pi_task_block(&task));
+    pi_task_wait_on(&task);
+}
+
+
+static __attribute__((noinline)) void pi_spim_send_enqueue_transfer(uint32_t event, void *arg)
+{
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) arg;
+    pi_spim_t *spim = (pi_spim_t *) spim_cs->spim;
+
+    uint32_t addr = spim->pending_repeat_misaligned_addr;
+    uint32_t size = spim->pending_repeat_misaligned_size;
+    uint32_t chunk_size = spim_cs->max_snd_size;
+
+    uint32_t cmd_size;
+    uint32_t *cmd;
+
+    SPIM_TRACE(POS_LOG_TRACE, "Enqueueing send transfer (addr: 0x%lx, ram_addr: 0x%x, pending size: 0x%lx)\n", addr, spim->pending_repeat_misaligned_ram_addr, size);
+
+    memcpy(spim_cs->send_addr_ucode, &spim->pending_repeat_misaligned_ram_addr, spim_cs->send_addr_ucode_size);
+
+
+
+    cmd_size = spim_cs->udma_send_cmd_size;
+    cmd = spim_cs->udma_send_cmd;
+
+    void *callback = pi_spim_send_enqueue_transfer;
+
+    if (size > chunk_size)
+    {
+        size = chunk_size;
+    }
+
+    SPIM_TRACE(POS_LOG_TRACE, "Enqueueing aligned send chunk (addr: 0x%x, size: 0x%lx)\n", spim->pending_repeat_misaligned_addr, size);
+
+    spim->pending_repeat_misaligned_ram_addr += size;
+    spim->pending_repeat_misaligned_addr += size;
+    spim->pending_repeat_misaligned_size -= size;
+
+    cmd[cmd_size++] = __BITINSERT(spim->tx_cmd, size-1, SPI_CMD_TX_DATA_SIZE_WIDTH, SPI_CMD_TX_DATA_SIZE_OFFSET);
+    cmd[cmd_size++] = SPI_CMD_EOT(1, 0);
+
+    if (spim->pending_repeat_misaligned_size == 0)
+    {
+        if (spim->pending_repeat_misaligned_2d_size > 0)
+        {
+            uint32_t size = spim->pending_repeat_misaligned_length;
+
+            if (size > spim->pending_repeat_misaligned_2d_size)
+            {
+                size = spim->pending_repeat_misaligned_2d_size;
+            }
+
+            spim->pending_repeat_misaligned_2d_size -= size;
+
+            spim->pending_repeat_misaligned_size = size;
+            spim->pending_repeat_misaligned_ram_addr = spim->pending_repeat_misaligned_ram_addr - spim->pending_repeat_misaligned_length + spim->pending_repeat_misaligned_stride;
+        }
+        else
+        {
+            callback = pi_spim_handle_eot;
+        }
+    }
+
+    pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), callback);
+
+    pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) cmd, cmd_size*4, 0);
+    pi_udma_core_lin_enqueue(spim->tx_channel_base, (uint32_t) addr, size, 0);
+}
+
+
+
+static void __attribute__((noinline)) pi_spim_send_handle_misaligned(pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, pi_spim_t *spim)
+{
+    SPIM_TRACE(POS_LOG_TRACE, "Handling SPIM chunk (addr: 0x%lx, size: 0x%lx)\n", addr, size);
+
+    spim->pending_repeat_misaligned_ram_addr = addr;
+    spim->pending_repeat_misaligned_addr = data;
+    spim->pending_repeat_misaligned_size = size;
+    spim->pending_repeat_misaligned_2d_size = 0;
+
+    pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs);
+
+    pi_spim_send_enqueue_transfer(0, spim_cs);
+}
+
+
+static void __attribute__((noinline)) pi_spim_send_handle_misaligned_2d(pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, uint32_t stride, uint32_t length, pi_spim_t *spim)
+{
+    SPIM_TRACE(POS_LOG_TRACE, "Sending SPIM 2D chunk (addr: 0x%lx, data: 0x%lx, size: 0x%lx, stride: 0x%lx, length: 0x%lx)\n", addr, data, size, stride, length);
+
+    int transfer_size = size > length ? length : size;
+
+    spim->pending_repeat_misaligned_ram_addr = addr;
+    spim->pending_repeat_misaligned_addr = data;
+    spim->pending_repeat_misaligned_size = transfer_size;
+    spim->pending_repeat_misaligned_length = length;
+    spim->pending_repeat_misaligned_stride = stride;
+    spim->pending_repeat_misaligned_2d_size = size - transfer_size;
+
+    pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs);
+
+    pi_spim_send_enqueue_transfer(0, spim_cs);
+}
+
+static __attribute__((noinline)) void pi_spim_receive_enqueue_transfer(uint32_t event, void *arg)
+{
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) arg;
+    pi_spim_t *spim = (pi_spim_t *) spim_cs->spim;
+
+    pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim->pending_callback);
+
+    uint32_t addr = spim->pending_repeat_misaligned_addr;
+    uint32_t size = spim->pending_repeat_misaligned_size;
+    uint32_t chunk_size = spim_cs->max_rcv_size;
+
+    if (size == 0)
+    {
+        return;
+    }
+
+    uint32_t cmd_size;
+    uint32_t *cmd;
+
+    SPIM_TRACE(POS_LOG_TRACE, "Enqueueing receive transfer (addr: 0x%lx, ram_addr: 0x%x, pending size: 0x%lx)\n", addr, spim->pending_repeat_misaligned_ram_addr, size);
+
+    memcpy(spim_cs->receive_addr_ucode, &spim->pending_repeat_misaligned_ram_addr, spim_cs->receive_addr_ucode_size);
+
+    cmd_size = spim_cs->udma_receive_cmd_size;
+    cmd = spim_cs->udma_receive_cmd;
+
+    spim->pending_callback = pi_spim_receive_enqueue_transfer;
+
+    if (size > chunk_size)
+    {
+        size = chunk_size;
+    }
+    else
+    {
+        size &= ~0x3;
+    }
+
+    SPIM_TRACE(POS_LOG_TRACE, "Enqueueing aligned receive chunk (addr: 0x%x, size: 0x%lx)\n", spim->pending_repeat_misaligned_addr, size);
+
+    spim->pending_repeat_misaligned_ram_addr += size;
+    spim->pending_repeat_misaligned_addr += size;
+    spim->pending_repeat_misaligned_size -= size;
+
+    cmd[cmd_size++] = __BITINSERT(spim->rx_cmd, size*8-1, SPI_CMD_RX_DATA_SIZE_WIDTH, SPI_CMD_RX_DATA_SIZE_OFFSET);
+    cmd[cmd_size++] = SPI_CMD_EOT(1, 0);
+
+    if (spim->pending_repeat_misaligned_size == 0)
+    {
+        if (spim->pending_repeat_misaligned_2d_size > 0)
+        {
+            uint32_t size = spim->pending_repeat_misaligned_length;
+
+            if (size > spim->pending_repeat_misaligned_2d_size)
+            {
+                size = spim->pending_repeat_misaligned_2d_size;
+            }
+
+            spim->pending_repeat_misaligned_2d_size -= size;
+
+            spim->pending_repeat_misaligned_size = size;
+            spim->pending_repeat_misaligned_ram_addr = spim->pending_repeat_misaligned_ram_addr - spim->pending_repeat_misaligned_length + spim->pending_repeat_misaligned_stride;
+        }
+        else
+        {
+            spim->pending_callback = pi_spim_handle_eot;
+        }
+    }
+
+
+    pi_udma_core_lin_enqueue(spim->rx_channel_base, (uint32_t) addr, size, 0);
+    pi_udma_core_lin_enqueue(spim->cmd_channel_base, (uint32_t) cmd, cmd_size*4, 0);
+}
+
+
+
+static void __attribute__((noinline)) pi_spim_receive_handle_misaligned(pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size, pi_spim_t *spim)
+{
+    SPIM_TRACE(POS_LOG_TRACE, "Receiving SPIM chunk (addr: 0x%lx, size: 0x%lx)\n", addr, size);
+
+    spim->pending_repeat_misaligned_ram_addr = addr;
+    spim->pending_repeat_misaligned_addr = data;
+    spim->pending_repeat_misaligned_size = size;
+    spim->pending_repeat_misaligned_2d_size = 0;
+
+    pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs);
+
+    pi_spim_receive_enqueue_transfer(0, spim_cs);
+
+    if (spim->pending_repeat_misaligned_size)
+    {
+        pi_spim_receive_enqueue_transfer(0, spim_cs);
+    }
+    else
+    {
+        pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim->pending_callback);
+    }
+
+}
+
+
+
+static void __attribute__((noinline)) pi_spim_receive_handle_misaligned_2d(
+    pi_spim_cs_t *spim_cs, uint32_t addr, uint32_t data, uint32_t size,
+    uint32_t stride, uint32_t length, pi_spim_t *spim)
+{
+    SPIM_TRACE(POS_LOG_TRACE, "Receiving SPIM 2D chunk (addr: 0x%lx, data: 0x%lx, size: 0x%lx, stride: 0x%lx, length: 0x%lx)\n", addr, data, size, stride, length);
+
+    int transfer_size = size > length ? length : size;
+
+    spim->pending_repeat_misaligned_ram_addr = addr;
+    spim->pending_repeat_misaligned_addr = data;
+    spim->pending_repeat_misaligned_size = transfer_size;
+    spim->pending_repeat_misaligned_length = length;
+    spim->pending_repeat_misaligned_stride = stride;
+    spim->pending_repeat_misaligned_2d_size = size - transfer_size;
+
+    pi_fc_event_handler_set_args(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim_cs);
+
+    pi_spim_receive_enqueue_transfer(0, spim_cs);
+
+    if (spim->pending_repeat_misaligned_size)
+    {
+        pi_spim_receive_enqueue_transfer(0, spim_cs);
+    }
+    else
+    {
+        pi_fc_event_handler_set_func(SOC_EVENT_UDMA_SPIM_EOT(spim->id), spim->pending_callback);
+    }
+}
+
+
+void pi_spi_copy_2d_async(struct pi_device *device, uint32_t addr, void *data,
+                          uint32_t size, uint32_t stride, uint32_t length,
+                          pi_spi_flags_e flags, pi_task_t *task)
+{
+    SPIM_TRACE(POS_LOG_DEBUG, "Copy 2D bitstream (device: %p, ext2loc: %d, addr: 0x%lx, buffer: %p, size: 0x%lx, stride: 0x%lx, length: 0x%lx, flags: 0x%x, task: %p)\n", device, __BITEXTRACT(flags, 1, 5), addr, data, size, stride, length, flags, task);
+
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+    pi_spim_t *spim = spim_cs->spim;
+
+    uint32_t irq = pi_irq_disable();
+
+    if (likely(!spim->pending_copy))
+    {
+        int qspi = __BITEXTRACT(flags, 2, 2) == 1;
+        int cs_mode = __BITEXTRACT(flags, 2, 0);
+        int ext2loc = __BITEXTRACT(flags, 1, 4);
+
+        spim->pending_copy = task;
+        spim->pending_is_auto = cs_mode == PI_SPI_CS_AUTO;
+
+        if (ext2loc)
+        {
+            spim->rx_cmd = __BITINSERT(spim_cs->rx_cmd, qspi, SPI_CMD_RX_DATA_QPI_WIDTH, SPI_CMD_RX_DATA_QPI_OFFSET);
+            pi_spim_receive_handle_misaligned_2d(spim_cs, addr, (uint32_t) data, size, stride, length, spim);
+        }
+        else
+        {
+            spim->tx_cmd = __BITINSERT(spim_cs->tx_cmd, qspi, SPI_CMD_TX_DATA_QPI_WIDTH, SPI_CMD_TX_DATA_QPI_OFFSET);
+            pi_spim_send_handle_misaligned_2d(spim_cs, addr, (uint32_t) data, size, stride, length, spim);
+        }
+    }
+    else
+    {
+        pi_spim_enqueue_to_pending_7(spim, task, 4, (int) device, addr, (int) data, size, stride, length, flags);
+    }
+
+    pi_irq_restore(irq);
+}
+
+
+
+void pi_spi_copy(struct pi_device *device, uint32_t addr, void *data,
+                 uint32_t size, pi_spi_flags_e flags)
+{
+    pi_task_t task;
+    pi_spi_copy_async(device, addr, data, size, flags, pi_task_block(&task));
+    pi_task_wait_on(&task);
+}
+
+
+
+void pi_spi_copy_async(struct pi_device *device, uint32_t addr, void *data,
+                       uint32_t size, pi_spi_flags_e flags, pi_task_t *task)
+{
+    SPIM_TRACE(POS_LOG_DEBUG, "Copy bitstream (device: %p, ext2loc: %d, addr: 0x%lx, buffer: %p, size: 0x%lx, flags: 0x%x, task: %p)\n", device, __BITEXTRACT(flags, 1, 4), addr, data, size, flags, task);
+
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+    pi_spim_t *spim = spim_cs->spim;
+
+    uint32_t irq = pi_irq_disable();
+
+    if (likely(!spim->pending_copy))
+    {
+        int qspi = __BITEXTRACT(flags, 2, 2) == 1;
+        int cs_mode = __BITEXTRACT(flags, 2, 0);
+        int ext2loc = __BITEXTRACT(flags, 1, 4);
+
+        spim->pending_copy = task;
+        spim->pending_is_auto = cs_mode == PI_SPI_CS_AUTO;
+
+        if (ext2loc)
+        {
+            spim->rx_cmd = __BITINSERT(spim_cs->rx_cmd, qspi, SPI_CMD_RX_DATA_QPI_WIDTH, SPI_CMD_RX_DATA_QPI_OFFSET);
+            pi_spim_receive_handle_misaligned(spim_cs, addr, (uint32_t) data, size, spim);
+        }
+        else
+        {
+            spim->tx_cmd = __BITINSERT(spim_cs->tx_cmd, qspi, SPI_CMD_TX_DATA_QPI_WIDTH, SPI_CMD_TX_DATA_QPI_OFFSET);
+            pi_spim_send_handle_misaligned(spim_cs, addr, (uint32_t) data, size, spim);
+        }
+    }
+    else
+    {
+        pi_spim_enqueue_to_pending_7(spim, task, 3, (int) device, addr, (int) data, size, flags, 0, 0);
+    }
+
+    pi_irq_restore(irq);
+}
+
+
+
+void *pi_spi_receive_ucode_set(struct pi_device *device, uint8_t *ucode,
+                               uint32_t ucode_size)
+{
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+
+    if (spim_cs->udma_receive_cmd)
+    {
+        pi_l2_free(spim_cs->udma_receive_cmd, (spim_cs->udma_receive_cmd_size + 2)*4);
+    }
+
+    spim_cs->udma_receive_cmd = pi_l2_malloc(ucode_size + 4*4);
+    if (spim_cs->udma_receive_cmd == NULL)
+    {
+        return NULL;
+    }
+
+    pi_spim_apply_conf(spim_cs);
+
+    memcpy(&spim_cs->udma_receive_cmd[2], ucode, ucode_size);
+    spim_cs->udma_receive_cmd_size = 2 + (ucode_size >> 2);
+
+    return (void *)&spim_cs->udma_receive_cmd[2];
+}
+
+
+
+void pi_spi_receive_ucode_set_addr_info(struct pi_device *device, uint8_t *ucode,
+                                        uint32_t ucode_size)
+{
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *) device->data;
+
+    spim_cs->receive_addr_ucode = ucode;
+    spim_cs->receive_addr_ucode_size = ucode_size;
+}
+
+
+
+void *pi_spi_send_ucode_set(struct pi_device *device, uint8_t *ucode, uint32_t ucode_size)
+{
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data;
+
+    if (spim_cs->udma_send_cmd)
+    {
+        pi_l2_free(spim_cs->udma_send_cmd, (spim_cs->udma_send_cmd_size + 2)*4);
+    }
+
+    spim_cs->udma_send_cmd = pi_l2_malloc(ucode_size + 4*4);
+    if (spim_cs->udma_send_cmd == NULL)
+    {
+        return NULL;
+    }
+
+    pi_spim_apply_conf(spim_cs);
+
+    memcpy(&spim_cs->udma_send_cmd[2], ucode, ucode_size);
+    spim_cs->udma_send_cmd_size = 2 + (ucode_size >> 2);
+
+    return (void *)&spim_cs->udma_send_cmd[2];
+}
+
+
+
+void pi_spi_send_ucode_set_addr_info(struct pi_device *device, uint8_t *ucode,
+                                     uint32_t ucode_size)
+{
+    pi_spim_cs_t *spim_cs = (pi_spim_cs_t *)device->data;
+
+    spim_cs->send_addr_ucode = ucode;
+    spim_cs->send_addr_ucode_size = ucode_size;
+}
+
+
+
+void pi_spim_handle_waiting_copy(pi_task_t *task)
+{
+    if (task->data[0] == 0)
+    {
+        pi_spi_send_async((struct pi_device *)task->data[1], (void *)task->data[2],
+                          task->data[3], task->data[4], task);
+    }
+    else if (task->data[0] == 1)
+    {
+        pi_spi_receive_async((struct pi_device *)task->data[1], (void *)task->data[2],
+                             task->data[3], task->data[4], task);
+    }
+    else
+    {
+        pi_spi_transfer_async((struct pi_device *)task->data[1], (void *)task->data[2],
+                              (void *)task->data[3], task->data[4], task->data[5], task);
+    }
+}
+
+void pi_spi_conf_init(struct pi_spi_conf *conf)
+{
+    conf->wordsize = PI_SPI_WORDSIZE_8;
+    conf->big_endian = 0;
+    conf->max_baudrate = 10000000;
+    conf->cs = -1;
+    conf->itf = 0;
+    conf->polarity = 0;
+    conf->phase = 0;
+    conf->max_rcv_chunk_size = -1;
+    conf->max_snd_chunk_size = -1;
+    conf->is_slave = 0;
+}
+
+#if !defined(__FREERTOS__)
+static void __attribute__((constructor)) pi_spim_init()
+{
+    for (int i=0; i<ARCHI_UDMA_NB_SPIM; i++)
+    {
+        g_spim_data[i].open_count = 0;
+        g_spim_data[i].pending_copy = NULL;
+        g_spim_data[i].waiting_first = NULL;
+        g_spim_data[i].id = i;
+    }
+}
+#endif  /* __FREERTOS__ */
+
+#ifdef __ZEPHYR__
+
+#include <zephyr.h>
+#include <device.h>
+#include <init.h>
+
+static int spi_init(struct device *device)
+{
+    ARG_UNUSED(device);
+
+    pi_spim_init();
+
+    return 0;
+}
+
+struct spi_config {
+};
+
+struct spi_data {
+};
+
+static const struct spi_config spi_cfg = {
+};
+
+static struct spi_data spi_data = {
+};
+
+DEVICE_INIT(spi, "spi", &spi_init,
+        &spi_data, &spi_cfg,
+        PRE_KERNEL_2, CONFIG_KERNEL_INIT_PRIORITY_DEVICE);
+
+#endif
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim_v4.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim_v4.h
new file mode 100644
index 000000000..8a6d56e7c
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/spim/spim_v4.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2019 GreenWaves Technologies
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARCHI_UDMA_SPIM_SPIM_V4_H__
+#define __ARCHI_UDMA_SPIM_SPIM_V4_H__
+
+// SPI commands IDS definition
+#define SPI_CMD_CFG_ID       0
+#define SPI_CMD_SOT_ID       1
+#define SPI_CMD_SEND_CMD_ID  2
+#define SPI_CMD_SEND_BITS_ID 2
+#define SPI_CMD_SEND_ADDR_ID 3
+#define SPI_CMD_DUMMY_ID     4
+#define SPI_CMD_WAIT_ID      5
+#define SPI_CMD_TX_DATA_ID   6
+#define SPI_CMD_RX_DATA_ID   7
+#define SPI_CMD_RPT_ID       8
+#define SPI_CMD_EOT_ID       9
+#define SPI_CMD_RPT_END_ID   10
+#define SPI_CMD_RX_CHECK_ID  11
+#define SPI_CMD_FUL_ID       12
+
+// SPI command fields offset, mask, value definition
+// SPI commands fields offsets
+#define SPI_CMD_ID_OFFSET       28
+
+// COMMON definitions
+#define SPI_CMD_QPI_ENA   1
+#define SPI_CMD_QPI_DIS   0
+#define SPI_CMD_LSB_FIRST  1
+#define SPI_CMD_MSB_FIRST  0
+#define SPI_CMD_4_WORD_PER_TRANSF 2
+#define SPI_CMD_2_WORD_PER_TRANSF 1
+#define SPI_CMD_1_WORD_PER_TRANSF 0
+#define SPI_CMD_DATA_WITDH(val) (val)
+#define SPI_CMD_CMD_SIZE(val) (val)
+
+// CFG
+#define SPI_CMD_CFG_CLK_DIV_OFFSET      0
+#define SPI_CMD_CFG_CLK_DIV_WIDTH   8
+#define SPI_CMD_CFG_CPHA_OFFSET         8
+#define SPI_CMD_CFG_CPOL_OFFSET       9
+
+#define SPI_CMD_CFG_CLKDIV(val) (val)
+#define SPI_CMD_CFG_CPOL_POS  1
+#define SPI_CMD_CFG_CPOL_NEG  0
+#define SPI_CMD_CFG_CPHA_STD  1
+#define SPI_CMD_CFG_CPHA_OPP  0
+
+// SOT
+#define SPI_CMD_SOT_CS_OFFSET    0
+#define SPI_CMD_SOT_CS_WIDTH     2
+
+#define SPI_CMD_SOT_CS0   0
+#define SPI_CMD_SOT_CS1   1
+#define SPI_CMD_SOT_CS2   2
+#define SPI_CMD_SOT_CS3   3
+
+// SEND_CMD
+#define SPI_CMD_SEND_CMD_CMD_OFFSET   0
+#define SPI_CMD_SEND_CMD_CMD_WIDTH    16
+#define SPI_CMD_SEND_CMD_SIZE_OFFSET  16
+#define SPI_CMD_SEND_CMD_SIZE_WIDTH   4
+#define SPI_CMD_SEND_CMD_QPI_OFFSET   27
+
+// SEND_CMD
+#define SPI_CMD_SEND_BITS_BITS_OFFSET   0
+#define SPI_CMD_SEND_BITS_BITS_WIDTH    16
+#define SPI_CMD_SEND_BITS_SIZE_OFFSET  16
+#define SPI_CMD_SEND_BITS_SIZE_WIDTH   4
+#define SPI_CMD_SEND_BITS_QPI_OFFSET   27
+
+// SEND_ADDR
+#define SPI_CMD_SEND_ADDR_SIZE_OFFSET   16
+#define SPI_CMD_SEND_ADDR_SIZE_WIDTH     5
+#define SPI_CMD_SEND_ADDR_QPI_OFFSET    27
+
+//#define SPI_CMD_SEND_ADDR_VALUE(value)  ((((value) & 0xff000000) >> 24) | (((value) & 0xff0000) >> 8) | (((value) & 0xff00) << 8) | (((value) & 0xff) << 24))
+#define SPI_CMD_SEND_ADDR_VALUE(value)  (value)
+
+
+// SEND_DUMMY
+#define SPI_CMD_DUMMY_CYCLE_OFFSET      16
+#define SPI_CMD_DUMMY_CYCLE_WIDTH        5
+
+// TX_DATA
+#define SPI_CMD_TX_DATA_SIZE_OFFSET          0
+#define SPI_CMD_TX_DATA_SIZE_WIDTH          16
+#define SPI_CMD_TX_DATA_QPI_OFFSET          27
+#define SPI_CMD_TX_DATA_QPI_WIDTH           1
+#define SPI_CMD_TX_DATA_WORDTRANS_OFFSET 21
+#define SPI_CMD_TX_DATA_WORDTRANS_WIDTH  2
+#define SPI_CMD_TX_DATA_LSBFIRST_OFFSET 26
+#define SPI_CMD_TX_DATA_BITSWORD_OFFSET 16
+#define SPI_CMD_TX_DATA_BITSWORD_WIDTH  5
+
+// SLAVE_TX_DATA
+#define SPI_CMD_SLAVE_TX_DATA_SIZE_OFFSET          0
+#define SPI_CMD_SLAVE_TX_DATA_SIZE_WIDTH          16
+#define SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_OFFSET 16
+#define SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_WIDTH  1
+
+
+// RX_DATA
+#define SPI_CMD_RX_DATA_SIZE_OFFSET          0
+#define SPI_CMD_RX_DATA_SIZE_WIDTH          16
+#define SPI_CMD_RX_DATA_QPI_OFFSET          27
+#define SPI_CMD_RX_DATA_QPI_WIDTH           1
+#define SPI_CMD_RX_DATA_WORDTRANS_OFFSET 21
+#define SPI_CMD_RX_DATA_WORDTRANS_WIDTH  2
+#define SPI_CMD_RX_DATA_LSBFIRST_OFFSET 26
+#define SPI_CMD_RX_DATA_BITSWORD_OFFSET 16
+#define SPI_CMD_RX_DATA_BITSWORD_WIDTH  5
+
+
+// RPT
+#define SPI_CMD_RPT_NB_OFFSET                0
+#define SPI_CMD_RPT_NB_WIDTH                16
+
+// EOT
+#define SPI_CMD_EOT_GEN_EVT_OFFSET           0
+#define SPI_CMD_EOT_CS_KEEP_OFFSET           1
+
+#define SPI_CMD_EOT_EVENT_ENA                1
+#define SPI_CMD_EOT_EVENT_DIS                0
+
+// WAIT
+#define SPI_CMD_WAIT_EVENT_OFFSET            0
+#define SPI_CMD_WAIT_EVENT_WIDTH             2
+
+// RX_CHECK
+#define SPI_CMD_RX_CHECK_VALUE_OFFSET        0
+#define SPI_CMD_RX_CHECK_VALUE_WIDTH        16
+
+#define SPI_CMD_RX_CHECK_SIZE_OFFSET        16
+#define SPI_CMD_RX_CHECK_SIZE_WIDTH          4
+
+#define SPI_CMD_RX_CHECK_MODE_OFFSET        24
+#define SPI_CMD_RX_CHECK_MODE_WIDTH          2
+
+#define SPI_CMD_RX_CHECK_BYTE_ALIGN_OFFSET  26
+
+#define SPI_CMD_RX_CHECK_QPI_OFFSET         27
+
+#define SPI_CMD_RX_CHECK_MODE_MATCH          0
+#define SPI_CMD_RX_CHECK_MODE_ONES           1
+#define SPI_CMD_RX_CHECK_MODE_ZEROS          2
+#define SPI_CMD_RX_CHECK_MODE_MASK           3
+
+// FULL DUPLEX
+#define SPI_CMD_FUL_SIZE_OFFSET              0
+#define SPI_CMD_FUL_SIZE_WIDTH              16
+#define SPI_CMD_FUL_WORDTRANS_OFFSET 21
+#define SPI_CMD_FUL_WORDTRANS_WIDTH  2
+#define SPI_CMD_FUL_LSBFIRST_OFFSET 26
+#define SPI_CMD_FUL_BITSWORD_OFFSET 16
+#define SPI_CMD_FUL_BITSWORD_WIDTH  5
+
+#define SPI_CMD_SETUP_UC_TXRXEN_OFFSET 27
+#define SPI_CMD_SETUP_UC_DS_OFFSET     25
+
+// SPI CMD encoding
+#define SPI_CMD_CFG(clockDiv,cpol,cpha)                         ((SPI_CMD_CFG_ID<<SPI_CMD_ID_OFFSET)       | ((cpol)<<SPI_CMD_CFG_CPOL_OFFSET)          | ((cpha)<<SPI_CMD_CFG_CPHA_OFFSET)                | ((clockDiv)<<SPI_CMD_CFG_CLK_DIV_OFFSET))
+#define SPI_CMD_SOT(cs)                                         ((SPI_CMD_SOT_ID << SPI_CMD_ID_OFFSET)     | ((cs) << SPI_CMD_SOT_CS_OFFSET))
+#define SPI_CMD_SEND_CMD(cmd,bits,qpi)                          ((SPI_CMD_SEND_CMD_ID<<SPI_CMD_ID_OFFSET)  | ((qpi)<<SPI_CMD_SEND_CMD_QPI_OFFSET)       | (((bits)-1)<<SPI_CMD_SEND_CMD_SIZE_OFFSET)       | (cmd&0xFFFF) )
+#define SPI_CMD_SEND_BITS(data,bits,qpi)                        ((SPI_CMD_SEND_CMD_ID<<SPI_CMD_ID_OFFSET)  | ((qpi)<<SPI_CMD_SEND_CMD_QPI_OFFSET)       | (((bits)-1)<<SPI_CMD_SEND_CMD_SIZE_OFFSET)       | (data&0xFFFF) )
+#define SPI_CMD_DUMMY(cycles)                                   ((SPI_CMD_DUMMY_ID<<SPI_CMD_ID_OFFSET)     | (((cycles)-1)<<SPI_CMD_DUMMY_CYCLE_OFFSET))
+#define SPI_CMD_SETUP_UCA(txrxen,ds,addr)                       ((SPI_CMD_SETUP_UCA_ID<<SPI_CMD_ID_OFFSET) | ((txrxen)<<SPI_CMD_SETUP_UC_TXRXEN_OFFSET) | ((int)addr & 0xFFFFF))
+#define SPI_CMD_SETUP_UCS(txrxen,ds,size)                       ((SPI_CMD_SETUP_UCS_ID<<SPI_CMD_ID_OFFSET) | ((txrxen)<<SPI_CMD_SETUP_UC_TXRXEN_OFFSET) | ((ds)<<SPI_CMD_SETUP_UC_DS_OFFSET)               | (size & 0xFFFF))
+#define SPI_CMD_TX_DATA(words,wordstrans,bitsword,qpi,lsbfirst) ((SPI_CMD_TX_DATA_ID<<SPI_CMD_ID_OFFSET)   | ((qpi)<<SPI_CMD_TX_DATA_QPI_OFFSET)        | ((wordstrans)<<SPI_CMD_TX_DATA_WORDTRANS_OFFSET) | (((bitsword)-1)<<SPI_CMD_TX_DATA_BITSWORD_OFFSET) | (((words)-1) << SPI_CMD_TX_DATA_SIZE_OFFSET) | ((lsbfirst)<<SPI_CMD_TX_DATA_LSBFIRST_OFFSET))
+#define SPI_CMD_RX_DATA(words,wordstrans,bitsword,qpi,lsbfirst) ((SPI_CMD_RX_DATA_ID<<SPI_CMD_ID_OFFSET)   | ((qpi)<<SPI_CMD_RX_DATA_QPI_OFFSET)        | ((wordstrans)<<SPI_CMD_RX_DATA_WORDTRANS_OFFSET) | (((bitsword)-1)<<SPI_CMD_RX_DATA_BITSWORD_OFFSET) | (((words)-1) << SPI_CMD_RX_DATA_SIZE_OFFSET) | ((lsbfirst)<<SPI_CMD_RX_DATA_LSBFIRST_OFFSET))
+#define SPI_CMD_RPT(iter)                                       ((SPI_CMD_RPT_ID<<SPI_CMD_ID_OFFSET)       | ((iter)<<SPI_CMD_RPT_NB_OFFSET))
+
+#define SPI_CMD_EOT(evt,cs_keep)    ((SPI_CMD_EOT_ID<<28) | ((evt)<<SPI_CMD_EOT_GEN_EVT_OFFSET) | ((cs_keep)<<SPI_CMD_EOT_CS_KEEP_OFFSET))
+
+#define SPI_CMD_RX_CHECK(mode,bits,value,qpi,byte_align) \
+  ((SPI_CMD_RX_CHECK_ID<<SPI_CMD_ID_OFFSET) | \
+  ((value) << SPI_CMD_RX_CHECK_VALUE_OFFSET) | \
+  ((mode) << SPI_CMD_RX_CHECK_MODE_OFFSET) | \
+  (((bits)-1) << SPI_CMD_RX_CHECK_SIZE_OFFSET) | \
+  ((byte_align)<<SPI_CMD_RX_CHECK_BYTE_ALIGN_OFFSET) | \
+  ((qpi)<<SPI_CMD_RX_CHECK_QPI_OFFSET))
+
+#define SPI_CMD_WAIT(event)               ((SPI_CMD_WAIT_ID<<SPI_CMD_ID_OFFSET) | ((event) << SPI_CMD_WAIT_EVENT_OFFSET))
+#define SPI_CMD_RPT_END()                 ((SPI_CMD_RPT_END_ID<<SPI_CMD_ID_OFFSET))
+#define SPI_CMD_FUL(words,wordstrans,bitsword,lsbfirst) ((SPI_CMD_FUL_ID<<SPI_CMD_ID_OFFSET)   | ((wordstrans)<<SPI_CMD_FUL_WORDTRANS_OFFSET) | ((bitsword-1)<<SPI_CMD_FUL_BITSWORD_OFFSET) | (((words)-1) << SPI_CMD_FUL_SIZE_OFFSET) | ((lsbfirst)<<SPI_CMD_FUL_LSBFIRST_OFFSET))
+
+#define SPI_CMD_SLAVE_TX_DATA(words,ignore_cs) ((SPI_CMD_TX_DATA_ID<<SPI_CMD_ID_OFFSET)   | ((ignore_cs)<<SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_OFFSET) | (((words)-1) << SPI_CMD_TX_DATA_SIZE_OFFSET))
+#define SPI_CMD_SLAVE_RX_DATA(words,ignore_cs) ((SPI_CMD_RX_DATA_ID<<SPI_CMD_ID_OFFSET)   | ((ignore_cs)<<SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_OFFSET) | (((words)-1) << SPI_CMD_TX_DATA_SIZE_OFFSET))
+#define SPI_CMD_SLAVE_FUL(words,ignore_cs)     ((SPI_CMD_FUL_ID<<SPI_CMD_ID_OFFSET)   | ((ignore_cs)<<SPI_CMD_SLAVE_TX_DATA_IGNORE_CS_OFFSET) | (((words)-1) << SPI_CMD_TX_DATA_SIZE_OFFSET))
+#define SPI_CMD_SLAVE_EOT()                    (SPI_CMD_EOT_ID << SPI_CMD_ID_OFFSET)
+
+#endif
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_datamove.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_datamove.c
new file mode 100644
index 000000000..362620967
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_datamove.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2020, GreenWaves Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * o Redistributions of source code must retain the above copyright notice, this list
+ *   of conditions and the following disclaimer.
+ *
+ * o Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * o Neither the name of GreenWaves Technologies, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "pmsis.h"
+
+/*******************************************************************************
+ * Definitions
+ ******************************************************************************/
+
+#if !defined(__FREERTOS__)
+#define SOC_EVENT_UDMA_CHAN_LIN(id) ( (id) )
+#define SOC_EVENT_UDMA_CHAN_2D(id)  ( ARCHI_UDMA_NB_2D_ADDRGEN + (id) )
+#define UDMA_NB_CHAN_LIN            ( ARCHI_UDMA_NB_LIN_ADDRGEN )
+#endif  /* __FREERTOS__ */
+
+#define __UDMA_NB_DATAMOVE          ( 2 )
+
+typedef struct pi_udma_datamove_data_s
+{
+    struct pi_udma_datamove_itf_data_s *itf_data;
+    pi_udma_datamove_transf_cfg_t src_trf_cfg; /*!< Source data transfer configuration */
+    pi_udma_datamove_transf_cfg_t dst_trf_cfg; /*!< Destination data transfer configuration */
+} pi_udma_datamove_data_t;
+
+typedef struct pi_udma_datamove_itf_data_s
+{
+    uint8_t device_id;
+    pi_task_t* fifo_head;
+    pi_task_t* fifo_tail;
+    pi_task_t* end_task;
+    int32_t nb_open; /*!< number of devices opened */
+    int32_t rx_lin_chan_id;
+    int32_t tx_lin_chan_id;
+    int32_t rx_2d_chan_id;
+    int32_t tx_2d_chan_id;
+} pi_udma_datamove_itf_data_t;
+
+/*******************************************************************************
+ * Driver data
+ ******************************************************************************/
+
+static pi_udma_datamove_itf_data_t* g_udma_datamove_itf_data[__UDMA_NB_DATAMOVE];
+
+/*******************************************************************************
+ * Internal functions
+ ******************************************************************************/
+
+static inline void __pi_udma_datamove_task_fifo_enqueue(pi_udma_datamove_itf_data_t *itf_data,
+                                                        pi_task_t *pi_task)
+{
+    //uint32_t irq = pi_irq_disable();
+    if (itf_data->fifo_tail)
+    {
+        // tail insert
+        itf_data->fifo_tail->next      = pi_task;
+        itf_data->fifo_tail            = itf_data->fifo_tail->next;
+        itf_data->fifo_tail->next      = NULL;
+    }
+    else
+    {
+        // Initialize the list
+        itf_data->fifo_head          = pi_task;
+        itf_data->fifo_head->next    = NULL;
+        // set the base tail
+        itf_data->fifo_tail          = itf_data->fifo_head;
+    }
+    //pi_irq_restore(irq);
+}
+
+static inline pi_task_t* __pi_udma_datamove_task_fifo_pop(pi_udma_datamove_itf_data_t *itf_data)
+{
+    //uint32_t irq = pi_irq_disable();
+    pi_task_t *ret_task = NULL;
+    if (itf_data->fifo_head != NULL)
+    {
+        ret_task = itf_data->fifo_head;
+        hal_compiler_barrier();
+        itf_data->fifo_head = itf_data->fifo_head->next;
+        if (itf_data->fifo_head == NULL)
+        {
+            itf_data->fifo_tail = NULL;
+        }
+    }
+    //pi_irq_restore(irq);
+    return ret_task;
+}
+
+static inline void __pi_udma_datamove_copy_start(pi_udma_datamove_itf_data_t* itf_data)
+{
+    if (NULL == itf_data->end_task)
+    {
+        return;
+    }
+
+    uint32_t src = itf_data->end_task->data[0];
+    uint32_t dst = itf_data->end_task->data[1];
+    uint32_t len = itf_data->end_task->data[2];
+
+    pi_udma_datamove_data_t* dev_data = (pi_udma_datamove_data_t*) itf_data->end_task->data[3];
+    uint32_t udma_ctrl_base = (uint32_t) ARCHI_UDMA_ADDR;
+
+    /* select the rx_channel according to the device configuration */
+    int32_t rx_chan = -1;
+    if (dev_data->dst_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR)
+    {
+        rx_chan = itf_data->rx_lin_chan_id;
+    }
+    else
+    {
+        rx_chan = itf_data->rx_2d_chan_id;
+    }
+    /* select the tx_channel according to the device configuration */
+    int32_t tx_chan = -1;
+    if (dev_data->src_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR)
+    {
+        tx_chan = itf_data->tx_lin_chan_id;
+    }
+    else
+    {
+        tx_chan = itf_data->tx_2d_chan_id;
+    }
+
+    if (0 == itf_data->device_id)
+    {
+        /* set the channels to use */
+        //hal_udma_ctrl_datamove0_cfg_set_ids(tx_chan, rx_chan);
+        udma_ctrl_datamove_cfg_source_id_0_set(udma_ctrl_base, tx_chan);
+        udma_ctrl_datamove_cfg_dest_id_0_set(udma_ctrl_base, rx_chan);
+
+        /* launch the copy, also activate clock for udma channels */
+        //hal_udma_ctrl_datamove0_enable();
+        udma_ctrl_datamove0_size_en_set(udma_ctrl_base, 1);
+    }
+    else
+    {
+        /* set the channels to use */
+        //hal_udma_ctrl_datamove1_cfg_set_ids(tx_chan, rx_chan);
+        udma_ctrl_datamove_cfg_source_id_1_set(udma_ctrl_base, tx_chan);
+        udma_ctrl_datamove_cfg_dest_id_1_set(udma_ctrl_base, rx_chan);
+
+        /* launch the copy, also activate clock for udma channels */
+        //hal_udma_ctrl_datamove1_enable();
+        udma_ctrl_datamove1_size_en_set(udma_ctrl_base, 1);
+    }
+
+    /* setup and launch channels */
+    if (dev_data->dst_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR)
+    {
+        uint32_t udma_core = pi_udma_core_lin_addr_get(rx_chan);
+        uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_EN(1);
+        pi_udma_core_lin_enqueue(udma_core, dst, len, config);
+    }
+    else
+    {
+        uint32_t udma_core = pi_udma_core_2d_addr_get(rx_chan - UDMA_NB_CHAN_LIN);
+        uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_EN(1);
+        pi_udma_core_2d_enqueue(udma_core, dst, 0, len, dev_data->dst_trf_cfg.stride,
+                                dev_data->dst_trf_cfg.row_len, config);
+    }
+
+    if (dev_data->src_trf_cfg.type == PI_UDMA_DATAMOVE_TRF_LINEAR)
+    {
+        uint32_t udma_core = pi_udma_core_lin_addr_get(tx_chan);
+        uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_EN(1);
+        pi_udma_core_lin_enqueue(udma_core, src, len, config);
+    }
+    else
+    {
+        uint32_t udma_core = pi_udma_core_2d_addr_get(tx_chan - UDMA_NB_CHAN_LIN);
+        uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_EN(1);
+        pi_udma_core_2d_enqueue(udma_core, src, 0, len, dev_data->src_trf_cfg.stride,
+                                dev_data->src_trf_cfg.row_len, config);
+    }
+}
+
+
+static void __pi_udma_datamove_event_handler(uint32_t event, void* arg)
+{
+    pi_udma_datamove_itf_data_t* itf_data = (pi_udma_datamove_itf_data_t*) arg;
+    pi_task_t* task = itf_data->end_task;
+    uint32_t udma_ctrl_base = (uint32_t) ARCHI_UDMA_ADDR;
+
+    /* stop the DATAMOVE */
+    if (0 == itf_data->device_id)
+    {
+        //hal_udma_ctrl_datamove0_stop();
+        udma_ctrl_datamove0_size_stop_set(udma_ctrl_base, 1);
+    }
+    else
+    {
+        //hal_udma_ctrl_datamove1_stop();
+        udma_ctrl_datamove1_size_stop_set(udma_ctrl_base, 1);
+    }
+
+    /* handle current task end */
+    if (task != NULL)
+    {
+       pi_task_push_irq_safe(task);
+    }
+
+    itf_data->end_task = NULL;
+
+    /* start new task if needed */
+    pi_task_t *next_task = __pi_udma_datamove_task_fifo_pop(itf_data);
+    if (next_task)
+    {
+        itf_data->end_task = next_task;
+        __pi_udma_datamove_copy_start(itf_data);
+    }
+}
+
+/*******************************************************************************
+ * API implementation
+ ******************************************************************************/
+
+void pi_udma_datamove_conf_init(pi_udma_datamove_conf_t *conf)
+{
+    conf->device_id = 0;
+    conf->src_trf_cfg.type = PI_UDMA_DATAMOVE_TRF_LINEAR;
+    conf->src_trf_cfg.row_len = 0;
+    conf->src_trf_cfg.stride = 0;
+    conf->dst_trf_cfg.type = PI_UDMA_DATAMOVE_TRF_LINEAR;
+    conf->dst_trf_cfg.row_len = 0;
+    conf->dst_trf_cfg.stride = 0;
+}
+
+int pi_udma_datamove_open(pi_device_t *device)
+{
+    int status = PI_OK;
+    uint32_t irq = pi_irq_disable();
+    pi_udma_datamove_conf_t *conf = (pi_udma_datamove_conf_t*) device->config;
+
+    if (conf->device_id >= __UDMA_NB_DATAMOVE)
+    {
+        pi_irq_restore(irq);
+        return PI_FAIL;
+    }
+
+    pi_udma_datamove_itf_data_t* itf_data = g_udma_datamove_itf_data[conf->device_id];
+
+    if (NULL == itf_data)
+    {
+        /* allocate itf data */
+        itf_data = pi_fc_l1_malloc(sizeof(pi_udma_datamove_itf_data_t));
+        if (NULL == itf_data)
+        {
+            pi_irq_restore(irq);
+            return PI_ERR_NO_MEM;
+        }
+        g_udma_datamove_itf_data[conf->device_id] = itf_data;
+
+        /* allocate lin channels */
+        itf_data->rx_lin_chan_id = pi_udma_core_lin_alloc();
+        itf_data->tx_lin_chan_id = pi_udma_core_lin_alloc();
+        if ((0 > itf_data->rx_lin_chan_id) || (0 > itf_data->tx_lin_chan_id))
+        {
+            pi_udma_core_lin_free(itf_data->rx_lin_chan_id);
+            pi_udma_core_lin_free(itf_data->tx_lin_chan_id);
+            pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t));
+            g_udma_datamove_itf_data[conf->device_id] = NULL;
+            pi_irq_restore(irq);
+            return PI_FAIL;
+        }
+
+        /* enable lin events */
+        pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id));
+        pi_fc_event_handler_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id),
+                                __pi_udma_datamove_event_handler, itf_data);
+        /* allocate 2d channels */
+        itf_data->rx_2d_chan_id = pi_udma_core_2d_alloc();
+        itf_data->tx_2d_chan_id = pi_udma_core_2d_alloc();
+        if ((itf_data->rx_2d_chan_id < 0) || (itf_data->tx_2d_chan_id < 0))
+        {
+            pi_udma_core_2d_free(itf_data->rx_2d_chan_id);
+            pi_udma_core_2d_free(itf_data->tx_2d_chan_id);
+            pi_udma_core_lin_free(itf_data->rx_lin_chan_id);
+            pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id));
+            pi_udma_core_lin_free(itf_data->tx_lin_chan_id);
+            pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t));
+            g_udma_datamove_itf_data[conf->device_id] = NULL;
+            pi_irq_restore(irq);
+            return PI_FAIL;
+        }
+        /* enable 2D events */
+        pi_soc_eu_fc_mask_set(itf_data->rx_2d_chan_id);
+        pi_fc_event_handler_set(itf_data->rx_2d_chan_id,
+                                __pi_udma_datamove_event_handler, itf_data);
+
+        /* Initialize itf data */
+        itf_data->nb_open = 1;
+        itf_data->end_task = NULL;
+        itf_data->fifo_head = NULL;
+        itf_data->fifo_tail = NULL;
+        itf_data->device_id = conf->device_id;
+    }
+    else
+    {
+        itf_data->nb_open++;
+    }
+
+    /* allocate device data */
+    device->data = pi_fc_l1_malloc(sizeof(pi_udma_datamove_data_t));
+    if (NULL == device->data)
+    {
+        /* the device we are opening initialized the interface */
+        /* we need to close it */
+        if (itf_data->nb_open == 1)
+        {
+            /* clear events, disable IRQs && free allocated udma channels */
+            pi_udma_core_2d_free(itf_data->rx_2d_chan_id);
+            pi_udma_core_2d_free(itf_data->tx_2d_chan_id);
+            pi_udma_core_lin_free(itf_data->rx_lin_chan_id);
+            pi_udma_core_lin_free(itf_data->tx_lin_chan_id);
+            pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id));
+            pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_2D((itf_data->rx_2d_chan_id - UDMA_NB_CHAN_LIN)));
+            /* free itf data */
+            g_udma_datamove_itf_data[itf_data->device_id] = NULL;
+            pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t));
+        }
+        pi_irq_restore(irq);
+        return PI_ERR_NO_MEM;
+    }
+
+    /* initialize device data */
+    pi_udma_datamove_data_t* dev_data = (pi_udma_datamove_data_t*) device->data;
+    dev_data->itf_data = itf_data;
+    dev_data->src_trf_cfg.type = conf->src_trf_cfg.type;
+    dev_data->src_trf_cfg.row_len = conf->src_trf_cfg.row_len;
+    dev_data->src_trf_cfg.stride = conf->src_trf_cfg.stride;
+    dev_data->dst_trf_cfg.type = conf->dst_trf_cfg.type;
+    dev_data->dst_trf_cfg.row_len = conf->dst_trf_cfg.row_len;
+    dev_data->dst_trf_cfg.stride = conf->dst_trf_cfg.stride;
+
+    pi_irq_restore(irq);
+    return status;
+}
+
+void pi_udma_datamove_close(pi_device_t *device)
+{
+    uint32_t irq = pi_irq_disable();
+    pi_udma_datamove_data_t* dev_data = (pi_udma_datamove_data_t*) device->data;
+    pi_udma_datamove_itf_data_t* itf_data = dev_data->itf_data;
+
+    itf_data->nb_open--;
+
+    if (0 == itf_data->nb_open)
+    {
+        /* clear events, disable IRQs && free allocated udma channels */
+        pi_udma_core_2d_free(itf_data->rx_2d_chan_id);
+        pi_udma_core_2d_free(itf_data->tx_2d_chan_id);
+        pi_udma_core_lin_free(itf_data->rx_lin_chan_id);
+        pi_udma_core_lin_free(itf_data->tx_lin_chan_id);
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_lin_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_2D((itf_data->rx_2d_chan_id - UDMA_NB_CHAN_LIN)));
+        /* free itf data */
+        g_udma_datamove_itf_data[itf_data->device_id] = NULL;
+        pi_fc_l1_free(itf_data, sizeof(pi_udma_datamove_itf_data_t));
+    }
+
+    /* free device data */
+    pi_fc_l1_free(dev_data, sizeof(pi_udma_datamove_data_t));
+
+    pi_irq_restore(irq);
+}
+
+int32_t pi_udma_datamove_copy_async(pi_device_t *device, void* src, void* dst,
+                                    uint32_t len, pi_task_t* task)
+{
+    uint32_t irq = pi_irq_disable();
+
+    pi_udma_datamove_data_t *dev_data = (pi_udma_datamove_data_t*) device->data;
+    task->data[0] = (uint32_t) src;
+    task->data[1] = (uint32_t) dst;
+    task->data[2] = (uint32_t) len;
+    task->data[3] = (uint32_t) dev_data;
+
+    pi_udma_datamove_itf_data_t *itf_data = dev_data->itf_data;
+
+    /* if a request is in progress, enqueue this one */
+    /* else, execute it */
+    if (NULL == itf_data->end_task)
+    {
+        itf_data->end_task = task;
+        __pi_udma_datamove_copy_start(itf_data);
+    }
+    else
+    {
+        __pi_udma_datamove_task_fifo_enqueue(itf_data, task);
+    }
+
+    pi_irq_restore(irq);
+    return PI_OK;
+}
+
+int32_t pi_udma_datamove_copy(pi_device_t *device, void* src, void* dst, uint32_t len)
+{
+    pi_task_t task;
+    pi_task_block(&task);
+    int32_t status = pi_udma_datamove_copy_async(device, src, dst, len, &task);
+    if (PI_OK == status)
+    {
+        pi_task_wait_on(&task);
+    }
+    pi_task_destroy(&task);
+    return status;
+}
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_ffc.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_ffc.c
new file mode 100644
index 000000000..794b0950d
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_ffc.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2020, GreenWaves Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * o Redistributions of source code must retain the above copyright notice, this list
+ *   of conditions and the following disclaimer.
+ *
+ * o Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * o Neither the name of GreenWaves Technologies, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "pmsis.h"
+#include "chips/gap9/drivers/udma/udma_core.h"
+
+#if !defined(__FREERTOS__)
+#include <archi/chips/gap9_v2/udma_ffc/udma_ffc.h>
+#endif  /*  */
+
+/*******************************************************************************
+ * Definitions
+ ******************************************************************************/
+
+/*******************************************************************************
+ * Driver data
+ ******************************************************************************/
+
+typedef struct pi_ffc_data_s
+{
+    struct pi_ffc_itf_data_s *itf_data;
+    pi_ffc_fixed_type_e fixed_type;
+    uint32_t fixed_scale;
+    uint32_t fixed_precision;
+    pi_ffc_float_type_e float_type;
+    pi_ffc_mode_e mode;
+    pi_ffc_io_mode_e io_mode;
+    uint8_t continuous_mode;
+} pi_ffc_data_t;
+
+typedef struct pi_ffc_itf_data_s
+{
+    pi_task_t *fifo_head; /*!< head of the tasks FIFO */
+    pi_task_t *fifo_tail; /*!< tail of the tasks FIFO */
+    pi_task_t *end_task;  /*!< current callback task */
+    void* latest_conf;    /*!< last used FFC configuration */
+    int32_t rx_chan_id;   /*!< RX udma channel */
+    int32_t tx_chan_id;   /*!< TX udma channel */
+    uint8_t device_id;    /*!< Device ID */
+    int32_t nb_open;      /*!< number of devices opened */
+} pi_ffc_itf_data_t;
+
+static pi_ffc_itf_data_t* g_ffc_itf_data[ARCHI_UDMA_NB_FFC];
+
+/********************
+ * Static functions
+ *******************/
+
+static inline uint32_t __ffc_compute_udma_lin_shift(uint8_t is_rx,
+                                                    pi_ffc_mode_e mode,
+                                                    pi_ffc_float_type_e fl_type,
+                                                    pi_ffc_fixed_type_e fp_type)
+{
+    uint32_t fp_shift = 0;
+    uint32_t fl_shift = 0;
+    switch (fl_type)
+    {
+    case PI_FFC_FLOAT_FP16:
+    case PI_FFC_FLOAT_BFP16:
+        fl_shift = 1;
+        break;
+
+    default:
+        fl_shift = 2;
+        break;
+    }
+
+    switch (fp_type)
+    {
+    case PI_FFC_FIXED_8:
+        fp_shift = 0;
+        break;
+
+    case PI_FFC_FIXED_16:
+        fp_shift = 1;
+        break;
+
+    default:
+        fp_shift = 2;
+        break;
+    }
+
+    if (0 != is_rx)
+    {
+        return (PI_FFC_FIXED_TO_FLOAT == mode) ? fp_shift : fl_shift;
+    }
+    else
+    {
+        return (PI_FFC_FLOAT_TO_FIXED == mode) ? fp_shift : fl_shift;
+    }
+}
+
+// Has to be synchronized with irq_disabled(done in convert_async)
+// since irq handler might pop at the same time
+static inline void __ffc_drv_fifo_enqueue(pi_ffc_itf_data_t *itf_data,
+                                          pi_task_t *pi_task)
+{
+    //uint32_t irq = pi_irq_disable();
+    if (itf_data->fifo_tail)
+    {
+        // tail insert
+        itf_data->fifo_tail->next      = pi_task;
+        itf_data->fifo_tail            = itf_data->fifo_tail->next;
+        itf_data->fifo_tail->next      = NULL;
+    }
+    else
+    {
+        // Initialize the list
+        itf_data->fifo_head          = pi_task;
+        itf_data->fifo_head->next    = NULL;
+        // set the base tail
+        itf_data->fifo_tail          = itf_data->fifo_head;
+    }
+    //pi_irq_restore(irq);
+}
+
+static inline pi_task_t *__ffc_drv_fifo_pop(pi_ffc_itf_data_t *itf_data)
+{
+    pi_task_t *ret_task = NULL;
+    if (itf_data->fifo_head != NULL)
+    {
+        ret_task = itf_data->fifo_head;
+        hal_compiler_barrier();
+        itf_data->fifo_head = itf_data->fifo_head->next;
+        if (itf_data->fifo_head == NULL)
+        {
+            itf_data->fifo_tail = NULL;
+        }
+    }
+    return ret_task;
+}
+
+static void __pi_ffc_change_continuous_mode(pi_ffc_data_t* dev_data,
+                                            uint8_t continuous_mode)
+{
+    uint32_t base = (uint32_t) UDMA_FFC_ADDR(dev_data->itf_data->device_id);
+
+    if (continuous_mode != dev_data->continuous_mode)
+    {
+        if (continuous_mode != 0)
+        {
+            udma_ffc_start_set(base, 1);
+        }
+        else
+        {
+            udma_ffc_start_set(base, 0);
+        }
+        dev_data->continuous_mode = continuous_mode;
+    }
+}
+
+static void __pi_ffc_change_event_source(pi_ffc_itf_data_t* itf_data)
+{
+    /* change event source according to io mode */
+    uint32_t base = (uint32_t) UDMA_FFC_ADDR(itf_data->device_id);
+
+    pi_ffc_data_t* dev_data = (pi_ffc_data_t*) itf_data->end_task->data[3];
+
+    switch(dev_data->io_mode)
+    {
+    case PI_FFC_MEMORY_IN_MEMORY_OUT:
+        //fallthrough
+    case PI_FFC_STREAM_IN_MEMORY_OUT:
+    {
+        /* use output channel as event source */
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id));
+        pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+        break;
+    }
+
+    case PI_FFC_MEMORY_IN_STREAM_OUT:
+    {
+        /* use input channel as event source */
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+        pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id));
+        break;
+    }
+
+    case PI_FFC_STREAM_IN_STREAM_OUT:
+    {
+        /* FFC has no control over data flow, no event source */
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id));
+        break;
+    }
+
+    default:
+        //invalid choice
+        break;
+    }
+}
+
+static void __pi_ffc_conf_apply(pi_ffc_itf_data_t* itf_data)
+{
+    uint32_t base = (uint32_t) UDMA_FFC_ADDR(itf_data->device_id);
+
+    pi_ffc_data_t* dev_data = (pi_ffc_data_t*) itf_data->end_task->data[3];
+
+    /* retrieve current task */
+    if (itf_data->latest_conf != dev_data)
+    {
+        itf_data->latest_conf = dev_data;
+
+        udma_ffc_fp_format_set(base, dev_data->fixed_type);
+        udma_ffc_fl_format_set(base, dev_data->float_type);
+        udma_ffc_fp_prec_set(base,  dev_data->fixed_precision);
+        udma_ffc_fp_scale_set(base, dev_data->fixed_scale);
+        udma_ffc_mode_direction_set(base, dev_data->mode);
+        udma_ffc_mode_io_mode_set(base, dev_data->io_mode);
+        __pi_ffc_change_event_source(itf_data);
+    }
+    udma_ffc_rx_dest_set(base, itf_data->rx_chan_id);
+    udma_ffc_tx_dest_set(base, itf_data->tx_chan_id);
+
+    /* set stream as blocking */
+    udma_ctrl_stream_cfg_set((uint32_t) ARCHI_UDMA_ADDR, 1 << (18 + itf_data->device_id));
+}
+
+static void __pi_ffc_conversion_start(pi_ffc_itf_data_t* itf_data)
+{
+    uint32_t chan_id;
+
+    if (NULL == itf_data->end_task)
+    {
+        return;
+    }
+
+    uint32_t src = itf_data->end_task->data[0];
+    uint32_t dst = itf_data->end_task->data[1];
+    uint32_t len = itf_data->end_task->data[2];
+    pi_ffc_data_t* dev_data = (pi_ffc_data_t*) itf_data->end_task->data[3];
+
+    __pi_ffc_conf_apply(itf_data);
+
+    uint32_t base = (uint32_t) UDMA_FFC_ADDR(itf_data->device_id);
+
+    /* launch the conversion */
+    if (dev_data->continuous_mode != 0)
+    {
+        udma_ffc_conv_num_set(base, 0);
+    }
+    else
+    {
+        udma_ffc_conv_num_set(base, len);
+        udma_ffc_start_set(base, 1);
+    }
+
+    /* setup & launching channels */
+
+    if (0 == (dev_data->io_mode & 2))
+    {
+        chan_id = udma_ffc_rx_dest_get(base);
+        uint32_t udma_core = pi_udma_core_lin_addr_get(chan_id);
+        uint32_t rx_shift = __ffc_compute_udma_lin_shift(0, dev_data->mode, dev_data->float_type, dev_data->fixed_type);
+        pi_udma_core_lin_enqueue(udma_core, (uint32_t) dst, len << rx_shift, 0);
+    }
+
+    if (0 == (dev_data->io_mode & 1))
+    {
+        chan_id = udma_ffc_tx_dest_get(base);
+        uint32_t udma_core = pi_udma_core_lin_addr_get(chan_id);
+        uint32_t tx_shift = __ffc_compute_udma_lin_shift(1, dev_data->mode, dev_data->float_type, dev_data->fixed_type);
+        pi_udma_core_lin_enqueue(udma_core, (uint32_t) src, len << tx_shift, 0);
+    }
+}
+
+/********************
+ * Callback
+ ********************/
+
+static void __pi_ffc_event_handler(uint32_t event, void *arg)
+{
+
+    pi_ffc_itf_data_t* itf_data = (pi_ffc_itf_data_t*) arg;
+    pi_task_t* task = itf_data->end_task;
+
+    /* handle current task end */
+    if (task != NULL)
+    {
+        pi_task_push_irq_safe(task);
+    }
+
+    itf_data->end_task = NULL;
+
+    /* start new task if needed */
+    pi_task_t *next_task = __ffc_drv_fifo_pop(itf_data);
+    if (next_task)
+    {
+        itf_data->end_task = next_task;
+        __pi_ffc_conversion_start(itf_data);
+    }
+}
+
+
+/*******************************************************************************
+ * API implementation
+ ******************************************************************************/
+
+void pi_ffc_conf_init(pi_ffc_conf_t *conf)
+{
+    conf->itf = 0;
+    conf->mode = PI_FFC_FLOAT_TO_FIXED;
+    conf->io_mode = PI_FFC_MEMORY_IN_MEMORY_OUT;
+    conf->fixed_type = PI_FFC_FIXED_32;
+    conf->fixed_scale = 0;
+    conf->fixed_precision = 0;
+    conf->float_type = PI_FFC_FLOAT_FP32;
+}
+
+int pi_ffc_open(pi_device_t *device)
+{
+    uint32_t irq = pi_irq_disable();
+    pi_ffc_conf_t *conf = (pi_ffc_conf_t*) device->config;
+    pi_ffc_itf_data_t *itf_data = g_ffc_itf_data[conf->itf];
+
+    if (NULL == itf_data)
+    {
+        /* allocate itf data */
+        itf_data = pi_fc_l1_malloc(sizeof(pi_ffc_itf_data_t));
+        if (NULL == itf_data)
+        {
+            pi_irq_restore(irq);
+            return PI_ERR_NO_MEM;
+        }
+        g_ffc_itf_data[conf->itf] = itf_data;
+
+        /* allocate 2 udma lin channels */
+        /* set both of them to trigger event handler, io_mode will decide which
+         * one to use */
+        int32_t tx_chan_id = pi_udma_core_lin_alloc();
+        int32_t rx_chan_id = pi_udma_core_lin_alloc();
+        if (rx_chan_id < 0 || tx_chan_id < 0)
+        {
+            pi_fc_l1_free(itf_data, sizeof(pi_ffc_itf_data_t));
+            pi_irq_restore(irq);
+            return PI_FAIL;
+        }
+        itf_data->rx_chan_id = rx_chan_id;
+        itf_data->tx_chan_id = tx_chan_id;
+
+        pi_fc_event_handler_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id),
+                                __pi_ffc_event_handler, itf_data);
+        pi_fc_event_handler_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id),
+                                __pi_ffc_event_handler, itf_data);
+        /* use rx as default */
+        pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+
+        itf_data->nb_open = 1;
+        itf_data->device_id = conf->itf;
+        itf_data->fifo_head = NULL;
+        itf_data->fifo_tail = NULL;
+        itf_data->end_task = NULL;
+        itf_data->latest_conf = NULL;
+
+        /* disable udma reset before setting regs */
+        uint32_t periph_id = ARCHI_UDMA_FFC_ID(itf_data->device_id);
+        udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+        udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+    }
+    else
+    {
+        itf_data->nb_open++;
+    }
+
+    /* allocate device data */
+    device->data = pi_fc_l1_malloc(sizeof(pi_ffc_data_t));
+    if (NULL == device->data)
+    {
+        pi_irq_restore(irq);
+        return PI_ERR_L2_NO_MEM;
+    }
+
+    /* initialize device data */
+    pi_ffc_data_t* dev_data = (pi_ffc_data_t*) device->data;
+    dev_data->itf_data = itf_data;
+    dev_data->fixed_type = conf->fixed_type;
+    dev_data->fixed_scale = conf->fixed_scale;
+    dev_data->fixed_precision = conf->fixed_precision;
+    dev_data->float_type = conf->float_type;
+    dev_data->mode = conf->mode;
+    dev_data->io_mode = conf->io_mode;
+    dev_data->continuous_mode = 0; /* continuous mode disabled by default */
+
+    pi_irq_restore(irq);
+    return PI_OK;
+}
+
+void pi_ffc_close(pi_device_t *device)
+{
+    uint32_t irq = pi_irq_disable();
+    pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data;
+    pi_ffc_itf_data_t* itf_data = dev_data->itf_data;
+
+    /* decrement number of devices opened */
+    itf_data->nb_open--;
+
+    if (itf_data->nb_open == 0)
+    {
+        /* clear events, disable IRQs & free allocated udma channels */
+        pi_udma_core_lin_reset(pi_udma_core_lin_addr_get(itf_data->rx_chan_id));
+        pi_udma_core_lin_reset(pi_udma_core_lin_addr_get(itf_data->tx_chan_id));
+
+        pi_udma_core_lin_free(itf_data->rx_chan_id);
+        pi_udma_core_lin_free(itf_data->tx_chan_id);
+
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->rx_chan_id));
+        pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_CHAN_LIN(itf_data->tx_chan_id));
+
+        uint32_t periph_id = ARCHI_UDMA_FFC_ID(itf_data->device_id);
+        udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+        udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+
+        /* free itf data */
+        g_ffc_itf_data[itf_data->device_id] = NULL;
+        pi_fc_l1_free(itf_data, sizeof(pi_ffc_itf_data_t));
+    }
+    /* free device data */
+    pi_fc_l1_free(device->data, sizeof(pi_ffc_data_t));
+    pi_irq_restore(irq);
+}
+
+int32_t pi_ffc_ioctl(pi_device_t *device, uint32_t cmd, void *arg)
+{
+    uint32_t irq = pi_irq_disable();
+    switch (cmd)
+    {
+    case PI_FFC_IOCTL_SET_IO_MODE:
+    {
+        pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data;
+        dev_data->io_mode = (pi_ffc_io_mode_e)((uintptr_t) arg);
+        /* make last config invalid */
+        dev_data->itf_data->latest_conf = NULL;
+        break;
+    }
+
+    case PI_FFC_IOCTL_CONTINUOUS_ENABLE:
+    {
+        pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data;
+
+        uint8_t continuous_mode = (uint8_t)((uintptr_t) arg);
+        __pi_ffc_change_continuous_mode(dev_data, continuous_mode);
+        /* make last config invalid */
+        dev_data->itf_data->latest_conf = NULL;
+        break;
+    }
+
+    default:
+        pi_irq_restore(irq);
+        return PI_FAIL;
+    }
+    pi_irq_restore(irq);
+    return PI_OK;
+}
+
+void pi_ffc_convert(pi_device_t *device, void* src, void* dst, uint16_t size)
+{
+    pi_task_t block;
+    pi_task_block(&block);
+    pi_ffc_convert_async(device, src, dst, size, &block);
+    pi_task_wait_on(&block);
+    pi_task_destroy(&block);
+}
+
+void pi_ffc_convert_async(pi_device_t* device, void* src, void* dst,
+                          uint16_t size, pi_task_t* task)
+{
+    uint32_t irq = pi_irq_disable();
+
+    pi_ffc_data_t *dev_data = (pi_ffc_data_t*) device->data;
+    task->data[0] = (uint32_t) src;
+    task->data[1] = (uint32_t) dst;
+    task->data[2] = (uint32_t) size;
+    task->data[3] = (uint32_t) dev_data;
+
+    pi_ffc_itf_data_t *itf_data = dev_data->itf_data;
+
+    /* if a request is in progress, enqueue this one */
+    /* else, execute it */
+    if (NULL == itf_data->end_task)
+    {
+        itf_data->end_task = task;
+        __pi_ffc_conversion_start(itf_data);
+    }
+    else
+    {
+        __ffc_drv_fifo_enqueue(itf_data, task);
+    }
+
+    pi_irq_restore(irq);
+}
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timeout.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timeout.c
new file mode 100644
index 000000000..88f7e780c
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timeout.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2021, GreenWaves Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * o Redistributions of source code must retain the above copyright notice, this list
+ *   of conditions and the following disclaimer.
+ *
+ * o Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * o Neither the name of GreenWaves Technologies, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "pmsis.h"
+#include "chips/gap9/drivers/udma/udma_timeout.h"
+
+/*******************************************************************************
+ * Definitions
+ ******************************************************************************/
+
+#if !defined(__FREERTOS__)
+#define UDMA_NB_TIMEOUT       ( 8 )
+#endif  /* __FREERTOS__ */
+
+/*******************************************************************************
+ * Driver data
+ ******************************************************************************/
+
+static struct pi_udma_timeout_s *g_udma_timeout[UDMA_NB_TIMEOUT];
+
+/*******************************************************************************
+ * Function declaration
+ ******************************************************************************/
+
+/* Event handler. */
+static void __pi_udma_timeout_event_handler(uint32_t event, void *arg);
+
+/* Enqueue task in SW fifo. */
+static inline void __pi_udma_timeout_task_push(struct pi_udma_timeout_s *driver_data,
+                                               pi_task_t *task);
+
+/* Pop task from SW fifo. */
+static inline pi_task_t *__pi_udma_timeout_task_pop(struct pi_udma_timeout_s *driver_data);
+
+/* Start a UDMA timer, when in SW trigger mode. */
+static void __pi_udma_timeout_start(uint8_t timeout_id);
+
+/* Stop a UDMA timer. */
+static void __pi_udma_timeout_stop(uint8_t timeout_id);
+
+/*******************************************************************************
+ * Internal functions
+ ******************************************************************************/
+
+/** TIMEOUT_PRE Register. */
+static inline uint32_t __pi_udma_ctrl_timeout_prescaler_conf_get(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3));
+    uint32_t prescaler = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    return prescaler;
+}
+
+static inline uint32_t __pi_udma_ctrl_timeout_prescaler_enabled(uint8_t timeout_id)
+{
+    uint32_t prescaler = __pi_udma_ctrl_timeout_prescaler_conf_get(timeout_id);
+    prescaler &= UDMA_CTRL_TIMEOUT_PRE0_EN_MASK;
+    prescaler >>= UDMA_CTRL_TIMEOUT_PRE0_EN_BIT;
+    return prescaler;
+}
+
+static inline void __pi_udma_ctrl_timeout_prescaler_set(uint8_t timeout_id,
+                                                       uint16_t presc_value,
+                                                       uint8_t enable)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3));
+    uint32_t prescaler = (UDMA_CTRL_TIMEOUT_PRE0_CNT(presc_value) |
+                          UDMA_CTRL_TIMEOUT_PRE0_EN(enable));
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler);
+}
+
+static inline void __pi_udma_ctrl_timeout_prescaler_reset(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3));
+    udma_ctrl_timeout_pre0_t prescaler = {0};
+    prescaler.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    prescaler.clr = 1;
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler.raw);
+}
+
+static inline void __pi_udma_ctrl_timeout_prescaler_start(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3));
+    udma_ctrl_timeout_pre0_t prescaler = {0};
+    prescaler.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    prescaler.en = 1;
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler.raw);
+}
+
+static inline void __pi_udma_ctrl_timeout_prescaler_stop(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_PRE0_OFFSET + (timeout_id << 3));
+    udma_ctrl_timeout_pre0_t prescaler = {0};
+    prescaler.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    prescaler.en = 0;
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, prescaler.raw);
+}
+
+
+/** TIMEOUT_CHX Register. */
+static inline uint32_t __pi_udma_ctrl_timeout_timeout_get(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3));
+    uint32_t timeout = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    return timeout;
+}
+
+static inline void __pi_udma_ctrl_timeout_timeout_set(uint8_t timeout_id,
+                                                     uint8_t udma_chan_id,
+                                                     uint8_t mode,
+                                                     uint16_t timeout_val,
+                                                     uint8_t enable)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3));
+    uint32_t timeout = (UDMA_CTRL_TIMEOUT_CH0_SOURCE_ID(udma_chan_id) |
+                        UDMA_CTRL_TIMEOUT_CH0_MODE(mode) |
+                        UDMA_CTRL_TIMEOUT_CH0_EN(enable) |
+                        UDMA_CTRL_TIMEOUT_CH0_CNT(timeout_val));
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout);
+
+}
+static inline void __pi_udma_ctrl_timeout_mode_set(uint8_t timeout_id, uint8_t mode)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3));
+    udma_ctrl_timeout_ch0_t timeout = {0};
+    timeout.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    timeout.mode = mode;
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout.raw);
+}
+
+static inline void __pi_udma_ctrl_timeout_timeout_start(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3));
+    udma_ctrl_timeout_ch0_t timeout = {0};
+    timeout.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    timeout.en = 1;
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout.raw);
+}
+
+static inline void __pi_udma_ctrl_timeout_timeout_stop(uint8_t timeout_id)
+{
+    uint32_t reg_offset = (UDMA_CTRL_TIMEOUT_CH0_OFFSET + (timeout_id << 3));
+    udma_ctrl_timeout_ch0_t timeout = {0};
+    timeout.raw = GAP_READ((uint32_t) ARCHI_UDMA_ADDR, reg_offset);
+    timeout.en = 0;
+    GAP_WRITE((uint32_t) ARCHI_UDMA_ADDR, reg_offset, timeout.raw);
+}
+
+
+static void __pi_udma_timeout_event_handler(uint32_t event, void *arg)
+{
+    uint32_t timeout_id = (uint32_t) arg;
+    //TIMEOUT_TRACE("Timeout id=%ld event\n", timeout_id);
+    struct pi_udma_timeout_s *driver_data = g_udma_timeout[timeout_id];
+    struct pi_task *task = __pi_udma_timeout_task_pop(driver_data);
+    __pi_udma_timeout_stop(timeout_id);
+    if (task != NULL)
+    {
+        //TIMEOUT_TRACE("Handle task=%lx\n", task);
+        //printf("Handle task=%lx, arg=%lx\n", task, task->arg[3]);
+        /* Timeout reached, abort transfer. */
+        pi_callback_func_t func = (pi_callback_func_t) task->arg[2];
+        func((void *) task->arg[3]);
+
+        /* Set transfer end result. */
+        task->arg[2] = -1;
+
+        /* Release event task. */
+        pi_task_push(task);
+    }
+}
+
+static inline void __pi_udma_timeout_task_push(struct pi_udma_timeout_s *driver_data,
+                                               pi_task_t *task)
+{
+    uint32_t irq = disable_irq();
+    if ((driver_data->fifo_head == NULL) || (driver_data->fifo_head == (void *) 0xFFFFFFFF))
+    {
+        driver_data->fifo_head = task;
+    }
+    else
+    {
+        driver_data->fifo_tail->next = task;
+    }
+    driver_data->fifo_tail = task;
+    restore_irq(irq);
+}
+
+static inline pi_task_t *__pi_udma_timeout_task_pop(struct pi_udma_timeout_s *driver_data)
+{
+    pi_task_t *task_return = NULL;
+    if (driver_data->fifo_head != NULL)
+    {
+        task_return = driver_data->fifo_head;
+        driver_data->fifo_head = driver_data->fifo_head->next;
+    }
+    return task_return;
+}
+
+static void __pi_udma_timeout_start(uint8_t timeout_id)
+{
+    __pi_udma_ctrl_timeout_prescaler_stop(timeout_id);
+    __pi_udma_ctrl_timeout_prescaler_reset(timeout_id);
+    __pi_udma_ctrl_timeout_timeout_start(timeout_id);
+    __pi_udma_ctrl_timeout_prescaler_start(timeout_id);
+}
+
+static void __pi_udma_timeout_stop(uint8_t timeout_id)
+{
+    __pi_udma_ctrl_timeout_prescaler_stop(timeout_id);
+    __pi_udma_ctrl_timeout_timeout_stop(timeout_id);
+}
+
+int32_t pi_udma_timeout_config_set(pi_task_t *task, uint8_t timeout_id,
+                                   uint8_t udma_chan_id, uint32_t timeout_us)
+{
+    int32_t status = 0;
+    /* Fast clock used by timeout. */
+    float periph_freq = (float) ARCHI_FLL_REF_CLOCK; /* 24MHz. */
+    float nb_tick_us = periph_freq / 1000000.0;
+    float timeout_val = ((float) timeout_us) * nb_tick_us;
+    uint16_t prescaler = 0;
+    TIMEOUT_TRACE("Timeout_%d : configure udma_chan=%d, timeout_us=%ld, task=%lx\n",
+                  timeout_id, udma_chan_id, timeout_us, task);
+    TIMEOUT_TRACE("Periph_freq=%f, timeout_us=%ld, nb_tick_us=%f, timeout_val=%f\n",
+                  periph_freq, timeout_us, nb_tick_us, timeout_val);
+    prescaler = (timeout_val > 0xFFFF);
+    while (timeout_val > 0xFFFF)
+    {
+        prescaler <<= 1;
+        timeout_val /= 2;
+    }
+    if (__pi_udma_ctrl_timeout_prescaler_enabled(timeout_id))
+    {
+        /* Timeout already in use. */
+        TIMEOUT_TRACE_ERR("Timeout id=%ld already in use\n", timeout_id);
+        return -11;
+    }
+
+    __pi_udma_timeout_task_push(g_udma_timeout[timeout_id], task);
+
+    uint32_t mode = g_udma_timeout[timeout_id]->mode;
+    uint8_t enable = (mode != PI_UDMA_TIMEOUT_MODE_SW_TRIGGER);
+    TIMEOUT_TRACE("Timeout_%d : mode=%d, prescaler=%d, timeout=%f, enable=%d\n",
+                  timeout_id, mode, prescaler, timeout_val, enable);
+    __pi_udma_ctrl_timeout_prescaler_set(timeout_id, prescaler, enable);
+    __pi_udma_ctrl_timeout_timeout_set(timeout_id, udma_chan_id, mode,
+                                      (uint16_t) timeout_val, enable);
+    __pi_udma_ctrl_timeout_prescaler_reset(timeout_id);
+    return status;
+}
+
+pi_task_t *__pi_udma_timeout_task_remove(uint8_t timeout_id)
+{
+    return __pi_udma_timeout_task_pop(g_udma_timeout[timeout_id]);
+}
+
+/*******************************************************************************
+ * API implementation
+ ******************************************************************************/
+
+int32_t pi_udma_timeout_alloc(pi_udma_timeout_mode_e mode)
+{
+    int32_t timeout_id = -1;
+    for (uint32_t tid = 0; tid < (uint32_t) UDMA_NB_TIMEOUT; tid++)
+    {
+        if ((g_udma_timeout[tid] == NULL) || (g_udma_timeout[tid]->fifo_head == NULL))
+        {
+            /* Alloc UDMA timeout struct. */
+            g_udma_timeout[tid] = pi_fc_l1_malloc(sizeof(struct pi_udma_timeout_s));
+            if (g_udma_timeout[tid] == NULL)
+            {
+                TIMEOUT_TRACE_ERR("Timeout struct alloc failed !\n");
+                timeout_id = -11;
+                break;
+            }
+            g_udma_timeout[tid]->fifo_head = (void *) 0xFFFFFFFF;
+            g_udma_timeout[tid]->mode = mode;
+            /* Set FC event handler. */
+            pi_fc_event_handler_set(SOC_EVENT_UDMA_TIMEOUT(tid),
+                                    __pi_udma_timeout_event_handler,
+                                    (void *) tid);
+            /* Enable SoC events propagation to FC. */
+            pi_soc_eu_fc_mask_set(SOC_EVENT_UDMA_TIMEOUT(tid));
+            timeout_id = tid;
+            TIMEOUT_TRACE("Timeout id=%ld allocated\n", timeout_id);
+            break;
+        }
+    }
+    return timeout_id;
+}
+
+void pi_udma_timeout_free(int32_t timeout_id)
+{
+    TIMEOUT_TRACE("Timeout_%ld : free timeout.\n", timeout_id);
+    g_udma_timeout[timeout_id]->fifo_head = NULL;
+    /* Free UDMA timeout struct. */
+    pi_fc_l1_free(g_udma_timeout[timeout_id], sizeof(struct pi_udma_timeout_s));
+    /* Clear FC event handler. */
+    pi_fc_event_handler_clear(SOC_EVENT_UDMA_TIMEOUT(timeout_id));
+    /* Disable SoC events propagation. */
+    pi_soc_eu_fc_mask_clear(SOC_EVENT_UDMA_TIMEOUT(timeout_id));
+}
+
+int32_t pi_udma_timeout_ioctl(int32_t timeout_id, uint32_t cmd, void *arg)
+{
+    TIMEOUT_TRACE("Timeout_%ld : ioctl cmd=%ld, arg=%lx\n", timeout_id, cmd, arg);
+    int32_t status = 0;
+    switch (cmd)
+    {
+    case PI_UDMA_TIMEOUT_IOCTL_START :
+        __pi_udma_timeout_start(timeout_id);
+        break;
+
+    case PI_UDMA_TIMEOUT_IOCTL_STOP :
+        __pi_udma_timeout_stop(timeout_id);
+        break;
+
+    default :
+        status = -1;
+    }
+    return status;
+}
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.c b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.c
new file mode 100644
index 000000000..dbe04185c
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2020, GreenWaves Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * o Redistributions of source code must retain the above copyright notice, this list
+ *   of conditions and the following disclaimer.
+ *
+ * o Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * o Neither the name of GreenWaves Technologies, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "pmsis.h"
+#include "udma_timestamp.h"
+
+/*******************************************************************************
+ * Definitions
+ ******************************************************************************/
+
+#if !defined(__FREERTOS__)
+#define UDMA_TIMESTAMP_ID(id)    ( ARCHI_UDMA_TS_ID((id)) )
+#define UDMA_TIMESTAMP(id)       ( UDMA_TS_ADDR((id)) )
+#define UDMA_NB_TIMESTAMP        ( ARCHI_UDMA_NB_TS )
+#define TIMESTAMP_TRACE(...)     ( (void) 0 )
+#define TIMESTAMP_TRACE_ERR(...) ( (void) 0 )
+#endif  /* __FREERTOS__ */
+
+/*******************************************************************************
+ * Driver data
+ ******************************************************************************/
+
+static struct pi_udma_timestamp_cnt_t timestamp_cnt[UDMA_NB_TIMESTAMP];
+static struct pi_udma_ts_evt_t ts_evt[UDMA_NB_TIMESTAMP_EVT];
+static struct pi_udma_ts_input_t ts_input[UDMA_NB_TIMESTAMP_INPUT];
+static uint8_t evt_mask = 0xF;
+static uint8_t input_mask = 0xFF;
+
+/*******************************************************************************
+ * Function declaration
+ ******************************************************************************/
+
+/*******************************************************************************
+ * Internal functions
+ ******************************************************************************/
+
+static void __pi_udma_timestamp_cnt_clr(uint32_t base)
+{
+    udma_timestamp_reg_cmd_cnt_clr_set(base, 1);
+}
+
+static void __pi_udma_timestamp_cnt_stop(uint32_t base)
+{
+    udma_timestamp_reg_cmd_cnt_stop_set(base, 1);
+}
+
+static void __pi_udma_timestamp_cnt_close(struct pi_udma_timestamp_cnt_t *ts)
+{
+    /* Stop the timestamp counter */
+    __pi_udma_timestamp_cnt_stop(ts->base);
+
+    ts->cnt_en = 0;
+
+    if (ts->cnt_trig_gpio != 0xFF)
+    {
+        udma_timestamp_reg_setup_cnt_ext_sel_set(ts->base, 0);
+        udma_timestamp_reg_setup_cnt_ext_type_set(ts->base, 0);
+        udma_timestamp_reg_setup_cnt_ext_en_set(ts->base, 0);
+    }
+
+    udma_timestamp_reg_clk_cfg_clk_mux_set(ts->base,   0);
+    udma_timestamp_reg_clk_cfg_gpio_sel_set(ts->base,  0);
+    udma_timestamp_reg_clk_cfg_pwm_sel_set(ts->base,   0);
+    udma_timestamp_reg_clk_cfg_prescaler_set(ts->base, 0);
+    udma_timestamp_reg_clk_cfg_clk_mux_en_set(ts->base, 0);
+
+    //TODO: clean the ts
+    //memset(ts,0,sizeof(struct pi_udma_timestamp_cnt_t));
+}
+
+static void __pi_udma_evt_cfg_init(void)
+{
+    uint32_t base = ARCHI_UDMA_ADDR;
+    // Init all the cfg event in udma ctrl to 0xFF
+    udma_ctrl_cfg_event_cmp_evt0_set(base, 0xFF);
+    udma_ctrl_cfg_event_cmp_evt1_set(base, 0xFF);
+    udma_ctrl_cfg_event_cmp_evt2_set(base, 0xFF);
+    udma_ctrl_cfg_event_cmp_evt3_set(base, 0xFF);
+}
+
+
+static int __pi_udma_timestamp_evt_alloc(uint32_t ts_base, pi_timestamp_event_t * evt)
+{
+    uint8_t src_id = 0;
+    if(evt_mask)
+    {
+        src_id = __builtin_pulp_fl1((evt_mask));
+        evt->ts_evt_id = src_id;
+        evt_mask &= ~(1<<src_id);
+    }
+    else
+    {
+        TIMESTAMP_TRACE_ERR("All the events have been reserved\n");
+        return -1;
+    }
+    return src_id;
+}
+
+static int __pi_udma_timestamp_evt_cfg(uint32_t ts_base, pi_timestamp_event_t * evt)
+{
+
+    ts_evt[evt->ts_evt_id].dest_id = evt->dest_id;
+
+    switch (evt->ts_evt_id)
+    {
+        case 0:
+            udma_timestamp_reg_event_dest_id_evt_0_set(ts_base, evt->dest_id);
+            break;
+        case 1:
+            udma_timestamp_reg_event_dest_id_evt_1_set(ts_base, evt->dest_id);
+            break;
+        case 2:
+            udma_timestamp_reg_event_dest_id_evt_2_set(ts_base, evt->dest_id);
+            break;
+        case 3:
+            udma_timestamp_reg_event_dest_id_evt_3_set(ts_base, evt->dest_id);
+            break;
+        default:
+            TIMESTAMP_TRACE_ERR("Unknown timestamp event numbe= %d\n", evt->ts_evt_id);
+            break;
+    }
+    return 0;
+}
+
+static int __pi_udma_timestamp_input_set(uint32_t base, pi_timestamp_input_t * input)
+{
+
+    uint8_t src_id = 0;
+    if(input_mask)
+    {
+        src_id = __builtin_pulp_fl1((input_mask));
+        ts_input[src_id].ts_input_id = src_id;
+        ts_input[src_id].dest_id = input->dest_id;
+        ts_input[src_id].input_sel = input->input_sel;
+        ts_input[src_id].input_type = input->input_type;
+        input->ts_input_id = src_id;
+        input_mask &= ~(1<<src_id);
+    }
+    else
+    {
+        TIMESTAMP_TRACE_ERR("All the events have been reserved\n");
+        return -1;
+    }
+
+
+    if (input->input_sel >> 6)
+    {
+        TIMESTAMP_TRACE_ERR("GPIO ID bigger than 63\n");
+        return -1;
+    }
+    else
+    {
+        switch (input->ts_input_id)
+        {
+            case 0:
+                udma_timestamp_reg_setup_ch0_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch0_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch0_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch0_dest_id_set(base, input->dest_id);
+                break;
+            case 1:
+                udma_timestamp_reg_setup_ch1_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch1_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch1_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch1_dest_id_set(base, input->dest_id);
+                break;
+            case 2:
+                udma_timestamp_reg_setup_ch2_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch2_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch2_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch2_dest_id_set(base, input->dest_id);
+                break;
+            case 3:
+                udma_timestamp_reg_setup_ch3_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch3_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch3_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch3_dest_id_set(base, input->dest_id);
+                break;
+            case 4:
+                udma_timestamp_reg_setup_ch4_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch4_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch4_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch4_dest_id_set(base, input->dest_id);
+                break;
+            case 5:
+                udma_timestamp_reg_setup_ch5_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch5_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch5_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch5_dest_id_set(base, input->dest_id);
+                break;
+            case 6:
+                udma_timestamp_reg_setup_ch6_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch6_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch6_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch6_dest_id_set(base, input->dest_id);
+                break;
+            case 7:
+                udma_timestamp_reg_setup_ch7_input_sel_set(base, input->input_sel);
+                udma_timestamp_reg_setup_ch7_input_type_set(base, input->input_type);
+                udma_timestamp_reg_setup_ch7_input_en_set(base, 1);
+                udma_timestamp_reg_setup_ch7_dest_id_set(base, input->dest_id);
+                break;
+
+            default:
+                break;
+        }
+
+    }
+    return 0;
+}
+
+/*******************************************************************************
+ * API implementation
+ ******************************************************************************/
+
+void pi_timestamp_conf_init(struct pi_timestamp_conf *conf)
+{
+    conf->itf = 0;
+    conf->cnt_trig_gpio = 0xFF;
+    conf->cnt_trig_type = PI_TIMESTAMP_AUX_INPUT;
+    conf->cnt_src = PI_TIMESTAMP_CNT_REF_CLK_QUICK;
+    conf->cnt_src_id = 0xFF;
+    conf->prescaler = 0;
+}
+
+void pi_udma_timestamp_open(struct pi_device * device)
+{
+    uint32_t irq = pi_irq_disable();
+    struct pi_timestamp_conf *conf = (struct pi_timestamp_conf *) device -> config;
+    struct pi_udma_timestamp_cnt_t *ts = &timestamp_cnt[conf->itf];
+
+    device->data = (void *)ts;
+
+    if (ts->cnt_en)
+    {
+        TIMESTAMP_TRACE("Timestamp counter already set, ignore this counter init");
+    }
+    else
+    {
+        ts->base = UDMA_TIMESTAMP(conf->itf);
+        /* Disable UDMA CG and reset periph. */
+        uint32_t periph_id = UDMA_TIMESTAMP_ID(conf->itf);
+        udma_ctrl_cfg_rstn_set_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+        udma_ctrl_cfg_cg_set_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+
+        if (conf->cnt_trig_gpio != 0xFF)
+        {
+            udma_timestamp_reg_setup_cnt_ext_sel_set(ts->base, conf->cnt_trig_gpio);
+            udma_timestamp_reg_setup_cnt_ext_type_set(ts->base, conf->cnt_trig_type);
+            udma_timestamp_reg_setup_cnt_ext_en_set(ts->base, 1);
+        }
+
+        if (conf->cnt_src != PI_TIMESTAMP_CNT_SOC_CLK)
+        {
+            udma_timestamp_reg_clk_cfg_clk_mux_set(ts->base, conf->cnt_src);
+            if (conf->cnt_src == PI_TIMESTAMP_CNT_GPIO)
+            {
+                udma_timestamp_reg_clk_cfg_gpio_sel_set(ts->base, conf->cnt_src_id);
+            }
+            else if(conf->cnt_src == PI_TIMESTAMP_CNT_PWM)
+            {
+                udma_timestamp_reg_clk_cfg_pwm_sel_set(ts->base, conf->cnt_src_id);
+            }
+            udma_timestamp_reg_clk_cfg_clk_mux_en_set(ts->base, 1);
+
+        }
+
+        if (conf->prescaler)
+        {
+            udma_timestamp_reg_clk_cfg_prescaler_set(ts->base, conf->prescaler);
+        }
+
+        ts->device_id = conf->itf;
+        ts->cnt_trig_gpio = conf->cnt_trig_gpio;
+        ts->cnt_trig_type = conf->cnt_trig_type;
+        ts->cnt_src = conf->cnt_src;
+        ts->cnt_src_id = conf->cnt_src_id;
+        ts->prescaler = conf->prescaler;
+        ts->cnt_en = 1;
+
+        /* Set all the event to 0xFF */
+        __pi_udma_evt_cfg_init();
+    }
+    pi_irq_restore(irq);
+}
+
+void pi_udma_timestamp_close(struct pi_device *device)
+{
+    uint32_t irq = pi_irq_disable();
+    struct pi_udma_timestamp_cnt_t *ts = (struct pi_udma_timestamp_cnt_t *) device->data;
+
+    __pi_udma_timestamp_cnt_close(ts);
+
+    /* Set all the event to 0xFF */
+    __pi_udma_evt_cfg_init();
+
+    /* Enable UDMA CG and reset periph. */
+    uint32_t periph_id = UDMA_TIMESTAMP_ID(ts->device_id);
+    udma_ctrl_cfg_rstn_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+    udma_ctrl_cfg_cg_clr_set(ARCHI_UDMA_ADDR, (1 << periph_id));
+
+    /* Free all the udma timestamp allocated. */
+    //TODO: clean all the event and input
+    /*
+    for (uint32_t tid = 0; tid < (uint32_t) UDMA_NB_TIMESTAMP_EVT; tid++)
+    {
+        if(ts_evt[tid].soc_evt)
+            soc_eu_prEventMask_setEvent(ts_evt[tid].soc_evt);
+    }
+    */
+
+    evt_mask = 0xF;
+    input_mask = 0xFF;
+    pi_irq_restore(irq);
+}
+
+
+int32_t pi_udma_timestamp_ioctl(struct pi_device *device, uint32_t cmd, void *arg)
+{
+    uint32_t irq = pi_irq_disable();
+    struct pi_udma_timestamp_cnt_t *ts = (struct pi_udma_timestamp_cnt_t *) device->data;
+    int32_t status = 0;
+    int src_id = 0;
+
+    switch (cmd)
+    {
+    case PI_UDMA_TIMESTAMP_IOCTL_CLR :
+        __pi_udma_timestamp_cnt_clr(ts->base);
+        break;
+
+    case PI_UDMA_TIMESTAMP_IOCTL_STOP :
+        __pi_udma_timestamp_cnt_stop(ts->base);
+        break;
+
+    case PI_UDMA_TIMESTAMP_IOCTL_EVT_ALLOC:
+        __pi_udma_timestamp_evt_alloc(ts->base, arg);
+        break;
+
+    case PI_UDMA_TIMESTAMP_IOCTL_SET_EVT :
+        status = __pi_udma_timestamp_evt_cfg(ts->base, arg);
+        break;
+
+    case PI_UDMA_TIMESTAMP_IOCTL_SET_INPUT :
+        status = __pi_udma_timestamp_input_set(ts->base, arg);
+        break;
+
+        // TODO: complete these cases
+    case PI_UDMA_TIMESTAMP_IOCTL_FREE_EVT :
+        break;
+
+    case PI_UDMA_TIMESTAMP_IOCTL_FREE_INPUT :
+        break;
+
+    default :
+        TIMESTAMP_TRACE_ERR("Unknown timestamp command, cmd=%ld\n", cmd);
+    }
+    pi_irq_restore(irq);
+    return status;
+}
+
+/** Not in PMSIS_API. */
+#if 0
+void pi_udma_timestamp_read_async(unsigned char src_type, unsigned char ts_id,
+                                  void *buffer, int32_t size, pi_task_t *task)
+{
+    pos_udma_channel_t *channel = src_type? &ts_evt[ts_id].channel : &ts_input[ts_id].channel;
+
+    uint32_t irq = pi_irq_disable();
+    pos_udma_enqueue(channel, task, (int)buffer, size);
+    pi_irq_restore(irq);
+}
+#endif  /* 0 */
diff --git a/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.h b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.h
new file mode 100644
index 000000000..395a9def4
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/chips/gap9/drivers/udma/udma_timestamp.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020, GreenWaves Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * o Redistributions of source code must retain the above copyright notice, this list
+ *   of conditions and the following disclaimer.
+ *
+ * o Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * o Neither the name of GreenWaves Technologies, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+
+/*******************************************************************************
+ * Definitions
+ ******************************************************************************/
+
+#define UDMA_NB_TIMESTAMP_EVT               (4)
+#define UDMA_NB_TIMESTAMP_INPUT             (8)
+
+struct pi_udma_timestamp_cnt_t
+{
+    uint32_t base;          /*!< base addr of TS */
+    uint8_t  device_id;     /*!< device ID for timestamp */
+    uint8_t  cnt_trig_gpio; /*!< gpio number for trigger the timestamp cnter */
+    uint8_t  cnt_trig_type; /*!< how the gpio trigger the timestamp counter */
+    uint8_t  cnt_src;       /*!< timestamp counter source */
+    uint8_t  cnt_src_id;    /*!< GPIO/PWM number depends on the counter source */
+    uint8_t  prescaler;     /*!< Prescaler for timestamp counter */
+    uint8_t  cnt_en;        /*!< If the counter is enabled */
+};
+
+struct pi_udma_ts_evt_t
+{
+    uint8_t  dest_id;       /*!< fifo ID for timestamp */
+    uint8_t  soc_evt;       /*!< soc event ID which should be propagated to periph */
+    uint8_t  ts_evt_id;     /*!< The udma cfg evt number */
+};
+
+struct pi_udma_ts_input_t
+{
+    uint8_t dest_id;
+    uint8_t ts_input_id;    /*!< Timestamp input ID, max 8 input. Reg0-7 */
+    uint8_t input_sel;      /*!< Timestamp input selction:
+                              if input_type=3, then 0-7 are SFU, 8-10 are SAI.
+                              Else input sel are GPIO 0-63 */
+    uint8_t input_type;     /*!< Timestamp input GPIO trigger or input from AUX */
+};
+
+/*******************************************************************************
+ * Driver data
+ ******************************************************************************/
+
+/*******************************************************************************
+ * Function declaration
+ ******************************************************************************/
diff --git a/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h
index f66c4076e..bb11d8454 100644
--- a/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h
+++ b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_core.h
@@ -36,9 +36,16 @@
  ******************************************************************************/
 
 #if !defined(__FREERTOS__)
+#define UDMA_NB_CHAN_LIN               ( ARCHI_UDMA_NB_LIN_ADDRGEN )
+#define UDMA_NB_CHAN_2D                ( ARCHI_UDMA_NB_2D_ADDRGEN )
+#define UDMA_NB_CHAN_FIFO              ( ARCHI_UDMA_NB_FIFO_ADDRGEN )
 #define UDMA_CHAN_LIN(id)              ( UDMA_LIN_ADDRGEN_ADDR((id)) )
-#define UDMA_CHAN_2D(id)
-#define UDMA_CHAN_FIFO(id)
+#define UDMA_CHAN_2D(id)               ( 0x1A103800 + 0x20 * id )
+#define UDMA_CHAN_FIFO(id)             ( 0x1A103900 + 0x20 * id )
+#define UDMA_CHAN_LIN_ID(id)           ( (id) )
+#define UDMA_CHAN_2D_ID(id)            ( ARCHI_UDMA_NB_LIN_ADDRGEN + (id) )
+#define UDMA_CHAN_FIFO_ID(id)          ( ARCHI_UDMA_NB_LIN_ADDRGEN + ARCHI_UDMA_NB_2D_ADDRGEN + (id) )
+#define SOC_EVENT_UDMA_CHAN_LIN(id)    ( (id) )
 #endif  /* __FREERTOS__ */
 
 
@@ -87,11 +94,6 @@ static inline void pi_udma_core_channels_init(void)
 /**
  * UDMA_CHANNEL_LINEAR
  */
-static inline uint32_t pi_udma_core_lin_addr_get(int32_t chan_id)
-{
-    return UDMA_CHAN_LIN(chan_id);
-}
-
 static inline int32_t pi_udma_core_lin_alloc(void)
 {
     int32_t chan_id = -1;
@@ -119,22 +121,58 @@ static inline void pi_udma_core_lin_free(int32_t chan_id)
     }
 }
 
+static inline uint32_t pi_udma_core_lin_addr_get(int32_t chan_id)
+{
+    return UDMA_CHAN_LIN(chan_id);
+}
+
+static inline void pi_udma_core_lin_enqueue(uint32_t udma_core_base,
+                                            uint32_t buf,
+                                            uint32_t size, uint32_t config)
+{
+    config |= UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_EN(1);
+    udma_core_lin_addrgen_cfg_sa_buf0_set(udma_core_base, buf);
+    udma_core_lin_addrgen_cfg_size_set(udma_core_base, size);
+    udma_core_lin_addrgen_cfg_ctrl_set(udma_core_base, config);
+}
 
-/**
- * UDMA_CHANNEL_2D
- */
-static inline uint32_t pi_udma_core_2d_addr_get(int32_t chan_id)
+static inline void pi_udma_core_lin_stop(uint32_t udma_core_base)
 {
-    return UDMA_CHAN_2D(chan_id);
+    uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_STOP(1);
+    udma_core_lin_addrgen_cfg_ctrl_set(udma_core_base, config);
+}
+
+static inline void pi_udma_core_lin_reset(uint32_t udma_core_base)
+{
+    uint32_t config = UDMA_CORE_LIN_ADDRGEN_CFG_CTRL_STOP(1);
+    /* udma_core_lin_addrgen_cfg_sa_buf0_set(udma_core_base, 0); */
+    /* udma_core_lin_addrgen_cfg_sa_buf1_set(udma_core_base, 0); */
+    /* udma_core_lin_addrgen_cfg_size_set(udma_core_base, 0); */
+    udma_core_lin_addrgen_cfg_ctrl_set(udma_core_base, config);
+}
+
+static inline uint32_t pi_udma_core_lin_curr_addr_get(uint32_t udma_core_base)
+{
+    return udma_core_lin_addrgen_cfg_curr_addr_get(udma_core_base);
 }
 
+static inline uint32_t pi_udma_core_lin_bytes_left_get(uint32_t udma_core_base)
+{
+    return udma_core_lin_addrgen_cfg_bytes_left_get(udma_core_base);
+}
+
+
+
+/**
+ * UDMA_CHANNEL_2D
+ */
 static inline int32_t pi_udma_core_2d_alloc(void)
 {
     int32_t chan_id = -1;
-    uint32_t reg_status = __pi_udma_chan_2d;
-    if (0x0 != reg_status)
+    if (0x0 != __pi_udma_chan_2d)
     {
-        chan_id = __FF1(reg_status);
+        chan_id = __FF1(__pi_udma_chan_2d);
+        __pi_udma_chan_2d = __BITCLR_R(__pi_udma_chan_2d, 1, chan_id);
         return (chan_id + UDMA_CHAN_2D_ID(0));
     }
     return chan_id;
@@ -148,22 +186,62 @@ static inline void pi_udma_core_2d_free(int32_t chan_id)
     }
 }
 
+static inline uint32_t pi_udma_core_2d_addr_get(int32_t chan_id)
+{
+    return UDMA_CHAN_2D(chan_id);
+}
 
-/**
- * UDMA_CHANNEL_FIFO
- */
-static inline uint32_t pi_udma_core_fifo_addr_get(int32_t chan_id)
+static inline void pi_udma_core_2d_enqueue(uint32_t udma_core_base,
+                                            uint32_t buf_0, uint32_t buf_1,
+                                            uint32_t size, uint32_t stride,
+                                            uint32_t length, uint32_t config)
 {
-    return UDMA_CHAN_FIFO(chan_id);
+    config |= UDMA_CORE_2D_ADDRGEN_CFG_CTRL_EN(1);
+    udma_core_2d_addrgen_cfg_sa_buf0_set(udma_core_base, buf_0);
+    udma_core_2d_addrgen_cfg_sa_buf1_set(udma_core_base, buf_1);
+    udma_core_2d_addrgen_cfg_size_set(udma_core_base, size);
+    udma_core_2d_addrgen_cfg_stride_set(udma_core_base, stride);
+    udma_core_2d_addrgen_cfg_row_len_set(udma_core_base, length);
+    udma_core_2d_addrgen_cfg_ctrl_set(udma_core_base, config);
+}
+
+static inline void pi_udma_core_2d_stop(uint32_t udma_core_base)
+{
+    uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_STOP(1);
+    udma_core_2d_addrgen_cfg_ctrl_set(udma_core_base, config);
+}
+
+static inline void pi_udma_core_2d_reset(uint32_t udma_core_base)
+{
+    uint32_t config = UDMA_CORE_2D_ADDRGEN_CFG_CTRL_STOP(1);
+    /* udma_core_2d_addrgen_cfg_sa_buf0_set(udma_core_base, 0); */
+    /* udma_core_2d_addrgen_cfg_sa_buf1_set(udma_core_base, 0); */
+    /* udma_core_2d_addrgen_cfg_size_set(udma_core_base, 0); */
+    /* udma_core_2d_addrgen_cfg_stride_set(udma_core_base, 0); */
+    /* udma_core_2d_addrgen_cfg_row_len_set(udma_core_base, 0); */
+    udma_core_2d_addrgen_cfg_ctrl_set(udma_core_base, config);
 }
 
+static inline uint32_t pi_udma_core_2d_curr_addr_get(uint32_t udma_core_base)
+{
+    return udma_core_2d_addrgen_cfg_curr_addr_get(udma_core_base);
+}
+
+static inline uint32_t pi_udma_core_2d_bytes_left_get(uint32_t udma_core_base)
+{
+    return udma_core_2d_addrgen_cfg_bytes_left_get(udma_core_base);
+}
+
+/**
+ * UDMA_CHANNEL_FIFO
+ */
 static inline int32_t pi_udma_core_fifo_alloc(void)
 {
     int32_t chan_id = -1;
-    uint32_t reg_status = __pi_udma_chan_fifo;
-    if (0x0 != reg_status)
+    if (0x0 != __pi_udma_chan_fifo)
     {
-        chan_id = __FF1(reg_status);
+        chan_id = __FF1(__pi_udma_chan_fifo);
+        __pi_udma_chan_fifo = __BITCLR_R(__pi_udma_chan_fifo, 1, chan_id);
         return (chan_id + UDMA_CHAN_FIFO_ID(0));
     }
     return chan_id;
@@ -176,3 +254,37 @@ static inline void pi_udma_core_fifo_free(int32_t chan_id)
         __pi_udma_chan_fifo = __BITSET_R(__pi_udma_chan_fifo, 1, chan_id - UDMA_CHAN_FIFO_ID(0));
     }
 }
+
+static inline uint32_t pi_udma_core_fifo_addr_get(int32_t chan_id)
+{
+    return UDMA_CHAN_FIFO(chan_id);
+}
+
+static inline void pi_udma_core_fifo_enqueue(uint32_t udma_core_base,
+                                              uint32_t buf,
+                                              uint32_t size,
+                                              uint32_t config)
+{
+    config |= UDMA_CORE_FIFO_CFG_CTRL_EN(1);
+    udma_core_fifo_cfg_sa_buffer_set(udma_core_base, buf);
+    udma_core_fifo_cfg_size_set(udma_core_base, size);
+    udma_core_fifo_cfg_ctrl_set(udma_core_base, config);
+}
+
+static inline void pi_udma_core_fifo_stop(uint32_t udma_core_base)
+{
+    uint32_t config = UDMA_CORE_FIFO_CFG_CTRL_STOP(1);
+    udma_core_fifo_cfg_ctrl_set(udma_core_base, config);
+}
+
+static inline void pi_udma_core_fifo_event_enable(uint32_t udma_core_base,
+                                                  uint8_t enable)
+{
+    udma_core_fifo_cfg_evt_en_set(udma_core_base, enable);
+}
+
+static inline void pi_udma_core_fifo_event_set_threshold(uint32_t udma_core_base,
+                                                         uint32_t threshold)
+{
+    udma_core_fifo_cfg_evt_num_bytes_set(udma_core_base, threshold);
+}
diff --git a/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_timeout.h b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_timeout.h
new file mode 100644
index 000000000..8085819c1
--- /dev/null
+++ b/rtos/pmsis/pmsis_implem/include/chips/gap9/drivers/udma/udma_timeout.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020, GreenWaves Technologies, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * o Redistributions of source code must retain the above copyright notice, this list
+ *   of conditions and the following disclaimer.
+ *
+ * o Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * o Neither the name of GreenWaves Technologies, Inc. nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+
+/*******************************************************************************
+ * Definitions
+ ******************************************************************************/
+
+struct pi_udma_timeout_s
+{
+    struct pi_task *fifo_head;
+    struct pi_task *fifo_tail;
+    //uint8_t tid;
+    pi_udma_timeout_mode_e mode;
+};
+
+/*******************************************************************************
+ * Driver data
+ ******************************************************************************/
+
+/*******************************************************************************
+ * Function declaration
+ ******************************************************************************/
+
+int32_t pi_udma_timeout_config_set(pi_task_t *task, uint8_t timeout_id,
+                                   uint8_t udma_chan_id, uint32_t timeout_us);
+
+pi_task_t *__pi_udma_timeout_task_remove(uint8_t timeout_id);
diff --git a/rtos/pulp/gap_archi/doc/ips/sfu.rst b/rtos/pulp/gap_archi/doc/ips/sfu.rst
new file mode 100644
index 000000000..f83121cb4
--- /dev/null
+++ b/rtos/pulp/gap_archi/doc/ips/sfu.rst
@@ -0,0 +1,609 @@
+.. 
+   Input file: fe/ips/udma/udma_anc/README.md
+
+Register map
+^^^^^^^^^^^^
+
+
+Overview
+""""""""
+
+
+Refer to :ref:`GAP9 address map<REF_MEMORY_MAP_DETAIL>` for the base address to be used.
+
+.. table:: 
+    :align: center
+    :widths: 40 12 12 90
+
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |                    Name                    |Offset|Width|            Description            |
+    +============================================+======+=====+===================================+
+    |:ref:`GRAPH_PTR<sfu__GRAPH_PTR>`            |     0|   32|Pointer to graph configuration     |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`GRAPH_CMD<sfu__GRAPH_CMD>`            |     4|   32|Graph command register             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`CLOCK_PTR<sfu__CLOCK_PTR>`            |     8|   32|Pointer to clock configuration     |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`CLOCK_CMD<sfu__CLOCK_CMD>`            |    12|   32|Clock command register             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`SFU_STATUS<sfu__SFU_STATUS>`          |    16|   32|Status of graph and clocks commands|
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_STATUS<sfu__MEM_IN_STATUS>`    |    20|   32|Status of memory IN interfaces     |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`LIMITER_MUTE<sfu__LIMITER_MUTE>`      |    24|   32|Limiter mute/unmute control        |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`VOLUME_INDEX<sfu__VOLUME_INDEX>`      |    28|   32|Control of mute/unmute             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`VOLUME_VALUE<sfu__VOLUME_VALUE>`      |    32|   32|Control of linear volume           |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`CLK_MONITOR_0<sfu__CLK_MONITOR_0>`    |    36|   32|Control of clock monitors 0 to 3   |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`CLK_MONITOR_1<sfu__CLK_MONITOR_1>`    |    40|   32|Control of clock monitors 4 to 7   |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`OUT_MUTE<sfu__OUT_MUTE>`              |    44|   32|Control of output channel mute     |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`AUDIO_CLK_CFG_0<sfu__AUDIO_CLK_CFG_0>`|    48|   32|Control audio clock generator 0    |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`AUDIO_CLK_CFG_1<sfu__AUDIO_CLK_CFG_1>`|    52|   32|Control audio clock generator 1    |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`AUDIO_CLK_CFG_2<sfu__AUDIO_CLK_CFG_2>`|    56|   32|Control audio clock generator 2    |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`AUDIO_CLK_CFG_3<sfu__AUDIO_CLK_CFG_3>`|    60|   32|Control audio clock generator 3    |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`ASRC_RATIO_0<sfu__ASRC_RATIO_0>`      |    64|   32|ASRC0 conversion ratio             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`ASRC_RATIO_1<sfu__ASRC_RATIO_1>`      |    68|   32|ASRC1 conversion ratio             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`ASRC_RATIO_2<sfu__ASRC_RATIO_2>`      |    72|   32|ASRC2 conversion ratio             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_0_CNT<sfu__MEM_IN_0_CNT>`      |    88|   32|Memory input counter 0             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_1_CNT<sfu__MEM_IN_1_CNT>`      |    92|   32|Memory input counter 1             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_2_CNT<sfu__MEM_IN_2_CNT>`      |    96|   32|Memory input counter 2             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_3_CNT<sfu__MEM_IN_3_CNT>`      |   100|   32|Memory input counter 3             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_4_CNT<sfu__MEM_IN_4_CNT>`      |   104|   32|Memory input counter 4             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_5_CNT<sfu__MEM_IN_5_CNT>`      |   108|   32|Memory input counter 5             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_6_CNT<sfu__MEM_IN_6_CNT>`      |   112|   32|Memory input counter 6             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+    |:ref:`MEM_IN_7_CNT<sfu__MEM_IN_7_CNT>`      |   116|   32|Memory input counter 7             |
+    +--------------------------------------------+------+-----+-----------------------------------+
+
+.. _sfu__GRAPH_PTR:
+
+GRAPH_PTR
+"""""""""
+
+Pointer to graph configuration
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-------+-----+----------------------------------------+
+    |Bit #|R/W| Name  |Reset|              Description               |
+    +=====+===+=======+=====+========================================+
+    |31:0 |R/W|ADDRESS|0x0  |Address of graph configuration in memory|
+    +-----+---+-------+-----+----------------------------------------+
+
+.. _sfu__GRAPH_CMD:
+
+GRAPH_CMD
+"""""""""
+
+Graph command register
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-----------+-----+---------------------------------------------+
+    |Bit #|R/W|   Name    |Reset|                 Description                 |
+    +=====+===+===========+=====+=============================================+
+    |    0|W  |LOAD       |0x0  |Write b1 to start graph load                 |
+    +-----+---+-----------+-----+---------------------------------------------+
+    |    1|W  |RECONF     |0x0  |Write b1 to start graph reconfiguration      |
+    +-----+---+-----------+-----+---------------------------------------------+
+    |    2|W  |UNLOAD     |0x0  |Write b1 to start graph unload               |
+    +-----+---+-----------+-----+---------------------------------------------+
+    |    3|W  |SAVE       |0x0  |Write b1 to start graph save                 |
+    +-----+---+-----------+-----+---------------------------------------------+
+    |    4|W  |SET_CURRENT|0x0  |Sets the current graph (used for status read)|
+    +-----+---+-----------+-----+---------------------------------------------+
+
+.. _sfu__CLOCK_PTR:
+
+CLOCK_PTR
+"""""""""
+
+Pointer to clock configuration
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-------+-----+----------------------------------------+
+    |Bit #|R/W| Name  |Reset|              Description               |
+    +=====+===+=======+=====+========================================+
+    |31:0 |R/W|ADDRESS|0x0  |Address of clock configuration in memory|
+    +-----+---+-------+-----+----------------------------------------+
+
+.. _sfu__CLOCK_CMD:
+
+CLOCK_CMD
+"""""""""
+
+Clock command register
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+------+-----+------------------------------+
+    |Bit #|R/W| Name |Reset|         Description          |
+    +=====+===+======+=====+==============================+
+    |    0|W  |LOAD  |0x0  |Write b1 to start clock load  |
+    +-----+---+------+-----+------------------------------+
+    |    1|W  |UNLOAD|0x0  |Write b1 to start clock unload|
+    +-----+---+------+-----+------------------------------+
+
+.. _sfu__SFU_STATUS:
+
+SFU_STATUS
+""""""""""
+
+Status of graph and clocks commands
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |Bit #|R/W|    Name     |Reset|                                         Description                                          |
+    +=====+===+=============+=====+==============================================================================================+
+    |    0|R  |CLOCK_LOAD   |0x0  |Bit is set to 1 when clock load is ongoing                                                    |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |    1|R  |GRAPH_LOAD   |0x0  |Bit is set to 1 when graph load is ongoing                                                    |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |    2|R  |GRAPH_UNLOAD |0x0  |Bit is set to 1 when graph unload is ongoing                                                  |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |    3|R  |GRAPH_RECONF |0x0  |Bit is set to 1 when graph reconfiguration is ongoing                                         |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |    4|R  |GRAPH_SAVE   |0x0  |Bit is set to 1 when graph save is ongoing                                                    |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |    5|R  |GRAPH_SET_CUR|0x0  |Bit is set to 1 when current graph is being set                                               |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |8:6  |R  |ASRC_LOCK    |0x0  |Lock status of the 3 ASRCs: bit *i* is set to 1 when frequency tracking of ASRC\ *i* is locked|
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+    |    9|R  |GRAPH_BUSY   |0x0  |Bit is set to 1 when current graph is busy                                                    |
+    +-----+---+-------------+-----+----------------------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_STATUS:
+
+MEM_IN_STATUS
+"""""""""""""
+
+Status of memory IN interfaces
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |Bit #|R/W| Name |Reset|                                                                                      Description                                                                                       |
+    +=====+===+======+=====+========================================================================================================================================================================================+
+    |7:0  |R/W|STATUS|0x0  |When reading, bit *i* give the status of MemIn interface *i*: b0: interface OK; b1: buffer has ended. Writing b1 to bit *i* restarts the MemIn interface *i* (e.g. after buffer restart)|
+    +-----+---+------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+.. _sfu__LIMITER_MUTE:
+
+LIMITER_MUTE
+""""""""""""
+
+Limiter mute/unmute control
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----------+-----+--------------------------------------------------+
+    |Bit #|R/W|   Name   |Reset|                   Description                    |
+    +=====+===+==========+=====+==================================================+
+    |    0|R/W|MUTE_LIM_0|0x0  |Enable mute 0: b0: mute disabled; b1: mute enabled|
+    +-----+---+----------+-----+--------------------------------------------------+
+    |    1|R/W|MUTE_LIM_1|0x0  |Enable mute 1: b0: mute disabled; b1: mute enabled|
+    +-----+---+----------+-----+--------------------------------------------------+
+    |    2|R/W|MUTE_LIM_2|0x0  |Enable mute 2: b0: mute disabled; b1: mute enabled|
+    +-----+---+----------+-----+--------------------------------------------------+
+    |    3|R/W|MUTE_LIM_3|0x0  |Enable mute 3: b0: mute disabled; b1: mute enabled|
+    +-----+---+----------+-----+--------------------------------------------------+
+    |    4|R/W|MUTE_LIM_4|0x0  |Enable mute 4: b0: mute disabled; b1: mute enabled|
+    +-----+---+----------+-----+--------------------------------------------------+
+    |    5|R/W|MUTE_LIM_5|0x0  |Enable mute 5: b0: mute disabled; b1: mute enabled|
+    +-----+---+----------+-----+--------------------------------------------------+
+
+.. _sfu__VOLUME_INDEX:
+
+VOLUME_INDEX
+""""""""""""
+
+Control of mute/unmute
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-----+-----+---------------------------------------------------------+
+    |Bit #|R/W|Name |Reset|                       Description                       |
+    +=====+===+=====+=====+=========================================================+
+    |4:0  |R/W|INDEX|0x0  |Index of volume setting accessed by VOLUME_VALUE register|
+    +-----+---+-----+-----+---------------------------------------------------------+
+
+.. _sfu__VOLUME_VALUE:
+
+VOLUME_VALUE
+""""""""""""
+
+Control of linear volume
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+---------+-----+------------------------------------------------------+
+    |Bit #|R/W|  Name   |Reset|                     Description                      |
+    +=====+===+=========+=====+======================================================+
+    |25:0 |R/W|VOLUME   |0x0  |Value of volume (linear)                              |
+    +-----+---+---------+-----+------------------------------------------------------+
+    |31:26|R/W|SCALING_V|0x0  |Value in bits for the scaling (bit 5 is the direction)|
+    +-----+---+---------+-----+------------------------------------------------------+
+
+.. _sfu__CLK_MONITOR_0:
+
+CLK_MONITOR_0
+"""""""""""""
+
+Control of clock monitors 0 to 3
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+-------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                      Description                      |
+    +=====+===+====+=====+=======================================================+
+    |4:0  |R/W|SEL0|0x0  |Monitored clock selector (see Clock select table below)|
+    +-----+---+----+-----+-------------------------------------------------------+
+    |7    |R/W|EN0 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |12:8 |R/W|SEL1|0x0  |Monitored clock selector (see Clock select table below |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |15   |R/W|EN1 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |20:16|R/W|SEL2|0x0  |Monitored clock selector (see Clock select table below |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |23   |R/W|EN2 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |28:24|R/W|SEL3|0x0  |Monitored clock selector (see Clock select table below |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |31   |R/W|EN3 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+
+.. _sfu__CLK_MONITOR_1:
+
+CLK_MONITOR_1
+"""""""""""""
+
+Control of clock monitors 4 to 7
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+-------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                      Description                      |
+    +=====+===+====+=====+=======================================================+
+    |4:0  |R/W|SEL0|0x0  |Monitored clock selector (see Clock select table below)|
+    +-----+---+----+-----+-------------------------------------------------------+
+    |7    |R/W|EN0 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |12:8 |R/W|SEL1|0x0  |Monitored clock selector (see Clock select table below |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |15   |R/W|EN1 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |20:16|R/W|SEL2|0x0  |Monitored clock selector (see Clock select table below |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |23   |R/W|EN2 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |28:24|R/W|SEL3|0x0  |Monitored clock selector (see Clock select table below |
+    +-----+---+----+-----+-------------------------------------------------------+
+    |31   |R/W|EN3 |0x0  |Set to b1 to enable monitoring                         |
+    +-----+---+----+-----+-------------------------------------------------------+
+
+.. _sfu__OUT_MUTE:
+
+OUT_MUTE
+""""""""
+
+Control of output channel mute
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----------+-----+-------------------------------------+
+    |Bit #|R/W|   Name   |Reset|             Description             |
+    +=====+===+==========+=====+=====================================+
+    |7:0  |R/W|MEM_OUT   |0x0  |Mutes corresponding MemOut channel   |
+    +-----+---+----------+-----+-------------------------------------+
+    |15:8 |R/W|STREAM_OUT|0x0  |Mutes corresponding StreamOut channel|
+    +-----+---+----------+-----+-------------------------------------+
+    |18:16|R/W|PDM_OUT   |0x0  |Mutes corresponding PDMOut channel   |
+    +-----+---+----------+-----+-------------------------------------+
+
+.. _sfu__AUDIO_CLK_CFG_0:
+
+AUDIO_CLK_CFG_0
+"""""""""""""""
+
+Control audio clock generator 0
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+---------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                       Description                       |
+    +=====+===+====+=====+=========================================================+
+    |15:0 |R/W|DIV |0x0  |Division factor for audio clock                          |
+    +-----+---+----+-----+---------------------------------------------------------+
+    |16   |R/W|EN  |0x0  |Enable: b0: audio clock disabled; b1: audio clock enabled|
+    +-----+---+----+-----+---------------------------------------------------------+
+
+.. _sfu__AUDIO_CLK_CFG_1:
+
+AUDIO_CLK_CFG_1
+"""""""""""""""
+
+Control audio clock generator 1
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+---------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                       Description                       |
+    +=====+===+====+=====+=========================================================+
+    |15:0 |R/W|DIV |0x0  |Division factor for audio clock                          |
+    +-----+---+----+-----+---------------------------------------------------------+
+    |16   |R/W|EN  |0x0  |Enable: b0: audio clock disabled; b1: audio clock enabled|
+    +-----+---+----+-----+---------------------------------------------------------+
+
+.. _sfu__AUDIO_CLK_CFG_2:
+
+AUDIO_CLK_CFG_2
+"""""""""""""""
+
+Control audio clock generator 2
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+---------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                       Description                       |
+    +=====+===+====+=====+=========================================================+
+    |15:0 |R/W|DIV |0x0  |Division factor for audio clock                          |
+    +-----+---+----+-----+---------------------------------------------------------+
+    |16   |R/W|EN  |0x0  |Enable: b0: audio clock disabled; b1: audio clock enabled|
+    +-----+---+----+-----+---------------------------------------------------------+
+
+.. _sfu__AUDIO_CLK_CFG_3:
+
+AUDIO_CLK_CFG_3
+"""""""""""""""
+
+Control audio clock generator 3
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+---------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                       Description                       |
+    +=====+===+====+=====+=========================================================+
+    |15:0 |R/W|DIV |0x0  |Division factor for audio clock                          |
+    +-----+---+----+-----+---------------------------------------------------------+
+    |16   |R/W|EN  |0x0  |Enable: b0: audio clock disabled; b1: audio clock enabled|
+    +-----+---+----+-----+---------------------------------------------------------+
+
+.. _sfu__ASRC_RATIO_0:
+
+ASRC_RATIO_0
+""""""""""""
+
+ASRC0 conversion ratio
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-----+-----+----------------+
+    |Bit #|R/W|Name |Reset|  Description   |
+    +=====+===+=====+=====+================+
+    |25:0 |R/W|RATIO|0x0  |Conversion ratio|
+    +-----+---+-----+-----+----------------+
+
+.. _sfu__ASRC_RATIO_1:
+
+ASRC_RATIO_1
+""""""""""""
+
+ASRC1 conversion ratio
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-----+-----+----------------+
+    |Bit #|R/W|Name |Reset|  Description   |
+    +=====+===+=====+=====+================+
+    |25:0 |R/W|RATIO|0x0  |Conversion ratio|
+    +-----+---+-----+-----+----------------+
+
+.. _sfu__ASRC_RATIO_2:
+
+ASRC_RATIO_2
+""""""""""""
+
+ASRC2 conversion ratio
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+-----+-----+----------------+
+    |Bit #|R/W|Name |Reset|  Description   |
+    +=====+===+=====+=====+================+
+    |25:0 |R/W|RATIO|0x0  |Conversion ratio|
+    +-----+---+-----+-----+----------------+
+
+.. _sfu__MEM_IN_0_CNT:
+
+MEM_IN_0_CNT
+""""""""""""
+
+Memory input counter 0
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_1_CNT:
+
+MEM_IN_1_CNT
+""""""""""""
+
+Memory input counter 1
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_2_CNT:
+
+MEM_IN_2_CNT
+""""""""""""
+
+Memory input counter 2
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_3_CNT:
+
+MEM_IN_3_CNT
+""""""""""""
+
+Memory input counter 3
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_4_CNT:
+
+MEM_IN_4_CNT
+""""""""""""
+
+Memory input counter 4
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_5_CNT:
+
+MEM_IN_5_CNT
+""""""""""""
+
+Memory input counter 5
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_6_CNT:
+
+MEM_IN_6_CNT
+""""""""""""
+
+Memory input counter 6
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+
+.. _sfu__MEM_IN_7_CNT:
+
+MEM_IN_7_CNT
+""""""""""""
+
+Memory input counter 7
+
+.. table:: 
+    :align: center
+    :widths: 13 12 45 24 85
+
+    +-----+---+----+-----+------------------------------------------------------------------------------+
+    |Bit #|R/W|Name|Reset|                                 Description                                  |
+    +=====+===+====+=====+==============================================================================+
+    |20:0 |R  |CNT |0x0  |Reports how many samples have been pushed to the SFU from this MemIn interface|
+    +-----+---+----+-----+------------------------------------------------------------------------------+
diff --git a/rtos/pulp/gap_archi/doc/source/sfu.rst b/rtos/pulp/gap_archi/doc/source/sfu.rst
new file mode 100644
index 000000000..436fefcbf
--- /dev/null
+++ b/rtos/pulp/gap_archi/doc/source/sfu.rst
@@ -0,0 +1,4 @@
+SFU
+---
+
+.. include:: ../ips/sfu.rst
\ No newline at end of file
diff --git a/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h b/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h
index d5ac42495..9690c6fb8 100644
--- a/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h
+++ b/rtos/pulp/pulpos-2/include/pos/implem/soc_event.h
@@ -27,25 +27,25 @@ extern volatile PI_FC_TINY unsigned int pos_soc_event_status[ARCHI_SOC_EVENT_NB_
 
 void pos_soc_event_init();
 
-static inline void pos_soc_event_register_callback_func(unsigned int channel_id, void (*callback)(int, void *))
+static inline void pos_soc_event_register_callback_func(unsigned int event, void (*callback)(uint32_t, void *))
 {
-    pos_soc_event_callback[channel_id] = callback;
+    pos_soc_event_callback[event] = callback;
 }
 
-static inline void pos_soc_event_register_callback_arg(unsigned int channel_id, void *arg)
+static inline void pos_soc_event_register_callback_arg(unsigned int event, void *arg)
 {
-    pos_soc_event_callback_arg[channel_id] = arg;
+    pos_soc_event_callback_arg[event] = arg;
 }
 
-static inline void pos_soc_event_register_callback(unsigned int channel_id, void (*callback)(int, void *), void *arg)
+static inline void pos_soc_event_register_callback(unsigned int event, void (*callback)(uint32_t, void *), void *arg)
 {
-    pos_soc_event_register_callback_func(channel_id, callback);
-    pos_soc_event_register_callback_arg(channel_id, arg);
+    pos_soc_event_register_callback_func(event, callback);
+    pos_soc_event_register_callback_arg(event, arg);
 }
 
-static inline void pi_fc_event_handler_set(unsigned int channel_id, void (*callback)(int, void *), void *arg)
+static inline void pi_fc_event_handler_set(unsigned int event, void (*callback)(uint32_t, void *), void *arg)
 {
-    pos_soc_event_register_callback(channel_id, callback, arg);
+    pos_soc_event_register_callback(event, callback, arg);
 }
 
 static inline void pi_soc_eu_pr_mask_set(int evt)
@@ -70,4 +70,14 @@ static inline void pos_soc_event_wait(int event)
     hal_irq_restore(irq);
 }
 
-#endif
\ No newline at end of file
+static inline void pi_soc_eu_fc_mask_set(uint32_t event_num)
+{
+    soc_eu_fc_mask_clr_set(SOC_EU_ADDR, event_num);
+}
+
+static inline void pi_soc_eu_fc_mask_clear(uint32_t event_num)
+{
+    soc_eu_fc_mask_set_set(SOC_EU_ADDR, event_num);
+}
+
+#endif
diff --git a/rtos/pulp/pulpos-2/include/pos/implem/task.h b/rtos/pulp/pulpos-2/include/pos/implem/task.h
index a3f705e78..1e34be980 100644
--- a/rtos/pulp/pulpos-2/include/pos/implem/task.h
+++ b/rtos/pulp/pulpos-2/include/pos/implem/task.h
@@ -40,6 +40,17 @@ static inline void pi_task_destroy(pi_task_t *task)
 }
 
 
+static inline int32_t pi_task_status_get(pi_task_t *task)
+{
+    return task->arg[3];
+}
+
+static inline void pi_task_status_set(pi_task_t *task, int32_t status)
+{
+    task->arg[3] = status;
+}
+
+
 static inline void pi_task_wait_on(struct pi_task *task)
 {
     int irq = hal_irq_disable();
diff --git a/rtos/pulp/pulpos-2/kernel/init.c b/rtos/pulp/pulpos-2/kernel/init.c
index 09c8e544b..310ede378 100644
--- a/rtos/pulp/pulpos-2/kernel/init.c
+++ b/rtos/pulp/pulpos-2/kernel/init.c
@@ -72,6 +72,11 @@ static void pos_init_bss()
 }
 
 
+void __attribute__((weak)) pi_bsp_init()
+{
+}
+
+
 void pos_init_start()
 {
     INIT_INF("Starting runtime initialization\n");
@@ -106,6 +111,8 @@ void pos_init_start()
     // Now now the minimal init are done, we can activate interruptions
     hal_irq_enable();
 
+    pi_bsp_init();
+
     int retval = main();
 
     exit(retval);
diff --git a/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk b/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk
index a5edf1b22..061e65c1e 100644
--- a/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk
+++ b/rtos/pulp/pulpos-2/rules/pulpos/default_rules.mk
@@ -309,6 +309,8 @@ build: $(TARGETS)
 image:
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(TARGET_BUILD_DIR) $(config_args) $(gapy_args) run --image --binary=$(TARGETS) $(runner_args)
 
+flash_noforce: flash
+
 flash:
 	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(TARGET_BUILD_DIR) $(config_args) $(gapy_args) run --flash --binary=$(TARGETS) $(runner_args)
 
@@ -354,4 +356,4 @@ install-lib: build-lib
 #	@echo "  CONFIG_TRACE_LEVEL=<level>    Activate traces for the specified level (0=none, 1=fatal, 2=error, 3=warning, 4=info, 5=debug, 6=trace)."
 #	@echo "  CONFIG_TRACE_ALL=1            Activate all traces. Other traces can be individually activated with CONFIG_TRACE_<NAME>."
 
-.PHONY: image flash exec run dis size help clean all conf build-lib install-lib
+.PHONY: image flash flash_noforce exec run dis size help clean all conf build-lib install-lib
diff --git a/rtos/pulp/pulpos-2/rules/pulpos/src.mk b/rtos/pulp/pulpos-2/rules/pulpos/src.mk
index cd77171cc..43feacbed 100644
--- a/rtos/pulp/pulpos-2/rules/pulpos/src.mk
+++ b/rtos/pulp/pulpos-2/rules/pulpos/src.mk
@@ -30,7 +30,7 @@ endif
 # HYPER
 
 ifeq '$(CONFIG_HYPER)' '1'
-ifneq '$(udma/version)' ''
+ifneq '$(udma/hyper/version)' ''
 ifeq '$(TARGET_CHIP_FAMILY)' 'GAP9'
 HYPER_HAS_ASM = 1
 HYPER_HAS_OCTOSPI = 1
diff --git a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h
index b5bdac14e..e3d807233 100644
--- a/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h
+++ b/tools/autotiler_v3/Autotiler/AutoTilerLibTypes.h
@@ -916,6 +916,7 @@ typedef struct {
 	uint64_t PaddedSize;				/* Total size in bytes or bits of this kernel argument, byte aligned, forced padding taken into account */
 	uint64_t BitSize;				/* Total size in bits of this kernel argument (unaligned) */
 	uint64_t PaddedBitSize;				/* Total size in bits of this kernel argument (unaligned), forced padding taken into account */
+	uint64_t Overflow;				/* Amount of read overflow in bytes, it can happen on TILED arg when right or bottom padding is greater than the size of the last tile */
 	KernelArgOneDimDescrT **DimDescr;		/* A vector of dimension description outer to inner */
 	KernelArgOneDimDescrT **IterOrderDimDescr;	/* Reordered DimDescr according to Kernel Iteration Order */
 	int *KerIterDimDescr;				/* Indexed by kernel's IterOrder, if in DimDescr then position in IterOrderDimDescr otherwise -1 */
@@ -1429,6 +1430,7 @@ typedef struct AGraphNodeList_T {
 	ArgBindingDescr_T *Binding;		/* The bindings from which this edge is originating */
 	GraphEdgeWeb_T *Web;			/* Which symbol */
 	unsigned int Size;			/* Size of this symbol as seen in the related kernel argument */
+	unsigned int Guard;			/* Extra space above size in case related arg can read overflow */
 	int Offset;				/* Offset applied to the  base of this symbol in case binding Oper is + or - */
 	int Channel;				/* To which channel this symbol belongs to */
 	int ChannelDepth;			/* Channel depth */
@@ -1457,6 +1459,7 @@ typedef struct {
 	AT_MemLocation_T MemType;
 	unsigned int Address;
 	unsigned int Size;
+	unsigned int Guard;
 	int LiveFirst;
 	int LiveLast;
 	BoxType_T AllocType;
@@ -1484,6 +1487,7 @@ typedef struct AGraphEdgeWeb_T {
 	CKernel_Arg_T *Edge;		/* The symbol, CArgs or Locals in the current graph */
 	unsigned int Index;		/* Index of this Symbol */
 	unsigned int Size;		/* Size of this symbol */
+	unsigned int Guard;		/* Guard on top of Size for this symbol in case of read overflow */
 	int LiveFirst;			/* Graph node index of start life for this symbol */
 	int LiveLast;			/* Graph node index of start stop for this symbol */
 	Kernel_Arg_T *KerArg;		/* This symbol is bounded to this Kernel argument */
diff --git a/tools/autotiler_v3/Autotiler/TilingGenCode.h b/tools/autotiler_v3/Autotiler/TilingGenCode.h
index 53f9bc350..b036d5d5a 100644
--- a/tools/autotiler_v3/Autotiler/TilingGenCode.h
+++ b/tools/autotiler_v3/Autotiler/TilingGenCode.h
@@ -22,6 +22,7 @@ extern void LogicalTileNAddressAndSizeOrig(Kernel_T *Ker, Kernel_Arg_T *Arg, uin
 extern char *BindOpImage(ArgBindingOper Op);
 
 extern char *KernelArgImage(Kernel_T *Ker, Kernel_Arg_T *Arg, CKernel_Arg_T *ArgVal, KernelArgSelect_T ArgSel, KernelIteratorT ArgSpace, KernelIteratorT ItSpace, int *IsInvar);
+extern int EvalArgOverflow(Kernel_T *Ker, Kernel_Arg_T *Arg, Object_T *Obj);
 
 
 #endif
diff --git a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
index d78ce6039..ef93bf8b1 100644
--- a/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
+++ b/tools/autotiler_v3/CNN_Generators_NE16/CNN_Generators_NE16.c
@@ -422,7 +422,7 @@ Kernel_T *CNN_MM_ConvolutionNE16(
         int WOffsetCfg      = 1;
         int QuantRightShift = 0;
         int QuantBits       = (NeedReduct)?2:(Abs(Out_DataSize)==2?1:0); // 00: 8bit, 01: 16bit, 10: 32bit --> If tiling the channel input dimension you need to streamin (need 32 bits output)
-        int QuantNoRect     = (Out_DataSize>0)?1:0;
+        int QuantNoRect     = (NeedReduct || (Out_DataSize>0))?1:0;
         int NormShift       = 1;
         int NormBias        = 1;
         unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \
@@ -676,7 +676,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         LayerBandwidth += (Fcx*Fcy*Filter_DataSizeBits*InFeat*(DWConv?1:OutFeat)+7)/8;
         LayerBandwidth += Bias_DataSize*OutFeat;
 
-        if (ConvOper == KOP_CONV && Height == 1 && Fcy == 1) ConvOper = KOP_CONV1D;
+        if (ConvOper == KOP_CONV && Height == 1 && Fcy == 1 && Fcx > 1) ConvOper = KOP_CONV1D;
         ConvKerName = CNN_FindMatchingKernelAttr(ConvOper, KOP_NONE, ParFeat, CALL_NE16_KER, Abs(In_DataSize), Abs(Out_DataSize), Bias_DataSize, 0, 4, Fcx, Fcy, Dcx, Dcy, Scx, Scy,
                                                  &NeedFcx, &NeedFcy, &NeedDcx, &NeedDcy, &NeedScx, &NeedScy, 0);
         if (ConvKerName==0) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Can't find a matching Convolution basic kernel", Name);
@@ -734,7 +734,7 @@ static Kernel_T *CNN_ConvolutionNE16_Internal(
         int WOffsetCfg      = 1;
         int QuantRightShift = 0;
         int QuantBits       = (NeedReduct)?2:(Abs(Out_DataSize)==2?1:0); // 00: 8bit, 01: 16bit, 10: 32bit --> If tiling the channel input dimension you need to streamin (need 32 bits output)
-        int QuantNoRect     = (Out_DataSize>0 || Mode16)?1:0;
+        int QuantNoRect     = (NeedReduct || (Out_DataSize>0))?1:0;
         int NormShift       = 1;
         int NormBias        = !Mode16;
         unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \
@@ -966,7 +966,7 @@ int CNN_ConvolutionNE16(
         )
 
 {
-        if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) {
+        if (Fcx==1 && Fcy==1 && Height==1 && Width==1) {
                 printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n");
                 CNN_LinearAct_NE16(Name, Ctrl, In_DataSize, Out_DataSize, Bias_DataSize, Scale_DataSize, Filter_DataSizeBits, InFeat, OutFeat, KOP_LINEAR, ActOper);
                 return 1;
@@ -1063,7 +1063,7 @@ int CNN_ConvolutionNE16(
                 OutDim:         Number of outputs
 
                 LinearOper      KOP_LINEAR
-                ActOper         Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU
+                ActOper         Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
                 Signature:      Name(In, Filter, Bias, Out, Scale, ScaleN, Infos)
 
@@ -1119,7 +1119,7 @@ static Kernel_T *CNN_LinearAct_NE16_Internal(
         LinearKerName = CNN_FindMatchingKernelAttr(LinearOper, KOP_NONE, 0, CALL_NE16_KER, Abs(In_DataSize), 0, Bias_DataSize, 0,0,  0,0,0,0,0,0, 0,0,0,0,0,0, 0);
         if (LinearKerName==0) GenTilingError("CNN_LinearAct_NE16 Kernel: %s, Can't find a matching Linear basic kernel: %d %d", Name, Abs(In_DataSize), Bias_DataSize);
         if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_HSIGMOID || ActOper == KOP_SIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_TANH))
-                GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH or KOP_LEAKYRELU", Name);
+                GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_HSIGMOID, KOP_HSWISH, KOP_LEAKYRELU, KOP_SIGMOID or KOP_TANH", Name);
 
         /* Also when in/out are 16bits you need to streamout 32bits but here the reduction step will be done in the cluster (KOP_DP_REDUCT) */
         int NeedReductScale = Mode16; //Abs(Out_DataSize) == 2;
@@ -1155,7 +1155,7 @@ static Kernel_T *CNN_LinearAct_NE16_Internal(
         int WOffsetCfg      = 1;
         int QuantRightShift = 0;
         int QuantBits       = NeedLinOut?2:((Abs(Out_DataSize)==1)?0:1); // 00: 8bit, 01: 16bit, 10: 32bit
-        int QuantNoRect     = (Out_DataSize>0 || Mode16)?1:0;
+        int QuantNoRect     = (NeedReduct || (Out_DataSize>0))?1:0;
         int NormBias        = !Mode16;
         int NormShift       = 1;
         unsigned int DEFAULT_NE16_JOB_CFG = NE16_DefaultConfig(Filter_DataSizeBits, Mode16, StreamoutMode, FilterMode, LinearMode, StridedMode, NormBits, Streamin, \
diff --git a/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c b/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c
index 61db23112..fb4b4693b 100644
--- a/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c
+++ b/tools/autotiler_v3/CNN_Generators_NE16/RNN_Generators_NE16.c
@@ -651,7 +651,7 @@ int RNN_Stack_NE16(
                                 if (Log) printf("Mapped sequence tile based with %d output size constraint\n", DoConstraint);
                         } else {
                                 if (Log) printf("Failed to map sequence tile based with %d output size constraint, relaxing constraint\n", DoConstraint);
-                                DoConstraint = (DoConstraint>16)?DoConstraint/2:0;
+                                DoConstraint = (DoConstraint>16)?DoConstraint/2:1;
                         }
                 } else {
                         if (Ok) {
@@ -1128,7 +1128,7 @@ int LSTM_Stack_NE16(
                                 if (Log) printf("Mapped sequence tile based with %d output size constraint\n", DoConstraint);
                         } else {
                                 if (Log) printf("Failed to map sequence tile based with %d output size constraint, relaxing constraint\n", DoConstraint);
-                                DoConstraint = (DoConstraint>16)?DoConstraint-8:0;
+                                DoConstraint = (DoConstraint>16)?DoConstraint-8:1;
                         }
                 } else {
                         if (Ok) {
@@ -1664,7 +1664,7 @@ int GRU_Stack_NE16(
                                 if (Log) printf("Mapped sequence tile based with %d output size constraint\n", DoConstraint);
                         } else {
                                 if (Log) printf("Failed to map sequence tile based with %d output size constraint, relaxing constraint\n", DoConstraint);
-                                DoConstraint = (DoConstraint>16)?DoConstraint-8:0;
+                                DoConstraint = (DoConstraint>16)?DoConstraint-8:1;
                         }
                 } else {
                         if (Ok) {
diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
index 0deeb9d8f..44b3c2891 100644
--- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
+++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.c
@@ -448,17 +448,38 @@ void LoadCNN_SQ8_Library()
 	LibKernel("KerParLinearLayer_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", 		CNN_Match(CNN_OperList(1, KOP_LINEAR), 0, 1, CNN_Type(1,1,0,0,4), 0,0,0,0,0,0));
 
 	/* Linear layer, 8b output with bias and scaling/activation (ReLU, ReLUN) done in a single shot */
-	LibKernel("KerParLinearLayerFullFeatB8_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
-	LibKernel("KerParLinearLayerFullFeatB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
-	LibKernel("KerParLinearLayerFullFeatB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
-
-	LibKernel("KerParLinearLayerFullFeatB16_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
-	LibKernel("KerParLinearLayerFullFeatB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
-	LibKernel("KerParLinearLayerFullFeatB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
-
-	LibKernel("KerParLinearLayerFullFeatB32_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
-	LibKernel("KerParLinearLayerFullFeatB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
-	LibKernel("KerParLinearLayerFullFeatB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerLinear_SQ8_T",CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_SQ8",		 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_ReLU_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_ReLUN_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_ReLUM_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_ReLUMN_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_LeakyReLU_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_HSwish_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_HSigmoid_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_Sigmoid_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB8_Tanh_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,0,0));
+
+	LibKernel("KerParLinearLayerFullFeatB16_SQ8",		 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_ReLU_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_ReLUN_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_ReLUM_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_ReLUMN_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_LeakyReLU_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_HSwish_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_HSigmoid_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_Sigmoid_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB16_Tanh_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,0,0));
+
+	LibKernel("KerParLinearLayerFullFeatB32_SQ8",		 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_ReLU_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_ReLUN_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_ReLUM_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_ReLUMN_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_LeakyReLU_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_HSwish_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_HSigmoid_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_Sigmoid_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
+	LibKernel("KerParLinearLayerFullFeatB32_Tanh_SQ8",	 CALL_PARALLEL, 0, "KerLinear_SQ8_T", CNN_Match(CNN_OperList(1, KOP_LINEAR), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,0,0));
 
 	/* Convolution or Linear output reduction with per channel scaling and optional activation. Out != In and In Place (IO)  */
 	LibKernel("KerParReduct_CC_SQ8", CALL_PARALLEL, 0, "KerConvLinReduct_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE), 1,
@@ -635,111 +656,281 @@ void LoadCNN_SQ8_Library()
 
 	/* Matrix Multiplication for 1x1 convolutions with channel scaling and optional ReLU or ReLUN activation */
 	/* 8b Bias */
-	LibKernel("KerParMatMulB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLU_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 		1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLUN_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLUM_SQ8",	 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLUMN_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_HSwish_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_Tanh_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 		1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+
+	LibKernel("KerParMatMulSxSyB8_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_ReLU_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 		1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_ReLUN_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN),		1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_ReLUM_SQ8",	 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB8_Tanh_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 		1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
 
-	LibKernel("KerParMatMulSxSyB8_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
-	LibKernel("KerParMatMulSxSyB8_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
-	LibKernel("KerParMatMulSxSyB8_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,-1,-1));
 
 	/* 16b Bias */
-	LibKernel("KerParMatMulB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
-
-	LibKernel("KerParMatMulSxSyB16_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
-	LibKernel("KerParMatMulSxSyB16_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
-	LibKernel("KerParMatMulSxSyB16_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulB16_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLU_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 		1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLUN_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLUM_SQ8",	 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_Tanh_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 		1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+
+	LibKernel("KerParMatMulSxSyB16_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_ReLU_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELU), 		1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_ReLUN_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_ReLUM_SQ8",	 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB16_Tanh_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL), CNN_OperList(1, KOP_TANH), 		1, CNN_Type(1,1,2,0,1), 0,0,0,0,-1,-1));
 
 	/* 32b Bias or No Bias at all */
-	LibKernel("KerParMatMulB32_2x4_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB32_2x4_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-
-	LibKernel("KerParMatMulTransposedB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",		CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulTransposedB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulTransposedB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-
-	LibKernel("KerParMatMulB32_2x4_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",		CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB32_2x4_ReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB32_2x4_ReLUN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-
-	LibKernel("KerParMatMulTransposedB32_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	  	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulTransposedB32_ReLU_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",  	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulTransposedB32_ReLUN_PL_SQ8", CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",  	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-
-	LibKernel("KerParMatMulSxSyB32_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
-	LibKernel("KerParMatMulSxSyB32_ReLU_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
-	LibKernel("KerParMatMulSxSyB32_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulB32_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLU_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUN_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUM_SQ8",	 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_Tanh_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+
+	LibKernel("KerParMatMulTransposedB32_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_NONE),  	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLU_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELU),  	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLUN_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLUM_SQ8",	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLUMN_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_LeakyReLU_SQ8",	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_HSwish_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_HSigmoid_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_Sigmoid_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_Tanh_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_TRANSPOSED, KOP_MATMUL_NOBIAS_TRANSPOSED), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+
+	LibKernel("KerParMatMulB32_PL_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLU_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUN_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUM_PL_SQ8",		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUMN_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_LeakyReLU_PL_SQ8",		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_HSwish_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_HSigmoid_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_Sigmoid_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_Tanh_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR, KOP_MATMUL_NOBIAS_SCALE_SCALAR), CNN_OperList(1, KOP_TANH), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+
+	LibKernel("KerParMatMulTransposedB32_PL_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T",	CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLU_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLUN_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_NONE), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLUM_PL_SQ8",	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_RELUM), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_ReLUMN_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_RELUMN),		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_LeakyReLU_PL_SQ8",	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_HSwish_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_HSWISH), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_HSigmoid_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_Sigmoid_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulTransposedB32_Tanh_PL_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_PL_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL_SCALE_SCALAR_TRANSPOSED, KOP_MATMUL_NOBIAS_SCALE_SCALAR_TRANSPOSED), CNN_OperList(1, KOP_TANH), 		1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+
+	LibKernel("KerParMatMulSxSyB32_SQ8", 			CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_ReLU_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_ReLUN_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_ReLUM_SQ8",	 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_RELUMN),	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_HSwish_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
+	LibKernel("KerParMatMulSxSyB32_Tanh_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(2, KOP_MATMUL, KOP_MATMUL_NOBIAS), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,-1,-1));
 
 	/* Mat Mul based convolutions */
 
 	/* CHW In and Out tensors, [OutFeat,InFeat,Fy,Fx] weights */
-	LibKernel("KerPar_MM_Conv1D_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv1D_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv1D_ReLUN_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv1D_LeakyReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
-
-	LibKernel("KerPar_MM_Conv1D_DxDy_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
-	LibKernel("KerPar_MM_Conv1D_DxDy_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
-
-	LibKernel("KerPar_MM_Conv2D_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv2D_DxDy_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
-	LibKernel("KerPar_MM_Conv2D_DxDy_ReLU_SQ8", CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
-
+	LibKernel("KerPar_MM_Conv1D_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE),	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLU_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLUN_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLUM_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLUMN_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_HSwish_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_HSigmoid_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_Sigmoid_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_Tanh_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv1D_DxDy_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLU_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLUN_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLUM_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLUMN_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_LeakyReLU_SQ8",	CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU),	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_HSwish_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_HSigmoid_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID),	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_Sigmoid_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_Tanh_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv2D_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLU_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLUN_SQ8",			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLUM_SQ8",			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLUMN_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_LeakyReLU_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU),	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_HSwish_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_HSigmoid_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID),	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_Sigmoid_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_Tanh_SQ8",			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv2D_DxDy_SQ8", 			CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLU_SQ8", 		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLUN_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLUM_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLUMN_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_LeakyReLU_SQ8",	CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU),	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_HSwish_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_HSigmoid_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID),	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_Sigmoid_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_Tanh_SQ8",		CALL_PARALLEL, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
 
 	/* HWC In and Out tensors, [OutFeat,Fy,Fx,InFeat] weights */
-	LibKernel("KerPar_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
-	LibKernel("Ker_MM_Conv1x1_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
-	LibKernel("Ker_MM_Conv1x1_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv1D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv1D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
-	LibKernel("KerPar_MM_Conv2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv2D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
-	LibKernel("KerPar_MM_Conv2D_DxDy_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
-	LibKernel("KerPar_MM_ConvDW2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV_DW), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
-	LibKernel("Ker_MM_Conv2D_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
-	LibKernel("Ker_MM_Conv2D_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T",
-											CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_ReLU_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_ReLUN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_ReLUM_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_ReLUMN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_LeakyReLU_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_HSwish_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_Sigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1x1_Tanh_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+
+	LibKernel("Ker_MM_Conv1x1_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_ReLU_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_ReLUN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_ReLUM_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_ReLUMN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_LeakyReLU_HWC_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_HSwish_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_HSigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_Sigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID),	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv1x1_Tanh_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	0, CNN_Type(1,1,4,0,1), 1,1,1,1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv1D_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLU_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLUN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLUM_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_ReLUMN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_LeakyReLU_HWC_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_HSwish_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_HSigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_Sigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_Tanh_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,1,1,1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv1D_DxDy_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLU_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLUN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLUM_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_ReLUMN_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_LeakyReLU_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_HSwish_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_HSigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_Sigmoid_HWC_SQ8", CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv1D_DxDy_Tanh_HWC_SQ8",	 CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,1,-1,-1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv2D_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLU_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLUN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLUM_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_ReLUMN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_LeakyReLU_HWC_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_HSwish_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_HSigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_Sigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_Tanh_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+
+	LibKernel("KerPar_MM_Conv2D_DxDy_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLU_HWC_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLUN_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLUM_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_ReLUMN_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_LeakyReLU_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_HSwish_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_HSigmoid_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_Sigmoid_HWC_SQ8",CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+	LibKernel("KerPar_MM_Conv2D_DxDy_Tanh_HWC_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), -1,-1,-1,-1,-1,-1));
+
+	LibKernel("Ker_MM_Conv2D_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_NONE), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_ReLU_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELU), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_ReLUN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUN), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_ReLUM_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUM), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_ReLUMN_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_RELUMN), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_LeakyReLU_HWC_SQ8",	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_LEAKYRELU), 0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_HSwish_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSWISH), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_HSigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_HSIGMOID), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_Sigmoid_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_SIGMOID), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
+	LibKernel("Ker_MM_Conv2D_Tanh_HWC_SQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "Ker_MM_Conv_SQ8_T", 	CNN_Match(CNN_OperList(1, KOP_MM_CONV), CNN_OperList(1, KOP_TANH), 	0, CNN_Type(1,1,4,0,1), -1,-1,1,1,-1,-1));
 
 	/* Matrix Multiplication for 1x1 convolutions with channel scaling and optional ReLU or ReLUN activation, optimized form when In1 fits entirely into shared L1 */
 	/* 8b Bias */
-	LibKernel("KerParMatMulB8_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB8_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB8_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_SF_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLU_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLUN_SF_SQ8",	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN),	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLUM_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_ReLUMN_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_LeakyReLU_SF_SQ8",	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_HSwish_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_HSigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_Sigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB8_Tanh_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
 
 	/* 16b Bias */
-	LibKernel("KerParMatMulB16_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB16_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB16_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_SF_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLU_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLUN_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLUM_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_ReLUMN_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_LeakyReLU_SF_SQ8",	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_HSwish_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_HSigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_Sigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB16_Tanh_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,2,0,1), 0,0,0,0,1,1));
 
 	/* 32b Bias */
-	LibKernel("KerParMatMulB32_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	//LibKernel("KerParMatMulB32_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB32_2x4_ReLU_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatMulB32_ReLUN_SF_SQ8", CALL_PARALLEL, 0, "KerMatMul_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
-
+	LibKernel("KerParMatMulB32_SF_SQ8", 		CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLU_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUN_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUM_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_ReLUMN_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_LeakyReLU_SF_SQ8",	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_HSwish_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_HSigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_Sigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatMulB32_Tanh_SF_SQ8", 	CALL_PARALLEL, 0, "KerMatMul_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATMUL_SM1), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,4,0,1), 0,0,0,0,1,1));
 
 	/* Matrix by vector multiplication with tensor centric scaling and optional activation */
-	LibKernel("KerParMatVectMul_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_NONE), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatVectMul_ReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatVectMul_ReLUN_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUN), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatVectMul_HSigmoid_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSIGMOID), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatVectMul_HSwish_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSWISH), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
-	LibKernel("KerParMatVectMul_LeakyReLU_SQ8", CALL_PARALLEL, 0, "KerMat3_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_SQ8", 		CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_NONE), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_ReLU_SQ8", 		CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_ReLUN_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUN),	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_ReLUM_SF_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUM), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_ReLUMN_SF_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_RELUMN), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_LeakyReLU_SF_SQ8",	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_LEAKYRELU), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_HSwish_SF_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSWISH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_HSigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_HSIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_Sigmoid_SF_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_SIGMOID), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
+	LibKernel("KerParMatVectMul_Tanh_SF_SQ8", 	CALL_PARALLEL, 0, "KerMat3_SQ8_T", CNN_Match(CNN_OperList(1, KOP_MATVECTMUL), CNN_OperList(1, KOP_TANH), 	1, CNN_Type(1,1,1,0,1), 0,0,0,0,1,1));
 
 	/* SoftMax, pre scaling */
 	LibKernel("KerParSoftMax_SQ8",      CALL_PARALLEL, 0, "KerSoftMax_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_SOFTMAX), 0, -1, CNN_Type(1,0,0,0,2), 0,0,0,0,0,0));
@@ -950,7 +1141,18 @@ void LoadCNN_SQ8_Library()
         LibKernel("KerParReduct_CC_LeakyReLU_HWC_SQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
         LibKernel("KerParReduct_CC_Sigmoid_HWC_SQ8",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
         LibKernel("KerParReduct_CC_Tanh_HWC_SQ8",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH),  	   1, CNN_Type(4,0,0,0,1), 0,0,0,0,0,0));
- 
+
+	LibKernel("KerParReduct_CC_HWC_USQ8",  	       		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_ReLU_HWC_USQ8",      	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_ReLUN_HWC_USQ8",     	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_ReLUM_HWC_USQ8",     	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_ReLUMN_HWC_USQ8",    	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_HSigmoid_HWC_USQ8",  	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_HSwish_HWC_USQ8",    	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_LeakyReLU_HWC_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_Sigmoid_HWC_USQ8",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerParReduct_CC_Tanh_HWC_USQ8",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_TANH),  	   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+  
 	LibKernel("KerParReduct_CC_HWC_SQ16",  	       		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
         LibKernel("KerParReduct_CC_ReLU_HWC_SQ16",      	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
         LibKernel("KerParReduct_CC_ReLUN_HWC_SQ16",     	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
@@ -997,20 +1199,29 @@ void LoadCNN_SQ8_Library()
         LibKernel("KerReduct_CC_NoScale_Tanh_SQ16",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,2), 0,0,0,0,0,0));
 
         /* Unsigned */
-        LibKernel("KerReduct_CC_NoScale_USQ8",  	   	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_NONE),  1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLU_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELU),  1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLUN_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLUM_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUMN),1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
-
-        LibKernel("KerReduct_CC_NoScale_USQ16",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_NONE),  1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLU_USQ16", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELU),  1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLUN_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUN), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLUM_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUM), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
-        LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE),CNN_OperList(1, KOP_RELUMN),1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_USQ8",  	   	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_NONE),  	   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLU_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELU),  	   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLUN_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUN), 	   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLUM_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUM), 	   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ8",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUMN),	   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_HSigmoid_USQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_HSwish_USQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_LeakyReLU_USQ8", 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_Sigmoid_USQ8",	 	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_Tanh_USQ8", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,-1), 0,0,0,0,0,0));
+
+        LibKernel("KerReduct_CC_NoScale_USQ16",  		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_NONE),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLU_USQ16", 		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELU),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLUN_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUN),     1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLUM_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUM),     1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_ReLUMN_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_RELUMN),    1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_HSigmoid_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSIGMOID),  1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_HSwish_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_HSWISH),    1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_LeakyReLU_USQ16",	CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_LEAKYRELU), 1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_Sigmoid_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_SIGMOID),   1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
+        LibKernel("KerReduct_CC_NoScale_Tanh_USQ16",		CALL_PARALLEL|CALL_HWC_KER, 0, "KerConvLinReduct_SQ8_T", CNN_Match(CNN_OperList(1, KOP_DP_REDUCT_NOSCALE), CNN_OperList(1, KOP_TANH),      1, CNN_Type(4,0,0,0,-2), 0,0,0,0,0,0));
 
 	/* Activations with tensor centric scaling */
-        LibKernel("Ker_Scale_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
 	LibKernel("Ker_ActNone_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_ACT_NONE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
         LibKernel("Ker_ReLU_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_RELU),     0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
         LibKernel("Ker_ReLUN_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_RELUN),    0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
@@ -1022,17 +1233,6 @@ void LoadCNN_SQ8_Library()
         LibKernel("Ker_Sigmoid_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_SIGMOID),  0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
         LibKernel("Ker_Tanh_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",		CNN_Match(CNN_OperList(1, KOP_TANH),  0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
 
-	LibKernel("Ker_ActNone_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_ACT_NONE_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_ReLU_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_RELU_IN_SCALE),     0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_ReLUN_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_RELUN_IN_SCALE),    0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_ReLUM_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_RELUM_IN_SCALE),    0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_ReLUMN_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_RELUMN_IN_SCALE),   0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_HSigmoid_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_HSIGMOID_IN_SCALE), 0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_HSwish_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_HSWISH_IN_SCALE),   0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_LeakyReLU_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_LEAKYRELU_IN_SCALE),0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_Sigmoid_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_SIGMOID_IN_SCALE),  0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-        LibKernel("Ker_Tanh_ScaleIn_SQ8", CALL_PARALLEL, 0, "KerActivation_SQ8_T",	CNN_Match(CNN_OperList(1, KOP_TANH_IN_SCALE),  0, 0, CNN_Type(1,0,0,0,1), 0,0,0,0,0,0));
-
 	/* Pooling (Max or Avg) with tensor centric scaling and optional ReLU or ReLUN activation */
         LibKernel("KerPool2x2Stride2_SQ8", CALL_PARALLEL, 0, "KerPool_SQ8_T",		CNN_Match(CNN_OperList(2, KOP_MAXPOOL, KOP_AVGPOOL), CNN_OperList(1, KOP_NONE), 0,
 												  CNN_Type(1,0,0,0,1), 2,2,1,1,2,2));
@@ -1100,7 +1300,7 @@ void LoadCNN_SQ8_Library()
 		Spy:		Pooling filter stride y dimension
 		PoolPad:	0: No padding, 1: Zero padding
 
-		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU
+		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In, Filter, Bias, Out, Scale, ScaleN, Infos)
 
@@ -1167,7 +1367,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal(
 	}
 
 	if (HWC && Fcy==1 && Fcx==1 && Scy==1 && Scx==1 && Dcy==1 && Dcx==1)
-		return CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, Height*Width, OutFeat, InFeat, 0,0,0,0, KOP_MATMUL_TRANSPOSED, ActOper, 0);
+		return CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, 1, InFeat, Height*Width, OutFeat, InFeat, 0,0,0,0, KOP_MATMUL_TRANSPOSED, ActOper, 0);
 
 	if (ParFeatConv == 2 && HWC && Fcy>1 && (InFeat < 8))
 		ParFeatConv = 0;
@@ -1194,7 +1394,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal(
 		GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, ConvOper, expecting KOP_CONV, KOP_CONV_DW", Name);
 	if (!(PoolOper == KOP_NONE || PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL))
 		GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, PoolOper, expecting KOP_NONE, KOP_MAXPOOL or KOP_AVGPOOL", Name);
-	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU))
+	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
 		GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name);
         if (DWConv && (InFeat != OutFeat)) GenTilingError("CNN_ConvolutionPoolAct_NE16 Kernel: %s, Depth wise convolution requested with InFeat:%d != OutFeat:%d", Name, InFeat, OutFeat);
 
@@ -1255,7 +1455,7 @@ static Kernel_T *CNN_MM_ConvolutionPoolAct_SQ8_Internal(
 							     	 &NeedFpx, &NeedFpy, &NeedDpx, &NeedDpy, &NeedSpx, &NeedSpy, 0);
 		else if (ActOper) StandAloneAct = 0;
 		else if (PoolOper==KOP_AVGPOOL && ActOper==KOP_NONE && HWC) {
-			StandAloneAct = 1; ActOper = KOP_SCALE;
+			StandAloneAct = 1; ActOper = KOP_ACT_NONE;
 		}
 		if (PoolKerName==0) GenTilingError("CNN_MM_ConvolutionPoolAct_SQ8 Kernel: %s, Can't find a matching Pooling %s basic kernel", Name, ActOper?"with linear rectification":"");
 	}
@@ -1754,7 +1954,7 @@ static Kernel_T *CNN_HWC_DWConvolutionPoolAct_SQ8_Internal(
 		Spy:		Pooling filter stride y dimension
 		PoolPad:	0: No padding, 1: Zero padding
 
-		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
+		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In, Filter, Bias, Out, Scale, ScaleN, Infos)
 
@@ -1844,10 +2044,6 @@ Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal(
 		if (Ok!=0) return Ok;
 		if (Log) printf("No solution found for im2col scheme, reverting to standard implementation\n");
 	}
-	if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) {
-		printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n");
-		return CNN_LinearAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, KOP_LINEAR, ActOper);
-	}
 
 	if (PoolOper==KOP_NONE) {
 		Fpx=1; Dpx=1; Spx=1; Fpy=1; Dpy=1; Spy=1;
@@ -1974,7 +2170,7 @@ Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal(
 			if (Ok) return Ok;
 		}
 		if (Log) printf("Mapping this convolution to matrix multiplication\n");
-		Kernel_T *Ok = CNN_MatMulAct_SQ8_Internal(Name, 0, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper, 1);
+		Kernel_T *Ok = CNN_MatMulAct_SQ8_Internal(Name, 0, Bias_DataSize, Scale_DataSize, 1, InFeat, OutFeat, Width*Height, InFeat, Width, Height, Scx, Scy, KOP_MATMUL, ActOper, 1);
 		AT_SetKernelCtrl(AT_KERNEL_NOSOLUTION_ERROR, AT_OPT_ON);
 		if (Ok) return Ok;
 		if (Log) printf("Mapping this convolution to matrix multiplication FAILED, reverting to standard implementation\n");
@@ -2172,7 +2368,7 @@ Kernel_T *CNN_ConvolutionPoolAct_SQ8_Internal(
 		Spy:		Pooling filter stride y dimension
 		PoolPad:	0: No padding, 1: Zero padding
 
-		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
+		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In, Filter, Bias, Out, Scale, ScaleN, Infos)
 
@@ -2227,7 +2423,7 @@ int CNN_GroupedConvolutionPoolAct_SQ8(
 		GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, ConvOper, expecting KOP_NONE, KOP_CONV or KOP_CONV_DW", Name);
 	if (!(PoolOper == KOP_NONE || PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL))
 		GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, PoolOper, expecting KOP_NONE, KOP_MAXPOOL or KOP_AVGPOOL", Name);
-	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU))
+	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
 		GenTilingError("CNN_GroupedConvolutionPoolAct_SQ8: Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name);
 
 	CNN_LayerOutputDim(Width, Height, ConvOper, Fcx, Fcy, Dcx, Dcy, Scx, Scy, ConvPad, PoolOper, Fpx, Fpy, Dpx, Dpy, Spx, Spy, PoolPad, &Wc, &Hc, &Wo, &Ho, 0, 0, 0, 0);
@@ -2309,7 +2505,7 @@ int CNN_GroupedConvolutionPoolAct_SQ8(
 		Spx:		Pooling stride, x dimension
 		Spy:		Pooling stride, y dimension
 
-		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
+		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In, Out, Infos)
 
@@ -2367,7 +2563,7 @@ Kernel_T * CNN_PoolAct_SQ8_Internal(
 
 	if (!(PoolOper == KOP_MAXPOOL || PoolOper == KOP_AVGPOOL))
 		GenTilingError("CNN_Pool_SQ8 Kernel: %s, PoolOper, expecting KOP_MAXPOOL or KOP_AVGPOOL", Name);
-	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU))
+	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
 		GenTilingError("CNN_Pool_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name);
 
 	/* Set Kernel characteristics */
@@ -2514,7 +2710,6 @@ Kernel_T * CNN_PoolAct_SQ8_Internal(
 		Height:		Number of lines of a given feature map
 
 		ActOper:	KOP_ACT_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
-				KOP_ACT_NONE_IN_SCALE, KOP_RELU_IN_SCALE, KOP_RELUN_IN_SCALE, KOP_HSIGMOID_IN_SCALE, KOP_HSWISH_IN_SCALE, KOP_LEAKYRELU_IN_SCALE, KOP_SIGMOID_IN_SCALE
 
 		Signature:	Name(In, Out, Infos)
 
@@ -2544,9 +2739,8 @@ Kernel_T * CNN_Act_SQ8_Internal(
 	int StandAloneAct = (ActOper!=KOP_NONE);
 	int Log=1;
 
-	if (!(ActOper == KOP_ACT_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH ||
-	      ActOper == KOP_ACT_NONE_IN_SCALE || ActOper == KOP_RELU_IN_SCALE || ActOper == KOP_RELUN_IN_SCALE || ActOper == KOP_RELUM_IN_SCALE || ActOper == KOP_RELUMN_IN_SCALE || ActOper == KOP_HSIGMOID_IN_SCALE || ActOper == KOP_HSWISH_IN_SCALE || ActOper == KOP_LEAKYRELU_IN_SCALE || ActOper == KOP_SIGMOID_IN_SCALE || ActOper == KOP_TANH_IN_SCALE))
-		GenTilingError("CNN_Act_SQ8 Kernel: %s, ActOper, expecting KOP_ACT_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_RELUMN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name);
+	if (!(ActOper == KOP_ACT_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
+		GenTilingError("CNN_Act_SQ8 Kernel: %s, ActOper, expecting KOP_ACT_NONE, KOP_RELU, KOP_RELUN, KOP_RELUM, KOP_RELUMN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH, KOP_SIGMOID or KOP_LEAKYRELU", Name);
 
 	ActKerName = CNN_FindMatchingKernel(ActOper, KOP_NONE, 0, 1, 0, 0, 0, 1, 0,0,0,0,0,0, 0,0,0,0,0,0, 0);
 	if (ActKerName==0) GenTilingError("CNN_Act_SQ8 Kernel: %s, Can't find a matching Activation basic kernel", Name);
@@ -2617,7 +2811,7 @@ Kernel_T * CNN_Act_SQ8_Internal(
 
 		PoolOper:	KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL
 
-		ActOper:	Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
+		ActOper:	Optional activation function: KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In, Out, Infos)
 
@@ -2656,7 +2850,7 @@ static Kernel_T *CNN_GlobalPoolAct_SQ8_Interal(
 
 	if (!(PoolOper == KOP_GLOBAL_MAXPOOL || PoolOper == KOP_GLOBAL_AVGPOOL || PoolOper == KOP_GLOBAL_SUMPOOL))
 		GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, PoolOper should be KOP_GLOBAL_MAXPOOL or KOP_GLOBAL_AVGPOOL or KOP_GLOBAL_SUMPOOL", Name);
-	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU))
+	if (!(ActOper == KOP_NONE || ActOper == KOP_RELU || ActOper == KOP_RELUN || ActOper == KOP_RELUM || ActOper == KOP_RELUMN || ActOper == KOP_HSIGMOID || ActOper == KOP_HSWISH || ActOper == KOP_LEAKYRELU || ActOper == KOP_SIGMOID || ActOper == KOP_TANH))
 		GenTilingError("CNN_GlobalPoolAct_SQ8 Kernel: %s, ActOper, expecting KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSIGMOID, KOP_TANH, KOP_HSWISH or KOP_LEAKYRELU", Name);
 
 	PoolKerName = CNN_FindMatchingKernelAttr(PoolOper, ActOper, ParFeat, KerLayout, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
@@ -2835,7 +3029,7 @@ static Kernel_T *CNN_GlobalPoolAct_SQ8_Interal(
 		OutDim:		Number of outputs
 
 		LinearOper	KOP_LINEAR
-		ActOper		Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
+		ActOper		Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In, Filter, Bias, Out, Scale, ScaleN, Infos)
 
@@ -3380,24 +3574,11 @@ int CNN_MatAddPaddedAct_SQ8(
 	char *TopName = NULL, *BotName = NULL, *BodyName = NULL;
 	Ok = Ok && CNN_MatAddAct_SQ8(BodyName = AppendNames(Name, "Body"), Ctrl, FeatBody, Width, Height, AddMatOper, ActOper);
 
-	KernelOper_T PadActOper;
-	switch (ActOper) {
-		case KOP_NONE:	    PadActOper = KOP_ACT_NONE_IN_SCALE; break;
-		case KOP_RELU:	    PadActOper = KOP_RELU_IN_SCALE; break;
-		case KOP_RELUN:	    PadActOper = KOP_RELUN_IN_SCALE; break;
-		case KOP_RELUM:	    PadActOper = KOP_RELUM_IN_SCALE; break;
-		case KOP_RELUMN:    PadActOper = KOP_RELUMN_IN_SCALE; break;
-		case KOP_HSIGMOID:  PadActOper = KOP_HSIGMOID_IN_SCALE; break;
-		case KOP_HSWISH:    PadActOper = KOP_HSWISH_IN_SCALE; break;
-		case KOP_LEAKYRELU: PadActOper = KOP_LEAKYRELU_IN_SCALE; break;
-		case KOP_SIGMOID:   PadActOper = KOP_SIGMOID_IN_SCALE; break;
-		case KOP_TANH:      PadActOper = KOP_TANH_IN_SCALE; break;
-	}
 	if (PadTop) {
-		Ok = Ok && CNN_Act_SQ8(TopName = AppendNames(Name, "PadTop"), Ctrl, PadTop, Width, Height, PadActOper);
+		Ok = Ok && CNN_Act_SQ8(TopName = AppendNames(Name, "PadTop"), Ctrl, PadTop, Width, Height, ActOper);
 	}
 	if (PadBot) {
-		Ok = Ok && CNN_Act_SQ8(BotName = AppendNames(Name, "PadBot"), Ctrl, PadBot, Width, Height, PadActOper);
+		Ok = Ok && CNN_Act_SQ8(BotName = AppendNames(Name, "PadBot"), Ctrl, PadBot, Width, Height, ActOper);
 	}
 	CloseKernelGroupNoMerge();
 	if (Ok==0) return 0;
@@ -3482,7 +3663,7 @@ int CNN_MatAddPaddedAct_SQ8(
 		Height:		Height of a In1
 
 		MatOper:	KOP_MATVECTMUL
-		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID
+		ActOper:	Optional activation function: KOP_NONE, KOP_RELU, KOP_RELUN, KOP_HSWISH, KOP_HSIGMOID, KOP_LEAKYRELU, KOP_SIGMOID, KOP_TANH
 
 		Signature:	Name(In1, In2, Out, Infos)
 
@@ -3692,6 +3873,7 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal(
 	int Bias_DataSize,
 	int Scale_DataSize,
 
+	int NBatches,
 	int ColM1,
 	int LineM1,
 	int ColM2,
@@ -3754,9 +3936,9 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal(
 	}
 
 	ColO = ((Width+Scx-1)/Scx) * ((Height+Scy-1)/Scy);
-	LayerOp += (int64_t) ColM1*ColO*LineM1;
-	LayerBandwidth += (int64_t) LineM1*(ColM1*ColM2*(1+1));
-	LayerBandwidth += (int64_t) LineM1*ColM2*1;
+	LayerOp += (int64_t) NBatches*ColM1*ColO*LineM1;
+	LayerBandwidth += (int64_t) NBatches*LineM1*(ColM1*ColM2*(1+1));
+	LayerBandwidth += (int64_t) NBatches*LineM1*ColM2*1;
 	LayerBandwidth += (int64_t) LineM1*Bias_DataSize;
 
 	if (Scy!=1) ConsT0 = Width*Scy; else ConsT0 = 4;
@@ -3779,6 +3961,8 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal(
 	}
 	/* First try buffering small objects */
 	Kernel = UserKernel(Name,
+		(NBatches>1)?
+		KernelIterSpace(3, IterFixedSpace(D0, NBatches), IterTiledSpace(T1), IterTiledSpace(T0)):
 		KernelIterSpace(2, IterTiledSpace(T1), IterTiledSpace(T0)),
                 TILE_HOR,
                 CArgs(7,
@@ -3823,24 +4007,44 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal(
 		),
 		ColFirst?
 		KerArgs(8,
-	    !Transposed?KerArg("KerBuff",KerArgSpace(1, T1), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
-			KerArg("In1",    KerArgSpace(1, T0), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
-			KerArg("In2",    KerArgSpace(1, T1), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"),
-	        !NoBias?KerArg("Bias",   KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
-			KerArg("Out",    KerArgSpace(1, T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"),
-	   !ScaleScalar?KerArg("Scale",  KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
-	   !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
-			KerArg("Infos",  KerArgSpace(1, T1), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
+	    		!Transposed?
+	    		KerArg("KerBuff",KerArgSpace(1,   T1), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
+	   		(NBatches>1)?
+	   		KerArg("In1",    KerArgSpace(2,D0,T0), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"):
+	   	        KerArg("In1",    KerArgSpace(1,   T0), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
+	   	        (NBatches>1)?
+			KerArg("In2",    KerArgSpace(2,D0,T1), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM,	     ConsT0, "In2"):
+			KerArg("In2",    KerArgSpace(1,   T1), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM,	     ConsT0, "In2"),
+	        	!NoBias?
+	        	KerArg("Bias",   KerArgSpace(1,   TA), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
+	   	        (NBatches>1)?
+			KerArg("Out",    KerArgSpace(2,D0,T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"):
+			KerArg("Out",    KerArgSpace(1,   T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_TILE_VER|OBJ_CONSTRAINTS_PAD_REM, 0, "Out"),
+	   		!ScaleScalar?
+	   		KerArg("Scale",  KerArgSpace(1,   TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
+	   		!ScaleScalar?
+	   		KerArg("ScaleN", KerArgSpace(1,   TA), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
+			KerArg("Infos",  KerArgSpace(1,   T1), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
 		):
 		KerArgs(8,
-	    !Transposed?KerArg("KerBuff",KerArgSpace(1, T0), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
-			KerArg("In1",    KerArgSpace(1, T1), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
-			KerArg("In2",    KerArgSpace(1, T0), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, ConsT0, "In2"),
-	        !NoBias?KerArg("Bias",   KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
-			KerArg("Out",    KerArgSpace(1, T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out"),
-	   !ScaleScalar?KerArg("Scale",  KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
-	   !ScaleScalar?KerArg("ScaleN", KerArgSpace(1, TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
-			KerArg("Infos",  KerArgSpace(1, T0), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
+	    		!Transposed?
+	    		KerArg("KerBuff",KerArgSpace(1,   T0), O_BUFF|O_NTILED, Nbuff*ColM1,  1,      1,             0, 0,                                                0, 0):AT_NO_KER_ARG,
+	   		(NBatches>1)?
+			KerArg("In1",    KerArgSpace(2,D0,T1), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"):
+			KerArg("In1",    KerArgSpace(1,   T1), O_IN|O_DB|O_CONST,     ColM1,  LineM1, 1,             0, OBJ_CONSTRAINTS_PAD_REM,                          8, "In1"),
+	   		(NBatches>1)?
+			KerArg("In2",    KerArgSpace(2,D0,T0), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 		   ConsT0, "In2"):
+			KerArg("In2",    KerArgSpace(1,   T0), O_IN|O_DB,             ColM2,  LineM2, 1,             0, ObjCons|OBJ_CONSTRAINTS_PAD_REM, 		   ConsT0, "In2"),
+	        	!NoBias?
+	        	KerArg("Bias",   KerArgSpace(1,   TB), O_BUFF|O_IN|O_CONST,       1,  SAxis,  Bias_DataSize, 0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Bias"):AT_NO_KER_ARG,
+	   		(NBatches>1)?
+			KerArg("Out",    KerArgSpace(2,D0,T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out"):
+			KerArg("Out",    KerArgSpace(1,   T1), O_OUT|O_DB,             ColO,  LineO,  1,             0, OBJ_CONSTRAINTS_PAD_REM,                          0, "Out"),
+	   		!ScaleScalar?
+	   		KerArg("Scale",  KerArgSpace(1,   TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "Scale"):AT_NO_KER_ARG,
+	   		!ScaleScalar?
+	   		KerArg("ScaleN", KerArgSpace(1,   TB), O_BUFF|O_IN|O_CONST,        1, SAxis,  1,             0, 0,                                                0, "ScaleN"):AT_NO_KER_ARG,
+			KerArg("Infos",  KerArgSpace(1,   T0), O_IN|O_BUFF|O_NTILED,       1,     1,  AT_INF_DIM*1,  0, 0,                                                0, "Infos")
 		)
 	);
 	if (Kernel) {
@@ -4263,6 +4467,10 @@ int CNN_ConvolutionPoolAct_SQ8(
 	KernelOper_T ActOper
 	)
 {
+	if (Fcx==1 && Fcy==1 && Height==1 && Width==1) {
+		printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n");
+		return CNN_LinearAct_SQ8(Name, Ctrl, Bias_DataSize, Scale_DataSize, InFeat, OutFeat, KOP_LINEAR, ActOper);
+	}
 	Kernel_T *Ker = 0, *Sol1 = 0, *Sol2 = 0;
         float K = 0.9;
         Tile_Orientation_T TileOrientation = TILE_HOR;
@@ -4340,7 +4548,11 @@ int CNN_MatAddAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Feat, int Width, i
 }
 
 int CNN_MatMulAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
-	return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0);
+	return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, 1, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0);
+}
+
+int CNN_BatchedMatMulAct_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int NBatches, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
+	return (CNN_MatMulAct_SQ8_Internal(Name, Ctrl, Bias_DataSize, Scale_DataSize, NBatches, ColM1, LineM1, ColM2, LineM2, Width, Height, Scx, Scy, MatMulOper, ActOper, 1)!=0);
 }
 
 int CNN_MatMulSmallM1Act_SQ8(char *Name, CNN_GenControl_T *Ctrl, int Bias_DataSize, int Scale_DataSize, int ColM1, int LineM1, int ColM2, int LineM2, int Width, int Height, int Scx, int Scy, KernelOper_T MatMulOper, KernelOper_T ActOper) {
diff --git a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h
index ba223b55d..ab919a417 100644
--- a/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h
+++ b/tools/autotiler_v3/CNN_Generators_SQ8/CNN_Generators_SQ8.h
@@ -698,6 +698,29 @@ int CNN_MatMulAct_SQ8(
         KernelOper_T ActOper
 	);
 
+int CNN_BatchedMatMulAct_SQ8(
+	char *Name,
+
+	CNN_GenControl_T *Ctrl,
+
+	int Bias_DataSize,
+	int Scale_DataSize,
+
+	int NBatches,
+	int ColM1,
+	int LineM1,
+	int ColM2,
+	int LineM2,
+
+	int Width,
+	int Height,
+	int Scx,
+	int Scy,
+
+	KernelOper_T MatMulOper,
+	KernelOper_T ActOper
+	);
+
 Kernel_T *CNN_MatMulAct_SQ8_Internal(
 	char *Name,
 
@@ -706,6 +729,7 @@ Kernel_T *CNN_MatMulAct_SQ8_Internal(
 	int Bias_DataSize,
 	int Scale_DataSize,
 
+	int NBatches,
 	int ColM1,
 	int LineM1,
 	int ColM2,
diff --git a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c
index 659e2e34e..9f912d010 100644
--- a/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c
+++ b/tools/autotiler_v3/CNN_Generators_fp16/CNN_Generators_fp16.c
@@ -1279,7 +1279,7 @@ Kernel_T *CNN_ConvolutionPoolAct_fp16_Internal(
                 if (Ok!=0) return Ok;
                 if (Log) printf("Mapping this convolution to im2col scheme failed, reverting to standard implementation\n");
         }
-	if (Fcx==1 && Fcy==1 && Scx==1 && Scy==1 && Dcx==1 && Dcy==1 && Height==1 && Width==1) {
+	if (Fcx==1 && Fcy==1 && Height==1 && Width==1) {
 		printf("This is a pointwise on 1x1 input --> Mapping to CNN_Linear_NE16\n");
 		return CNN_LinearAct_fp16_Internal(Name, Ctrl, InFeat, OutFeat, KOP_LINEAR, ActOper);
 	}
diff --git a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c
index cdce41f3b..b1a395965 100644
--- a/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c
+++ b/tools/autotiler_v3/CNN_Libraries/SSD_BasicKernels.c
@@ -15,13 +15,6 @@
 #include "CNN_BasicKernels.h"
 #include "SSD_BasicKernels.h"
 
-#ifndef __EMUL__
-    #define CL_CRITICAL_ENTER() pi_cl_team_critical_enter()
-    #define CL_CRITICAL_EXIT()  pi_cl_team_critical_exit()
-#else
-    #define CL_CRITICAL_ENTER()
-    #define CL_CRITICAL_EXIT()
-#endif
 // optimize the division to find the chunk size
 // equivalent to ceil(KerArg0->W/rt_nb_pe())
 inline static unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)
@@ -137,14 +130,14 @@ void Ker_SSD_Decoder(Ker_SSD_Decoder_ArgT  *KerArg0 )
         boxes_idx = i*num_coords;
         for (unsigned int j=1; j<num_classes; j++){
             if((scores[(i*num_classes)+j]) > score_th){
-                CL_CRITICAL_ENTER();
+                gap_cl_critical_enter();
                 bbn = KerArg0->bbox_idx[0]++;
                 // printf("Core: %d\tbbox_idx:%d\n", CoreId, KerArg0->bbox_idx[0]);
                 if(bbn > n_max_bb){ // check if we reched n_max_bb
-                    CL_CRITICAL_EXIT();
+                    gap_cl_critical_exit();
                     goto exit_double_for;
                 }
-                CL_CRITICAL_EXIT();
+                gap_cl_critical_exit();
                 // Valid BBOX --> alive
                 bbox[bbn].alive = 1;
                 //Save score always as a Q7
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c
index df3b6affc..5c5a14412 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_HWC_SQ8.c
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if 0
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wextra"
@@ -1060,261 +1061,6 @@ void KerParReduct_CC_Tanh_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg)
 	gap_waitbarrier(0);
 }
 
-
-void KerParReduct_CC_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                Out[i*Feat + c] = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-	        }
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_ReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-	                Out[i*Feat + c] = gap_clipu(AT_SCALE(Max(0, Acc0), ActScale, ActScaleN), 8);
-	        }
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_ReLUN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-	                Out[i*Feat + c] = gap_clipu(AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN), 8);
-	        }
-	}
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_ReLUM_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-			Out[i*Feat + c] = gap_clipu(AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN), 8);
-		}
-	}
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_ReLUMN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-			Out[i*Feat + c] = gap_clipu(AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN), 8);
-	        }
-	}
-	gap_waitbarrier(0);
-
-}
-
-// void KerParReduct_CC_HSigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-// {
-// 	int Feat = Arg->Feat;
-// 	int W = Arg->W;
-// 	int H = Arg->H;
-// 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-// 	int * __restrict__ In = (int *__restrict__) Arg->In;
-// 	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-// 	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-// 	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-// 	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-// 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-// 	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-// 	int Prenorm = Infos[AT_INF_PRENORM];
-
-// 	for (int i=First; i<Last; i++) {
-// 		for (int c=0; c<Feat; c++) {
-// 	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c],   ScaleN[c]), 8);
-// 			Out[i*Feat + c] = gap_clipu(AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN), 8);
-// 	        }
-// 	}
-// 	gap_waitbarrier(0);
-// }
-
-// void KerParReduct_CC_HSwish_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-// {
-// 	int Feat = Arg->Feat;
-// 	int W = Arg->W;
-// 	int H = Arg->H;
-// 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-// 	int * __restrict__ In = (int *__restrict__) Arg->In;
-// 	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-// 	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-// 	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-// 	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-// 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-// 	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-// 	int Prenorm = Infos[AT_INF_PRENORM];
-
-// 	for (int i=First; i<Last; i++) {
-// 		for (int c=0; c<Feat; c++) {
-// 	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-// 			Out[i*Feat + c] = gap_clipu(AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN), 8);
-// 	        }
-// 	}
-// 	gap_waitbarrier(0);
-// }
-
-// void KerParReduct_CC_LeakyReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-// {
-// 	int Feat = Arg->Feat;
-// 	int W = Arg->W;
-// 	int H = Arg->H;
-// 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-// 	int * __restrict__ In = (int *__restrict__) Arg->In;
-// 	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-// 	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-// 	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-// 	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-// 	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-// 	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-// 	int Prenorm = Infos[AT_INF_PRENORM];
-
-// 	for (int i=First; i<Last; i++) {
-// 		for (int c=0; c<Feat; c++) {
-// 	                int Acc0 = gap_clipu(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 8);
-// 			int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-// 			int Acc0N = AT_NORM(Acc0 * A0, 7);
-// 			Out[i*Feat + 2*c  ] = gap_clipu(AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN), 8);
-// 	        }
-// 	}
-// 	gap_waitbarrier(0);
-// }
-
-void KerParReduct_CC_Sigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                int Acc0 = gap_clip(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 15);
-			Out[i*Feat + c] = gap_clipu(AT_SCALE(SigmoidU(Acc0) >> 8, ActScale, ActScaleN), 8);
-	        }
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_Tanh_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-	int Prenorm = Infos[AT_INF_PRENORM];
-
-	for (int i=First; i<Last; i++) {
-		for (int c=0; c<Feat; c++) {
-	                int Acc0 = gap_clip(AT_SCALE(AT_NORM(In[i*Feat + c], Prenorm), Scale[c], ScaleN[c]), 15);
-			Out[i*Feat + c] = (unsigned char) (gap_clip(AT_SCALE(Tanh(Acc0) >> 8, ActScale, ActScaleN), 7) + 128);
-	        }
-	}
-	gap_waitbarrier(0);
-}
-
 // void parray(unsigned char * Out, int Feat, int H, int W) {
 // 	for (int c=0; c<Feat; c++){
 //         for (int h=0; h<H; h++){
@@ -1652,4 +1398,5 @@ void KerParPool_MaxPoolNxMStrideSxSy__HWC_USQ8(KerPool_HWC_USQ8_T *Arg)
 	gap_waitbarrier(0);
 }
 
+#endif
 #pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c
index a72a13ee2..ef6918d19 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Activation_SQ8.c
@@ -53,9 +53,7 @@ unsigned short int SIGMOID_LUT_uint16[256] = {
     65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534,
     65534, 65534, 65535};
 
-static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)
-
-{
+static inline unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X) {
         unsigned int NCore;
         unsigned int Log2Core;
         unsigned int Chunk;
@@ -137,1063 +135,224 @@ int TanhTable(int x, unsigned short * table){
 #endif
 }
 
-
-/*
- * Standalone activation
-*/
-static void Ker_Activation_SQ8(
-        signed char * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N/2; i++) {
-                int Acc0 = In[2*i], Acc1 = In[2*i+1];
-		switch (Activation) {
-			case ACT_NONE:     Acc0 = AT_SCALE(Acc0, ActScale, ActScaleN); Acc1 = AT_SCALE(Acc1, ActScale, ActScaleN); break;
-			case ACT_RELU:     Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); Acc1 = AT_SCALE(Max(0, Acc1), ActScale, ActScaleN); break;
-			case ACT_RELUM:    Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); Acc1 = AT_SCALE(Max(A0, Acc1), ActScale, ActScaleN); break;
-			case ACT_RELUMN:   Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); Acc1 = AT_SCALE(Min(B0, Max(A0, Acc1)), ActScale, ActScaleN); break;
-			case ACT_RELUN:    Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); Acc1 = AT_SCALE(AT_CLIP_POS(Acc1, A0), ActScale, ActScaleN); break;
-			case ACT_HSIGMOID: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScale, ActScaleN); break;
-			case ACT_HSWISH:   Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScale, ActScaleN); break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-					int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1;
-					int Acc1N = AT_NORM(Acc1 * A0, 7);
-					Acc1 = AT_SCALE((Neg1*Acc1N+Pos1*Acc1), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM((Acc0 * A0), 7):Acc0), ActScale, ActScaleN);
-				//	Acc1 = AT_SCALE(((Acc1<0) ? AT_NORM((Acc1 * A0), 7):Acc1), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[2*i] = gap_clip(Acc0, 7), Out[2*i+1] = gap_clip(Acc1, 7);
-        }
-	if (N&0x1) {
-        	unsigned int i=N-1;
-                int Acc0 = In[i];
-		switch (Activation) {
-			case ACT_NONE:     Acc0 = AT_SCALE(Acc0, ActScale, ActScaleN); break;
-			case ACT_RELU:     Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); break;
-			case ACT_RELUM:    Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); break;
-			case ACT_RELUMN:   Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); break;
-			case ACT_RELUN:    Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); break;
-			case ACT_HSIGMOID: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); break;
-			case ACT_HSWISH:   Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM((Acc0 * A0), 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[i] = gap_clip(Acc0, 7);
-	}
-}
-
-/*
- * Standalone activation variant with Scale = 1.0
-*/
-static void Ker_ActivationScale1_SQ8(
-        signed char * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-        CNN_ActivationOper_T Activation,
-	int A0,
-        int B0
-        )
-
-{
-        for (unsigned int i=0; i<N/2; i++) {
-                int Acc0 = In[2*i], Acc1 = In[2*i+1];
-		switch (Activation) {
-			case ACT_RELU: Acc0 = Max(0, Acc0); Acc1 = Max(0, Acc1); break;
-			case ACT_RELUN: Acc0 = AT_CLIP_POS(Acc0, A0); Acc1 = AT_CLIP_POS(Acc1, A0); break;
-			case ACT_RELUM: Acc0 = Max(A0, Acc0); Acc1 = Max(A0, Acc1); break;
-			case ACT_RELUMN: Acc0 = Min(B0, Max(A0, Acc0)); Acc1 = Min(B0, Max(A0, Acc1)); break;
-		}
-                Out[2*i] = Acc0; Out[2*i+1] = Acc1;
-        }
-	if (N&0x1) {
-        	unsigned int i=N-1;
-                int Acc0 = In[i];
-		switch (Activation) {
-			case ACT_RELU: Acc0 = Max(0, Acc0); break;
-			case ACT_RELUN: Acc0 = AT_CLIP_POS(Acc0, A0); break;
-			case ACT_RELUM: Acc0 = Max(A0, Acc0); break;
-			case ACT_RELUMN: Acc0 = Min(B0, Max(A0, Acc0)); break;
-		}
-                Out[i] = Acc0;
-	}
-}
-
-static void Ker_Activation_ScaleIn_SQ8(
-        signed char * __restrict__ In,
-        signed char * __restrict__ Out,
-        unsigned int Scale,
-        unsigned int ScaleN,
-	unsigned int N,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N/2; i++) {
-        	int Acc0 = gap_clip(AT_SCALE(In[2*i], Scale, ScaleN), 7);
-        	int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:     Acc0 = AT_SCALE(Acc0, ActScale, ActScaleN); Acc1 = AT_SCALE(Acc1, ActScale, ActScaleN); break;
-			case ACT_RELU:     Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); Acc1 = AT_SCALE(Max(0, Acc1), ActScale, ActScaleN); break;
-			case ACT_RELUN:    Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); Acc1 = AT_SCALE(AT_CLIP_POS(Acc1, A0), ActScale, ActScaleN); break;
-			case ACT_RELUM:    Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); Acc1 = AT_SCALE(Max(A0, Acc1), ActScale, ActScaleN); break;
-			case ACT_RELUMN:   Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); Acc1 = AT_SCALE(Min(B0, Max(A0, Acc1)), ActScale, ActScaleN); break;
-			case ACT_HSIGMOID: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScale, ActScaleN); break;
-			case ACT_HSWISH:   Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScale, ActScaleN); break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-					int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1;
-					int Acc1N = AT_NORM(Acc1 * A0, 7);
-					Acc1 = AT_SCALE((Neg1*Acc1N+Pos1*Acc1), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM((Acc0 * A0), 7):Acc0), ActScale, ActScaleN);
-				//	Acc1 = AT_SCALE(((Acc1<0) ? AT_NORM((Acc1 * A0), 7):Acc1), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[2*i] = gap_clip(Acc0, 7), Out[2*i+1] = gap_clip(Acc1, 7);
-        }
-	if (N&0x1) {
-        	unsigned int i=N-1;
-        	int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:     Acc0 = AT_SCALE(Acc0, ActScale, ActScaleN); break;
-			case ACT_RELU:     Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN); break;
-			case ACT_RELUN:    Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN); break;
-			case ACT_RELUM:    Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN); break;
-			case ACT_RELUMN:   Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN); break;
-			case ACT_HSIGMOID: Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN); break;
-			case ACT_HSWISH:   Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN); break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM((Acc0 * A0), 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[i] = gap_clip(Acc0, 7);
-	}
-}
-
-/*
- * Standalone activation variant with Scale = 1.0
-*/
-static void Ker_ActivationScale1_ScaleIn_SQ8(
-        signed char * __restrict__ In,
-        signed char * __restrict__ Out,
-        unsigned int Scale,
-        unsigned int ScaleN,
-	unsigned int N,
-        CNN_ActivationOper_T Activation,
-	int A0,
-        int B0
-        )
-
-{
-        for (unsigned int i=0; i<N/2; i++) {
-        	int Acc0 = gap_clip(AT_SCALE(In[2*i], Scale, ScaleN), 7);
-        	int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_RELU: Acc0 = Max(0, Acc0); Acc1 = Max(0, Acc1); break;
-			case ACT_RELUN: Acc0 = AT_CLIP_POS(Acc0, A0); Acc1 = AT_CLIP_POS(Acc1, A0); break;
-			case ACT_RELUM: Acc0 = Max(A0, Acc0); Acc1 = Max(A0, Acc1); break;
-			case ACT_RELUMN: Acc0 = Min(B0, Max(A0, Acc0)); Acc1 = Min(B0, Max(A0, Acc1)); break;
-		}
-                Out[2*i] = Acc0; Out[2*i+1] = Acc1;
-        }
-	if (N&0x1) {
-        	unsigned int i=N-1;
-        	int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_RELU: Acc0 = Max(0, Acc0); break;
-			case ACT_RELUN: Acc0 = AT_CLIP_POS(Acc0, A0); break;
-			case ACT_RELUM: Acc0 = Max(A0, Acc0); break;
-			case ACT_RELUMN: Acc0 = Min(B0, Max(A0, Acc0)); break;
-		}
-                Out[i] = Acc0;
-	}
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, Out buffer is different from In Buffer
-*/
-static void KerReduct_Activation_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[i] = gap_clip(Acc0, 7);
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, Out buffer is different from In Buffer
-*/
-/*static void KerReduct_Activation_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[i] = gap_clip(Acc0, 7);
-        }
-}*/
-
-static void KerReduct_Activation_HWC_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Feat,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUM:
-				Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUMN:
-				Acc0 = AT_SCALE(Min(B0, Max(Acc0, A0)), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[i*Feat] = gap_clip(Acc0, 7);
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, variant for ScaleAct=1.0, Out buffer is different from In Buffer
-*/
-static void KerReduct_ActivationScale1_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(Scale, In[i], ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(Acc0, A0);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(Acc0, A0));
-				break;
-		}
-                Out[i] = Acc0;
-        }
-}
-
-static void KerReduct_ActivationScale1_HWC_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Feat,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N/4; i++) {
-                int Acc0 = gap_clip(AT_SCALE(Scale, In[4*i+0], ScaleN), 7);
-                int Acc1 = gap_clip(AT_SCALE(Scale, In[4*i+1], ScaleN), 7);
-                int Acc2 = gap_clip(AT_SCALE(Scale, In[4*i+2], ScaleN), 7);
-                int Acc3 = gap_clip(AT_SCALE(Scale, In[4*i+3], ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				Acc1 = Max(0, Acc1);
-				Acc2 = Max(0, Acc2);
-				Acc3 = Max(0, Acc3);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				Acc1 = AT_CLIP_POS(Acc1, A0);
-				Acc2 = AT_CLIP_POS(Acc2, A0);
-				Acc3 = AT_CLIP_POS(Acc3, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(Acc0, A0);
-				Acc1 = Max(Acc1, A0);
-				Acc2 = Max(Acc2, A0);
-				Acc3 = Max(Acc3, A0);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(Acc0, A0));
-				Acc1 = Min(B0, Max(Acc1, A0));
-				Acc2 = Min(B0, Max(Acc2, A0));
-				Acc3 = Min(B0, Max(Acc3, A0));
-				break;
-		}
-                Out[Feat*(4*i+0)] = Acc0;
-                Out[Feat*(4*i+1)] = Acc1;
-                Out[Feat*(4*i+2)] = Acc2;
-                Out[Feat*(4*i+3)] = Acc3;
-        }
-        for (unsigned int i=4*(N/4); i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(Scale, In[i], ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(Acc0, A0);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(Acc0, A0));
-				break;
-		}
-                Out[Feat*i] = Acc0;
-	}
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, In place version
- * Input is 32b int output is 8b
-*/
-static void KerReductIO_Activation_SQ8(
-        signed char *__restrict__ Out,
-        int *__restrict__ In,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUM:
-				Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUMN:
-				Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[i] = gap_clip(Acc0, 7);
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, variant for ActScale=1.0, In place version
- * Input is 32b int output is 8b
-*/
-static void KerReductIO_ActivationScale1_SQ8(
-        signed char *__restrict__ Out,
-        int *__restrict__ In,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<N; i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[i], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(B0, Acc0);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(B0, Acc0));
-				break;
-		}
-                Out[i] = Acc0;
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, Out buffer is different from In Buffer
- * Partial unroll to avoid load use penalty
-*/
-static void _KerReduct_Activation_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<(N/2); i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7);
-                int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7);
-		
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(Max(0, Acc1), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(AT_CLIP_POS(Acc1, A0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUM:
-				Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(Max(A0, Acc1), ActScale, ActScaleN);
-				break;
-			case ACT_RELUMN:
-				Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(Min(B0, Max(A0, Acc1)), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					int Acc1N = AT_NORM(Acc1 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-					Acc1 = AT_SCALE((Neg1*Acc1N+Pos1*Acc1), ActScale, ActScaleN);
-
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				//	Acc1 = AT_SCALE(((Acc1<0) ? AT_NORM(Acc1 * A0, 7):Acc1), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[2*i] = gap_clip(Acc0, 7); Out[2*i+1] = gap_clip(Acc1, 7);
-        }
-        if (N&0x1) {
-                int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUM:
-				Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUMN:
-				Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-
-					// Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[N-1] = gap_clip(Acc0, 7);
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, variant for ActScale=1.0, Out buffer is different from In Buffer
- * Partial unroll to avoid load use penalty
-*/
-static void _KerReduct_ActivationScale1_SQ8(
-        int * __restrict__ In,
-        signed char * __restrict__ Out,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<(N/2); i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7);
-                int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7);
-		
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				Acc1 = Max(0, Acc1);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				Acc1 = AT_CLIP_POS(Acc1, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(A0, Acc0);
-				Acc1 = Max(A0, Acc1);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(A0, Acc0));
-				Acc1 = Min(B0, Max(A0, Acc1));
-				break;
-		}
-                Out[2*i]   = Acc0; Out[2*i+1] = Acc1;
-        }
-        if (N&0x1) {
-                int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(A0, Acc0);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(A0, Acc0));
-				break;
-		}
-                Out[N-1] = Acc0;
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, In place version
- * Input is 32b int output is 8b
- * Partially unrolled version to avoid load use penalty
-*/
-static void _KerReductIO_Activation_SQ8(
-        signed char * __restrict__ Out,
-        int *__restrict__ In,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	unsigned int ActScale, unsigned int ActScaleN, int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<(N/2); i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7);
-                int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(Max(0, Acc1), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(AT_CLIP_POS(Acc1, A0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUM:
-				Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(Max(A0, Acc1), ActScale, ActScaleN);
-				break;
-			case ACT_RELUMN:
-				Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN);
-				Acc1 = AT_SCALE(Min(B0, Max(A0, Acc1)), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				Acc1 = AT_SCALE(AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					int Acc1N = AT_NORM(Acc1 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-					Acc1 = AT_SCALE((Neg1*Acc1N+Pos1*Acc1), ActScale, ActScaleN);
-
-				//	Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				//	Acc1 = AT_SCALE(((Acc1<0) ? AT_NORM(Acc1 * A0, 7):Acc1), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Sigmoid(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					// Assumes input (Acc) in Sq[-8:8] = 16 / 256 = 2**(-4)
-					// y = Sigmoid(x) expects x in Q12 --> Sin/Sq12 = 2**(-4) / 2**(-12) = 2**(8) --> << 8
-					// y in Q15 is then shifted to fit int8 Q7 data --> >> 8 and scaled to the output scale with ActScale
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-					int Acc1N = Acc1 << 8;
-					Acc1 = AT_SCALE((Tanh(Acc1N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[2*i]   = gap_clip(Acc0, 7); Out[2*i+1] = gap_clip(Acc1, 7);
-        }
-        if (N&0x1) {
-                int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = AT_SCALE(Max(0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0, A0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUM:
-				Acc0 = AT_SCALE(Max(A0, Acc0), ActScale, ActScaleN);
-				break;
-			case ACT_RELUMN:
-				Acc0 = AT_SCALE(Min(B0, Max(A0, Acc0)), ActScale, ActScaleN);
-				break;
-			case ACT_HSIGMOID:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScale, ActScaleN);
-				break;
-			case ACT_HSWISH:
-				Acc0 = AT_SCALE(AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScale, ActScaleN);
-				break;
-			case ACT_LEAKYRELU:
-				{
-					int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-					int Acc0N = AT_NORM(Acc0 * A0, 7);
-					Acc0 = AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN);
-
-					// Acc0 = AT_SCALE(((Acc0<0) ? AT_NORM(Acc0 * A0, 7):Acc0), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_SIGMOID:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Sigmoid(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-			case ACT_TANH:
-				{
-					int Acc0N = Acc0 << 8;
-					Acc0 = AT_SCALE((Tanh(Acc0N) >> 8), ActScale, ActScaleN);
-				}
-				break;
-		}
-                Out[N-1] = gap_clip(Acc0, 7);
-        }
-}
-
-/*
- * Conv/Linear DP scaling followed by an optional activation, Variant for ActScale=1.0, In place version
- * Input is 32b int output is 8b
- * Partially unrolled version to avoid load use penalty
-*/
-static void _KerReductIO_ActivationScale1_SQ8(
-        signed char *__restrict__ Out,
-        int *__restrict__ In,
-	unsigned int N,
-	unsigned int Scale,
-	unsigned int ScaleN,
-        CNN_ActivationOper_T Activation,
-	int A0, int B0, int C0
-        )
-
-{
-        for (unsigned int i=0; i<(N/2); i++) {
-                int Acc0 = gap_clip(AT_SCALE(In[2*i+0], Scale, ScaleN), 7);
-                int Acc1 = gap_clip(AT_SCALE(In[2*i+1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				Acc1 = Max(0, Acc1);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				Acc1 = AT_CLIP_POS(Acc1, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(A0, Acc0);
-				Acc1 = Max(A0, Acc1);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(A0, Acc0));
-				Acc1 = Min(B0, Max(A0, Acc1));
-				break;
-		}
-                Out[2*i]   = Acc0; Out[2*i+1] = Acc1;
-        }
-        if (N&0x1) {
-                int Acc0 = gap_clip(AT_SCALE(In[N-1], Scale, ScaleN), 7);
-		switch (Activation) {
-			case ACT_NONE:
-				break;
-			case ACT_RELU:
-				Acc0 = Max(0, Acc0);
-				break;
-			case ACT_RELUN:
-				Acc0 = AT_CLIP_POS(Acc0, A0);
-				break;
-			case ACT_RELUM:
-				Acc0 = Max(A0, Acc0);
-				break;
-			case ACT_RELUMN:
-				Acc0 = Min(B0, Max(A0, Acc0));
-				break;
-		}
-                Out[N-1] = Acc0;
-        }
-}
-
-/*
- * Buffer compaction, scattered by chunk size groups of 8b moved to a contiguous representation through a parallel reduction tree
-*/
-static void __attribute__ ((noinline)) KerReductIO_Compact_SQ8(int *__restrict__ In, unsigned int Size, unsigned int CoreId, unsigned int ChunkCell)
-
-{
-	unsigned int U = gap_ncore()/2, Log2Core = gap_fl1(gap_ncore()), A = 2, B = 1;
-	for (int k=0; k<Log2Core; k++) {
-		if (CoreId<U) {
-			signed char *__restrict__ OOs = ((signed char *)In+(A*CoreId+B)*ChunkCell);
-			signed char *__restrict__ IIs = ((signed char *)In+((sizeof(int)/sizeof(signed char))*(A*CoreId+B))*ChunkCell);
-			int *__restrict__ II = (int *) IIs;
-			int *__restrict__ OO = (int *) OOs;
-			for (int i=0;i<Size/8;i++) {
-				int V0 = II[2*i], V1 = II[2*i+1];
-				OO[2*i] = V0; OO[2*i+1] = V1;
-			}
-			for (int i=((Size/8)*8); i<Size; i++) OOs[i] = IIs[i];
-		}
-		U = U/2; A = A*2; B = B*2;
-	}
-	gap_waitbarrier(0);
-}
+#define KER_ACT(Activation, in_d_type, out_d_type, p_type, n_bits, is_unsigned) \
+do { \
+	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	decl(in_d_type * __restrict__, In) = decl((in_d_type *__restrict__), Arg->In); \
+	decl(out_d_type * __restrict__, Out) = decl((out_d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Max(0, Last-First); \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+\
+	for (unsigned int i=First; i<Last; i++) { \
+		int Acc0 = In[i]; \
+		ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+		Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+	} \
+} while(0)
+
+#define KER_PAR_REDUCT_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	int S = Arg->Feat; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Arg->W*Arg->H; \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int c=First; c<Last; c++) { \
+		for (unsigned int i=0; i<Size; i++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[Size*c + i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
+
+#define KER_REDUCT_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	unsigned int Feat = Arg->Feat; \
+	unsigned S = Arg->W*Arg->H; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Max(0, Last-First); \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int c=0; c<Feat; c++) { \
+		for (unsigned int i=First; i<Last; i++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[Size*c + i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
+
+#define KER_PAR_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	int S = Arg->Feat; \
+	unsigned int Size = Arg->W*Arg->H; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int c=First; c<Last; c++) { \
+		for (unsigned int i=0; i<Size; i++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+		Out += Size; \
+	} \
+	gap_waitbarrier(0); \
+	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat); \
+} while(0);
+
+#define KER_REDUCT_IO_ACT_CHW(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	unsigned int Feat = Arg->Feat; \
+	unsigned int S = Arg->W*Arg->H; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ InOut = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Max(0, Last-First); \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+ \
+	for (unsigned int c=0; c<Feat; c++) { \
+		int *In = (int *) (InOut+S*c+First); \
+		d_type *Out = (d_type *) (InOut+S*c+First); \
+		for (unsigned int i=0; i<Size; i++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[i], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[i] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+		gap_waitbarrier(0); \
+		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S); \
+	} \
+} while(0)
+
+#define KER_PAR_REDUCT_ACT_CHW2HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	int Feat = Arg->Feat; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Arg->W*Arg->H; \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int c=First; c<Last; c++) { \
+		for (unsigned int i=0; i<Size; i++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
+
+#define KER_REDUCT_ACT_CHW2HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	unsigned int Feat = Arg->Feat; \
+	unsigned S = Arg->W*Arg->H; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int Size = Max(0, Last-First); \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int c=0; c<Feat; c++) { \
+		for (unsigned int i=First; i<Last; i++) { \
+	                int Acc0 = AT_SCALE(AT_NORM(In[Size*c + i], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+	                Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+	        } \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
+
+#define KER_PAR_REDUCT_ACT_HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	unsigned int Feat = Arg->Feat; \
+	unsigned S = Arg->W*Arg->H; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int i=First; i<Last; i++) { \
+		for (unsigned int c=0; c<Feat; c++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[Feat*i + c], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
+
+#define KER_REDUCT_ACT_HWC(Activation, d_type, p_type, n_bits, is_unsigned) \
+do { \
+	unsigned int Feat = Arg->Feat; \
+	unsigned S = Arg->W*Arg->H; \
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S); \
+	int * __restrict__ In = (int *__restrict__) Arg->In; \
+	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale; \
+	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN; \
+	decl(d_type * __restrict__, Out) = decl((d_type *__restrict__), Arg->Out); \
+	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos; \
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN]; \
+	int A0 = arr_at_as(Infos, AT_INF_A0, p_type); int B0 = arr_at_as(Infos, AT_INF_B0, p_type); int C0 = arr_at_as(Infos, AT_INF_C0, p_type); \
+	int Prenorm = arr_at_as(Infos, AT_INF_PRENORM, p_type); \
+\
+	for (unsigned int i=First; i<Last; i++) { \
+		for (unsigned int c=0; c<Feat; c++) { \
+			int Acc0 = AT_SCALE(AT_NORM(In[Feat*i + c], Prenorm), Scale[c], ScaleN[c]); \
+			ACT_SWITCH(Acc0, Activation, ActScale, ActScaleN, A0, B0, C0, n_bits, is_unsigned); \
+			Out[Feat*i + c] = OUT_CLIP(Acc0, is_unsigned, n_bits); \
+		} \
+	} \
+	gap_waitbarrier(0); \
+} while(0)
 
 #define B_CLR(x, bits)  ((x)&(~((1<<(bits))-1)))
-static void KerReductIO_Compact_SQ8_1(char *__restrict__ To, char *__restrict__ From, int Size, int TotalSize)
-
-{
+static void KerReductIO_Compact_SQ8_1(signed char *__restrict__ To, signed char *__restrict__ From, int Size, int TotalSize) {
         unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Size), First = Chunk*CoreId, Last = Min(First+Chunk, Size);
         unsigned int Iter = Max(0, Last-First);
 
@@ -1201,7 +360,7 @@ static void KerReductIO_Compact_SQ8_1(char *__restrict__ To, char *__restrict__
 		From += Size*4; To += Size;
 
         	int *pFrom = (int *) (From+First), *pTo = (int *) (To+First);
-        	for (int j=0; j<Iter/8; j++) {
+        	for (unsigned int j=0; j<Iter/8; j++) {
                 	int V0 = pFrom[2*j], V1 = pFrom[2*j+1];
                 	pTo[2*j] = V0; pTo[2*j+1] = V1;
         	}
@@ -1215,1991 +374,605 @@ static void KerReductIO_Compact_SQ8_1(char *__restrict__ To, char *__restrict__
 /*
  * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated in parallel
 */
-void KerParReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_NONE, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
 }
 
 
-void KerParReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_RELU, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_RELUN, A0, B0, C0);
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_RELUM, A0, B0, C0);
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_RELUMN, A0, B0, C0);
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
 }
 
-void KerParReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+}
 
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
+void KerParReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
+}
 
-void KerParReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+}
 
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
 }
 
-void KerParReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+}
 
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
+void KerParReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+}
 
-void KerParReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+}
 
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_SQ8(In+Size*c, Out+Size*c, Size, Scale[c], ScaleN[c], ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
 }
 
 /*
  * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated in parallel. In: CHW Out: HWC
 */
-void KerParReduct_CC_CHW2HWC_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_NONE, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-
-void KerParReduct_CC_CHW2HWC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_RELU, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_CHW2HWC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_RELUN, A0, B0, C0);
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_CHW2HWC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_RELU, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_CHW2HWC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_ActivationScale1_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_RELUN, A0, B0, C0);
-	gap_waitbarrier(0);
-
-}
-
-void KerParReduct_CC_CHW2HWC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_CHW2HWC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_CHW2HWC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_CHW2HWC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerParReduct_CC_CHW2HWC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int S = Arg->Feat;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=First; c<Last; c++) KerReduct_Activation_HWC_SQ8(In+W*H*c, Out+c, W*H, S, Scale[c], ScaleN[c], ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_CHW2HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_NONE, signed char, unsigned char, 8, 0);
 }
 
-/*
- * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated in parallel
-*/
-extern void DumpFeaturePlanes(char *Mess, int DataSize, void *Plane, unsigned int NPlanes, unsigned int W, unsigned int Wmax, unsigned int H, unsigned int Hmax);
-
-void KerParReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int W = Arg->W, H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_ActivationScale1_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_NONE, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_ActivationScale1_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_RELU, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_ActivationScale1_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_RELUN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_ActivationScale1_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_RELUM, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_ActivationScale1_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_RELUMN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_Activation_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_Activation_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_Activation_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_Activation_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-void KerParReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->Feat;
-	unsigned int Size = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char *__restrict__ Out = (signed char *__restrict__)(In+First*Size);
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	S = Size*Max(0, Last-First);
-	for (int c=First; c<Last; Out+=Size, c++) KerReductIO_Activation_SQ8(Out, In+Size*c, Size, Scale[c], ScaleN[c], ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-	// KerReductIO_Compact_SQ8(In, S, CoreId, ChunkCell*Size);
-	KerReductIO_Compact_SQ8_1((signed char *__restrict__)In, (signed char *__restrict__)In, Size*ChunkCell, Size * Arg->Feat);
-}
-
-/* Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated one after the other in parallel */
-void KerReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_ActivationScale1_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_NONE, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_ActivationScale1_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELU, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_ActivationScale1_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELUN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_ActivationScale1_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELUM, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_ActivationScale1_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELUMN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_Activation_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_Activation_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_Activation_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_Activation_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) KerReduct_Activation_SQ8(In+S*c+First, Out+S*c+First, Size, Scale[c], ScaleN[c], ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
-}
-
-/* Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated one after the other in parallel */
-void KerReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_ActivationScale1_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_NONE, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
 
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
+void KerParReduct_CC_CHW2HWC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELU, signed char, unsigned char, 8, 0);
 }
 
-void KerReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_ActivationScale1_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELU, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_ActivationScale1_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELUN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_ActivationScale1_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELUM, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_ActivationScale1_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_RELUMN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_Activation_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_Activation_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_Activation_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_Activation_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
-}
-
-void KerReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	unsigned int Feat = Arg->Feat;
-	unsigned int S = Arg->W*Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	int * __restrict__ InOut = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int c=0; c<Feat; c++) {
-		KerReductIO_Activation_SQ8((signed char *__restrict__)(InOut+S*c+First), InOut+S*c+First, Size, Scale[c], ScaleN[c], ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-		gap_waitbarrier(0);
-		// KerReductIO_Compact_SQ8(InOut+S*c, Size, CoreId, ChunkCell);
-		KerReductIO_Compact_SQ8_1((signed char *__restrict__)InOut+S*c, (signed char *__restrict__)(InOut+S*c), ChunkCell, S);
-	}
-	// ChunkCell = ChunkSize(Feat); First = CoreId*ChunkCell; Last  = Min(First+ChunkCell, Feat); Size = S*Max(0, Last-First);
-	// KerReductIO_Compact_SQ8(InOut, Size, CoreId, ChunkCell*Size);
+void KerParReduct_CC_CHW2HWC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUN, signed char, unsigned char, 8, 0);
 }
 
-/*
- * Standalone Scaled Activation, Features are evaluated in parallel
-*/
-
-void KerPar_ActNone_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-
-
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_NONE, ActScale, ActScaleN, 0, 0, 0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_CHW2HWC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUM, signed char, unsigned char, 8, 0);
 }
 
-void Ker_Scale_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-
+void KerParReduct_CC_CHW2HWC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_RELUMN, signed char, unsigned char, 8, 0);
+}
 
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_NONE, ActScale, ActScaleN, 0, 0, 0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_CHW2HWC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg)
+void KerParReduct_CC_CHW2HWC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_HSWISH, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_CHW2HWC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+}
 
+void KerParReduct_CC_CHW2HWC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+}
 
-	if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELU, A0, B0);
-	gap_waitbarrier(0);
+void KerParReduct_CC_CHW2HWC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_CHW2HWC(ACT_TANH, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg)
+/*
+ * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated in parallel
+*/
+void KerParReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
+}
 
+void KerParReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+}
 
-	if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUN, A0, B0);
-	gap_waitbarrier(0);
+void KerParReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLUM_SQ8(KerActivation_SQ8_T *Arg)
+void KerParReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+}
 
+void KerParReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+}
 
-	if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUM, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUM, A0, B0);
-	gap_waitbarrier(0);
+void KerParReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLUMN_SQ8(KerActivation_SQ8_T *Arg)
+void KerParReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
+}
 
 
-	if (ActScale) Ker_Activation_SQ8(In+First, Out+First, Size, ACT_RELUMN, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_SQ8(In+First, Out+First, Size, ACT_RELUMN, A0, B0);
-	gap_waitbarrier(0);
+/*
+ * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated one after the other in parallel
+*/
+void KerReduct_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
 }
 
-void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
+}
 
+void KerReduct_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+}
 
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerReduct_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
 }
 
-void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg)
+void KerReduct_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
+}
 
+void KerReduct_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+}
 
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerReduct_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
 }
 
-void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg)
+void KerReduct_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
+}
 
+/* 
+ * Input Scaling and reduction to 8b then channel centric activation, Out location = In location. Features are evaluated one after the other in parallel 
+*/
+void KerReductIO_CC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_NONE, signed char, unsigned char, 8, 0);
+}
 
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerReductIO_CC_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_RELU, signed char, unsigned char, 8, 0);
 }
 
-void Ker_Sigmoid_SQ8(KerActivation_SQ8_T *Arg)
+void KerReductIO_CC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_RELUN, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReductIO_CC_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_RELUM, signed char, unsigned char, 8, 0);
+}
 
+void KerReductIO_CC_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_RELUMN, signed char, unsigned char, 8, 0);
+}
 
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerReductIO_CC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
 }
 
-void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg)
+void KerReductIO_CC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_HSWISH, signed char, unsigned char, 8, 0);
+}
 
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReductIO_CC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+}
 
+void KerReductIO_CC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_SIGMOID, signed char, unsigned char, 8, 0);
+}
 
-	Ker_Activation_SQ8(In+First, Out+First, Size, ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void KerReductIO_CC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_REDUCT_IO_ACT_CHW(ACT_TANH, signed char, unsigned char, 8, 0);
 }
 
 /*
- * Standalone Scaled Activation with Extra Scale before activation, Features are evaluated one after the other in parallel
+ * Standalone Scaled Activation, No reduction with Scale[c] ScaleN[c] - All the elements can be evaluated in parallel
 */
 
-void Ker_ActNone_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_NONE, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_NONE, A0, B0);
-	gap_waitbarrier(0);
+void Ker_ActNone_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_NONE, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLU_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELU, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELU, A0, B0);
-	gap_waitbarrier(0);
+void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_RELU, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLUN_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUN, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUN, A0, B0);
-	gap_waitbarrier(0);
+void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUN, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLUM_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUM, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUM, A0, B0);
-	gap_waitbarrier(0);
+void Ker_ReLUM_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUM, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_ReLUMN_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	if (ActScale) Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUMN, ActScale, ActScaleN, A0, B0, C0);
-	else Ker_ActivationScale1_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_RELUMN, A0, B0);
-	gap_waitbarrier(0);
+void Ker_ReLUMN_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUMN, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_HSigmoid_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_HSIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void Ker_HSigmoid_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_HSIGMOID, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_HSwish_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_HSWISH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void Ker_HSwish_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_HSWISH, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_LeakyReLU_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_LEAKYRELU, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_LEAKYRELU, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_Sigmoid_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-
-	Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_SIGMOID, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+void Ker_Sigmoid_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_SIGMOID, signed char, signed char, unsigned char, 8, 0);
 }
 
-void Ker_Tanh_ScaleIn_SQ8(KerActivation_SQ8_T *Arg)
-
-{
-	unsigned int S = Arg->W*Arg->H*Arg->Feat, CoreId = gap_coreid(), ChunkCell = ChunkSize(S), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, S);
-	signed char * __restrict__ In = (signed char *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int Size = Max(0, Last-First);
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	unsigned int In1Scale = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALE], In1ScaleN = ((unsigned char *)Arg->Infos)[AT_INF_IN1SCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg) {
+	KER_ACT(ACT_TANH, signed char, signed char, unsigned char, 8, 0);
+}
 
+/* 
+ * from int32 to 8/16bits + optional Activation - Reduction with Scale[c] ScaleN[c] - All the elements can be evaluated in parallel
+*/
 
-	Ker_Activation_ScaleIn_SQ8(In+First, Out+First, In1Scale, In1ScaleN, Size, ACT_TANH, ActScale, ActScaleN, A0, B0, C0);
-	gap_waitbarrier(0);
+/* ------------------------------------------------------ Signed 8 bits ------------------------------------------------------ */
+void KerReduct_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_NONE, int, signed char, unsigned char, 8, 0);
 }
 
-
-/*
- * Input Scaling and reduction to 8b then channel cnetric activation, Out location != In location. Features are evaluated in parallel
-*/
-void KerReduct_CC_NoScale_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(In[i], 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELU, int, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_ReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUN, int, signed char, unsigned char, 8, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUM, int, signed char, unsigned char, 8, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-	        Out[i] = gap_clip(AT_SCALE(Max(0, In[i]), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUMN, int, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSIGMOID, int, signed char, unsigned char, 8, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSWISH, int, signed char, unsigned char, 8, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-	        Out[i] = gap_clip(AT_SCALE(AT_CLIP_POS(In[i], A0), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_LEAKYRELU, int, signed char, unsigned char, 8, 0);
+}
 
+void KerReduct_CC_NoScale_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_SIGMOID, int, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_ReLUM_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_TANH, int, signed char, unsigned char, 8, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(Max(A0, In[i]), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+/* ------------------------------------------------------ Signed 16 bits ------------------------------------------------------ */
+void KerReduct_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_NONE, int, signed short, unsigned short, 16, 0);
+}
 
+void KerReduct_CC_NoScale_ReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELU, int, signed short, unsigned short, 16, 0);
 }
 
-void KerReduct_CC_NoScale_ReLUMN_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_ReLUN_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUN, int, signed short, unsigned short, 16, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_ReLUM_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUM, int, signed short, unsigned short, 16, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(Min(B0, Max(A0, In[i])), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_ReLUMN_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUMN, int, signed short, unsigned short, 16, 0);
+}
 
+void KerReduct_CC_NoScale_HSigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSIGMOID, int, signed short, unsigned short, 16, 0);
 }
 
-void KerReduct_CC_NoScale_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_HSwish_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSWISH, int, signed short, unsigned short, 16, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_LeakyReLU_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_LEAKYRELU, int, signed short, unsigned short, 16, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(AT_CLIP_POS(In[i] + B0, A0) * C0, ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_Sigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_SIGMOID, int, signed short, unsigned short, 16, 0);
 }
 
-void KerReduct_CC_NoScale_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_Tanh_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_TANH, int, signed short, unsigned short, 16, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(AT_CLIP_POS(In[i] + B0, A0) * C0 * In[i], ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_NoScale_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int i=First; i<Last; i++) {
-		int Acc0 = In[i];
-		int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-		int Acc0N = AT_NORM(Acc0 * A0, 7);
-		Out[i] = gap_clip(AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+/* ---------------------------------------------------- Unsigned 8 bits ----------------------------------------------------- */
+void KerReduct_CC_NoScale_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_NONE, int, unsigned char, unsigned char, 8, 1);
 }
 
-void KerReduct_CC_NoScale_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_ReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELU, int, unsigned char, unsigned char, 8, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUN, int, unsigned char, unsigned char, 8, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE((Sigmoid(In[i] << 8) >> 8), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUM, int, unsigned char, unsigned char, 8, 1);
 }
 
-void KerReduct_CC_NoScale_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUMN, int, unsigned char, unsigned char, 8, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed char * __restrict__ Out = (signed char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_HSigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSIGMOID, int, unsigned char, unsigned char, 8, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE((Tanh(In[i] << 8) >> 8), ActScale, ActScaleN), 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_HSwish_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSWISH, int, unsigned char, unsigned char, 8, 1);
 }
 
+void KerReduct_CC_NoScale_LeakyReLU_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_LEAKYRELU, int, unsigned char, unsigned char, 8, 1);
+}
 
-/*
- * Input Scaling and reduction to 8b then channel cnetric activation, Out location != In location. Features are evaluated in parallel
-*/
-void KerReduct_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(In[i], 7);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_Sigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_SIGMOID, int, unsigned char, unsigned char, 8, 1);
 }
 
-void KerReduct_CC_NoScale_ReLU_SQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_Tanh_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_TANH, int, unsigned char, unsigned char, 8, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
 
-	for (int i=First; i<Last; i++) {
-	        Out[i] = gap_clip(AT_SCALE(Max(0, In[i]), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+/* ---------------------------------------------------- UnSigned 16 bits ----------------------------------------------------- */
+void KerReduct_CC_NoScale_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_NONE, int, unsigned short, unsigned short, 16, 1);
 }
 
-void KerReduct_CC_NoScale_ReLUN_SQ16(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELU, int, unsigned short, unsigned short, 16, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-	        Out[i] = gap_clip(AT_SCALE(AT_CLIP_POS(In[i], A0), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_ReLUN_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUN, int, unsigned short, unsigned short, 16, 1);
+}
 
+void KerReduct_CC_NoScale_ReLUM_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUM, int, unsigned short, unsigned short, 16, 1);
 }
 
-void KerReduct_CC_NoScale_ReLUM_SQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_RELUMN, int, unsigned short, unsigned short, 16, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_HSigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSIGMOID, int, unsigned short, unsigned short, 16, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(Max(A0, In[i]), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+void KerReduct_CC_NoScale_HSwish_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_HSWISH, int, unsigned short, unsigned short, 16, 1);
+}
 
+void KerReduct_CC_NoScale_LeakyReLU_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_LEAKYRELU, int, unsigned short, unsigned short, 16, 1);
 }
 
-void KerReduct_CC_NoScale_ReLUMN_SQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerReduct_CC_NoScale_Sigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_SIGMOID, int, unsigned short, unsigned short, 16, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerReduct_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_ACT(ACT_TANH, int, unsigned short, unsigned short, 16, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(Min(B0, Max(A0, In[i])), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+/* 
+	HWC Activations 
+*/
 
+/* ------------------------------------------------------ Signed 8 bits ------------------------------------------------------ */
+void KerParReduct_CC_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_HSigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_ReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed char, unsigned char, 8, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(AT_CLIP_POS(In[i] + B0, A0) * C0, ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_ReLUN_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_HSwish_SQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_ReLUM_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, signed char, unsigned char, 8, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_ReLUMN_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed char, unsigned char, 8, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(AT_CLIP_POS(In[i] + B0, A0) * C0 * In[i], ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_NoScale_LeakyReLU_SQ16(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int i=First; i<Last; i++) {
-		int Acc0 = In[i];
-		int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-		int Acc0N = AT_NORM(Acc0 * A0, 7);
-		Out[i] = gap_clip(AT_SCALE((Neg0*Acc0N+Pos0*Acc0), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_HSigmoid_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_Sigmoid_SQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_HSwish_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, signed char, unsigned char, 8, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_LeakyReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed char, unsigned char, 8, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(Sigmoid(In[i] << 8), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_Sigmoid_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed char, unsigned char, 8, 0);
 }
 
-void KerReduct_CC_NoScale_Tanh_SQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_Tanh_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, signed char, unsigned char, 8, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	signed short * __restrict__ Out = (signed short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+/* ----------------------------------------------------- UnSigned 8 bits ----------------------------------------------------- */
+void KerParReduct_CC_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned char, unsigned char, 8, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clip(AT_SCALE(Tanh(In[i] << 8), ActScale, ActScaleN), 15);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_ReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, unsigned char, unsigned char, 8, 1);
 }
 
+void KerParReduct_CC_ReLUN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, unsigned char, unsigned char, 8, 1);
+}
 
+void KerParReduct_CC_ReLUM_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned char, unsigned char, 8, 1);
+}
 
+void KerParReduct_CC_ReLUMN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, unsigned char, unsigned char, 8, 1);
+}
 
-/*
- * Input Scaling and reduction to 8b then channel cnetric activation, Out location != In location. Features are evaluated in parallel
-*/
-void KerReduct_CC_NoScale_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int i=First; i<Last; i++) {
-	        Out[i] = gap_clipu(In[i], 8);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_HSigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, unsigned char, unsigned char, 8, 1);
 }
 
-void KerReduct_CC_NoScale_ReLU_USQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_HSwish_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned char, unsigned char, 8, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_LeakyReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned char, unsigned char, 8, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		int Acc0 = gap_clipu(In[i], 8);
-                Out[i] = gap_clipu(AT_SCALE(Acc0, ActScale, ActScaleN), 8);
-	}
-	gap_waitbarrier(0);
-}
-
-void KerReduct_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg)
-
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Scale = (unsigned char *__restrict__) Arg->Scale;
-	unsigned char * __restrict__ ScaleN = (unsigned char *__restrict__) Arg->ScaleN;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
-
-	for (int i=First; i<Last; i++) {
-                Out[i] = gap_clipu(AT_SCALE(AT_CLIP_POS((int) In[i], A0), ActScale, ActScaleN), 8);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_Sigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, unsigned char, unsigned char, 8, 1);
 }
 
-void KerReduct_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_Tanh_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, unsigned char, unsigned char, 8, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+/* ----------------------------------------------------- Signed 16 bits ---------------------------------------------------- */
+void KerParReduct_CC_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, signed short, unsigned short, 16, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clipu(AT_SCALE(Max(A0, (int) In[i]), ActScale, ActScaleN), 8);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_ReLU_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, signed short, unsigned short, 16, 0);
 }
 
+void KerParReduct_CC_ReLUN_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, signed short, unsigned short, 16, 0);
+}
 
-void KerReduct_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_ReLUM_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, signed short, unsigned short, 16, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned char * __restrict__ Out = (unsigned char *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_ReLUMN_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, signed short, unsigned short, 16, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clipu(AT_SCALE(Min(B0, Max(A0, (int) In[i])), ActScale, ActScaleN), 8);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_HSigmoid_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, signed short, unsigned short, 16, 0);
 }
 
+void KerParReduct_CC_HSwish_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, signed short, unsigned short, 16, 0);
+}
 
-void KerReduct_CC_NoScale_USQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_LeakyReLU_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, signed short, unsigned short, 16, 0);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_Sigmoid_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, signed short, unsigned short, 16, 0);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clipu(In[i], 16);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_Tanh_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, signed short, unsigned short, 16, 0);
 }
 
-void KerReduct_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg)
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+/* ----------------------------------------------------- UnSigned 16 bits ---------------------------------------------------- */
+void KerParReduct_CC_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_NONE, unsigned short, unsigned short, 16, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		int Acc0 = gap_clipu(In[i], 16);
-		Out[i] = gap_clipu(AT_SCALE(Acc0, ActScale, ActScaleN), 16);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_ReLU_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELU, unsigned short, unsigned short, 16, 1);
 }
 
-void KerReduct_CC_NoScale_ReLUN_USQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_ReLUN_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUN, unsigned short, unsigned short, 16, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_ReLUM_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUM, unsigned short, unsigned short, 16, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clipu(AT_SCALE(AT_CLIP_POS((int) In[i], A0), ActScale, ActScaleN), 16);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_ReLUMN_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_RELUMN, unsigned short, unsigned short, 16, 1);
 }
 
-void KerReduct_CC_NoScale_ReLUM_USQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_HSigmoid_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSIGMOID, unsigned short, unsigned short, 16, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
+void KerParReduct_CC_HSwish_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_HSWISH, unsigned short, unsigned short, 16, 1);
+}
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clipu(AT_SCALE(Max(A0, (int) In[i]), ActScale, ActScaleN), 16);
-	}
-	gap_waitbarrier(0);
+void KerParReduct_CC_LeakyReLU_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_LEAKYRELU, unsigned short, unsigned short, 16, 1);
 }
 
+void KerParReduct_CC_Sigmoid_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_SIGMOID, unsigned short, unsigned short, 16, 1);
+}
 
-void KerReduct_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg)
+void KerParReduct_CC_Tanh_HWC_USQ16(KerConvLinReduct_SQ8_T *Arg) {
+	KER_PAR_REDUCT_ACT_HWC(ACT_TANH, unsigned short, unsigned short, 16, 1);
+}
 
-{
-	int Feat = Arg->Feat;
-	int W = Arg->W;
-	int H = Arg->H;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H*W*Feat), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, Feat*H*W);
-	int * __restrict__ In = (int *__restrict__) Arg->In;
-	unsigned short * __restrict__ Out = (unsigned short *__restrict__) Arg->Out;
-	signed char * __restrict__ Infos = (signed char *__restrict__) Arg->Infos;
-	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
-	int A0 = Infos[AT_INF_A0], B0 = Infos[AT_INF_B0], C0 = Infos[AT_INF_C0];
 
-	for (int i=First; i<Last; i++) {
-		Out[i] = gap_clipu(AT_SCALE(Min(B0, Max(A0, (int) In[i])), ActScale, ActScaleN), 16);
-	}
-	gap_waitbarrier(0);
-}
 
 #pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
index 326cb3f58..af3f1734d 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_BasicKernels_SQ8.h
@@ -39,12 +39,13 @@ typedef enum {
         ACT_RELU,
         ACT_RELUN,
         ACT_HSIGMOID,
+        ACT_HTANH,
         ACT_HSWISH,
         ACT_LEAKYRELU,
         ACT_SIGMOID,
         ACT_TANH,
-		ACT_RELUM,
-		ACT_RELUMN
+	ACT_RELUM,
+	ACT_RELUMN
 } CNN_ActivationOper_T;
 
 /******************************************************************************************************************
@@ -630,12 +631,6 @@ extern void KerParConv3x3D4x1Stride1x1_SQ8(KerConv_SQ8_T *Arg);
 extern void KerParConv3x3D8x1Stride1x1_SQ8(KerConv_SQ8_T *Arg);
 extern void KerParConv3x3DxD2Stride1x1_SQ8(KerConv_SQ8_T *Arg);
 
-extern void KerPar_MM_Conv2D_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1D_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1D_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1D_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-
 extern void KerConv1x1Stride1_SQ8(KerConv_SQ8_T *Arg);
 extern void KerConv1x1Stride2_SQ8(KerConv_SQ8_T *Arg);
 extern void KerConv1x1StrideS_SQ8(KerConv_SQ8_T *Arg);
@@ -823,6 +818,60 @@ int TanhTable(int x, unsigned short int * table);
 #define Sigmoid(__x) SigmoidTable((__x), SIGMOID_LUT_uint16)
 #define SigmoidU(__x) SigmoidTableUnsigned((__x), SIGMOID_LUT_uint16)
 
+#pragma GCC diagnostic ignored "-Wswitch"
+#define decl(__t, __v) __t __v
+#define arr_at_as(__arr, __offset, __t) *((__t *) &__arr[__offset])
+#define OUT_CLIP(__acc, __is_unsigned, __n_bits) (__is_unsigned)?gap_clipu(__acc, __n_bits):gap_clip(__acc, (__n_bits-1))
+#define ACT_SWITCH(__acc, __act_type, __act_scale, __act_scalen, __a0, __b0, __c0, __n_bits, __is_unsigned) \
+do { \
+	switch (__act_type) { \
+		case ACT_NONE: \
+			break; \
+		case ACT_RELU: \
+			__acc = AT_SCALE(Max(0, __acc), __act_scale, __act_scalen); \
+			break; \
+		case ACT_RELUN: \
+			__acc = AT_SCALE(AT_CLIP_POS(__acc, __a0), __act_scale, __act_scalen); \
+			break; \
+		case ACT_RELUM: \
+			__acc = AT_SCALE(Max(__a0, __acc), __act_scale, __act_scalen); \
+			break; \
+		case ACT_RELUMN: \
+			__acc = AT_SCALE(Max(__a0, Min(__acc, __b0)), __act_scale, __act_scalen); \
+			break; \
+		case ACT_HSIGMOID: \
+			__acc = AT_SCALE(AT_CLIP_POS(__acc + __b0, __a0), __act_scale, __act_scalen) + __c0; \
+			break; \
+		case ACT_HSWISH: \
+			__acc = AT_SCALE(AT_CLIP_POS(__acc + __b0, __a0) * __acc, __act_scale, __act_scalen) + __c0; \
+			break; \
+		case ACT_LEAKYRELU: \
+			{ \
+				int Neg0 = gap_bitextractu(__acc, 1, 31), Pos0 = !Neg0; \
+				int Acc0N = AT_NORM(__acc * __a0, 7); \
+				__acc = AT_SCALE((Neg0*Acc0N+Pos0*__acc), __act_scale, __act_scalen) + __b0; \
+			} \
+			break; \
+		case ACT_SIGMOID: \
+			{ \
+				int Acc0N = __acc << (16 - __n_bits); \
+				if (!__is_unsigned) __acc = AT_SCALE(Sigmoid(Acc0N),  __act_scale, __act_scalen); \
+				else 		    __acc = AT_SCALE(SigmoidU(Acc0N), __act_scale, __act_scalen); \
+			} \
+			break; \
+		case ACT_TANH: \
+			{ \
+				int Acc0N = __acc << (16 - __n_bits); \
+				if (!__is_unsigned) __acc = AT_SCALE(Tanh(Acc0N), __act_scale, __act_scalen); \
+				else 		    __acc = AT_SCALE(Tanh(Acc0N), __act_scale, __act_scalen) + __a0; \
+			} \
+			break; \
+		case ACT_HTANH: \
+			break; \
+	} \
+} while(0)
+#pragma GCC diagnostic pop
+
 /*
  * Input Scaling and reduction to 8b then channel centric activation, Out location != In location. Features are evaluated in parallel
 */
@@ -843,6 +892,7 @@ extern void KerParReduct_CC_CHW2HWC_ReLUN_SQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_CHW2HWC_HSigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_CHW2HWC_HSwish_SQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_CHW2HWC_LeakyReLU_SQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_CHW2HWC_Sigmoid_SQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_CHW2HWC_Tanh_SQ8(KerConvLinReduct_SQ8_T *Arg);
 
 /*
@@ -899,6 +949,17 @@ extern void KerParReduct_CC_LeakyReLU_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_Sigmoid_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_Tanh_HWC_SQ8(KerConvLinReduct_SQ8_T *Arg);
 
+extern void KerParReduct_CC_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_ReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_ReLUN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_ReLUM_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_ReLUMN_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_HSigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_HSwish_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_LeakyReLU_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_Sigmoid_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerParReduct_CC_Tanh_HWC_USQ8(KerConvLinReduct_SQ8_T *Arg);
+
 extern void KerParReduct_CC_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_ReLU_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerParReduct_CC_ReLUN_HWC_SQ16(KerConvLinReduct_SQ8_T *Arg);
@@ -939,6 +1000,11 @@ extern void KerReduct_CC_NoScale_ReLU_USQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLUN_USQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLUM_USQ8(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLUMN_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_HSigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_HSwish_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_LeakyReLU_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_Sigmoid_USQ8(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_Tanh_USQ8(KerConvLinReduct_SQ8_T *Arg);
 
 extern void KerReduct_CC_NoScale_SQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLU_SQ16(KerConvLinReduct_SQ8_T *Arg);
@@ -956,6 +1022,12 @@ extern void KerReduct_CC_NoScale_ReLU_USQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLUN_USQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLUM_USQ16(KerConvLinReduct_SQ8_T *Arg);
 extern void KerReduct_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_HSigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_HSwish_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_LeakyReLU_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_Sigmoid_USQ16(KerConvLinReduct_SQ8_T *Arg);
+extern void KerReduct_CC_NoScale_Tanh_USQ16(KerConvLinReduct_SQ8_T *Arg);
+
 
 /******************************************************************************************************************
           Stand alone activation. Parallel Feature, Feature Parallel
@@ -967,7 +1039,6 @@ extern void KerReduct_CC_NoScale_ReLUMN_USQ16(KerConvLinReduct_SQ8_T *Arg);
  * Standalone Scaled Activation, Features are evaluated one after the other in parallel
 */
 extern void Ker_ActNone_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_Scale_SQ8(KerActivation_SQ8_T *Arg);
 extern void Ker_ReLU_SQ8(KerActivation_SQ8_T *Arg);
 extern void Ker_ReLUN_SQ8(KerActivation_SQ8_T *Arg);
 extern void Ker_ReLUM_SQ8(KerActivation_SQ8_T *Arg);
@@ -978,18 +1049,6 @@ extern void Ker_LeakyReLU_SQ8(KerActivation_SQ8_T *Arg);
 extern void Ker_Sigmoid_SQ8(KerActivation_SQ8_T *Arg);
 extern void Ker_Tanh_SQ8(KerActivation_SQ8_T *Arg);
 
-extern void Ker_ActNone_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_ReLU_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_ReLUN_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_ReLUM_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_ReLUMN_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_HSigmoid_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_HSwish_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_LeakyReLU_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_Sigmoid_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-extern void Ker_Tanh_ScaleIn_SQ8(KerActivation_SQ8_T *Arg);
-
-
 /******************************************************************************************************************
 	Pooling group.
 	Performs Max, Average or Global average pooling followed by an optional Scaling or Scaling and Activation.
@@ -1113,51 +1172,101 @@ extern void KerMatAdd_ReLUMN_USQ8(KerMat3_SQ8_T *Arg);
 extern void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
+
 extern void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulSxSyB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulSxSyB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB8_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
 
 extern void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
+
 extern void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulSxSyB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulSxSyB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB16_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
 
 extern void KerParMatMulB32_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
-
-extern void KerParMatMulB32_2x4_SQ8(KerMatMul_SQ8_T *Arg);
-extern void KerParMatMulB32_2x4_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
-
-extern void KerParMatMulB32_2x4_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-extern void KerParMatMulB32_2x4_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-extern void KerParMatMulB32_2x4_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-
-extern void KerParMatMulNoBias_2x4_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-extern void KerParMatMulNoBias_2x4_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-extern void KerParMatMulNoBias_2x4_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
+
+extern void KerParMatMulB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUM_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUMN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_LeakyReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_HSwish_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_HSigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_Sigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulB32_Tanh_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
 
 extern void KerParMatMulTransposedB32_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulTransposedB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulTransposedB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
-
-extern void KerParMatMulTransposedNoBias_SQ8(KerMatMul_SQ8_T *Arg);
-extern void KerParMatMulTransposedNoBias_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
-extern void KerParMatMulTransposedNoBias_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
 
 extern void KerParMatMulTransposedB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
 extern void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
 extern void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-
-extern void KerParMatMulTransposedNoBias_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-extern void KerParMatMulTransposedNoBias_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
-extern void KerParMatMulTransposedNoBias_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_ReLUM_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_ReLUMN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_LeakyReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_HSwish_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_HSigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_Sigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
+extern void KerParMatMulTransposedB32_Tanh_PL_SQ8(KerMatMul_PL_SQ8_T *Arg);
 
 extern void KerParMatMulSxSyB32_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulSxSyB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulSxSyB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulSxSyB32_Tanh_SQ8(KerMatMul_SQ8_T *Arg);
 
 /*************************************************************************************************************************************************
 	Matrix mult with channel centric scaling for small first matrix in the product, goal is to improve parallelism in this specific situation
@@ -1171,42 +1280,160 @@ extern void KerParMatMulSxSyB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB8_SF_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB8_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB8_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_ReLUMN_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_LeakyReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_HSwish_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_HSigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_Sigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB8_Tanh_SF_SQ8(KerMatMul_SQ8_T *Arg);
 
 extern void KerParMatMulB16_SF_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB16_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB16_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_ReLUMN_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_LeakyReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_HSwish_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_HSigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_Sigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB16_Tanh_SF_SQ8(KerMatMul_SQ8_T *Arg);
 
 extern void KerParMatMulB32_SF_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB32_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
-extern void KerParMatMulB32_2x4_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
 extern void KerParMatMulB32_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_ReLUMN_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_LeakyReLU_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_HSwish_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_HSigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_Sigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg);
+extern void KerParMatMulB32_Tanh_SF_SQ8(KerMatMul_SQ8_T *Arg);
 
-extern void KerPar_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-
-extern void Ker_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void Ker_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+/*************************************************************************************************************************************************
+	Convolutions based on Matric Multiplication, i.e. im2col. CHW and HWC
+*************************************************************************************************************************************************/
 
 extern void KerPar_MM_Conv1D_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1D_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1D_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
 extern void KerPar_MM_Conv1D_DxDy_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv1D_DxDy_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv1D_DxDy_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 
 extern void KerPar_MM_Conv2D_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv2D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv2D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
 extern void KerPar_MM_Conv2D_DxDy_SQ8(Ker_MM_Conv_SQ8_T *Arg);
-extern void KerPar_MM_Conv2D_DxDy_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerPar_MM_Conv2D_DxDy_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
+extern void KerPar_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1x1_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
+extern void Ker_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv1x1_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
+extern void KerPar_MM_Conv1D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
+extern void KerPar_MM_Conv1D_DxDy_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv1D_DxDy_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 
-extern void KerPar_MM_ConvDW2D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+
+extern void KerPar_MM_Conv2D_DxDy_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void KerPar_MM_Conv2D_DxDy_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 
 extern void Ker_MM_Conv2D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void Ker_MM_Conv2D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
+extern void Ker_MM_Conv2D_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 
 /*************************************************************************************************************************************************
 	Matrix by Vector Multiplication followed by an optional Activation (all of them supported)
@@ -1215,10 +1442,13 @@ extern void Ker_MM_Conv2D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg);
 extern void KerParMatVectMul_SQ8(KerMat3_SQ8_T *Arg);
 extern void KerParMatVectMul_ReLU_SQ8(KerMat3_SQ8_T *Arg);
 extern void KerParMatVectMul_ReLUN_SQ8(KerMat3_SQ8_T *Arg);
-extern void KerParMatVectMul_HSigmoid_SQ8(KerMat3_SQ8_T *Arg);
-extern void KerParMatVectMul_HSwish_SQ8(KerMat3_SQ8_T *Arg);
-extern void KerParMatVectMul_LeakyReLU_SQ8(KerMat3_SQ8_T *Arg);
-
+extern void KerParMatVectMul_ReLUM_SF_SQ8(KerMat3_SQ8_T *Arg);
+extern void KerParMatVectMul_ReLUMN_SF_SQ8(KerMat3_SQ8_T *Arg);
+extern void KerParMatVectMul_LeakyReLU_SF_SQ8(KerMat3_SQ8_T *Arg);
+extern void KerParMatVectMul_HSwish_SF_SQ8(KerMat3_SQ8_T *Arg);
+extern void KerParMatVectMul_HSigmoid_SF_SQ8(KerMat3_SQ8_T *Arg);
+extern void KerParMatVectMul_Sigmoid_SF_SQ8(KerMat3_SQ8_T *Arg);
+extern void KerParMatVectMul_Tanh_SF_SQ8(KerMat3_SQ8_T *Arg);
 
 /*************************************************************************************************************************************************
 	Linear Layer with 32b output.
@@ -1231,14 +1461,36 @@ extern void KerParLinearLayer_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB8_ReLU_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB8_ReLUN_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_ReLUM_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_ReLUMN_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_HSwish_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_HSigmoid_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_Sigmoid_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB8_Tanh_SQ8(KerLinear_SQ8_T *Arg);
 
 extern void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB16_ReLU_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB16_ReLUN_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_ReLUM_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_ReLUMN_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_HSwish_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_HSigmoid_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_Sigmoid_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB16_Tanh_SQ8(KerLinear_SQ8_T *Arg);
 
 extern void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB32_ReLU_SQ8(KerLinear_SQ8_T *Arg);
 extern void KerParLinearLayerFullFeatB32_ReLUN_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_ReLUM_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_ReLUMN_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_HSwish_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_HSigmoid_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_Sigmoid_SQ8(KerLinear_SQ8_T *Arg);
+extern void KerParLinearLayerFullFeatB32_Tanh_SQ8(KerLinear_SQ8_T *Arg);
+
 
 /*************************************************************************************************************************************************
 	SotMax, Q15 Output
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c
index 83c487861..4dc30ccc5 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Bias_Linear_SQ8.c
@@ -316,7 +316,10 @@ void KerParLinearLayer_SQ8(KerLinear_SQ8_T *Arg)
 
 /* Output can be evaluated completly */
 /* 8b Bias */
-void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg)
+static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB8_SQ8_act(
+	KerLinear_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+	)
 
 {
 	signed char * __restrict__ In = Arg->In;
@@ -330,6 +333,9 @@ void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg)
 
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
 	v4s * __restrict__ VectIn = (v4s *) In;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	for (int i=First; i<Last; i++) {
 		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
@@ -341,76 +347,59 @@ void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg)
 		}
 		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
 		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = gap_clip(AT_SCALE(Acc, Scale[i], ScaleN[i]), 7);
+		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		Out[i] = gap_clip(Acc, 7);
 	}
 	gap_waitbarrier(0);
 }
 
 
-void KerParLinearLayerFullFeatB8_ReLU_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB8_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	signed char * __restrict__ In = Arg->In;
-	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
-	const signed char * __restrict__ Weights = Arg->Weights;
-	const signed char * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN];
-	unsigned char *Scale = Arg->Scale;
-	unsigned char *ScaleN = Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+void KerParLinearLayerFullFeatB8_ReLU_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_RELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
-	v4s * __restrict__ VectIn = (v4s *) In;
+void KerParLinearLayerFullFeatB8_ReLUN_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_RELUN);
+}
 
-	for (int i=First; i<Last; i++) {
-		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
-		int Acc = AT_LSHIFT(Bias[i], NormBias);
-		for (int j=0; j<(InDim/(4*2)); j++) {
-			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
-			v4s C0=W[2*j], C1=W[2*j+1];
-			Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc);
-		}
-		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
-		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = Max(0, gap_clip(AT_SCALE(Acc, Scale[i], ScaleN[i]), 7));
-	}
-	gap_waitbarrier(0);
+void KerParLinearLayerFullFeatB8_ReLUM_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_RELUM);
 }
 
-void KerParLinearLayerFullFeatB8_ReLUN_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB8_ReLUMN_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_RELUMN);
+}
 
-{
-	signed char * __restrict__ In = Arg->In;
-	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
-	const signed char * __restrict__ Weights = Arg->Weights;
-	const signed char * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN];
-	unsigned char *Scale = Arg->Scale;
-	unsigned char *ScaleN = Arg->ScaleN;
-	int A0 = Arg->Infos[AT_INF_A0];
-	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+void KerParLinearLayerFullFeatB8_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
-	v4s * __restrict__ VectIn = (v4s *) In;
+void KerParLinearLayerFullFeatB8_HSigmoid_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-	for (int i=First; i<Last; i++) {
-		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
-		int Acc = AT_LSHIFT(Bias[i], NormBias);
-		for (int j=0; j<(InDim/(4*2)); j++) {
-			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
-			v4s C0=W[2*j], C1=W[2*j+1];
-			Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc);
-		}
-		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
-		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = AT_CLIP_POS(AT_SCALE(Acc, Scale[i], ScaleN[i]), A0);
-	}
-	gap_waitbarrier(0);
+void KerParLinearLayerFullFeatB8_HSwish_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_HSWISH);
 }
 
-/* 16b Bias */
-void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB8_Sigmoid_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_SIGMOID);
+}
 
+void KerParLinearLayerFullFeatB8_Tanh_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB8_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/* 16b Bias */
+static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB16_SQ8_act(
+	KerLinear_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+	)
 {
 	signed char * __restrict__ In = Arg->In;
 	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
@@ -423,6 +412,9 @@ void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg)
 
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
 	v4s * __restrict__ VectIn = (v4s *) In;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	for (int i=First; i<Last; i++) {
 		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
@@ -434,76 +426,58 @@ void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg)
 		}
 		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
 		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = gap_clip(AT_SCALE(Acc, Scale[i], ScaleN[i]), 7);
+		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		Out[i] = gap_clip(Acc, 7);
 	}
 	gap_waitbarrier(0);
 }
 
 
-void KerParLinearLayerFullFeatB16_ReLU_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB16_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	signed char * __restrict__ In = Arg->In;
-	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
-	const signed char * __restrict__ Weights = Arg->Weights;
-	const short int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN];
-	unsigned char *Scale = Arg->Scale;
-	unsigned char *ScaleN = Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+void KerParLinearLayerFullFeatB16_ReLU_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_RELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
-	v4s * __restrict__ VectIn = (v4s *) In;
+void KerParLinearLayerFullFeatB16_ReLUN_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_RELUN);
+}
 
-	for (int i=First; i<Last; i++) {
-		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
-		int Acc = AT_LSHIFT(Bias[i], NormBias);
-		for (int j=0; j<(InDim/(4*2)); j++) {
-			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
-			v4s C0=W[2*j], C1=W[2*j+1];
-			Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc);
-		}
-		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
-		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = Max(0, gap_clip(AT_SCALE(Acc, Scale[i], ScaleN[i]), 7));
-	}
-	gap_waitbarrier(0);
+void KerParLinearLayerFullFeatB16_ReLUM_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_RELUM);
 }
 
-void KerParLinearLayerFullFeatB16_ReLUN_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB16_ReLUMN_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_RELUMN);
+}
 
-{
-	signed char * __restrict__ In = Arg->In;
-	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
-	const signed char * __restrict__ Weights = Arg->Weights;
-	const short int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN];
-	unsigned char *Scale = Arg->Scale;
-	unsigned char *ScaleN = Arg->ScaleN;
-	int A0 = Arg->Infos[AT_INF_A0];
-	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+void KerParLinearLayerFullFeatB16_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
-	v4s * __restrict__ VectIn = (v4s *) In;
+void KerParLinearLayerFullFeatB16_HSigmoid_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-	for (int i=First; i<Last; i++) {
-		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
-		int Acc = AT_LSHIFT(Bias[i], NormBias);
-		for (int j=0; j<(InDim/(4*2)); j++) {
-			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
-			v4s C0=W[2*j], C1=W[2*j+1];
-			Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc);
-		}
-		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
-		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = AT_CLIP_POS(AT_SCALE(Acc, Scale[i], ScaleN[i]), A0);
-	}
-	gap_waitbarrier(0);
+void KerParLinearLayerFullFeatB16_HSwish_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_HSWISH);
 }
 
-/* 32b Bias */
-void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB16_Sigmoid_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParLinearLayerFullFeatB16_Tanh_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB16_SQ8_act(Arg, ACT_TANH);
+}
 
+/* 32b Bias */
+static inline void __attribute__((always_inline)) KerParLinearLayerFullFeatB32_SQ8_act(
+	KerLinear_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+	)
 {
 	signed char * __restrict__ In = Arg->In;
 	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
@@ -513,6 +487,9 @@ void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg)
 	unsigned char *Scale = Arg->Scale;
 	unsigned char *ScaleN = Arg->ScaleN;
 	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
 	v4s * __restrict__ VectIn = (v4s *) In;
@@ -527,70 +504,51 @@ void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg)
 		}
 		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
 		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = gap_clip(AT_SCALE(Acc, Scale[i], ScaleN[i]), 7);
+		Acc = AT_SCALE(Acc, Scale[i], ScaleN[i]); ACT_SWITCH(Acc, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		Out[i] = gap_clip(Acc, 7);
 	}
 	gap_waitbarrier(0);
 }
 
 
-void KerParLinearLayerFullFeatB32_ReLU_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB32_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	signed char * __restrict__ In = Arg->In;
-	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
-	const signed char * __restrict__ Weights = Arg->Weights;
-	const int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN];
-	unsigned char *Scale = Arg->Scale;
-	unsigned char *ScaleN = Arg->ScaleN;
-	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+void KerParLinearLayerFullFeatB32_ReLU_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_RELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
-	v4s * __restrict__ VectIn = (v4s *) In;
+void KerParLinearLayerFullFeatB32_ReLUN_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_RELUN);
+}
 
-	for (int i=First; i<Last; i++) {
-		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
-		int Acc = AT_LSHIFT(Bias[i], NormBias);
-		for (int j=0; j<(InDim/(4*2)); j++) {
-			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
-			v4s C0=W[2*j], C1=W[2*j+1];
-			Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc);
-		}
-		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
-		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = Max(0, gap_clip(AT_SCALE(Acc, Scale[i], ScaleN[i]), 7));
-	}
-	gap_waitbarrier(0);
+void KerParLinearLayerFullFeatB32_ReLUM_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_RELUM);
 }
 
-void KerParLinearLayerFullFeatB32_ReLUN_SQ8(KerLinear_SQ8_T *Arg)
+void KerParLinearLayerFullFeatB32_ReLUMN_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_RELUMN);
+}
 
-{
-	signed char * __restrict__ In = Arg->In;
-	unsigned int InDim = Arg->InDim, OutDim = Arg->OutDim;
-	const signed char * __restrict__ Weights = Arg->Weights;
-	const int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = ((unsigned char *)Arg->Infos)[AT_INF_BIASN];
-	unsigned char *Scale = Arg->Scale;
-	unsigned char *ScaleN = Arg->ScaleN;
-	int A0 = Arg->Infos[AT_INF_A0];
-	signed char * __restrict__ Out = (signed char * __restrict__) Arg->Out;
+void KerParLinearLayerFullFeatB32_LeakyReLU_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(OutDim), First = CoreId*ChunkCell, Last  = Min(First+ChunkCell, OutDim);
-	v4s * __restrict__ VectIn = (v4s *) In;
+void KerParLinearLayerFullFeatB32_HSigmoid_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-	for (int i=First; i<Last; i++) {
-		v4s * __restrict__ W = (v4s *) (&Weights[i*InDim]);
-		int Acc = AT_LSHIFT(Bias[i], NormBias);
-		for (int j=0; j<(InDim/(4*2)); j++) {
-			v4s V0=VectIn[2*j], V1=VectIn[2*j+1];
-			v4s C0=W[2*j], C1=W[2*j+1];
-			Acc = gap_sumdotp4(V0, C0, Acc); Acc = gap_sumdotp4(V1, C1, Acc);
-		}
-		if (InDim&0x4) Acc = gap_sumdotp4(VectIn[InDim/4-1], W[InDim/4-1], Acc);
-		for (int j=4*(InDim/4); j<InDim; j++) Acc += In[j]*Weights[i*InDim+j];
-		Out[i] = AT_CLIP_POS(AT_SCALE(Acc, Scale[i], ScaleN[i]), A0);
-	}
-	gap_waitbarrier(0);
+void KerParLinearLayerFullFeatB32_HSwish_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_HSWISH);
 }
+
+void KerParLinearLayerFullFeatB32_Sigmoid_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParLinearLayerFullFeatB32_Tanh_SQ8(KerLinear_SQ8_T *Arg) {
+	KerParLinearLayerFullFeatB32_SQ8_act(Arg, ACT_TANH);
+}
+
 #pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c
index c4296cdaf..2177327fc 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Conv_SQ8.c
@@ -3640,11 +3640,7 @@ static void __attribute__ ((noinline)) KerConvNxMStrideSxSy_Body_SQ8(
 		for (unsigned int w=Wo_F; w<Wo_L; w++) {
 			int Acc = *PtO;
 			for (unsigned int i=0; i<Fh; i++) {
-				// for (unsigned int j=0; j<Fw/4; j++) Acc = gap_sumdotp4(*((v4s *) &In[(h*StrideY-PadT+i)*W + (w*StrideX-PadL+j)]), *((v4s *) &Filter[Fw*i+j]), Acc);
-				// for (unsigned int j=(Fw/4)*4; j<Fw; j++) Acc += In[(h*StrideY-PadT+i)*W + (w*StrideX-PadL+j)]*Filter[Fw*i+j];
-				for (unsigned int j=0; j<Fw; j++) {
-					Acc += In[(h*StrideY-PadT+i)*W + (w*StrideX-PadL+j)]*Filter[Fw*i+j];
-				}
+				for (unsigned int j=0; j<Fw; j++) Acc += In[(h*StrideY-PadT+i)*W + (w*StrideX-PadL+j)]*Filter[Fw*i+j];
 			}
 			*PtO = Acc; PtO++;
 		}
@@ -3717,9 +3713,9 @@ static void __attribute__ ((noinline)) KerConv3x3DxDyStrideSxSy_Body_SQ8(
         int *PtO = Out+Wo*Ho_F+Wo_F;
 
         V0 = (v4s){0}; V1 = (v4s){0}; V2 = (v4s){0};
-        for (unsigned int h=Ho_F; h<Ho_L; h++) {
+	for (unsigned int h=Ho_F; h<Ho_L; h++) {
                 signed char *PtI = In + (h*StrideY-PadT)*W + Wo_F*StrideX-PadL;
-                for (unsigned int w=Wo_F; w<Wo_L; w++) {
+		for (unsigned int w=Wo_F; w<Wo_L; w++) {
                         int Acc = *PtO;
 			signed char X0, X1, X2;
 			X0 = PtI[0]; X1 = PtI[Dw]; X2 = PtI[2*Dw]; V0 = gap_pack4(X0, X1, X2, 0); PtI += Dh*W;
@@ -3753,7 +3749,7 @@ static void __attribute__ ((noinline)) KerConv3x3DxDyStride1x1_Body_SQ8(
         )
 {
         int Fw = 3, Fh = 3;
-	unsigned StrideX = 1, StrideY = 1;
+	int StrideX = 1, StrideY = 1;
         unsigned short int PadL = Pad[0], PadT = Pad[2];
         v4s C0 = *((v4s *) &Filter[0]);
         v4s C1 = *((v4s *) &Filter[3]);
@@ -3796,7 +3792,7 @@ static void __attribute__ ((noinline)) KerConv3x3D2D2Stride1x1_Body_SQ8(
 {
         int Fw = 3, Fh = 3;
         int Dw = 2, Dh = 2;
-	unsigned StrideX = 1, StrideY = 1;
+	int StrideX = 1, StrideY = 1;
         unsigned short int PadL = Pad[0], PadT = Pad[2];
         v4s C0 = *((v4s *) &Filter[0]);
         v4s C1 = *((v4s *) &Filter[3]);
@@ -3860,7 +3856,7 @@ static void __attribute__ ((noinline)) KerConv3x3D4D4Stride1x1_Body_SQ8(
 {
         int Fw = 3, Fh = 3;
         int Dw = 4, Dh = 4;
-	unsigned StrideX = 1, StrideY = 1;
+	int StrideX = 1, StrideY = 1;
         unsigned short int PadL = Pad[0], PadT = Pad[2];
         v4s C0 = *((v4s *) &Filter[0]);
         v4s C1 = *((v4s *) &Filter[3]);
@@ -3906,7 +3902,7 @@ static void __attribute__ ((noinline)) KerConv3x3D8D8Stride1x1_Body_SQ8(
 {
         int Fw = 3, Fh = 3;
         int Dw = 8, Dh = 8;
-	unsigned StrideX = 1, StrideY = 1;
+	int StrideX = 1, StrideY = 1;
         unsigned short int PadL = Pad[0], PadT = Pad[2];
         v4s C0 = *((v4s *) &Filter[0]);
         v4s C1 = *((v4s *) &Filter[3]);
@@ -4044,7 +4040,7 @@ static void __attribute__ ((noinline)) KerConv3x3D8D1Stride1x1_Body_SQ8(
 {
         int Fw = 3, Fh = 3;
         int Dw = 8, Dh = 1;
-	unsigned StrideX = 1, StrideY = 1;
+	int StrideX = 1, StrideY = 1;
         unsigned short int PadL = Pad[0], PadT = Pad[2];
         v4s C0 = *((v4s *) &Filter[0]);
         v4s C1 = *((v4s *) &Filter[3]);
@@ -6019,19 +6015,8 @@ void KerConv3x3Stride1_SQ8(KerConv_SQ8_T *Arg)
 		Ho_F = Max(First, Ho_F); Ho_L = Min(Last, Ho_L);
 	}
 	if (First<Last) {
-#ifdef ALT
-		unsigned int TotalInFeatures = Arg->TotalInFeatures, InFeatures = Arg->InFeatures, OutFeatures = Arg->OutFeatures;
-		for (unsigned int of=0; of<OutFeatures; of++)
-			for (unsigned int If=0; If<InFeatures; If++) {
-				signed char *in = In+W*H*If, *filter = Filter+FS*FS*(TotalInFeatures*of  + If);
-				int *out = Out+Wo*Ho*(of);
-				KerConv3x3Stride1_Body_SQ8(in, out, filter, W, H, Wo, Wo_F, Wo_L, Ho, Ho_F, Ho_L, PadOrg);
-				if ((int)PadIn) KerConv3x3BorderStride1_SQ8(in, out, filter, W, H, Wo, Wo_F, Wo_L, Ho, Ho_F, Ho_L, PadIn, PadOrg);
-			}
-#else
 		KerConv3x3Stride1_Body_SQ8(In, Out, Filter, W, H, Wo, Wo_F, Wo_L, Ho, Ho_F, Ho_L, PadOrg);
 		if ((int)PadIn) KerConv3x3BorderStride1_SQ8(In, Out, Filter, W, H, Wo, Wo_F, Wo_L, Ho, Ho_F, Ho_L, PadIn, PadOrg);
-#endif
 	}
 	gap_waitbarrier(0);
 }
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Infos_SQ8.h b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Infos_SQ8.h
index b4b629cde..413a0ccba 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Infos_SQ8.h
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_Infos_SQ8.h
@@ -52,11 +52,10 @@
 #define AT_INF_OUTSCALEN	(AT_INF_OUTZEROPOINT+AT_INF_OUTZEROPOINT_LEN) // 11
 #define AT_INF_OUTSCALEN_LEN	1
 
-#define AT_INF_DIM		(AT_INF_OUTSCALEN+AT_INF_OUTSCALEN_LEN) // 12
-
 #define AT_INF_PRENORM          (AT_INF_OUTSCALEN+AT_INF_OUTSCALEN_LEN) // 12
 #define AT_INF_PRENORM_LEN      1
 
+#define AT_INF_DIM      (AT_INF_PRENORM+AT_INF_PRENORM_LEN) // 13
 #define AT_INF_SQ16_DIM         (AT_INF_PRENORM+AT_INF_PRENORM_LEN) // 13
 
 #define AT_INF_ADD_BIAS		AT_INF_PRENORM // 13
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
index debd2ba4b..4c1c2218c 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatAlgebra_SQ8.c
@@ -638,7 +638,9 @@ void KerMatAdd_ReLUMN_USQ8(KerMat3_SQ8_T *Arg)
 *************************************************************************************************************************************************/
 
 /* 	Byte Bias */
-void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg)
+static inline void __attribute__((always_inline)) KerParMatMulB8_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 	/*
@@ -658,6 +660,9 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg)
         unsigned int OutFirstCol = Arg->OutFirstCol;
         signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
         int ColFirst = Arg->ColFirst;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
@@ -702,7 +707,11 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg)
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
                 }
                 gap_waitbarrier(0);
@@ -730,8 +739,10 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg)
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
+			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
                 gap_waitbarrier(0);
 	}
@@ -754,136 +765,172 @@ void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg)
 				S0 += V0 * BufferColIn2[i];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
 	}
 }
 
-void KerParMatMulB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulB8_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulB8_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulB8_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulB8_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB8_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB8_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB8_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB8_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SQ8_act(Arg, ACT_TANH);
+}
+
+static inline void __attribute__((always_inline)) KerParMatMulSxSyB8_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
+/*
+	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
+	In2 is  [InFeat][Width*Height]
+
+	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
+*/
+	signed char * __restrict__ In1 = Arg->In1;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	signed char * __restrict__ In2 = Arg->In2;
+	unsigned int W_In2 = Arg->W_In2;
+	signed char * __restrict__ Bias = Arg->Bias;
+	signed char * __restrict__ Out = Arg->Out;
+	unsigned int W_Out = Arg->W_Out;
+	int Pi = Arg->OutFirstCol;
+	signed char *BufferColIn2 = Arg->BufferColIn2;
+	unsigned int NormBias = Arg->NormBias;
+	int Wi = Arg->W, Hi = Arg->H;
+	int Sx = Arg->Sx, Sy = Arg->Sy;
+	int ColFirst = Arg->ColFirst;
 	unsigned char * __restrict__ Scale = Arg->Scale;
 	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
+	unsigned int H_In2 = W_In1;
+	unsigned int H_Out = H_In1;
 
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
+	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
+	int Oo, OffLine;
+	int At, F=0, L = W_In2;
 
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0, S2=S0, S3=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
-                }
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
+	unsigned int Line, Col, i;
+	v4s *VBuff = (v4s *) BufferColIn2;
+
+	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
+
+	At = 0; OffLine = 0; Oo = 0;
+	if (ColFirst) OffLine = Pi; else Oo = Pi;
+
+	while (L>0) {
+	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
+	       	gap_waitbarrier(0);
+	       	for (Line=First; Line<Last; Line++) {
+		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
+		       	int S = (Bias[Line]<<NormBias);
+		       	for (i=0; i<(W_In1/(4*2)); i++) {
+				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
+				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
 			}
+			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
+		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+			S = AT_SCALE(S, Sc,  ScN);  ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(S, 7);
+	       	}
+		int nF = F+Sx;
+		if (nF<Wi) {
+			F = nF; At += Sx; L -= Sx; Oo++;
+		} else {
+			int d = Wi-F+(Sy-1)*Wi;
+			F = 0; L -= d; At += d; Oo++;
 		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias);
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
+	       	gap_waitbarrier(0);
 	}
 }
 
-void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulSxSyB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulSxSyB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulSxSyB8_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulSxSyB8_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulSxSyB8_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulSxSyB8_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulSxSyB8_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulSxSyB8_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulSxSyB8_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB8_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/* 	Half Word Bias */
+static inline void __attribute__((always_inline)) KerParMatMulB16_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 	/*
@@ -894,7 +941,7 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
         unsigned int H_In1 = Arg->H_In1;
         signed char * __restrict__ In2 = Arg->In2;
         unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Bias = Arg->Bias;
+        short int * __restrict__ Bias = Arg->Bias;
         signed char * __restrict__ Out = Arg->Out;
         unsigned int W_Out = Arg->W_Out;
 	unsigned char * __restrict__ Scale = Arg->Scale;
@@ -903,7 +950,9 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
         unsigned int OutFirstCol = Arg->OutFirstCol;
         signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
         int ColFirst = Arg->ColFirst;
-	int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
@@ -948,8 +997,11 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
 				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7),
-					  gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7));
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
                 }
                 gap_waitbarrier(0);
@@ -977,8 +1029,10 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
+			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
                 gap_waitbarrier(0);
 	}
@@ -1001,13 +1055,57 @@ void KerParMatMulB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
 				S0 += V0 * BufferColIn2[i];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
 	}
 }
 
-void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulB16_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulB16_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulB16_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB16_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB16_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB16_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB16_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SQ8_act(Arg, ACT_TANH);
+}
+
+
+static inline void __attribute__((always_inline)) KerParMatMulSxSyB16_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 /*
@@ -1021,7 +1119,7 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg)
 	unsigned int H_In1 = Arg->H_In1;
 	signed char * __restrict__ In2 = Arg->In2;
 	unsigned int W_In2 = Arg->W_In2;
-	signed char * __restrict__ Bias = Arg->Bias;
+	short int * __restrict__ Bias = Arg->Bias;
 	signed char * __restrict__ Out = Arg->Out;
 	unsigned int W_Out = Arg->W_Out;
 	int Pi = Arg->OutFirstCol;
@@ -1032,6 +1130,9 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg)
 	int ColFirst = Arg->ColFirst;
 	unsigned char * __restrict__ Scale = Arg->Scale;
 	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	unsigned int H_In2 = W_In1;
 	unsigned int H_Out = H_In1;
@@ -1062,7 +1163,8 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg)
 			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
 		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(AT_SCALE(S, Sc, ScN), 7);
+			S = AT_SCALE(S, Sc,  ScN);  ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(S, 7);
 	       	}
 		int nF = F+Sx;
 		if (nF<Wi) {
@@ -1075,145 +1177,52 @@ void KerParMatMulSxSyB8_SQ8(KerMatMul_SQ8_T *Arg)
 	}
 }
 
-void KerParMatMulSxSyB8_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
+void KerParMatMulSxSyB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_RELU);
+}
 
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	signed char * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
+void KerParMatMulSxSyB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_RELUN);
+}
 
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
+void KerParMatMulSxSyB16_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_RELUM);
+}
 
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
-
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
-
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
-
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
-
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = (Bias[Line]<<NormBias);
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
-			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = AT_CLIP_POS_IMM(AT_SCALE(S, Sc, ScN), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
-		}
-	       	gap_waitbarrier(0);
-	}
+void KerParMatMulSxSyB16_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_RELUMN);
 }
 
-void KerParMatMulSxSyB8_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
-
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	signed char * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
-
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
+void KerParMatMulSxSyB16_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
+void KerParMatMulSxSyB16_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_HSWISH);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
+void KerParMatMulSxSyB16_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
+void KerParMatMulSxSyB16_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_SIGMOID);
+}
 
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = (Bias[Line]<<NormBias);
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
-			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(AT_CLIP_POS(AT_SCALE(S, Sc, ScN), A0), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
-		}
-	       	gap_waitbarrier(0);
-	}
+void KerParMatMulSxSyB16_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB16_SQ8_act(Arg, ACT_TANH);
 }
 
-/* 	Half Word Bias */
-void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
+
+/* 	Word Bias */
+static inline void __attribute__((always_inline)) KerParMatMulB32_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
+)
 
 {
 	/*
@@ -1224,7 +1233,7 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
         unsigned int H_In1 = Arg->H_In1;
         signed char * __restrict__ In2 = Arg->In2;
         unsigned int W_In2 = Arg->W_In2;
-        short int * __restrict__ Bias = Arg->Bias;
+        int * __restrict__ Bias = Arg->Bias;
         signed char * __restrict__ Out = Arg->Out;
         unsigned int W_Out = Arg->W_Out;
 	unsigned char * __restrict__ Scale = Arg->Scale;
@@ -1233,6 +1242,9 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
         unsigned int OutFirstCol = Arg->OutFirstCol;
         signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
         int ColFirst = Arg->ColFirst;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
@@ -1243,6 +1255,7 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
         v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
 
         unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+        unsigned int Iter = (Last>First)?(Last-First):0;
         unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
         int OffLine = 0, OffCol = 0;
 
@@ -1259,27 +1272,87 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
 			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
 		}
                 gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0, S2=S0, S3=S0;
+                for (Line=0; Line<Iter/2; Line++) {
+                	int l1 = 2*Line + First;
+                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
+                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
+                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
+                        if (Bias) {
+	                        S0 = (Bias[l1]<<NormBias), S1=S0, S2=S0, S3=S0;
+	                        S4 = (Bias[l1+1]<<NormBias), S5=S4, S6=S4, S7=S4;
+	                }
                         for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
+				v4s A = VBuff0[i];
+				v4s B = VBuff1[i];
+				v4s C = VBuff2[i];
+				v4s D = VBuff3[i];
+                                S0 = gap_sumdotp4(V0, A, S0);
+                                S1 = gap_sumdotp4(V0, B, S1);
+                                S2 = gap_sumdotp4(V0, C, S2);
+                                S3 = gap_sumdotp4(V0, D, S3);
+				v4s V1 = VIn2[i];
+                                S4 = gap_sumdotp4(V1, A, S4);
+                                S5 = gap_sumdotp4(V1, B, S5);
+                                S6 = gap_sumdotp4(V1, C, S6);
+                                S7 = gap_sumdotp4(V1, D, S7);
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
+				int V0 = In1[l1*W_In1 + i];
 				S0 += V0 * BufferColIn2[i];
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 				S2 += V0 * BufferColIn2[i+2*H_In2];
 				S3 += V0 * BufferColIn2[i+3*H_In2];
+				int V1 = In1[(l1+1)*W_In1 + i];
+				S4 += V1 * BufferColIn2[i];
+				S5 += V1 * BufferColIn2[i+1*H_In2];
+				S6 += V1 * BufferColIn2[i+2*H_In2];
+				S7 += V1 * BufferColIn2[i+3*H_In2];
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, Sc1, ScN1); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, Sc1, ScN1); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, Sc1, ScN1); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, Sc1, ScN1); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
+			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
                 }
+		if (Iter&0x1) {
+			int l1 = Last - 1;
+			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
+                        int S0=0, S1=0, S2=0, S3=0;
+                        if (Bias) {
+	                        S0 = (Bias[l1]<<NormBias), S1=S0, S2=S0, S3=S0;
+	                }
+			for (i=0; i<(W_In1/4); i++) {
+				v4s V0 = VIn1[i];
+				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
+				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
+				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
+				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
+			}
+                        for (i=(W_In1/4)*4; i<W_In1; i++) {
+				int V0 = In1[l1*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
+				S2 += V0 * BufferColIn2[i+2*H_In2];
+				S3 += V0 * BufferColIn2[i+3*H_In2];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+		}
                 gap_waitbarrier(0);
         }
 	if (W_In2&0x2) {
@@ -1293,7 +1366,10 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
                 gap_waitbarrier(0);
                 for (Line=First; Line<Last; Line++) {
                         v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0;
+                        int S0=0, S1=0;
+                        if (Bias) {
+	                        S0 = (Bias[Line]<<NormBias), S1=S0;
+	                }
                         for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
                                 S0 = gap_sumdotp4(V0, VBuff0[i], S0);
@@ -1305,8 +1381,10 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
 				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
+			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
                 }
                 gap_waitbarrier(0);
 	}
@@ -1319,7 +1397,10 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
                 gap_waitbarrier(0);
                 for (Line=First; Line<Last; Line++) {
                         v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias);
+                        int S0=0;
+                        if (Bias) {
+	                        S0 = (Bias[Line]<<NormBias);
+	                }
                         for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
                                 S0 = gap_sumdotp4(V0, VBuff0[i], S0);
@@ -1329,291 +1410,90 @@ void KerParMatMulB16_SQ8(KerMatMul_SQ8_T *Arg)
 				S0 += V0 * BufferColIn2[i];
 			}
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
                 }
                 gap_waitbarrier(0);
 	}
 }
 
-void KerParMatMulB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulB32_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB32_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB32_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SQ8_act(Arg, ACT_TANH);
+}
+
+static inline void __attribute__((always_inline)) KerParMatMulSxSyB32_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        short int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
+/*
+	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
+	In2 is  [InFeat][Width*Height]
+
+	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
+*/
+	signed char * __restrict__ In1 = Arg->In1;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	signed char * __restrict__ In2 = Arg->In2;
+	unsigned int W_In2 = Arg->W_In2;
+	int * __restrict__ Bias = Arg->Bias;
+	signed char * __restrict__ Out = Arg->Out;
+	unsigned int W_Out = Arg->W_Out;
+	int Pi = Arg->OutFirstCol;
+	signed char *BufferColIn2 = Arg->BufferColIn2;
+	unsigned int NormBias = Arg->NormBias;
+	int Wi = Arg->W, Hi = Arg->H;
+	int Sx = Arg->Sx, Sy = Arg->Sy;
+	int ColFirst = Arg->ColFirst;
 	unsigned char * __restrict__ Scale = Arg->Scale;
 	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
+	unsigned int H_In2 = W_In1;
+	unsigned int H_Out = H_In1;
 
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0, S2=S0, S3=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
-                }
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias);
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-void KerParMatMulB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        short int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0, S2=S0, S3=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7),
-					  gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
-                }
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias), S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = (Bias[Line]<<NormBias);
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
-
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	short int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
-
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
+	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
+	int Oo, OffLine;
+	int At, F=0, L = W_In2;
 
 	unsigned int Line, Col, i;
 	v4s *VBuff = (v4s *) BufferColIn2;
@@ -1629,7 +1509,7 @@ void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg)
 	       	gap_waitbarrier(0);
 	       	for (Line=First; Line<Last; Line++) {
 		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = (Bias[Line]<<NormBias);
+		       	int S = Bias?(Bias[Line]<<NormBias):0;
 		       	for (i=0; i<(W_In1/(4*2)); i++) {
 				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
 				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
@@ -1637,7 +1517,8 @@ void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg)
 			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
 		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
 			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(AT_SCALE(S, Sc, ScN), 7);
+			S = AT_SCALE(S, Sc, ScN); ACT_SWITCH(S, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(S, 7);
 	       	}
 		int nF = F+Sx;
 		if (nF<Wi) {
@@ -1650,3634 +1531,679 @@ void KerParMatMulSxSyB16_SQ8(KerMatMul_SQ8_T *Arg)
 	}
 }
 
-void KerParMatMulSxSyB16_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulSxSyB32_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
+void KerParMatMulSxSyB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELU);
+}
 
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	short int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
+void KerParMatMulSxSyB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELUN);
+}
 
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
+void KerParMatMulSxSyB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELUM);
+}
 
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
+void KerParMatMulSxSyB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_RELUMN);
+}
 
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
+void KerParMatMulSxSyB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
+void KerParMatMulSxSyB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_HSWISH);
+}
 
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
+void KerParMatMulSxSyB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = (Bias[Line]<<NormBias);
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
-			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = AT_CLIP_POS_IMM(AT_SCALE(S, Sc, ScN), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
-		}
-	       	gap_waitbarrier(0);
-	}
+void KerParMatMulSxSyB32_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_SIGMOID);
 }
 
-void KerParMatMulSxSyB16_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulSxSyB32_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulSxSyB32_SQ8_act(Arg, ACT_TANH);
+}
 
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
 
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
+/*************************************************************************************************************************************************
+	Matrix mult with channel centric scaling for small first matrix in the product, goal is to improve parallelism in this specific situation
+	Followed by optionnal activation; ReLU and ReLUN. Other activations are implemented using stand alone activation kernels.
+
+	Used to implement 1x1 convolution with unit stride
+   	In1 fits completly in shared L1, convolution weights
+	In2 has been transposed before being used, convolution Features
+	Parallelization scheme partition In2 along H_In2
+*************************************************************************************************************************************************/
+
+/* 	Byte Bias */
+void KerParMatMulB8_SF_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
+
+{
 	signed char * __restrict__ In1 = Arg->In1;
 	unsigned int W_In1 = Arg->W_In1;
 	unsigned int H_In1 = Arg->H_In1;
 	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	short int * __restrict__ Bias = Arg->Bias;
+	unsigned int H_In2 = Arg->W_In2;
+	unsigned int W_In2 = W_In1;
+	signed char * __restrict__ Bias = Arg->Bias;
 	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
 	unsigned char * __restrict__ Scale = Arg->Scale;
 	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	int A0 = Arg->Infos[AT_INF_A0];
+	unsigned int NormBias = Arg->NormBias;
+        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
+        unsigned int Iter = (Last>First)?(Last-First):0;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
+	for (int i=0; i<Iter/4; i++) {
+		int l2 = 4*i+First;
+		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
+			}
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+l1*H_In2 + l2)) = R;
+		}
+	}
+	if (Iter&0x2) {
+		int l2 = (4*(Iter/4)) + First;
+		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
+		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias[l1]<<NormBias, S1=S0;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
+			}
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
+			Out[l1*H_In2 + l2+1] = gap_clip(S1, 7);
+		}
+	}
+	if (Iter&0x1) {
+		int l2 = Last-1;
+		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias[l1]<<NormBias;
+			for (int c=0; c<W_In1/(4*2); c++) {
+				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
+				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
+			}
+			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
+		}
+	}
+	gap_waitbarrier(0);
+}
 
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
+void KerParMatMulB8_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_NONE);
+}
 
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
+void KerParMatMulB8_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_RELU);
+}
 
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
+void KerParMatMulB8_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_RELUN);
+}
 
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
+void KerParMatMulB8_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_RELUM);
+}
 
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = (Bias[Line]<<NormBias);
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
-			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(AT_CLIP_POS(AT_SCALE(S, Sc, ScN), A0), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
-		}
-	       	gap_waitbarrier(0);
-	}
+void KerParMatMulB8_ReLUMN_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_RELUMN);
 }
 
-/* 	Word Bias */
-void KerParMatMulB32_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulB8_LeakyReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB8_HSwish_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB8_HSigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB8_Sigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB8_Tanh_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB8_SF_SQ8_act(Arg, ACT_TANH);
+}
+
+
+
+
+/* 	Half Word Bias */
+static inline void __attribute__((always_inline)) KerParMatMulB16_SF_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
+	signed char * __restrict__ In1 = Arg->In1;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	signed char * __restrict__ In2 = Arg->In2;
+	unsigned int H_In2 = Arg->W_In2;
+	unsigned int W_In2 = W_In1;
+	short int * __restrict__ Bias = Arg->Bias;
+	signed char * __restrict__ Out = Arg->Out;
 	unsigned char * __restrict__ Scale = Arg->Scale;
 	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
+        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
+        unsigned int Iter = (Last>First)?(Last-First):0;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
+	for (int i=0; i<Iter/4; i++) {
+		int l2 = 4*i+First;
+		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
+			}
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
+	}
+	if (Iter&0x2) {
+		int l2 = (4*(Iter/4)) + First;
+		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
+		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias[l1]<<NormBias, S1=S0;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
-                }
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0, S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
+			Out[l1*H_In2 + l2+1] = gap_clip(S1, 7);
 		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
+	}
+	if (Iter&0x1) {
+		int l2 = Last-1;
+		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias[l1]<<NormBias;
+			for (int c=0; c<W_In1/(4*2); c++) {
+				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
+				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
+			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
+		}
 	}
+	gap_waitbarrier(0);
+}
+
+void KerParMatMulB16_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulB16_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulB16_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulB16_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulB16_ReLUMN_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulB16_LeakyReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB16_HSwish_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB16_HSigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB16_Sigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB16_Tanh_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB16_SF_SQ8_act(Arg, ACT_TANH);
 }
 
-void KerParMatMulB32_2x4_SQ8(KerMatMul_SQ8_T *Arg)
+
+
+/* 	Word Bias */
+static inline void __attribute__((always_inline)) KerParMatMulB32_SF_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
+	signed char * __restrict__ In1 = Arg->In1;
+	unsigned int W_In1 = Arg->W_In1;
+	unsigned int H_In1 = Arg->H_In1;
+	signed char * __restrict__ In2 = Arg->In2;
+	unsigned int H_In2 = Arg->W_In2;
+	unsigned int W_In2 = W_In1;
+	int * __restrict__ Bias = Arg->Bias;
+	signed char * __restrict__ Out = Arg->Out;
 	unsigned char * __restrict__ Scale = Arg->Scale;
 	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
+        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
+	unsigned int Iter = (Last>First)?(Last-First):0;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-                        if (Bias) {
-	                        S0 = (Bias[l1]<<NormBias), S1=S0, S2=S0, S3=S0;
-	                        S4 = (Bias[l1+1]<<NormBias), S5=S4, S6=S4, S7=S4;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
+	for (int i=0; i<Iter/4; i++) {
+		int l2 = 4*i+First;
+		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
+		for (int j=0; j<H_In1/2; j++) {
+			int l1 = 2*j;
+			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
+			v4s *pIn1_1 = (v4s *) (In1 + (l1+1)*W_In1);
+			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
+			int S4 = Bias?(Bias[l1+1]<<NormBias):0, S5=S4, S6=S4, S7=S4;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
+				v4s C1 = pIn1_1[c];
+				S4 = gap_sumdotp4(C1, V0, S4); S5 = gap_sumdotp4(C1, V1, S5); S6 = gap_sumdotp4(C1, V2, S6); S7 = gap_sumdotp4(C1, V3, S7);
+			}
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
+				int C1 = In1[(l1+1)*W_In1+c];
+				S4 += C1 * In2[(l2+0)*W_In2+c]; S5 += C1 * In2[(l2+1)*W_In2+c]; S6 += C1 * In2[(l2+2)*W_In2+c]; S7 += C1 * In2[(l2+3)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7),
-					   gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, Sc1, ScN1), 7), gap_clip(AT_SCALE(S5, Sc1, ScN1), 7),
-					   gap_clip(AT_SCALE(S6, Sc1, ScN1), 7), gap_clip(AT_SCALE(S7, Sc1, ScN1), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0;
-                        if (Bias) {
-	                        S0 = (Bias[l1]<<NormBias), S1=S0, S2=S0, S3=S0;
-	                }
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
+			S4 = AT_SCALE(S4, Sc1, ScN1);  ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, Sc1, ScN1);  ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, Sc1, ScN1);  ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, Sc1, ScN1);  ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
+			*((v4s *) (Out+(l1+1)*H_In2 + l2)) = R1;
+		}
+		if (H_In1&0x1) {
+			int l1 = H_In1 - 1;
+			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
 			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
 			}
 			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7),
-					   gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Sc,  ScN);  ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Sc,  ScN);  ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+l1*H_In2 + l2)) = R;
 		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0, S1=0;
-                        if (Bias) {
-	                        S0 = (Bias[Line]<<NormBias), S1=S0;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
 	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+	if (Iter&0x2) {
+		int l2 = (4*(Iter/4)) + First;
+		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
+		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0;
+			for (int c=0; c<W_In1/4; c++) {
+				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
+				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
+			}
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Sc,  ScN);  ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
+			Out[l1*H_In2 + l2+1] = gap_clip(S1, 7);
 		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0;
-                        if (Bias) {
-	                        S0 = (Bias[Line]<<NormBias);
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
+	}
+	if (Iter&0x1) {
+		int l2 = Last-1;
+		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
+		for (int l1=0; l1<H_In1; l1++) {
+			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
+			int S0 = Bias?(Bias[l1]<<NormBias):0;
+			for (int c=0; c<W_In1/(4*2); c++) {
+				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
+				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
+			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
+			for (int c=(W_In1/4)*4; c<W_In1; c++) {
+				int C0 = In1[l1*W_In1+c];
+				S0 += C0 * In2[(l2+0)*W_In2+c];
+			}
+			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
+			S0 = AT_SCALE(S0, Sc,  ScN);  ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[l1*H_In2 + l2+0] = gap_clip(S0, 7);
+		}
 	}
+	gap_waitbarrier(0);
 }
 
-void KerParMatMulB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatMulB32_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
+void KerParMatMulB32_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELU);
+}
 
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
+void KerParMatMulB32_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELUN);
+}
 
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
+void KerParMatMulB32_ReLUM_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELUM);
+}
 
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0;
-                        if (Bias) {
-                        	S0 = (Bias[Line]<<NormBias), S1=S0, S2=S0, S3=S0;
-                        }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
+void KerParMatMulB32_ReLUMN_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulB32_LeakyReLU_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB32_HSwish_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB32_HSigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB32_Sigmoid_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB32_Tanh_SF_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulB32_SF_SQ8_act(Arg, ACT_TANH);
+}
+
+/*************************************************************************************************************************************************
+	Matrix by Vector Multiplication with optional Activation (all of them are supported)
+*************************************************************************************************************************************************/
+
+static inline void __attribute__((always_inline)) KerParMatVectMul_SQ8_act(
+	KerMat3_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
+
+{
+	signed char * __restrict__ In1	= Arg->In1;
+	signed char * __restrict__ In2	= Arg->In2;
+	signed char * __restrict__ Out	= Arg->Out;
+	int W				= Arg->W;
+	int H				= Arg->H;
+	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
+	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
+
+	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
+
+	if (Scale)
+		for (int i=First; i<Last; i++) {
+			signed char * __restrict__ I1 = In1 + i*W*H;
+			int I2 = In2[i];
+			signed char * __restrict__ O  = Out + i*W*H;
+			for (int j=0; j<((W*H)/2); j++) {
+				int I10 = I1[2*j], I11 = I1[2*j+1];
+				int P1 = AT_SCALE(I10*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P2 = AT_SCALE(I11*Scale, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				O[2*j  ] = gap_clip(P1, 7); O[2*j+1] = gap_clip(P2, 7);
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
-                }
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
+			int P1 = AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			O[W*H-1] = gap_clip(P1, 7);
 		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0, S1=0;
-                        if (Bias) {
-                        	S0 = (Bias[Line]<<NormBias), S1=S0;
-                        }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
+	else
+		for (int i=First; i<Last; i++) {
+			signed char * __restrict__ I1 = In1 + i*W*H;
+			int I2 = In2[i];
+			signed char * __restrict__ O  = Out + i*W*H;
+			for (int j=0; j<((W*H)/2); j++) {
+				int I10 = I1[2*j], I11 = I1[2*j+1];
+				int P1 = AT_SCALE(I10, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P2 = AT_SCALE(I11, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				O[2*j  ] = gap_clip(P1, 7); O[2*j+1] = gap_clip(P2, 7);
 			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+			int P1 = AT_SCALE(I1[W*H-1], I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			O[W*H-1] = gap_clip(P1, 7);
 		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
+	gap_waitbarrier(0);
 }
 
-void KerParMatMulB32_2x4_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatVectMul_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
+void KerParMatVectMul_ReLU_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_RELU);
+}
 
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
+void KerParMatVectMul_ReLUN_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_RELUN);
+}
 
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
+void KerParMatVectMul_ReLUM_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_RELUM);
+}
 
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-                        if (Bias) {
-	                        S0 = (Bias[l1]<<NormBias), S1=S0, S2=S0, S3=S0;
-	                        S4 = (Bias[l1+1]<<NormBias), S5=S4, S6=S4, S7=S4;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Sc1, ScN1), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Sc1, ScN1), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S6, Sc1, ScN1), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Sc1, ScN1), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0, S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-                }
-                gap_waitbarrier(0);
-	}
+void KerParMatVectMul_ReLUMN_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_RELUMN);
 }
 
-void KerParMatMulB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatVectMul_LeakyReLU_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-	int A0 = Arg->Infos[AT_INF_A0];
+void KerParMatVectMul_HSwish_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_HSWISH);
+}
 
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
+void KerParMatVectMul_HSigmoid_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
+void KerParMatVectMul_Sigmoid_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_SIGMOID);
+}
 
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                                S2 = gap_sumdotp4(V0, VBuff2[i], S2);
-                                S3 = gap_sumdotp4(V0, VBuff3[i], S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7),
-					  gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7));
-			*((v4s *) (Out+(Line+OffLine)*W_Out+4*Col+0+OffCol)) = R;
-                }
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0, S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = Bias?(Bias[Line]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
+void KerParMatVectMul_Tanh_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_SQ8_act(Arg, ACT_TANH);
 }
 
-void KerParMatMulSxSyB32_SQ8(KerMatMul_SQ8_T *Arg)
+static inline void __attribute__((always_inline)) KerParMatVectMul_HWC_SQ8_act(
+	KerMat3_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
-
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
-
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
-
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
-
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
+	signed char * __restrict__ In1	= Arg->In1;
+	signed char * __restrict__ In2	= Arg->In2;
+	signed char * __restrict__ Out	= Arg->Out;
+	int W				= Arg->W;
+	int H				= Arg->H;
+	int Feat			= Arg->Feat;
+	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
+	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
+	int S				= W*H;
 
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
+	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat);
 
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = Bias?(Bias[Line]<<NormBias):0;
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
+	if (Scale)
+		for (int i=First; i<Last; i++) {
+			signed char * __restrict__ I1 = In1 + i;
+			int I2 = In2[i];
+			signed char * __restrict__ O  = Out + i;
+			for (int j=0; j<(S/2); j++) {
+				int I10 = I1[(2*j)*Feat], I11 = I1[(2*j+1)*Feat];
+				int P1 = AT_SCALE(I10*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P2 = AT_SCALE(I11*Scale, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				O[(2*j)*Feat] = gap_clip(P1, 7); O[(2*j+1)*Feat] = gap_clip(P2, 7);
 			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(AT_SCALE(S, Sc, ScN), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
+			int P1 = AT_SCALE(I1[(S-1)*Feat]*Scale, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			O[(S-1)*Feat] = gap_clip(P1, 7);
 		}
-	       	gap_waitbarrier(0);
-	}
-}
-
-void KerParMatMulSxSyB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
-
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
-
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
-
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
-
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
-
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
-
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = Bias?(Bias[Line]<<NormBias):0;
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
+	else
+		for (int i=First; i<Last; i++) {
+			signed char * __restrict__ I1 = In1 + i;
+			int I2 = In2[i];
+			signed char * __restrict__ O  = Out + i;
+			for (int j=0; j<(S/2); j++) {
+				int I10 = I1[(2*j)*Feat], I11 = I1[(2*j+1)*Feat];
+				int P1 = AT_SCALE(I10, I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				int P2 = AT_SCALE(I11, I2, ScaleN); ACT_SWITCH(P2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				O[(2*j)*Feat] = gap_clip(P1, 7); O[(2*j+1)*Feat] = gap_clip(P2, 7);
 			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = AT_CLIP_POS_IMM(AT_SCALE(S, Sc, ScN), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
+			int P1 = AT_SCALE(I1[(S-1)*Feat], I2, ScaleN); ACT_SWITCH(P1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			O[(S-1)*Feat] = gap_clip(P1, 7);
 		}
-	       	gap_waitbarrier(0);
-	}
+	gap_waitbarrier(0);
 }
 
-void KerParMatMulSxSyB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-/*
-	In1 is usually the Conv1x1 filter set, e,g In1 is [OutFeat][InFeat]
-	In2 is  [InFeat][Width*Height]
-
-	When we receive tiles In2 and if StrideY is != 1 tile is always [OutFeat][K*(Width*Scy)]
-*/
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int W_In2 = Arg->W_In2;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned int W_Out = Arg->W_Out;
-	int Pi = Arg->OutFirstCol;
-	signed char *BufferColIn2 = Arg->BufferColIn2;
-	unsigned int NormBias = Arg->NormBias;
-	int Wi = Arg->W, Hi = Arg->H;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	int ColFirst = Arg->ColFirst;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	unsigned int H_In2 = W_In1;
-	unsigned int H_Out = H_In1;
-
-	int Wo  = (Wi+Sx-1)/Sx, Ho = (Hi+Sy-1)/Sy;
-	int Oo, OffLine;
-	int At, F=0, L = W_In2;
-
-	unsigned int Line, Col, i;
-	v4s *VBuff = (v4s *) BufferColIn2;
-
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-	unsigned int Ci = ChunkSize(H_In2), Fi = CoreId*Ci, Li  = Min(H_In2, Fi+Ci);
-
-	At = 0; OffLine = 0; Oo = 0;
-	if (ColFirst) OffLine = Pi; else Oo = Pi;
-
-	while (L>0) {
-	       	for (i=Fi;i<Li; i++) BufferColIn2[i] = In2[i*W_In2+At];
-	       	gap_waitbarrier(0);
-	       	for (Line=First; Line<Last; Line++) {
-		       	v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-		       	int S = Bias?(Bias[Line]<<NormBias):0;
-		       	for (i=0; i<(W_In1/(4*2)); i++) {
-				S = gap_sumdotp4(VIn1[2*i], VBuff[2*i], S);
-				S = gap_sumdotp4(VIn1[2*i+1], VBuff[2*i+1], S);
-			}
-			if (W_In1&0x4) S = gap_sumdotp4(VIn1[W_In1/4-1], VBuff[W_In1/4-1], S);
-		       	for (i=(W_In1/4)*4; i<W_In1; i++) S += In1[Line*W_In1 + i] * BufferColIn2[i];
-			unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-		       	Out[(Line+OffLine)*W_Out+Oo] = gap_clip(AT_CLIP_POS(AT_SCALE(S, Sc, ScN), A0), 7);
-	       	}
-		int nF = F+Sx;
-		if (nF<Wi) {
-			F = nF; At += Sx; L -= Sx; Oo++;
-		} else {
-			int d = Wi-F+(Sy-1)*Wi;
-			F = 0; L -= d; At += d; Oo++;
-		}
-	       	gap_waitbarrier(0);
-	}
+void KerParMatVectMul_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_NONE);
 }
 
-/*************************************************************************************************************************************************
-	Matrix mult with channel centric scaling for small first matrix in the product, goal is to improve parallelism in this specific situation
-	Followed by optionnal activation; ReLU and ReLUN. Other activations are implemented using stand alone activation kernels.
-
-	Used to implement 1x1 convolution with unit stride
-   	In1 fits completly in shared L1, convolution weights
-	In2 has been transposed before being used, convolution Features
-	Parallelization scheme partition In2 along H_In2
-*************************************************************************************************************************************************/
+void KerParMatVectMul_ReLU_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_RELU);
+}
 
-/* 	Byte Bias */
-void KerParMatMulB8_SF_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatVectMul_ReLUN_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_RELUN);
+}
 
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	signed char * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-        unsigned int Iter = (Last>First)?(Last-First):0;
+void KerParMatVectMul_ReLUM_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_RELUM);
+}
 
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB8_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	signed char * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB8_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	signed char * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7),
-					  gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-			Out[l1*H_In2 + l2+1] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-/* 	Half Word Bias */
-void KerParMatMulB16_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	short int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB16_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	short int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB16_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	short int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-	unsigned int Iter = (Last>First)?(Last-First):0;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7),
-					  gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-			Out[l1*H_In2 + l2+1] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias[l1]<<NormBias;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-/* 	Word Bias */
-void KerParMatMulB32_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-	unsigned int Iter = (Last>First)?(Last-First):0;
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int j=0; j<H_In1/2; j++) {
-			int l1 = 2*j;
-			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
-			v4s *pIn1_1 = (v4s *) (In1 + (l1+1)*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			int S4 = Bias?(Bias[l1+1]<<NormBias):0, S5=S4, S6=S4, S7=S4;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-				v4s C1 = pIn1_1[c];
-				S4 = gap_sumdotp4(C1, V0, S4); S5 = gap_sumdotp4(C1, V1, S5); S6 = gap_sumdotp4(C1, V2, S6); S7 = gap_sumdotp4(C1, V3, S7);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-				int C1 = In1[(l1+1)*W_In1+c];
-				S4 += C1 * In2[(l2+0)*W_In2+c]; S5 += C1 * In2[(l2+1)*W_In2+c]; S6 += C1 * In2[(l2+2)*W_In2+c]; S7 += C1 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7),
-					  gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S4, Sc1, ScN1), 7), gap_clip(AT_SCALE(S5, Sc1, ScN1), 7),
-					   gap_clip(AT_SCALE(S6, Sc1, ScN1), 7), gap_clip(AT_SCALE(S7, Sc1, ScN1), 7));
-			*((v4s *) (Out+(l1+1)*H_In2 + l2)) = R1;
-		}
-		if (H_In1&0x1) {
-			int l1 = H_In1 - 1;
-			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_SCALE(S0, Sc, ScN), 7), gap_clip(AT_SCALE(S1, Sc, ScN), 7), gap_clip(AT_SCALE(S2, Sc, ScN), 7), gap_clip(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = gap_clip(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB32_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-	unsigned int Iter = (Last>First)?(Last-First):0;
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB32_2x4_ReLU_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-	unsigned int Iter = (Last>First)?(Last-First):0;
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int j=0; j<H_In1/2; j++) {
-			int l1 = 2*j;
-			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
-			v4s *pIn1_1 = (v4s *) (In1 + (l1+1)*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			int S4 = Bias?(Bias[l1+1]<<NormBias):0, S5=S4, S6=S4, S7=S4;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-				v4s C1 = pIn1_1[c];
-				S4 = gap_sumdotp4(C1, V0, S4); S5 = gap_sumdotp4(C1, V1, S5); S6 = gap_sumdotp4(C1, V2, S6); S7 = gap_sumdotp4(C1, V3, S7);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-				int C1 = In1[(l1+1)*W_In1+c];
-				S4 += C1 * In2[(l2+0)*W_In2+c]; S5 += C1 * In2[(l2+1)*W_In2+c]; S6 += C1 * In2[(l2+2)*W_In2+c]; S7 += C1 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-			unsigned int Sc1 = Scale[l1+1], ScN1 = ScaleN[l1+1];
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Sc1, ScN1), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Sc1, ScN1), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S6, Sc1, ScN1), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Sc1, ScN1), 7));
-			*((v4s *) (Out+(l1+1)*H_In2 + l2)) = R1;
-		}
-		if (H_In1&0x1) {
-			int l1 = H_In1 - 1;
-			v4s *pIn1_0 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1_0[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7),
-					  AT_CLIP_POS_IMM(AT_SCALE(S2, Sc, ScN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc, ScN), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-			Out[l1*H_In2 + l2+1] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc, ScN), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulB32_ReLUN_SF_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1 = Arg->In1;
-	unsigned int W_In1 = Arg->W_In1;
-	unsigned int H_In1 = Arg->H_In1;
-	signed char * __restrict__ In2 = Arg->In2;
-	unsigned int H_In2 = Arg->W_In2;
-	unsigned int W_In2 = W_In1;
-	int * __restrict__ Bias = Arg->Bias;
-	signed char * __restrict__ Out = Arg->Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(H_In2), First = Chunk*CoreId, Last = Min(First+Chunk, H_In2);
-	unsigned int Iter = (Last>First)?(Last-First):0;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	for (int i=0; i<Iter/4; i++) {
-		int l2 = 4*i+First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2), *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2), *pIn2_2 = (v4s *) (In2 + (l2+2)*W_In2), *pIn2_3 = (v4s *) (In2 + (l2+3)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0, S2=S0, S3=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c], V2 = pIn2_2[c], V3 = pIn2_3[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1); S2 = gap_sumdotp4(C0, V2, S2); S3 = gap_sumdotp4(C0, V3, S3);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c]; S2 += C0 * In2[(l2+2)*W_In2+c]; S3 += C0 * In2[(l2+3)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			v4s R = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7),
-					  gap_clip(AT_CLIP_POS(AT_SCALE(S2, Sc, ScN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Sc, ScN), A0), 7));
-			*((v4s *) (Out+l1*H_In2 + l2)) = R;
-		}
-	}
-	if (Iter&0x2) {
-		int l2 = (4*(Iter/4)) + First;
-		v4s *pIn2_0 = (v4s *) (In2 + (l2+0)*W_In2);
-		v4s *pIn2_1 = (v4s *) (In2 + (l2+1)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0, S1=S0;
-			for (int c=0; c<W_In1/4; c++) {
-				v4s C0 = pIn1[c], V0 = pIn2_0[c], V1 = pIn2_1[c];
-				S0 = gap_sumdotp4(C0, V0, S0); S1 = gap_sumdotp4(C0, V1, S1);
-			}
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c]; S1 += C0 * In2[(l2+1)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-			Out[l1*H_In2 + l2+1] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Sc, ScN), A0), 7);
-		}
-	}
-	if (Iter&0x1) {
-		int l2 = Last-1;
-		v4s *pIn2 = (v4s *) (In2 + (l2+0)*W_In2);
-		for (int l1=0; l1<H_In1; l1++) {
-			v4s *pIn1 = (v4s *) (In1 + l1*W_In1);
-			int S0 = Bias?(Bias[l1]<<NormBias):0;
-			for (int c=0; c<W_In1/(4*2); c++) {
-				v4s C0 = pIn1[2*c], C1 = pIn1[2*c+1], V0 = pIn2[2*c], V1 = pIn2[2*c+1];
-				S0 = gap_sumdotp4(C0, V0, S0); S0 = gap_sumdotp4(C1, V1, S0);
-			}
-			if (W_In1&0x4) S0 = gap_sumdotp4(pIn1[W_In1/4-1], pIn2[W_In1/4-1], S0);
-			for (int c=(W_In1/4)*4; c<W_In1; c++) {
-				int C0 = In1[l1*W_In1+c];
-				S0 += C0 * In2[(l2+0)*W_In2+c];
-			}
-			unsigned int Sc = Scale[l1], ScN = ScaleN[l1];
-			Out[l1*H_In2 + l2+0] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-/*************************************************************************************************************************************************
-	Matrix by Vector Multiplication with optional Activation (all of them are supported)
-*************************************************************************************************************************************************/
-
-void KerParMatVectMul_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = gap_clip(AT_SCALE(I10*Scale, I2, ScaleN), 7);
-				int P2 = gap_clip(AT_SCALE(I11*Scale, I2, ScaleN), 7);
-				O[2*j  ] = P1; O[2*j+1] = P2;
-			}
-			O[W*H-1] = gap_clip(AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = gap_clip(AT_SCALE(I10, I2, ScaleN), 7);
-				int P2 = gap_clip(AT_SCALE(I11, I2, ScaleN), 7);
-				O[2*j  ] = P1; O[2*j+1] = P2;
-			}
-			O[W*H-1] = gap_clip(AT_SCALE(I1[W*H-1], I2, ScaleN), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-
-void KerParMatVectMul_HWC_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	int Feat			= Arg->Feat;
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-	int S				= W*H;
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i;
-			for (int j=0; j<(S/2); j++) {
-				int I10 = I1[(2*j)*Feat], I11 = I1[(2*j+1)*Feat];
-				int P1 = gap_clip(AT_SCALE(I10*Scale, I2, ScaleN), 7);
-				int P2 = gap_clip(AT_SCALE(I11*Scale, I2, ScaleN), 7);
-				O[(2*j)*Feat] = P1; O[(2*j+1)*Feat] = P2;
-			}
-			O[(S-1)*Feat] = gap_clip(AT_SCALE(I1[(S-1)*Feat]*Scale, I2, ScaleN), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i;
-			for (int j=0; j<(S/2); j++) {
-				int I10 = I1[(2*j)*Feat], I11 = I1[(2*j+1)*Feat];
-				int P1 = gap_clip(AT_SCALE(I10, I2, ScaleN), 7);
-				int P2 = gap_clip(AT_SCALE(I11, I2, ScaleN), 7);
-				O[(2*j+0)*Feat] = P1; O[(2*j+1)*Feat] = P2;
-			}
-			O[(S-1)*Feat] = gap_clip(AT_SCALE(I1[(S-1)*Feat], I2, ScaleN), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-void KerParMatVectMul_ReLU_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = AT_CLIP_POS_IMM(AT_SCALE(I10*Scale, I2, ScaleN), 7);
-				int P2 = AT_CLIP_POS_IMM(AT_SCALE(I11*Scale, I2, ScaleN), 7);
-				O[2*j  ] = P1; O[2*j+1] = P2;
-			}
-			O[W*H-1] = AT_CLIP_POS_IMM(AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = AT_CLIP_POS_IMM(AT_SCALE(I10, I2, ScaleN), 7);
-				int P2 = AT_CLIP_POS_IMM(AT_SCALE(I11, I2, ScaleN), 7);
-				O[2*j  ] = P1; O[2*j+1] = P2;
-			}
-			O[W*H-1] = AT_CLIP_POS_IMM(AT_SCALE(I1[W*H-1], I2, ScaleN), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-void KerParMatVectMul_ReLUN_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	int A0				= Arg->Infos[AT_INF_A0];
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = AT_CLIP_POS(AT_SCALE(I10*Scale, I2, ScaleN), A0);
-				int P2 = AT_CLIP_POS(AT_SCALE(I11*Scale, I2, ScaleN), A0);
-				O[2*j  ] = gap_clip(P1, 7); O[2*j+1] = gap_clip(P2, 7);
-			}
-			O[W*H-1] = gap_clip(AT_CLIP_POS(AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN), A0), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int P1 = AT_CLIP_POS(AT_SCALE(I10, I2, ScaleN), A0);
-				int P2 = AT_CLIP_POS(AT_SCALE(I11, I2, ScaleN), A0);
-				O[2*j  ] = gap_clip(P1, 7); O[2*j+1] = gap_clip(P2, 7);
-			}
-			O[W*H-1] = gap_clip(AT_CLIP_POS(AT_SCALE(I1[W*H-1], I2, ScaleN), A0), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-void KerParMatVectMul_HSigmoid_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	unsigned int ActScale		= ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE];
-	unsigned int ActScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN];
-	int A0				= Arg->Infos[AT_INF_A0];
-	int B0				= Arg->Infos[AT_INF_B0];
-	int C0				= Arg->Infos[AT_INF_C0];
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int Acc0 = gap_clip(AT_SCALE(I10*Scale, I2, ScaleN), 7), Acc1 = gap_clip(AT_SCALE(I11*Scale, I2, ScaleN), 7);
-				Acc0 = AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScaleN);
-				Acc1 = AT_SCALE(ActScale, AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScaleN);
-				O[2*j  ] = gap_clip(Acc0, 7); O[2*j+1] = gap_clip(Acc1, 7);
-			}
-			int Acc0 = gap_clip(AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN), 7);
-			O[W*H-1] = gap_clip(AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScaleN), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int Acc0 = gap_clip(AT_SCALE(I10, I2, ScaleN), 7), Acc1 = gap_clip(AT_SCALE(I11, I2, ScaleN), 7);
-				Acc0 = AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScaleN);
-				Acc1 = AT_SCALE(ActScale, AT_CLIP_POS(Acc1 + B0, A0) * C0, ActScaleN);
-				O[2*j  ] = gap_clip(Acc0, 7); O[2*j+1] = gap_clip(Acc1, 7);
-			}
-			int Acc0 = gap_clip(AT_SCALE(I1[W*H-1], I2, ScaleN), 7);
-			O[W*H-1] = gap_clip(AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0, ActScaleN), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-void KerParMatVectMul_HSwish_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	unsigned int ActScale		= ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE];
-	unsigned int ActScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN];
-	int A0				= Arg->Infos[AT_INF_A0];
-	int B0				= Arg->Infos[AT_INF_B0];
-	int C0				= Arg->Infos[AT_INF_C0];
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int Acc0 = gap_clip(AT_SCALE(I10*Scale, I2, ScaleN), 7), Acc1 = gap_clip(AT_SCALE(I11*Scale, I2, ScaleN), 7);
-				Acc0 = AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScaleN);
-				Acc1 = AT_SCALE(ActScale, AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScaleN);
-				O[2*j  ] = gap_clip(Acc0, 7); O[2*j+1] = gap_clip(Acc1, 7);
-			}
-			int Acc0 = gap_clip(AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN), 7);
-			O[W*H-1] = gap_clip(AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScaleN), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int Acc0 = gap_clip(AT_SCALE(I10, I2, ScaleN), 7), Acc1 = gap_clip(AT_SCALE(I11, I2, ScaleN), 7);
-				Acc0 = AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScaleN);
-				Acc1 = AT_SCALE(ActScale, AT_CLIP_POS(Acc1 + B0, A0) * C0 * Acc1, ActScaleN);
-				O[2*j  ] = gap_clip(Acc0, 7); O[2*j+1] = gap_clip(Acc1, 7);
-			}
-			int Acc0 = gap_clip(AT_SCALE(I1[W*H-1], I2, ScaleN), 7);
-			O[W*H-1] = gap_clip(AT_SCALE(ActScale, AT_CLIP_POS(Acc0 + B0, A0) * C0 * Acc0, ActScaleN), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-void KerParMatVectMul_LeakyReLU_SQ8(KerMat3_SQ8_T *Arg)
-
-{
-	signed char * __restrict__ In1	= Arg->In1;
-	signed char * __restrict__ In2	= Arg->In2;
-	signed char * __restrict__ Out	= Arg->Out;
-	int W				= Arg->W;
-	int H				= Arg->H;
-	unsigned int ActScale		= ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALE];
-	unsigned int ActScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_ACTSCALEN];
-	int A0				= Arg->Infos[AT_INF_A0];
-	int B0				= Arg->Infos[AT_INF_B0];
-	int C0				= Arg->Infos[AT_INF_C0];
-	unsigned int Scale		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALE];
-	unsigned int ScaleN		= ((unsigned char *)(Arg->Infos))[AT_INF_SCALEN];
-
-	unsigned int CoreId = gap_coreid(), Chunk = ChunkSize(Arg->Feat), First = Chunk*CoreId, Last = Min(First+Chunk, Arg->Feat);
-
-	if (Scale)
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int Acc0 = gap_clip(AT_SCALE(I10*Scale, I2, ScaleN), 7), Acc1 = gap_clip(AT_SCALE(I11*Scale, I2, ScaleN), 7);
-				int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-				int Acc0N = AT_NORM(Acc0 * A0, 7);
-				Acc0 = AT_SCALE(ActScale, (Neg0*Acc0N+Pos0*Acc0), ActScaleN);
-				int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1;
-				int Acc1N = AT_NORM(Acc1 * A0, 7);
-				Acc1 = AT_SCALE(ActScale, (Neg1*Acc1N+Pos1*Acc1), ActScaleN);
-				O[2*j  ] = gap_clip(Acc0, 7); O[2*j+1] = gap_clip(Acc1, 7);
-			}
-			int Acc0 = gap_clip(AT_SCALE(I1[W*H-1]*Scale, I2, ScaleN), 7);
-			int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-			int Acc0N = AT_NORM(Acc0 * A0, 7);
-			O[W*H-1] = gap_clip(AT_SCALE(ActScale, (Neg0*Acc0N+Pos0*Acc0), ActScaleN), 7);
-		}
-	else
-		for (int i=First; i<Last; i++) {
-			signed char * __restrict__ I1 = In1 + i*W*H;
-			int I2 = In2[i];
-			signed char * __restrict__ O  = Out + i*W*H;
-			for (int j=0; j<((W*H)/2); j++) {
-				int I10 = I1[2*j], I11 = I1[2*j+1];
-				int Acc0 = gap_clip(AT_SCALE(I10, I2, ScaleN), 7), Acc1 = gap_clip(AT_SCALE(I11, I2, ScaleN), 7);
-				int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-				int Acc0N = AT_NORM(Acc0 * A0, 7);
-				Acc0 = AT_SCALE(ActScale, (Neg0*Acc0N+Pos0*Acc0), ActScaleN);
-				int Neg1 = gap_bitextractu(Acc1, 1, 31), Pos1 = !Neg1;
-				int Acc1N = AT_NORM(Acc1 * A0, 7);
-				Acc1 = AT_SCALE(ActScale, (Neg1*Acc1N+Pos1*Acc1), ActScaleN);
-				O[2*j  ] = gap_clip(Acc0, 7); O[2*j+1] = gap_clip(Acc1, 7);
-			}
-			int Acc0 = gap_clip(AT_SCALE(I1[W*H-1], I2, ScaleN), 7);
-			int Neg0 = gap_bitextractu(Acc0, 1, 31), Pos0 = !Neg0;
-			int Acc0N = AT_NORM(Acc0 * A0, 7);
-			O[W*H-1] = gap_clip(AT_SCALE(ActScale, (Neg0*Acc0N+Pos0*Acc0), ActScaleN), 7);
-		}
-	gap_waitbarrier(0);
-}
-
-void KerParMatMulNoBias_2x4_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-                        int S0 = 0, S1=S0, S2=S0, S3=S0;
-                        int S4 = 0, S5=S4, S6=S4, S7=S4;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Scale, ScaleN), 7), gap_clip(AT_SCALE(S1, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S2, Scale, ScaleN), 7), gap_clip(AT_SCALE(S3, Scale, ScaleN), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, Scale, ScaleN), 7), gap_clip(AT_SCALE(S5, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S6, Scale, ScaleN), 7), gap_clip(AT_SCALE(S7, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-			int S0 = 0, S1=S0, S2=S0, S3=S0;
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Scale, ScaleN), 7), gap_clip(AT_SCALE(S1, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S2, Scale, ScaleN), 7), gap_clip(AT_SCALE(S3, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = 0, S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Scale, ScaleN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = 0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-void KerParMatMulNoBias_2x4_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-                        int S0 = 0, S1=S0, S2=S0, S3=S0;
-                        int S4 = 0, S5=S4, S6=S4, S7=S4;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7));
-			v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S6, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-			int S0 = 0, S1=S0, S2=S0, S3=S0;
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = 0, S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = 0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-void KerParMatMulNoBias_2x4_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-        int A0 = Arg->Infos[AT_INF_A0];
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-                        int S0 = 0, S1=S0, S2=S0, S3=S0;
-                        int S4 = 0, S5=S4, S6=S4, S7=S4;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7),
-					   gap_clip(AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7),
-					   gap_clip(AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-			int S0 = 0, S1=S0, S2=S0, S3=S0;
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7),
-					   gap_clip(AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = 0, S1=S0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0 = 0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-
-void KerParMatMulB32_2x4_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Out = Arg->Out;
-	int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
-		                S4 = (Bias[4*Col]<<NormBias), S5 = (Bias[4*Col+1]<<NormBias), S6 = (Bias[4*Col+2]<<NormBias), S7 = (Bias[4*Col+3]<<NormBias);
-                	}
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Scale, ScaleN), 7), gap_clip(AT_SCALE(S1, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S2, Scale, ScaleN), 7), gap_clip(AT_SCALE(S3, Scale, ScaleN), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, Scale, ScaleN), 7), gap_clip(AT_SCALE(S5, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S6, Scale, ScaleN), 7), gap_clip(AT_SCALE(S7, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-			int S0=0, S1=0, S2=0, S3=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
-                	}
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Scale, ScaleN), 7), gap_clip(AT_SCALE(S1, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S2, Scale, ScaleN), 7), gap_clip(AT_SCALE(S3, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0, S1=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias);
-                	}
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			Out[(Line+OffLine)*W_Out+2*Col  +OffCol] = gap_clip(AT_SCALE(S0, Scale, ScaleN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_SCALE(S1, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=Bias?(Bias[Col]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_SCALE(S0, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-void KerParMatMulB32_2x4_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Out = Arg->Out;
-	int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int W_Out = Arg->W_Out;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-			int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
-		                S4 = (Bias[4*Col]<<NormBias), S5 = (Bias[4*Col+1]<<NormBias), S6 = (Bias[4*Col+2]<<NormBias), S7 = (Bias[4*Col+3]<<NormBias);
-                	}
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7));
-			v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S6, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
-                	}
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0, S1=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias);
-                	}
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=Bias?(Bias[Col]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-void KerParMatMulB32_2x4_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        signed char * __restrict__ Out = Arg->Out;
-	int * __restrict__ Bias = Arg->Bias;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int W_Out = Arg->W_Out;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
-        int ColFirst = Arg->ColFirst;
-        int A0 = Arg->Infos[AT_INF_A0];
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
-        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
-        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
-        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        for (Col=0; Col<W_In2/4; Col++) {
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+4*Col+0];
-			int X1 = In2[i*W_In2+4*Col+1];
-			int X2 = In2[i*W_In2+4*Col+2];
-			int X3 = In2[i*W_In2+4*Col+3];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
-			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
-			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
-		}
-                gap_waitbarrier(0);
-                for (Line=0; Line<Iter/2; Line++) {
-                	int l1 = 2*Line + First;
-                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
-                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-			int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
-		                S4 = (Bias[4*Col]<<NormBias), S5 = (Bias[4*Col+1]<<NormBias), S6 = (Bias[4*Col+2]<<NormBias), S7 = (Bias[4*Col+3]<<NormBias);
-                	}
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * BufferColIn2[i];
-				S5 += V1 * BufferColIn2[i+1*H_In2];
-				S6 += V1 * BufferColIn2[i+2*H_In2];
-				S7 += V1 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7),
-					   gap_clip(AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7),
-					   gap_clip(AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
-                }
-		if (Iter&0x1) {
-			int l1 = Last - 1;
-			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
-                        int S0=0, S1=0, S2=0, S3=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
-                	}
-			for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
-				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
-				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
-				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
-			}
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-				S2 += V0 * BufferColIn2[i+2*H_In2];
-				S3 += V0 * BufferColIn2[i+3*H_In2];
-			}
-			v4s R1 = gap_pack4(gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7),
-					   gap_clip(AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), 7), gap_clip(AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0), 7));
-			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
-		}
-                gap_waitbarrier(0);
-        }
-	if (W_In2&0x2) {
-		Col = W_In2/2 - 1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+2*Col+0];
-			int X1 = In2[i*W_In2+2*Col+1];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
-			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=0, S1=0;
-                        if (Bias) {
-		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias);
-                	}
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-				S1 += V0 * BufferColIn2[i+1*H_In2];
-			}
-			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7);
-			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-	if (W_In2&0x1) {
-		Col = W_In2-1;
-                for (i=F;i<L; i++) {
-			int X0 = In2[i*W_In2+1*Col+0];
-			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
-		}
-                gap_waitbarrier(0);
-                for (Line=First; Line<Last; Line++) {
-                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
-                        int S0=Bias?(Bias[Col]<<NormBias):0;
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[Line*W_In1 + i];
-				S0 += V0 * BufferColIn2[i];
-			}
-			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), 7);
-                }
-                gap_waitbarrier(0);
-	}
-}
-
-
-void KerParMatMulTransposedB32_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        int ColFirst = Arg->ColFirst;
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        signed char *pOut = Out + W_Out*OffLine + OffCol;
-        for (Line=0; Line<Iter/2; Line++) {
-        	signed char *pIn2 = In2;
-        	int l1 = 2*Line + First;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-                v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  , S4=S0;
-	                        S1 = (Bias[4*Col+1]<<NormBias), S5=S1;
-	                        S2 = (Bias[4*Col+2]<<NormBias), S6=S2;
-	                        S3 = (Bias[4*Col+3]<<NormBias), S7=S3;
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * pIn2[i];
-				S5 += V1 * pIn2[i+1*H_In2];
-				S6 += V1 * pIn2[i+2*H_In2];
-				S7 += V1 * pIn2[i+3*H_In2];
-			}
-                        int Sc0 = Scale[4*Col],   ScN0 = ScaleN[4*Col];
-                        int Sc1 = Scale[4*Col+1], ScN1 = ScaleN[4*Col+1];
-                        int Sc2 = Scale[4*Col+2], ScN2 = ScaleN[4*Col+2];
-                        int Sc3 = Scale[4*Col+3], ScN3 = ScaleN[4*Col+3];
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Sc0, ScN0), 7), gap_clip(AT_SCALE(S1, Sc1, ScN1), 7),
-					   gap_clip(AT_SCALE(S2, Sc2, ScN2), 7), gap_clip(AT_SCALE(S3, Sc3, ScN3), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, Sc0, ScN0), 7), gap_clip(AT_SCALE(S5, Sc1, ScN1), 7),
-					   gap_clip(AT_SCALE(S6, Sc2, ScN2), 7), gap_clip(AT_SCALE(S7, Sc3, ScN3), 7));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			*((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0, S1=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias), S1=S0;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s V1 = VIn2[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V1, A, S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V1 * pIn2[i];
-			}
-                        int Sc0 = Scale[Col],   ScN0 = ScaleN[Col];
-                        pOut[(l1  )*W_Out + Col] = gap_clip(AT_SCALE(S0, Sc0, ScN0), 7);
-                        pOut[(l1+1)*W_Out + Col] = gap_clip(AT_SCALE(S1, Sc0, ScN0), 7);
-			pIn2 += H_In2;
-        	}
-        }
-        if (Iter&0x1) {
-        	int l1 = Last-1;
-        	signed char *pIn2 = In2;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  , S4=S0;
-	                        S1 = (Bias[4*Col+1]<<NormBias), S5=S1;
-	                        S2 = (Bias[4*Col+2]<<NormBias), S6=S2;
-	                        S3 = (Bias[4*Col+3]<<NormBias), S7=S3;
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
-			}
-                        int Sc0 = Scale[4*Col],   ScN0 = ScaleN[4*Col];
-                        int Sc1 = Scale[4*Col+1], ScN1 = ScaleN[4*Col+1];
-                        int Sc2 = Scale[4*Col+2], ScN2 = ScaleN[4*Col+2];
-                        int Sc3 = Scale[4*Col+3], ScN3 = ScaleN[4*Col+3];
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Sc0, ScN0), 7), gap_clip(AT_SCALE(S1, Sc1, ScN1), 7),
-					   gap_clip(AT_SCALE(S2, Sc2, ScN2), 7), gap_clip(AT_SCALE(S3, Sc3, ScN3), 7));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias);
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				S0 += V0 * pIn2[i];
-			}
-                        int Sc0 = Scale[Col],   ScN0 = ScaleN[Col];
-                        pOut[(l1  )*W_Out + Col] = gap_clip(AT_SCALE(S0, Sc0, ScN0), 7);
-			pIn2 += H_In2;
-        	}
-        }
-        gap_waitbarrier(0);
-}
-
-void KerParMatMulTransposedB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg)
-
-{
-	/*
-	 	Column buffer has to be sized in order to be able to accomodate up to 4 columns of size H_In2
-	*/
-        signed char * __restrict__ In1 = Arg->In1;
-        unsigned int W_In1 = Arg->W_In1;
-        unsigned int H_In1 = Arg->H_In1;
-        signed char * __restrict__ In2 = Arg->In2;
-        unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
-        unsigned int OutFirstCol = Arg->OutFirstCol;
-        int ColFirst = Arg->ColFirst;
-
-        unsigned int H_In2 = W_In1;
-        unsigned int H_Out = H_In1;
-        unsigned int Line, Col, i;
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        signed char *pOut = Out + W_Out*OffLine + OffCol;
-        for (Line=0; Line<Iter/2; Line++) {
-        	signed char *pIn2 = In2;
-        	int l1 = 2*Line + First;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-                v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  , S4=S0;
-	                        S1 = (Bias[4*Col+1]<<NormBias), S5=S1;
-	                        S2 = (Bias[4*Col+2]<<NormBias), S6=S2;
-	                        S3 = (Bias[4*Col+3]<<NormBias), S7=S3;
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-				v4s V1 = VIn2[i];
-                                S4 = gap_sumdotp4(V1, A, S4);
-                                S5 = gap_sumdotp4(V1, B, S5);
-                                S6 = gap_sumdotp4(V1, C, S6);
-                                S7 = gap_sumdotp4(V1, D, S7);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * pIn2[i];
-				S5 += V1 * pIn2[i+1*H_In2];
-				S6 += V1 * pIn2[i+2*H_In2];
-				S7 += V1 * pIn2[i+3*H_In2];
-			}
-                        int Sc0 = Scale[4*Col],   ScN0 = ScaleN[4*Col];
-                        int Sc1 = Scale[4*Col+1], ScN1 = ScaleN[4*Col+1];
-                        int Sc2 = Scale[4*Col+2], ScN2 = ScaleN[4*Col+2];
-                        int Sc3 = Scale[4*Col+3], ScN3 = ScaleN[4*Col+3];
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc0, ScN0), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc1, ScN1), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Sc2, ScN2), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc3, ScN3), 7));
-			v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Sc0, ScN0), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Sc1, ScN1), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S6, Sc2, ScN2), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Sc3, ScN3), 7));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			*((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0, S1=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias), S1=S0;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s V1 = VIn2[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V1, A, S1);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V1 * pIn2[i];
-			}
-                        int Sc0 = Scale[Col],   ScN0 = ScaleN[Col];
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc0, ScN0), 7);
-                        pOut[(l1+1)*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S1, Sc0, ScN0), 7);
-			pIn2 += H_In2;
-        	}
-        }
-        if (Iter&0x1) {
-        	int l1 = Last-1;
-        	signed char *pIn2 = In2;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  ;
-	                        S1 = (Bias[4*Col+1]<<NormBias);
-	                        S2 = (Bias[4*Col+2]<<NormBias);
-	                        S3 = (Bias[4*Col+3]<<NormBias);
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
-			}
-                        int Sc0 = Scale[4*Col],   ScN0 = ScaleN[4*Col];
-                        int Sc1 = Scale[4*Col+1], ScN1 = ScaleN[4*Col+1];
-                        int Sc2 = Scale[4*Col+2], ScN2 = ScaleN[4*Col+2];
-                        int Sc3 = Scale[4*Col+3], ScN3 = ScaleN[4*Col+3];
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Sc0, ScN0), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Sc1, ScN1), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Sc2, ScN2), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Sc3, ScN3), 7));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias);
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
-				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                        }
-                        for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				S0 += V0 * pIn2[i];
-			}
-                        int Sc0 = Scale[Col],   ScN0 = ScaleN[Col];
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc0, ScN0), 7);
-			pIn2 += H_In2;
-        	}
-        }
-        gap_waitbarrier(0);
+void KerParMatVectMul_ReLUMN_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_RELUMN);
 }
 
-void KerParMatMulTransposedB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
+void KerParMatVectMul_LeakyReLU_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatVectMul_HSwish_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatVectMul_HSigmoid_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatVectMul_Sigmoid_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatVectMul_Tanh_HWC_SQ8(KerMat3_SQ8_T *Arg) {
+	KerParMatVectMul_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/* ================================================================================================================== */
+
+static inline void __attribute__((always_inline)) KerParMatMulNoBias_PL_SQ8_act(
+	KerMatMul_PL_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 	/*
@@ -5288,44 +2214,50 @@ void KerParMatMulTransposedB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
         unsigned int H_In1 = Arg->H_In1;
         signed char * __restrict__ In2 = Arg->In2;
         unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
         signed char * __restrict__ Out = Arg->Out;
         unsigned int W_Out = Arg->W_Out;
-	unsigned char * __restrict__ Scale = Arg->Scale;
-	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-	unsigned int NormBias = Arg->NormBias;
+	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
+	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
         unsigned int OutFirstCol = Arg->OutFirstCol;
+        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
         int ColFirst = Arg->ColFirst;
-        int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
         unsigned int Line, Col, i;
+        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
+        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
+        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
+        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
 
         unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
         unsigned int Iter = (Last>First)?(Last-First):0;
+        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
         int OffLine = 0, OffCol = 0;
 
         if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        signed char *pOut = Out + W_Out*OffLine + OffCol;
-        for (Line=0; Line<Iter/2; Line++) {
-        	signed char *pIn2 = In2;
-        	int l1 = 2*Line + First;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-                v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  , S4=S0;
-	                        S1 = (Bias[4*Col+1]<<NormBias), S5=S1;
-	                        S2 = (Bias[4*Col+2]<<NormBias), S6=S2;
-	                        S3 = (Bias[4*Col+3]<<NormBias), S7=S3;
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
+        for (Col=0; Col<W_In2/4; Col++) {
+                for (i=F;i<L; i++) {
+			int X0 = In2[i*W_In2+4*Col+0];
+			int X1 = In2[i*W_In2+4*Col+1];
+			int X2 = In2[i*W_In2+4*Col+2];
+			int X3 = In2[i*W_In2+4*Col+3];
+			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
+			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
+			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
+		}
+                gap_waitbarrier(0);
+                for (Line=0; Line<Iter/2; Line++) {
+                	int l1 = 2*Line + First;
+                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
+                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
+                        int S0 = 0, S1=S0, S2=S0, S3=S0;
+                        int S4 = 0, S5=S4, S6=S4, S7=S4;
+                        for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
 				v4s A = VBuff0[i];
 				v4s B = VBuff1[i];
@@ -5343,121 +2275,154 @@ void KerParMatMulTransposedB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg)
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
 				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
+				S2 += V0 * BufferColIn2[i+2*H_In2];
+				S3 += V0 * BufferColIn2[i+3*H_In2];
 				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * pIn2[i];
-				S5 += V1 * pIn2[i+1*H_In2];
-				S6 += V1 * pIn2[i+2*H_In2];
-				S7 += V1 * pIn2[i+3*H_In2];
+				S4 += V1 * BufferColIn2[i];
+				S5 += V1 * BufferColIn2[i+1*H_In2];
+				S6 += V1 * BufferColIn2[i+2*H_In2];
+				S7 += V1 * BufferColIn2[i+3*H_In2];
 			}
-                        int Sc0 = Scale[4*Col],   ScN0 = ScaleN[4*Col];
-                        int Sc1 = Scale[4*Col+1], ScN1 = ScaleN[4*Col+1];
-                        int Sc2 = Scale[4*Col+2], ScN2 = ScaleN[4*Col+2];
-                        int Sc3 = Scale[4*Col+3], ScN3 = ScaleN[4*Col+3];
-			v4s R1 = gap_pack4(AT_CLIP_POS(AT_SCALE(S0, Sc0, ScN0), A0), AT_CLIP_POS(AT_SCALE(S1, Sc1, ScN1), A0),
-					   AT_CLIP_POS(AT_SCALE(S2, Sc2, ScN2), A0), AT_CLIP_POS(AT_SCALE(S3, Sc3, ScN3), A0));
-			v4s R2 = gap_pack4(AT_CLIP_POS(AT_SCALE(S4, Sc0, ScN0), A0), AT_CLIP_POS(AT_SCALE(S5, Sc1, ScN1), A0),
-					   AT_CLIP_POS(AT_SCALE(S6, Sc2, ScN2), A0), AT_CLIP_POS(AT_SCALE(S7, Sc3, ScN3), A0));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			*((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0, S1=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias), S1=S0;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
+			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
+                }
+		if (Iter&0x1) {
+			int l1 = Last - 1;
+			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
+			int S0 = 0, S1=S0, S2=S0, S3=S0;
+			for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-				v4s V1 = VIn2[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V1, A, S1);
-                        }
+				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
+				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
+				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
+				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
+			}
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V1 * pIn2[i];
+				int V0 = In1[l1*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
+				S2 += V0 * BufferColIn2[i+2*H_In2];
+				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
-                        int Sc0 = Scale[Col],   ScN0 = ScaleN[Col];
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Sc0, ScN0), A0);
-                        pOut[(l1+1)*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Sc0, ScN0), A0);
-			pIn2 += H_In2;
-        	}
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+		}
+                gap_waitbarrier(0);
         }
-        if (Iter&0x1) {
-        	int l1 = Last-1;
-        	signed char *pIn2 = In2;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  ;
-	                        S1 = (Bias[4*Col+1]<<NormBias);
-	                        S2 = (Bias[4*Col+2]<<NormBias);
-	                        S3 = (Bias[4*Col+3]<<NormBias);
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
+	if (W_In2&0x2) {
+		Col = W_In2/2 - 1;
+                for (i=F;i<L; i++) {
+			int X0 = In2[i*W_In2+2*Col+0];
+			int X1 = In2[i*W_In2+2*Col+1];
+			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
+			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
+		}
+                gap_waitbarrier(0);
+                for (Line=First; Line<Last; Line++) {
+                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
+                        int S0 = 0, S1=S0;
+                        for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
+                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
+                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
+				int V0 = In1[Line*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
-                        int Sc0 = Scale[4*Col],   ScN0 = ScaleN[4*Col];
-                        int Sc1 = Scale[4*Col+1], ScN1 = ScaleN[4*Col+1];
-                        int Sc2 = Scale[4*Col+2], ScN2 = ScaleN[4*Col+2];
-                        int Sc3 = Scale[4*Col+3], ScN3 = ScaleN[4*Col+3];
-			v4s R1 = gap_pack4(AT_CLIP_POS(AT_SCALE(S0, Sc0, ScN0), A0), AT_CLIP_POS(AT_SCALE(S1, Sc1, ScN1), A0),
-					   AT_CLIP_POS(AT_SCALE(S2, Sc2, ScN2), A0), AT_CLIP_POS(AT_SCALE(S3, Sc3, ScN3), A0));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias);
-	                }
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+2*Col+0+OffCol] = gap_clip(S0, 7);
+			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
+                }
+                gap_waitbarrier(0);
+	}
+	if (W_In2&0x1) {
+		Col = W_In2-1;
+                for (i=F;i<L; i++) {
+			int X0 = In2[i*W_In2+1*Col+0];
+			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+		}
+                gap_waitbarrier(0);
+                for (Line=First; Line<Last; Line++) {
+                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
+                        int S0 = 0;
                         for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
+                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				S0 += V0 * pIn2[i];
+				int V0 = In1[Line*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
 			}
-                        int Sc0 = Scale[Col],   ScN0 = ScaleN[Col];
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Sc0, ScN0), A0);
-			pIn2 += H_In2;
-        	}
-        }
-        gap_waitbarrier(0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
+                }
+                gap_waitbarrier(0);
+	}
+}
+
+void KerParMatMulNoBias_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulNoBias_PL_ReLU_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulNoBias_PL_ReLUN_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulNoBias_PL_ReLUM_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulNoBias_PL_ReLUMN_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulNoBias_PL_LeakyReLU_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulNoBias_PL_HSwish_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulNoBias_PL_HSigmoid_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulNoBias_PL_Sigmoid_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulNoBias_PL_Tanh_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulNoBias_PL_SQ8_act(Arg, ACT_TANH);
 }
 
 
-void KerParMatMulTransposedB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
+static inline void __attribute__((always_inline)) KerParMatMulB32_PL_SQ8_act(
+	KerMatMul_PL_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 	/*
@@ -5468,43 +2433,55 @@ void KerParMatMulTransposedB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
         unsigned int H_In1 = Arg->H_In1;
         signed char * __restrict__ In2 = Arg->In2;
         unsigned int W_In2 = Arg->W_In2;
-        int * __restrict__ Bias = Arg->Bias;
         signed char * __restrict__ Out = Arg->Out;
+	int * __restrict__ Bias = Arg->Bias;
+	unsigned int NormBias = Arg->NormBias;
         unsigned int W_Out = Arg->W_Out;
 	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
 	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
-	unsigned int NormBias = Arg->NormBias;
         unsigned int OutFirstCol = Arg->OutFirstCol;
+        signed char * __restrict__ BufferColIn2 = Arg->BufferColIn2;
         int ColFirst = Arg->ColFirst;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
         unsigned int Line, Col, i;
-
-        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
-        unsigned int Iter = (Last>First)?(Last-First):0;
-        int OffLine = 0, OffCol = 0;
-
-        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
-        signed char *pOut = Out + W_Out*OffLine + OffCol;
-        for (Line=0; Line<Iter/2; Line++) {
-        	signed char *pIn2 = In2;
-        	int l1 = 2*Line + First;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-                v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  , S4=S0;
-	                        S1 = (Bias[4*Col+1]<<NormBias), S5=S1;
-	                        S2 = (Bias[4*Col+2]<<NormBias), S6=S2;
-	                        S3 = (Bias[4*Col+3]<<NormBias), S7=S3;
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
+        v4s * __restrict__ VBuff0 = (v4s *) BufferColIn2;
+        v4s * __restrict__ VBuff1 = (v4s *) (BufferColIn2+H_In2);
+        v4s * __restrict__ VBuff2 = (v4s *) (BufferColIn2+2*H_In2);
+        v4s * __restrict__ VBuff3 = (v4s *) (BufferColIn2+3*H_In2);
+
+        unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(H_In1), First = CoreId*ChunkCell, Last  = Min(H_In1, First+ChunkCell);
+        unsigned int Iter = (Last>First)?(Last-First):0;
+        unsigned int C = ChunkSize(H_In2), F = CoreId*C, L  = Min(H_In2, F+C);
+        int OffLine = 0, OffCol = 0;
+
+        if (ColFirst) OffLine = OutFirstCol; else OffCol = OutFirstCol;
+        for (Col=0; Col<W_In2/4; Col++) {
+                for (i=F;i<L; i++) {
+			int X0 = In2[i*W_In2+4*Col+0];
+			int X1 = In2[i*W_In2+4*Col+1];
+			int X2 = In2[i*W_In2+4*Col+2];
+			int X3 = In2[i*W_In2+4*Col+3];
+			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+4*Col+1];
+			BufferColIn2[i+2*H_In2] = X2; // In2[i*W_In2+4*Col+2];
+			BufferColIn2[i+3*H_In2] = X3; // In2[i*W_In2+4*Col+3];
+		}
+                gap_waitbarrier(0);
+                for (Line=0; Line<Iter/2; Line++) {
+                	int l1 = 2*Line + First;
+                        v4s *VIn1 = (v4s *) (&In1[(l1)*W_In1 + 0]);
+                        v4s *VIn2 = (v4s *) (&In1[(l1+1)*W_In1 + 0]);
+                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
+                        if (Bias) {
+		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
+		                S4 = (Bias[4*Col]<<NormBias), S5 = (Bias[4*Col+1]<<NormBias), S6 = (Bias[4*Col+2]<<NormBias), S7 = (Bias[4*Col+3]<<NormBias);
+                	}
+                        for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
 				v4s A = VBuff0[i];
 				v4s B = VBuff1[i];
@@ -5522,110 +2499,159 @@ void KerParMatMulTransposedB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
 				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
+				S2 += V0 * BufferColIn2[i+2*H_In2];
+				S3 += V0 * BufferColIn2[i+3*H_In2];
 				int V1 = In1[(l1+1)*W_In1 + i];
-				S4 += V1 * pIn2[i];
-				S5 += V1 * pIn2[i+1*H_In2];
-				S6 += V1 * pIn2[i+2*H_In2];
-				S7 += V1 * pIn2[i+3*H_In2];
+				S4 += V1 * BufferColIn2[i];
+				S5 += V1 * BufferColIn2[i+1*H_In2];
+				S6 += V1 * BufferColIn2[i+2*H_In2];
+				S7 += V1 * BufferColIn2[i+3*H_In2];
 			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Scale, ScaleN), 7), gap_clip(AT_SCALE(S1, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S2, Scale, ScaleN), 7), gap_clip(AT_SCALE(S3, Scale, ScaleN), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, Scale, ScaleN), 7), gap_clip(AT_SCALE(S5, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S6, Scale, ScaleN), 7), gap_clip(AT_SCALE(S7, Scale, ScaleN), 7));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			*((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0, S1=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias), S1=S0;
-	                }
-                        for (i=0; i<(W_In1/4); i++) {
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
+			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+			*((v4s *) (Out+(l1+OffLine+1)*W_Out+4*Col+0+OffCol)) = R2;
+                }
+		if (Iter&0x1) {
+			int l1 = Last - 1;
+			v4s *VIn1 = (v4s *) (&In1[l1*W_In1 + 0]);
+			int S0=0, S1=0, S2=0, S3=0;
+                        if (Bias) {
+		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias), S2 = (Bias[4*Col+2]<<NormBias), S3 = (Bias[4*Col+3]<<NormBias);
+                	}
+			for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-				v4s V1 = VIn2[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V1, A, S1);
-                        }
+				v4s A = VBuff0[i]; S0 = gap_sumdotp4(V0, A, S0);;
+				v4s B = VBuff1[i]; S1 = gap_sumdotp4(V0, B, S1);;
+				v4s C = VBuff2[i]; S2 = gap_sumdotp4(V0, C, S2);;
+				v4s D = VBuff3[i]; S3 = gap_sumdotp4(V0, D, S3);;
+			}
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				int V1 = In1[(l1+1)*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V1 * pIn2[i];
+				int V0 = In1[l1*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
+				S2 += V0 * BufferColIn2[i+2*H_In2];
+				S3 += V0 * BufferColIn2[i+3*H_In2];
 			}
-                        pOut[(l1  )*W_Out + Col] = gap_clip(AT_SCALE(S0, Scale, ScaleN), 7);
-                        pOut[(l1+1)*W_Out + Col] = gap_clip(AT_SCALE(S1, Scale, ScaleN), 7);
-			pIn2 += H_In2;
-        	}
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			*((v4s *) (Out+(l1+OffLine)*W_Out+4*Col+0+OffCol)) = R1;
+		}
+                gap_waitbarrier(0);
         }
-        if (Iter&0x1) {
-        	int l1 = Last-1;
-        	signed char *pIn2 = In2;
-                v4s *VIn1 = (v4s *) (&In1[(l1  )*W_In1 + 0]);
-        	for (Col=0; Col<(W_In2/4); Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
-		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
-		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0;
-		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  ;
-	                        S1 = (Bias[4*Col+1]<<NormBias);
-	                        S2 = (Bias[4*Col+2]<<NormBias);
-	                        S3 = (Bias[4*Col+3]<<NormBias);
-	                }
-        		for (i=0; i<(W_In1/4); i++) {
+	if (W_In2&0x2) {
+		Col = W_In2/2 - 1;
+                for (i=F;i<L; i++) {
+			int X0 = In2[i*W_In2+2*Col+0];
+			int X1 = In2[i*W_In2+2*Col+1];
+			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+2*Col+0];
+			BufferColIn2[i+1*H_In2] = X1; // In2[i*W_In2+2*Col+1];
+		}
+                gap_waitbarrier(0);
+                for (Line=First; Line<Last; Line++) {
+                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
+                        int S0=0, S1=0;
+                        if (Bias) {
+		                S0 = (Bias[4*Col]<<NormBias), S1 = (Bias[4*Col+1]<<NormBias);
+                	}
+                        for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-				v4s B = VBuff1[i];
-				v4s C = VBuff2[i];
-				v4s D = VBuff3[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
-                                S1 = gap_sumdotp4(V0, B, S1);
-                                S2 = gap_sumdotp4(V0, C, S2);
-                                S3 = gap_sumdotp4(V0, D, S3);
+                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
+                                S1 = gap_sumdotp4(V0, VBuff1[i], S1);
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[l1*W_In1 + i];
-				S0 += V0 * pIn2[i];
-				S1 += V0 * pIn2[i+1*H_In2];
-				S2 += V0 * pIn2[i+2*H_In2];
-				S3 += V0 * pIn2[i+3*H_In2];
+				int V0 = In1[Line*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
+				S1 += V0 * BufferColIn2[i+1*H_In2];
 			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, Scale, ScaleN), 7), gap_clip(AT_SCALE(S1, Scale, ScaleN), 7),
-					   gap_clip(AT_SCALE(S2, Scale, ScaleN), 7), gap_clip(AT_SCALE(S3, Scale, ScaleN), 7));
-			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
-			pIn2 += 4*H_In2;
-        	}
-        	for (Col=(W_In2/4)*4; Col<W_In2; Col++) {
-		        v4s * __restrict__ VBuff0 = (v4s *) pIn2;
-                        int S0=0;
-		        if (Bias) {
-	                        S0 = (Bias[Col]<<NormBias);
-	                }
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+2*Col  +OffCol] = gap_clip(S0, 7);
+			Out[(Line+OffLine)*W_Out+2*Col+1+OffCol] = gap_clip(S1, 7);
+                }
+                gap_waitbarrier(0);
+	}
+	if (W_In2&0x1) {
+		Col = W_In2-1;
+                for (i=F;i<L; i++) {
+			int X0 = In2[i*W_In2+1*Col+0];
+			BufferColIn2[i+0*H_In2] = X0; // In2[i*W_In2+4*Col+0];
+		}
+                gap_waitbarrier(0);
+                for (Line=First; Line<Last; Line++) {
+                        v4s *VIn1 = (v4s *) (&In1[Line*W_In1 + 0]);
+                        int S0=Bias?(Bias[Col]<<NormBias):0;
                         for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
-				v4s A = VBuff0[i];
-                                S0 = gap_sumdotp4(V0, A, S0);
+                                S0 = gap_sumdotp4(V0, VBuff0[i], S0);
                         }
                         for (i=(W_In1/4)*4; i<W_In1; i++) {
-				int V0 = In1[(l1  )*W_In1 + i];
-				S0 += V0 * pIn2[i];
+				int V0 = In1[Line*W_In1 + i];
+				S0 += V0 * BufferColIn2[i];
 			}
-                        pOut[(l1  )*W_Out + Col] = gap_clip(AT_SCALE(S0, Scale, ScaleN), 7);
-			pIn2 += H_In2;
-        	}
-        }
-        gap_waitbarrier(0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			Out[(Line+OffLine)*W_Out+1*Col+0+OffCol] = gap_clip(S0, 7);
+                }
+                gap_waitbarrier(0);
+	}
+}
+
+void KerParMatMulB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulB32_PL_ReLU_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulB32_PL_ReLUN_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulB32_PL_ReLUM_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulB32_PL_ReLUMN_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulB32_PL_LeakyReLU_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulB32_PL_HSwish_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulB32_PL_HSigmoid_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulB32_PL_Sigmoid_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulB32_PL_Tanh_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulB32_PL_SQ8_act(Arg, ACT_TANH);
 }
 
-void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
+static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_SQ8_act(
+	KerMatMul_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 	/*
@@ -5639,11 +2665,14 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
         int * __restrict__ Bias = Arg->Bias;
         signed char * __restrict__ Out = Arg->Out;
         unsigned int W_Out = Arg->W_Out;
-	unsigned char Scale = (unsigned char) Arg->Infos[AT_INF_OUTSCALE];
-	unsigned char ScaleN = (unsigned char) Arg->Infos[AT_INF_OUTSCALEN];
+	unsigned char * __restrict__ Scale = Arg->Scale;
+	unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 	unsigned int NormBias = Arg->NormBias;
         unsigned int OutFirstCol = Arg->OutFirstCol;
         int ColFirst = Arg->ColFirst;
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
@@ -5700,10 +2729,16 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				S6 += V1 * pIn2[i+2*H_In2];
 				S7 += V1 * pIn2[i+3*H_In2];
 			}
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7));
-			v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S6, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, Scale, ScaleN), 7));
+			S0 = AT_SCALE(S0, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
 			*((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2;
 			pIn2 += 4*H_In2;
@@ -5727,8 +2762,10 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				S0 += V0 * pIn2[i];
 				S1 += V1 * pIn2[i];
 			}
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7);
-                        pOut[(l1+1)*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7);
+			S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale[Col], ScaleN[Col]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+                        pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
+                        pOut[(l1+1)*W_Out + Col] = gap_clip(S1, 7);
 			pIn2 += H_In2;
         	}
         }
@@ -5741,12 +2778,12 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 		        v4s * __restrict__ VBuff1 = (v4s *) (pIn2+H_In2);
 		        v4s * __restrict__ VBuff2 = (v4s *) (pIn2+2*H_In2);
 		        v4s * __restrict__ VBuff3 = (v4s *) (pIn2+3*H_In2);
-		        int S0=0, S1=0, S2=0, S3=0;
+                        int S0=0, S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0;
 		        if (Bias) {
-	                        S0 = (Bias[4*Col]<<NormBias)  ;
-	                        S1 = (Bias[4*Col+1]<<NormBias);
-	                        S2 = (Bias[4*Col+2]<<NormBias);
-	                        S3 = (Bias[4*Col+3]<<NormBias);
+	                        S0 = (Bias[4*Col]<<NormBias)  , S4=S0;
+	                        S1 = (Bias[4*Col+1]<<NormBias), S5=S1;
+	                        S2 = (Bias[4*Col+2]<<NormBias), S6=S2;
+	                        S3 = (Bias[4*Col+3]<<NormBias), S7=S3;
 	                }
         		for (i=0; i<(W_In1/4); i++) {
 				v4s V0 = VIn1[i];
@@ -5766,8 +2803,11 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				S2 += V0 * pIn2[i+2*H_In2];
 				S3 += V0 * pIn2[i+3*H_In2];
 			}
-			v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, Scale, ScaleN), 7),
-					   AT_CLIP_POS_IMM(AT_SCALE(S2, Scale, ScaleN), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, Scale, ScaleN), 7));
+			S0 = AT_SCALE(S0, Scale[4*Col  ], ScaleN[4*Col  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale[4*Col+1], ScaleN[4*Col+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale[4*Col+2], ScaleN[4*Col+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale[4*Col+3], ScaleN[4*Col+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
 			pIn2 += 4*H_In2;
         	}
@@ -5786,14 +2826,58 @@ void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				int V0 = In1[(l1  )*W_In1 + i];
 				S0 += V0 * pIn2[i];
 			}
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS_IMM(AT_SCALE(S0, Scale, ScaleN), 7);
+			S0 = AT_SCALE(S0, Scale[Col], ScaleN[Col]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+                        pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
 			pIn2 += H_In2;
         	}
         }
         gap_waitbarrier(0);
 }
 
-void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
+void KerParMatMulTransposedB32_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulTransposedB32_ReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulTransposedB32_ReLUN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulTransposedB32_ReLUM_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulTransposedB32_ReLUMN_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulTransposedB32_LeakyReLU_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulTransposedB32_HSwish_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulTransposedB32_HSigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulTransposedB32_Sigmoid_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulTransposedB32_Tanh_SQ8(KerMatMul_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_SQ8_act(Arg, ACT_TANH);
+}
+
+
+static inline void __attribute__((always_inline)) KerParMatMulTransposedB32_PL_SQ8_act(
+	KerMatMul_PL_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
 	/*
@@ -5812,7 +2896,9 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 	unsigned int NormBias = Arg->NormBias;
         unsigned int OutFirstCol = Arg->OutFirstCol;
         int ColFirst = Arg->ColFirst;
-        int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = (unsigned char *) Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
         unsigned int H_In2 = W_In1;
         unsigned int H_Out = H_In1;
@@ -5869,10 +2955,16 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				S6 += V1 * pIn2[i+2*H_In2];
 				S7 += V1 * pIn2[i+3*H_In2];
 			}
-			v4s R1 = gap_pack4(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0),
-					   AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0));
-			v4s R2 = gap_pack4(AT_CLIP_POS(AT_SCALE(S4, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S5, Scale, ScaleN), A0),
-					   AT_CLIP_POS(AT_SCALE(S6, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S7, Scale, ScaleN), A0));
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, Scale, ScaleN); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, Scale, ScaleN); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, Scale, ScaleN); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, Scale, ScaleN); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
 			*((v4s *) (pOut+(l1+1)*W_Out+4*Col)) = R2;
 			pIn2 += 4*H_In2;
@@ -5896,8 +2988,10 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				S0 += V0 * pIn2[i];
 				S1 += V1 * pIn2[i];
 			}
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0);
-                        pOut[(l1+1)*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+                        pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
+                        pOut[(l1+1)*W_Out + Col] = gap_clip(S1, 7);
 			pIn2 += H_In2;
         	}
         }
@@ -5935,8 +3029,11 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				S2 += V0 * pIn2[i+2*H_In2];
 				S3 += V0 * pIn2[i+3*H_In2];
 			}
-			v4s R1 = gap_pack4(AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S1, Scale, ScaleN), A0),
-					   AT_CLIP_POS(AT_SCALE(S2, Scale, ScaleN), A0), AT_CLIP_POS(AT_SCALE(S3, Scale, ScaleN), A0));
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, Scale, ScaleN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, Scale, ScaleN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, Scale, ScaleN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (pOut+(l1  )*W_Out+4*Col)) = R1;
 			pIn2 += 4*H_In2;
         	}
@@ -5955,10 +3052,52 @@ void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg)
 				int V0 = In1[(l1  )*W_In1 + i];
 				S0 += V0 * pIn2[i];
 			}
-                        pOut[(l1  )*W_Out + Col] = AT_CLIP_POS(AT_SCALE(S0, Scale, ScaleN), A0);
+			S0 = AT_SCALE(S0, Scale, ScaleN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+                        pOut[(l1  )*W_Out + Col] = gap_clip(S0, 7);
 			pIn2 += H_In2;
         	}
         }
         gap_waitbarrier(0);
 }
 
+void KerParMatMulTransposedB32_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerParMatMulTransposedB32_ReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerParMatMulTransposedB32_ReLUN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerParMatMulTransposedB32_ReLUM_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerParMatMulTransposedB32_ReLUMN_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerParMatMulTransposedB32_LeakyReLU_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerParMatMulTransposedB32_HSwish_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerParMatMulTransposedB32_HSigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerParMatMulTransposedB32_Sigmoid_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerParMatMulTransposedB32_Tanh_PL_SQ8(KerMatMul_PL_SQ8_T *Arg) {
+	KerParMatMulTransposedB32_PL_SQ8_act(Arg, ACT_TANH);
+}
+
+#pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
index 81e4e8db1..e90a8d3b4 100644
--- a/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
+++ b/tools/autotiler_v3/CNN_Libraries_SQ8/CNN_MatMul_Conv_SQ8.c
@@ -16,6 +16,7 @@
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wswitch"
 
 #include <stdio.h>
 #include "CNN_BasicKernels_SQ8.h"
@@ -53,8 +54,14 @@ static int LastDefinedOutput(int DimIn, int F, int PadL, int Stride, int D)
 }
 
 // #define OLD
-void KerPar_MM_Conv1D_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+
+/*
+ * 1D Convolutional Kernels kernels based on MatMul (im2col: ColBuff) with CHW inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -73,6 +80,9 @@ void KerPar_MM_Conv1D_SQ8(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
@@ -149,7 +159,8 @@ void KerPar_MM_Conv1D_SQ8(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
+	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
 		}
@@ -157,81 +168,54 @@ void KerPar_MM_Conv1D_SQ8(
 	}
 }
 
-static void __attribute__ ((noinline)) MatMul2Out(
-		signed char *__restrict__ pI,
-		signed char *__restrict__ pC,
-		int *__restrict__ pBias,
-		unsigned char *__restrict__ pSc,
-		unsigned char *__restrict__ pScN,
-		signed char *__restrict__ pOut0,
-		signed char *__restrict__ pOut1,
-		unsigned int InFeat,
-		unsigned int IterOut,
-		unsigned int NormBias
-		)
+void KerPar_MM_Conv1D_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	for (int i=0; i<(IterOut/4); i++) {
-		signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat,
-			    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
-		pC+=4;
-		int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-		int S10 = (*pBias)<<NormBias, S11 = S10; pBias++;
-		int S20 = (*pBias)<<NormBias, S21 = S20; pBias++;
-		int S30 = (*pBias)<<NormBias, S31 = S30; pBias++;
-		for (int f=0; f<(InFeat/4); f++) {
-			v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-	                               	S10 = gap_sumdotp4(V0, C1, S10); S11 = gap_sumdotp4(V1, C1, S11);
-	                               	S20 = gap_sumdotp4(V0, C2, S20); S21 = gap_sumdotp4(V1, C2, S21);
-	                               	S30 = gap_sumdotp4(V0, C3, S30); S31 = gap_sumdotp4(V1, C3, S31);
-			pIn0+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-		}
-		for (int f=4*(InFeat/4); f<InFeat; f++) {
-			int V0 = *pIn0, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-			S00 += V0*C0; S01 += V1*C0;
-			S10 += V0*C1; S11 += V1*C1;
-			S20 += V0*C2; S21 += V1*C2;
-			S30 += V0*C3; S31 += V1*C3;
-			pIn0++; pIn1++; pC0++; pC1++; pC2++; pC3++;
-		}
-		unsigned int Sc, ScN;
-		Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-		*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-		*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-		Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-		*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-		*pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-		Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-		*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-		*pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-		Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-		*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-		*pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
-	}
-	for (int i=4*(IterOut/4); i<IterOut; i++) {
-		signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat, *pC0 = pC; // &Filter[(First+i)*InFeat + 0];
-		pC++;
-		int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-		for (int f=0; f<(InFeat/4); f++) {
-			v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0);
-	          	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-			pIn0+=4; pIn1+=4; pC0+=4;
-		}
-		for (int f=4*(InFeat/4); f<InFeat; f++) {
-			int V0 = *pIn0, V1 = *pIn1, C0 = *pC0;
-			S00 += V0*C0; S01 += V1*C0;
-			pIn0++; pIn1++; pC0++;
-		}
-		unsigned int Sc, ScN;
-		Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-		*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-		*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-	}
+void KerPar_MM_Conv1D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerPar_MM_Conv1D_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerPar_MM_Conv1D_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerPar_MM_Conv1D_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerPar_MM_Conv1D_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerPar_MM_Conv1D_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerPar_MM_Conv1D_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerPar_MM_Conv1D_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv1D_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_SQ8_act(Arg, ACT_TANH);
 }
 
-void KerPar_MM_Conv1x1_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+/*
+ * 1x1 Convolutional Kernels kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Parallelized on the output feature dimension
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv1x1_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -251,6 +235,9 @@ void KerPar_MM_Conv1x1_HWC_SQ8(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	unsigned int CoreId = gap_coreid();
 	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
@@ -294,17 +281,25 @@ void KerPar_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
+				*pOut1 = gap_clip(S01, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S10, 7); pOut0++;
+				*pOut1 = gap_clip(S11, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S20, 7); pOut0++;
+				*pOut1 = gap_clip(S21, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S30, 7); pOut0++;
+				*pOut1 = gap_clip(S31, 7); pOut1++;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
 				signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat;
@@ -321,8 +316,10 @@ void KerPar_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
+				*pOut1 = gap_clip(S01, 7); pOut1++;
 			}
 			PosC += 2*Sx;
 		}
@@ -361,13 +358,17 @@ void KerPar_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S10, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S20, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S30, 7); pOut0++;
 			}
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
 				signed char *pIn0 = pI;
@@ -384,7 +385,8 @@ void KerPar_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
 			}
 		}
 		PosL += Sy;
@@ -392,171 +394,55 @@ void KerPar_MM_Conv1x1_HWC_SQ8(
 	gap_waitbarrier(0);
 }
 
+void KerPar_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_NONE);
+}
 
-void KerPar_MM_Conv1x1_ReLU_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
+void KerPar_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELU);
+}
 
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	unsigned int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
+void KerPar_MM_Conv1x1_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUN);
+}
 
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
+void KerPar_MM_Conv1x1_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUM);
+}
 
-	int Wo = Arg->Wo, Ho = Arg->Ho;
+void KerPar_MM_Conv1x1_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
 
-	unsigned int CoreId = gap_coreid();
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-	int IterOut = Last - First;
+void KerPar_MM_Conv1x1_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	int PosL = 0;
-	for (int l=0; l<Ho; l++) {
-		int PosC = 0;
-		for (int c=0; c<Wo/2; c++) {
-			int *pBias = Bias + First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
-			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
-			signed char *pC = Filter + First*InFeat;
-			signed char *pI = (In+PosL*W*InFeat + (PosC+0)*InFeat);
-			unsigned char *pSc = Scale + First;
-			unsigned char *pScN = ScaleN + First;
+void KerPar_MM_Conv1x1_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
 
-			for (int i=0; i<(IterOut/4); i++) {
-				signed char *pIn0 = pI, *pIn1 = pIn0 + Sx*InFeat,
-					    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
-				pC=pC3+InFeat;
-	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-	                        int S10 = (*pBias)<<NormBias, S11 = S10; pBias++;
-	                        int S20 = (*pBias)<<NormBias, S21 = S20; pBias++;
-	                        int S30 = (*pBias)<<NormBias, S31 = S30; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-	                               	S10 = gap_sumdotp4(V0, C1, S10); S11 = gap_sumdotp4(V1, C1, S11);
-	                               	S20 = gap_sumdotp4(V0, C2, S20); S21 = gap_sumdotp4(V1, C2, S21);
-	                               	S30 = gap_sumdotp4(V0, C3, S30); S31 = gap_sumdotp4(V1, C3, S31);
-					pIn0+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S00 += V0*C0; S01 += V1*C0;
-					S10 += V0*C1; S11 += V1*C1;
-					S20 += V0*C2; S21 += V1*C2;
-					S30 += V0*C3; S31 += V1*C3;
-					pIn0++; pIn1++; pC0++; pC1++; pC2++; pC3++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
-	                }
-			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat;
-	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
-	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-					pIn0+=4; pIn1+=4; pC+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, V1 = *pIn1, C0 = *pC;
-					S00 += V0*C0; S01 += V1*C0;
-					pIn0++; pIn1++; pC++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-			}
-			PosC += 2*Sx;
-		}
-		if (Wo&0X1) {
-			PosC = (Wo/2)*2*Sx;
-			int *pBias = Bias + First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (Wo-1)*OutFeat+First;
-			signed char *pC = Filter + First*InFeat;
-			signed char *pI = (In+PosL*W*InFeat + (PosC+0)*InFeat);
-			unsigned char *pSc = Scale + First;
-			unsigned char *pScN = ScaleN + First;
+void KerPar_MM_Conv1x1_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-			for (int i=0; i<(IterOut/4); i++) {
-				signed char *pIn0 = pI,
-					    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
-				pC=pC3+InFeat;
-	                        int S00 = (*pBias)<<NormBias; pBias++;
-	                        int S10 = (*pBias)<<NormBias; pBias++;
-	                        int S20 = (*pBias)<<NormBias; pBias++;
-	                        int S30 = (*pBias)<<NormBias; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                               	S00 = gap_sumdotp4(V0, C0, S00);
-	                               	S10 = gap_sumdotp4(V0, C1, S10);
-	                               	S20 = gap_sumdotp4(V0, C2, S20);
-	                               	S30 = gap_sumdotp4(V0, C3, S30);
-					pIn0+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S00 += V0*C0;
-					S10 += V0*C1;
-					S20 += V0*C2;
-					S30 += V0*C3;
-					pIn0++; pC0++; pC1++; pC2++; pC3++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-			}
-			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn0 = pI;
-	                        int S00 = (*pBias)<<NormBias; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC);
-	                               	S00 = gap_sumdotp4(V0, C0, S00);
-					pIn0+=4; pC+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, C0 = *pC;
-					S00 += V0*C0;
-					pIn0++; pC++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-			}
-		}
-		PosL += Sy;
-	}
-	gap_waitbarrier(0);
+void KerPar_MM_Conv1x1_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_SIGMOID);
 }
 
-void Ker_MM_Conv1x1_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void KerPar_MM_Conv1x1_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/*
+ * 1x1 Convolutional Kernels kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Parallelized on the spatial height dimension
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) Ker_MM_Conv1x1_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -576,6 +462,9 @@ void Ker_MM_Conv1x1_HWC_SQ8(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	unsigned int CoreId = gap_coreid();
 	unsigned int ChunkCell = ChunkSize(Ho), First = Min(Ho, CoreId*ChunkCell), Last  = Min(Ho, First+ChunkCell);
@@ -618,17 +507,25 @@ void Ker_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
+				*pOut1 = gap_clip(S01, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S10, 7); pOut0++;
+				*pOut1 = gap_clip(S11, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S20, 7); pOut0++;
+				*pOut1 = gap_clip(S21, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S30, 7); pOut0++;
+				*pOut1 = gap_clip(S31, 7); pOut1++;
 	                }
 			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
 				signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat;
@@ -645,8 +542,10 @@ void Ker_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
+				*pOut1 = gap_clip(S01, 7); pOut1++;
 			}
 			PosC += 2*Sx;
 		}
@@ -685,13 +584,17 @@ void Ker_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S10, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S20, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S30, 7); pOut0++;
 			}
 			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
 				signed char *pIn0 = pI;
@@ -708,15 +611,61 @@ void Ker_MM_Conv1x1_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
 			}
 		}
 	}
 	gap_waitbarrier(0);
 }
 
-void Ker_MM_Conv1x1_ReLU_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void Ker_MM_Conv1x1_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_NONE);
+}
+
+void Ker_MM_Conv1x1_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELU);
+}
+
+void Ker_MM_Conv1x1_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUN);
+}
+
+void Ker_MM_Conv1x1_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUM);
+}
+
+void Ker_MM_Conv1x1_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void Ker_MM_Conv1x1_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void Ker_MM_Conv1x1_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void Ker_MM_Conv1x1_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void Ker_MM_Conv1x1_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void Ker_MM_Conv1x1_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv1x1_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+/*
+ * 1D Convolutional Kernels kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -726,184 +675,27 @@ void Ker_MM_Conv1x1_ReLU_HWC_SQ8(
 	signed char *__restrict__ In = Arg->In;
 	int W = Arg->W, H = Arg->H;
 	signed char *__restrict__ Filter = Arg->Filter;
-	int Sx = Arg->Sx, Sy = Arg->Sy;
-	unsigned int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
+	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
+	int PadL = Arg->Pad[0];
+	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
 
         int * __restrict__ Bias = Arg->Bias;
 	int NormBias = Arg->Infos[AT_INF_BIASN];
         signed char * __restrict__ Out = Arg->Out;
         unsigned char * __restrict__ Scale = Arg->Scale;
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
+        signed char * __restrict__ ColBuff = Arg->ColBuff;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
-	unsigned int CoreId = gap_coreid();
-	unsigned int ChunkCell = ChunkSize(Ho), First = Min(Ho, CoreId*ChunkCell), Last  = Min(Ho, First+ChunkCell);
-	int IterOut = Last - First;
-
-	for (int l=First; l<Last; l++) {
-		int PosC = 0;
-		for (int c=0; c<Wo/2; c++) {
-			int *pBias = Bias;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat;
-			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat;
-			signed char *pC = Filter;
-			signed char *pI = (In+l*Sy*W*InFeat + (PosC+0)*InFeat);
-			unsigned char *pSc = Scale;
-			unsigned char *pScN = ScaleN;
-
-			for (int i=0; i<(OutFeat/4); i++) {
-				signed char *pIn0 = pI, *pIn1 = pIn0 + Sx*InFeat,
-					    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
-				pC=pC3+InFeat;
-	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-	                        int S10 = (*pBias)<<NormBias, S11 = S10; pBias++;
-	                        int S20 = (*pBias)<<NormBias, S21 = S20; pBias++;
-	                        int S30 = (*pBias)<<NormBias, S31 = S30; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-	                               	S10 = gap_sumdotp4(V0, C1, S10); S11 = gap_sumdotp4(V1, C1, S11);
-	                               	S20 = gap_sumdotp4(V0, C2, S20); S21 = gap_sumdotp4(V1, C2, S21);
-	                               	S30 = gap_sumdotp4(V0, C3, S30); S31 = gap_sumdotp4(V1, C3, S31);
-					pIn0+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S00 += V0*C0; S01 += V1*C0;
-					S10 += V0*C1; S11 += V1*C1;
-					S20 += V0*C2; S21 += V1*C2;
-					S30 += V0*C3; S31 += V1*C3;
-					pIn0++; pIn1++; pC0++; pC1++; pC2++; pC3++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
-	                }
-			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
-				signed char *pIn0 = pI, *pIn1 = pIn0 + InFeat;
-	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
-	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-					pIn0+=4; pIn1+=4; pC+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, V1 = *pIn1, C0 = *pC;
-					S00 += V0*C0; S01 += V1*C0;
-					pIn0++; pIn1++; pC++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-			}
-			PosC += 2*Sx;
-		}
-		if (Wo&0X1) {
-			PosC = (Wo/2)*2*Sx;
-			int *pBias = Bias;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (Wo-1)*OutFeat;
-			signed char *pC = Filter;
-			signed char *pI = (In+Sy*l*W*InFeat + (PosC+0)*InFeat);
-			unsigned char *pSc = Scale;
-			unsigned char *pScN = ScaleN;
-
-			for (int i=0; i<(OutFeat/4); i++) {
-				signed char *pIn0 = pI,
-					    *pC0 = pC, *pC1 = pC0+InFeat, *pC2 = pC1+InFeat, *pC3 = pC2+InFeat;
-				pC=pC3+InFeat;
-	                        int S00 = (*pBias)<<NormBias; pBias++;
-	                        int S10 = (*pBias)<<NormBias; pBias++;
-	                        int S20 = (*pBias)<<NormBias; pBias++;
-	                        int S30 = (*pBias)<<NormBias; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                               	S00 = gap_sumdotp4(V0, C0, S00);
-	                               	S10 = gap_sumdotp4(V0, C1, S10);
-	                               	S20 = gap_sumdotp4(V0, C2, S20);
-	                               	S30 = gap_sumdotp4(V0, C3, S30);
-					pIn0+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S00 += V0*C0;
-					S10 += V0*C1;
-					S20 += V0*C2;
-					S30 += V0*C3;
-					pIn0++; pC0++; pC1++; pC2++; pC3++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-			}
-			for (int i=4*(OutFeat/4); i<OutFeat; i++) {
-				signed char *pIn0 = pI;
-	                        int S00 = (*pBias)<<NormBias; pBias++;
-				for (int f=0; f<(InFeat/4); f++) {
-					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC);
-	                               	S00 = gap_sumdotp4(V0, C0, S00);
-					pIn0+=4; pC+=4;
-				}
-				for (int f=4*(InFeat/4); f<InFeat; f++) {
-					int V0 = *pIn0, C0 = *pC;
-					S00 += V0*C0;
-					pIn0++; pC++;
-				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-			}
-		}
-	}
-	gap_waitbarrier(0);
-}
-
-void KerPar_MM_Conv1D_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
+	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
+	v4s * __restrict__ VBuff = (v4s *) ColBuff;
+	unsigned int W_In1 = InFeat*Fx;
+	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
+	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
 
 	int Tail = 2*((W_In1+7)/8);
 	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
@@ -992,10 +784,16 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 					S3 += V0*C3; S7 += V1*C3;
 					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 				}
-				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+				v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
 				*((v4s *) (pOut1+4*Line)) = R2;
 	                }
@@ -1013,8 +811,10 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 					S0 += V0*C0; S4 += V1*C0;
 					pIn++; pIn1++; pC++;
 				}
-				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-				*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*(pOut0+i) = gap_clip(S0, 7);
+				*(pOut1+i) = gap_clip(S4, 7);
 			}
 			gap_waitbarrier(0);
 		}
@@ -1068,8 +868,11 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 					S3 += V0*C3;
 					pIn++; pC0++; pC1++; pC2++; pC3++;
 				}
-				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
@@ -1085,7 +888,8 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 					S0 += V0*C0;
 					pIn++; pC++;
 				}
-				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*(pOut0+i) = gap_clip(S0, 7);
 			}
 			gap_waitbarrier(0);
 		}
@@ -1093,8 +897,54 @@ void KerPar_MM_Conv1D_HWC_SQ8(
 	// }
 }
 
-void KerPar_MM_Conv1D_DxDy_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void KerPar_MM_Conv1D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerPar_MM_Conv1D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerPar_MM_Conv1D_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerPar_MM_Conv1D_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerPar_MM_Conv1D_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerPar_MM_Conv1D_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerPar_MM_Conv1D_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerPar_MM_Conv1D_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerPar_MM_Conv1D_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv1D_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/*
+ * 1D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with CHW inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -1113,6 +963,9 @@ void KerPar_MM_Conv1D_DxDy_SQ8(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
@@ -1157,7 +1010,8 @@ void KerPar_MM_Conv1D_DxDy_SQ8(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
+	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
 		}
@@ -1165,8 +1019,54 @@ void KerPar_MM_Conv1D_DxDy_SQ8(
 	}
 }
 
-void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void KerPar_MM_Conv1D_DxDy_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerPar_MM_Conv1D_DxDy_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerPar_MM_Conv1D_DxDy_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerPar_MM_Conv1D_DxDy_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerPar_MM_Conv1D_DxDy_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerPar_MM_Conv1D_DxDy_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerPar_MM_Conv1D_DxDy_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerPar_MM_Conv1D_DxDy_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerPar_MM_Conv1D_DxDy_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv1D_DxDy_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/*
+ * 1D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -1189,6 +1089,10 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 
 	int Wo = Arg->Wo, Ho = Arg->Ho;
 
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
+
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
 	unsigned int W_In1 = InFeat*Fx;
@@ -1306,10 +1210,16 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 				S3 += V0*C3; S7 += V1*C3;
 				pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-					   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-			v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
-					   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+			S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+			v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 			*((v4s *) (pOut0+4*Line)) = R1;
 			*((v4s *) (pOut1+4*Line)) = R2;
                 }
@@ -1327,8 +1237,10 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 				S0 += V0*C0; S4 += V1*C0;
 				pIn++; pIn1++; pC++;
 			}
-			*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-			*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+			S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			*(pOut0+i) = gap_clip(S0, 7);
+			*(pOut1+i) = gap_clip(S4, 7);
 		}
 		gap_waitbarrier(0);
 	}
@@ -1393,8 +1305,11 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 				S3 += V0*C3;
 				pIn++; pC0++; pC1++; pC2++; pC3++;
 			}
-			v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-					   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
+			S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
 			*((v4s *) (pOut0+4*Line)) = R1;
                 }
 		for (int i=4*(IterOut/4); i<IterOut; i++) {
@@ -1410,90 +1325,69 @@ void KerPar_MM_Conv1D_DxDy_HWC_SQ8(
 				S0 += V0*C0;
 				pIn++; pC++;
 			}
-			*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
+			S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+			*(pOut0+i) = gap_clip(S0, 7);
 		}
 		gap_waitbarrier(0);
 	}
 }
 
-void KerPar_MM_Conv1D_DxDy_ReLU_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
+void KerPar_MM_Conv1D_DxDy_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy, Dx = Arg->Dx;
-	int PadL = Arg->Pad[0];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
+void KerPar_MM_Conv1D_DxDy_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELU);
+}
 
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
+void KerPar_MM_Conv1D_DxDy_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELUN);
+}
 
-	int Wo = Arg->Wo, Ho = Arg->Ho;
+void KerPar_MM_Conv1D_DxDy_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELUM);
+}
 
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
+void KerPar_MM_Conv1D_DxDy_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
 
-	int Tail = 2*((W_In1+7)/8);
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	
-	int DFx = Dx*(Fx-1)+1;
-	// int Prec=10;
-	int InvDx = ((1<<Prec)+Dx-1)/Dx;
-	int PosL = 0;
-	int Iter = L-F;
-	int Iter1 = Iter*Fx;
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
-			int Off = -Lb - Min(PosC, 0);
-			int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
-			for (int f=F; f<L; f++) {
-				for (int i=0; i<IterX; i++) ColBuff[Fx*f+i+Off] = In[f*W*H+PosL*W+Lb+i*Dx];
-			}
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
+void KerPar_MM_Conv1D_DxDy_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
 }
 
-void KerPar_MM_Conv1D_ReLU_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void KerPar_MM_Conv1D_DxDy_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerPar_MM_Conv1D_DxDy_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerPar_MM_Conv1D_DxDy_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv1D_DxDy_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv1D_DxDy_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+/*
+ * 2D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with CHW inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
 	signed char *__restrict__ In = Arg->In;
 	int W = Arg->W, H = Arg->H;
 	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0];
+	int Fx = Arg->Fx, Sx = Arg->Sx;
+	int Fy = Arg->Fy, Sy = Arg->Sy;
+	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
 	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
         int * __restrict__ Bias = Arg->Bias;
 	int NormBias = Arg->Infos[AT_INF_BIASN];
@@ -1503,67 +1397,105 @@ void KerPar_MM_Conv1D_ReLU_SQ8(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
 
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
+
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx;
+	unsigned int W_In1 = InFeat*Fx*Fy;
 	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
 	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
 
+	int FS = Fx*Fy;
 	int Tail = 2*((W_In1+7)/8);
 	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	int PosL = 0;
+	int PosL = Arg->FirstTile?(-PadT):0;
 	int Iter = L-F;
-	int Iter1 = Iter*Fx;
+	int Iter1 = Iter*FS;
+
+	// printf("If: %3d, Of: %3d, W: %3d, H: %3d, Wo: %3d, Ho: %3d, PosL: %d\n", InFeat, OutFeat, W, H, Wo, Ho, PosL);
 	for (int l=0; l<Ho; l++) {
 		int PosC = -PadL;
+		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
+		int OffL = -Tb - Min(PosL, 0);
 		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Off = -Lb - Min(PosC, 0);
+			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
+			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
+			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
+			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
+			int OffC = -Lb - Min(PosC, 0);
 			int Size = Rb-Lb;
 			if (Size>4) {
 				if (Size&0x2) {
 					if (Size&0x1) {
 						for (int f=F; f<L; f++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-							*((short int *)(ColBuff+Fx*f+Off+Rb-3)) = *((short int *)(In+f*W*H+PosL*W+Rb-3));
-							*((ColBuff+Fx*f+Off+Rb-1)) = *((In+f*W*H+PosL*W+Rb-1));
+							for (int j=Tb; j<Db; j++) {
+								for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
+								*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-3)) = *((short int *)(In+f*W*H+j*W+Rb-3));
+								*((ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-1)) = *((In+f*W*H+j*W+Rb-1));
+							}
 						}
 					} else {
 						for (int f=F; f<L; f++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-							*((short int *)(ColBuff+Fx*f+Off+Rb-2)) = *((short int *)(In+f*W*H+PosL*W+Rb-2));
+							for (int j=Tb; j<Db; j++) {
+								for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
+								*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-2)) = *((short int *)(In+f*W*H+j*W+Rb-2));
+							}
 						}
 					}
 				} else if (Size&0x1) {
 					for (int f=F; f<L; f++) {
-						for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-						*((ColBuff+Fx*f+Off+Rb-1)) = *((In+f*W*H+PosL*W+Rb-1));
+						for (int j=Tb; j<Db; j++) {
+							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
+							*((ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-1)) = *((In+f*W*H+j*W+Rb-1));
+						}
 					}
-				} else for (int f=F; f<L; f++) for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
+				} else {
+					for (int f=F; f<L; f++) {
+						for (int j=Tb; j<Db; j++) {
+							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
+						}
+					}
+				}
 			} else if (Size>=2) {
 				if (Size&0x4) {
 					for (int f=F; f<L; f++) {
-						*((int *)(ColBuff+Fx*f+Off+Lb)) = *((int *)(In+f*W*H+PosL*W+Lb));
+						for (int j=Tb; j<Db; j++) {
+							*((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Lb)) = *((int *)(In+f*W*H+j*W+Lb));
+						}
 					}
 				} else if (Size&0x1) {
 					for (int f=F; f<L; f++) {
-						*((short int *)(ColBuff+Fx*f+Off+Lb)) = *((short int *)(In+f*W*H+PosL*W+Lb));
-						ColBuff[Fx*f+Off+Lb+2] = In[f*W*H+PosL*W+Lb+2];
+						for (int j=Tb; j<Db; j++) {
+							*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Lb)) = *((short int *)(In+f*W*H+j*W+Lb));
+							ColBuff[FS*f + Fx*(j+OffL)+OffC+Lb+2] = In[f*W*H+j*W+Lb+2];
+						}
 					}
 				} else {
 					for (int f=F; f<L; f++) {
-						*((short int *)(ColBuff+Fx*f+Off+Lb)) = *((short int *)(In+f*W*H+PosL*W+Lb));
+						for (int j=Tb; j<Db; j++) {
+							*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Lb)) = *((short int *)(In+f*W*H+j*W+Lb));
+						}
 					}
 				}
-			} else if (Size) for (int f=F; f<L; f++) ColBuff[Fx*f+Off+Lb] = In[f*W*H+PosL*W+Lb];
+			} else if (Size) {
+				for (int f=F; f<L; f++) {
+					for (int j=Tb; j<Db; j++) {
+						ColBuff[FS*f + Fx*(j+OffL)+OffC+Lb] = In[f*W*H+j*W+Lb];
+					}
+				}
+			}
 			PosC += Sx;
 			gap_waitbarrier(0);
+			/*
+			printf("Line: %d, Col: %d, OutFeat: %d to %d\n", l, c, First, Last-1);
+			printf("Feat : "); for (int i=0; i<(((W_In1+7)/8)*8); i++) printf("%2d ", ColBuff[i]); printf("\n");
+			*/
 	                for (int Line=First; Line<Last; Line++) {
 	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
 	                        int S0 = (Bias[Line]<<NormBias);
+				// printf("S0   : %d\n", S0); printf("Filt%d: ", Line); for (int i=0; i<W_In1; i++) printf("%2d ", ((signed char *)VIn1)[i]); printf("\n");
 	                        for (int i=0; i<((W_In1+7)/8); i++) {
 	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
 					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
@@ -1571,215 +1503,69 @@ void KerPar_MM_Conv1D_ReLU_SQ8(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
+				// printf("Out[F:%d, H:%d, W:%d] = (%d * %d) >> %d = %d\n", Line, l, c, S0, Sc, ScN, gap_clip(AT_SCALE(S0, Sc, ScN), 7));
+				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
 		}
 		PosL += Sy;
 	}
+	gap_waitbarrier(0);
 }
 
-void KerPar_MM_Conv1D_ReLUN_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
+void KerPar_MM_Conv2D_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_NONE);
+}
 
-{
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
+void KerPar_MM_Conv2D_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_RELU);
+}
 
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
+void KerPar_MM_Conv2D_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_RELUN);
+}
 
-	int Tail = 2*((W_In1+7)/8);
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	int PosL = 0;
-	int Iter = L-F;
-	int Iter1 = Iter*Fx;
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Off = -Lb - Min(PosC, 0);
-			int Size = Rb-Lb;
-			if (Size>4) {
-				if (Size&0x2) {
-					if (Size&0x1) {
-						for (int f=F; f<L; f++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-							*((short int *)(ColBuff+Fx*f+Off+Rb-3)) = *((short int *)(In+f*W*H+PosL*W+Rb-3));
-							*((ColBuff+Fx*f+Off+Rb-1)) = *((In+f*W*H+PosL*W+Rb-1));
-						}
-					} else {
-						for (int f=F; f<L; f++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-							*((short int *)(ColBuff+Fx*f+Off+Rb-2)) = *((short int *)(In+f*W*H+PosL*W+Rb-2));
-						}
-					}
-				} else if (Size&0x1) {
-					for (int f=F; f<L; f++) {
-						for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-						*((ColBuff+Fx*f+Off+Rb-1)) = *((In+f*W*H+PosL*W+Rb-1));
-					}
-				} else for (int f=F; f<L; f++) for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-			} else if (Size>=2) {
-				if (Size&0x4) {
-					for (int f=F; f<L; f++) {
-						*((int *)(ColBuff+Fx*f+Off+Lb)) = *((int *)(In+f*W*H+PosL*W+Lb));
-					}
-				} else if (Size&0x1) {
-					for (int f=F; f<L; f++) {
-						*((short int *)(ColBuff+Fx*f+Off+Lb)) = *((short int *)(In+f*W*H+PosL*W+Lb));
-						ColBuff[Fx*f+Off+Lb+2] = In[f*W*H+PosL*W+Lb+2];
-					}
-				} else {
-					for (int f=F; f<L; f++) {
-						*((short int *)(ColBuff+Fx*f+Off+Lb)) = *((short int *)(In+f*W*H+PosL*W+Lb));
-					}
-				}
-			} else if (Size) for (int f=F; f<L; f++) ColBuff[Fx*f+Off+Lb] = In[f*W*H+PosL*W+Lb];
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_CLIP_POS(AT_SCALE(S0, Sc, ScN), A0), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
+void KerPar_MM_Conv2D_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_RELUM);
 }
 
-void KerPar_MM_Conv1D_LeakyReLU_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
+void KerPar_MM_Conv2D_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_RELUMN);
+}
 
-{
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
-	unsigned int ActScale = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Arg->Infos)[AT_INF_ACTSCALEN];
+void KerPar_MM_Conv2D_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
+void KerPar_MM_Conv2D_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_HSWISH);
+}
 
-	int Tail = 2*((W_In1+7)/8);
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	int PosL = 0;
-	int Iter = L-F;
-	int Iter1 = Iter*Fx;
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*Fx))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*Fx))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*Fx))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Off = -Lb - Min(PosC, 0);
-			int Size = Rb-Lb;
-			if (Size>4) {
-				if (Size&0x2) {
-					if (Size&0x1) {
-						for (int f=F; f<L; f++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-							*((short int *)(ColBuff+Fx*f+Off+Rb-3)) = *((short int *)(In+f*W*H+PosL*W+Rb-3));
-							*((ColBuff+Fx*f+Off+Rb-1)) = *((In+f*W*H+PosL*W+Rb-1));
-						}
-					} else {
-						for (int f=F; f<L; f++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-							*((short int *)(ColBuff+Fx*f+Off+Rb-2)) = *((short int *)(In+f*W*H+PosL*W+Rb-2));
-						}
-					}
-				} else if (Size&0x1) {
-					for (int f=F; f<L; f++) {
-						for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-						*((ColBuff+Fx*f+Off+Rb-1)) = *((In+f*W*H+PosL*W+Rb-1));
-					}
-				} else for (int f=F; f<L; f++) for (int i=0; i<Size/4; i++) *((int *)(ColBuff+Fx*f+Off+4*i+Lb)) = *((int *)(In+f*W*H+PosL*W+4*i+Lb));
-			} else if (Size>=2) {
-				if (Size&0x4) {
-					for (int f=F; f<L; f++) {
-						*((int *)(ColBuff+Fx*f+Off+Lb)) = *((int *)(In+f*W*H+PosL*W+Lb));
-					}
-				} else if (Size&0x1) {
-					for (int f=F; f<L; f++) {
-						*((short int *)(ColBuff+Fx*f+Off+Lb)) = *((short int *)(In+f*W*H+PosL*W+Lb));
-						ColBuff[Fx*f+Off+Lb+2] = In[f*W*H+PosL*W+Lb+2];
-					}
-				} else {
-					for (int f=F; f<L; f++) {
-						*((short int *)(ColBuff+Fx*f+Off+Lb)) = *((short int *)(In+f*W*H+PosL*W+Lb));
-					}
-				}
-			} else if (Size) for (int f=F; f<L; f++) ColBuff[Fx*f+Off+Lb] = In[f*W*H+PosL*W+Lb];
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-				S0 = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-				int Neg0 = gap_bitextractu(S0, 1, 31), Pos0 = !Neg0;
-				int S0N = AT_NORM(S0 * A0, 7);
-	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(ActScale, (Neg0*S0N+Pos0*S0), ActScaleN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
+void KerPar_MM_Conv2D_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_HSIGMOID);
 }
 
-void KerPar_MM_Conv2D_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
+void KerPar_MM_Conv2D_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv2D_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_SQ8_act(Arg, ACT_TANH);
+}
+
+/*
+ * 2D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation)
 
 {
+	/*
+		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
+	*/
 	signed char *__restrict__ In = Arg->In;
 	int W = Arg->W, H = Arg->H;
 	signed char *__restrict__ Filter = Arg->Filter;
@@ -1794,922 +1580,307 @@ void KerPar_MM_Conv2D_SQ8(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
 	unsigned int W_In1 = InFeat*Fx*Fy;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-
-	int FS = Fx*Fy;
-	int Tail = 2*((W_In1+7)/8);
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	int PosL = Arg->FirstTile?(-PadT):0;
-	int Iter = L-F;
-	int Iter1 = Iter*FS;
-
-	// printf("If: %3d, Of: %3d, W: %3d, H: %3d, Wo: %3d, Ho: %3d, PosL: %d\n", InFeat, OutFeat, W, H, Wo, Ho, PosL);
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
-		int OffL = -Tb - Min(PosL, 0);
-		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int OffC = -Lb - Min(PosC, 0);
-			int Size = Rb-Lb;
-			if (Size>4) {
-				if (Size&0x2) {
-					if (Size&0x1) {
-						for (int f=F; f<L; f++) {
-							for (int j=Tb; j<Db; j++) {
-								for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
-								*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-3)) = *((short int *)(In+f*W*H+j*W+Rb-3));
-								*((ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-1)) = *((In+f*W*H+j*W+Rb-1));
-							}
-						}
-					} else {
-						for (int f=F; f<L; f++) {
-							for (int j=Tb; j<Db; j++) {
-								for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
-								*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-2)) = *((short int *)(In+f*W*H+j*W+Rb-2));
-							}
-						}
-					}
-				} else if (Size&0x1) {
-					for (int f=F; f<L; f++) {
-						for (int j=Tb; j<Db; j++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
-							*((ColBuff+FS*f + Fx*(j+OffL)+OffC+Rb-1)) = *((In+f*W*H+j*W+Rb-1));
-						}
-					}
-				} else {
-					for (int f=F; f<L; f++) {
-						for (int j=Tb; j<Db; j++) {
-							for (int i=0; i<Size/4; i++) *((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+4*i+Lb)) = *((int *)(In+f*W*H+j*W+4*i+Lb));
-						}
-					}
-				}
-			} else if (Size>=2) {
-				if (Size&0x4) {
-					for (int f=F; f<L; f++) {
-						for (int j=Tb; j<Db; j++) {
-							*((int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Lb)) = *((int *)(In+f*W*H+j*W+Lb));
-						}
-					}
-				} else if (Size&0x1) {
-					for (int f=F; f<L; f++) {
-						for (int j=Tb; j<Db; j++) {
-							*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Lb)) = *((short int *)(In+f*W*H+j*W+Lb));
-							ColBuff[FS*f + Fx*(j+OffL)+OffC+Lb+2] = In[f*W*H+j*W+Lb+2];
-						}
-					}
-				} else {
-					for (int f=F; f<L; f++) {
-						for (int j=Tb; j<Db; j++) {
-							*((short int *)(ColBuff+FS*f + Fx*(j+OffL)+OffC+Lb)) = *((short int *)(In+f*W*H+j*W+Lb));
-						}
-					}
-				}
-			} else if (Size) {
-				for (int f=F; f<L; f++) {
-					for (int j=Tb; j<Db; j++) {
-						ColBuff[FS*f + Fx*(j+OffL)+OffC+Lb] = In[f*W*H+j*W+Lb];
-					}
-				}
-			}
-			PosC += Sx;
-			gap_waitbarrier(0);
-			/*
-			printf("Line: %d, Col: %d, OutFeat: %d to %d\n", l, c, First, Last-1);
-			printf("Feat : "); for (int i=0; i<(((W_In1+7)/8)*8); i++) printf("%2d ", ColBuff[i]); printf("\n");
-			*/
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-				// printf("S0   : %d\n", S0); printf("Filt%d: ", Line); for (int i=0; i<W_In1; i++) printf("%2d ", ((signed char *)VIn1)[i]); printf("\n");
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-				// printf("Out[F:%d, H:%d, W:%d] = (%d * %d) >> %d = %d\n", Line, l, c, S0, Sc, ScN, gap_clip(AT_SCALE(S0, Sc, ScN), 7));
-	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
-	gap_waitbarrier(0);
-}
-
-void KerPar_MM_Conv2D_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx;
-	int Fy = Arg->Fy, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx*Fy;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-
-	int FS = Fx*Fy;
-	int Tail = 2*((W_In1+7)/8);
-
-	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
-	int PosL = Arg->FirstTile?(-PadT):0;
-
-	int Iter = L-F;
-	int Iter1 = Iter*FS;
-	int IterOut = Max(0, Last - First);
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
-		int OffL = -Tb - Min(PosL, 0);
-		for (int c=0; c<(Wo/2); c++) {
-			for (int i=0; i<(Iter1/4); i++) {
-				((int *)(ColBuff+F*FS))[i]=0;
-				((int *)(ColBuff1+F*FS))[i]=0;
-			}
-			if (Iter1&0x2) {
-				((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-				((short int *)(ColBuff1+F*FS))[Iter1/2-1]=0;
-			}
-			if (Iter1&0x1) {
-				((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
-				((signed char *)(ColBuff1+F*FS))[Iter1-1]=0;
-			}
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Sx+Fx, W);
-			int OffC = -Lb - Min(PosC, 0);
-			int OffC1 = -Lb1 - Min(PosC+Sx, 0);
-                        if (Iter>=4) {
-                                for (int f=0; f<(Iter/4); f++)
-					for (int j=Tb; j<Db; j++) {
-                                        	for (int i=Lb; i<Rb; i++) {
-							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
-                                        	}
-                                        	for (int i=Lb1; i<Rb1; i++) {
-							((int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
-                                        	}
-					}
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-					}
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-					}
-                        } else if (Iter>=2) {
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-					}
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-					}
-                        } else if (Iter>0) {
-				for (int j=Tb; j<Db; j++) {
-					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
-					for (int i=Lb1; i<Rb1; i++)
-						ColBuff1[(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
-				}
-			}
-			PosC += 2*Sx;
-			gap_waitbarrier(0);
-
-			int *pBias = Bias + First;
-			signed char *pC = Filter + W_In1*First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
-			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
-			unsigned char *pSc = Scale + First;
-			unsigned char *pScN = ScaleN + First;
-	                for (int Line=0; Line<IterOut/4; Line++) {
-				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
-				pC=pC3+W_In1;
-	                        int S0 = (pBias[4*Line  ])<<NormBias, S4=S0;
-	                        int S1 = (pBias[4*Line+1])<<NormBias, S5=S1;
-	                        int S2 = (pBias[4*Line+2])<<NormBias, S6=S2;
-	                        int S3 = (pBias[4*Line+3])<<NormBias, S7=S3;
-				signed char *pIn = ColBuff;
-				signed char *pIn1 = ColBuff1;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
-	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
-	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
-	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
-					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-	                        }
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S0 += V0*C0; S4 += V1*C0;
-					S1 += V0*C1; S5 += V1*C1;
-					S2 += V0*C2; S6 += V1*C2;
-					S3 += V0*C3; S7 += V1*C3;
-					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
-				}
-				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				*((v4s *) (pOut0+4*Line)) = R1;
-				*((v4s *) (pOut1+4*Line)) = R2;
-	                }
-			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn = ColBuff;
-				signed char *pIn1 = ColBuff1;
-	                        int S0 = (pBias[i])<<NormBias, S4=S0;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
-	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
-					pIn+=4; pIn1+=4; pC+=4;
-				}
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
-					S0 += V0*C0; S4 += V1*C0;
-					pIn++; pIn1++; pC++;
-				}
-				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-				*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
-			}
-			gap_waitbarrier(0);
-		}
-		if (Wo&0x1) {
-			int c = Wo-1;
-		// for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int OffC = -Lb - Min(PosC, 0);
-                        if (Iter>=4) {
-                                for (int f=0; f<(Iter/4); f++)
-					for (int j=Tb; j<Db; j++)
-                                        	for (int i=Lb; i<Rb; i++)
-							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-                        } else if (Iter>=2) {
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-                        } else if (Iter>0)
-				for (int j=Tb; j<Db; j++)
-					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
-
-			PosC += Sx;
-			gap_waitbarrier(0);
-
-			int *pBias = Bias + First;
-			signed char *pC = Filter + W_In1*First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (c)*OutFeat+First;
-			unsigned char *pSc = Scale + First;
-			unsigned char *pScN = ScaleN + First;
-	                for (int Line=0; Line<IterOut/4; Line++) {
-				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
-				pC=pC3+W_In1;
-	                        int S0 = (*pBias)<<NormBias; pBias++;
-	                        int S1 = (*pBias)<<NormBias; pBias++;
-	                        int S2 = (*pBias)<<NormBias; pBias++;
-	                        int S3 = (*pBias)<<NormBias; pBias++;
-				signed char *pIn = ColBuff;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S1 = gap_sumdotp4(V0, C1, S1);
-	                                S2 = gap_sumdotp4(V0, C2, S2);
-	                                S3 = gap_sumdotp4(V0, C3, S3);
-					pIn+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-	                        }
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S0 += V0*C0;
-					S1 += V0*C1;
-					S2 += V0*C2;
-					S3 += V0*C3;
-					pIn++; pC0++; pC1++; pC2++; pC3++;
-				}
-				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				*((v4s *) (pOut0+4*Line)) = R1;
-	                }
-			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn = ColBuff;
-	                        int S0 = (*pBias)<<NormBias; pBias++;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC);
-	                               	S0 = gap_sumdotp4(V0, C0, S0);
-					pIn+=4; pC+=4;
-				}
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, C0 = *pC;
-					S0 += V0*C0;
-					pIn++; pC++;
-				}
-				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-			}
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
-}
-
-void KerPar_MM_Conv2D_ReLU_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx;
-	int Fy = Arg->Fy, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx*Fy;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-
-	int FS = Fx*Fy;
-	int Tail = 2*((W_In1+7)/8);
-
-	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
-	int PosL = Arg->FirstTile?(-PadT):0;
-
-	int Iter = L-F;
-	int Iter1 = Iter*FS;
-	int IterOut = Max(0, Last - First);
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
-		int OffL = -Tb - Min(PosL, 0);
-		for (int c=0; c<(Wo/2); c++) {
-			for (int i=0; i<(Iter1/4); i++) {
-				((int *)(ColBuff+F*FS))[i]=0;
-				((int *)(ColBuff1+F*FS))[i]=0;
-			}
-			if (Iter1&0x2) {
-				((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-				((short int *)(ColBuff1+F*FS))[Iter1/2-1]=0;
-			}
-			if (Iter1&0x1) {
-				((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
-				((signed char *)(ColBuff1+F*FS))[Iter1-1]=0;
-			}
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Sx+Fx, W);
-			int OffC = -Lb - Min(PosC, 0);
-			int OffC1 = -Lb1 - Min(PosC+Sx, 0);
-                        if (Iter>=4) {
-                                for (int f=0; f<(Iter/4); f++)
-					for (int j=Tb; j<Db; j++) {
-                                        	for (int i=Lb; i<Rb; i++) {
-							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
-                                        	}
-                                        	for (int i=Lb1; i<Rb1; i++) {
-							((int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
-                                        	}
-					}
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-					}
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-					}
-                        } else if (Iter>=2) {
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-					}
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++) {
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-						for (int i=Lb1; i<Rb1; i++)
-							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-					}
-                        } else if (Iter>0) {
-				for (int j=Tb; j<Db; j++) {
-					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
-					for (int i=Lb1; i<Rb1; i++)
-						ColBuff1[(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
-				}
-			}
-			PosC += 2*Sx;
-			gap_waitbarrier(0);
-
-			int *pBias = Bias + First;
-			signed char *pC = Filter + W_In1*First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
-			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
-			unsigned char *pSc = Scale + First;
-			unsigned char *pScN = ScaleN + First;
-	                for (int Line=0; Line<IterOut/4; Line++) {
-				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
-				pC=pC3+W_In1;
-	                        int S0 = (pBias[4*Line  ])<<NormBias, S4=S0;
-	                        int S1 = (pBias[4*Line+1])<<NormBias, S5=S1;
-	                        int S2 = (pBias[4*Line+2])<<NormBias, S6=S2;
-	                        int S3 = (pBias[4*Line+3])<<NormBias, S7=S3;
-				signed char *pIn = ColBuff;
-				signed char *pIn1 = ColBuff1;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
-	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
-	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
-	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
-					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-	                        }
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S0 += V0*C0; S4 += V1*C0;
-					S1 += V0*C1; S5 += V1*C1;
-					S2 += V0*C2; S6 += V1*C2;
-					S3 += V0*C3; S7 += V1*C3;
-					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
-				}
-				v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   AT_CLIP_POS_IMM(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				v4s R2 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), AT_CLIP_POS_IMM(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   AT_CLIP_POS_IMM(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), AT_CLIP_POS_IMM(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				*((v4s *) (pOut0+4*Line)) = R1;
-				*((v4s *) (pOut1+4*Line)) = R2;
-	                }
-			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn = ColBuff;
-				signed char *pIn1 = ColBuff1;
-	                        int S0 = (pBias[i])<<NormBias, S4=S0;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
-	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
-					pIn+=4; pIn1+=4; pC+=4;
-				}
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
-					S0 += V0*C0; S4 += V1*C0;
-					pIn++; pIn1++; pC++;
-				}
-				*(pOut0+i) = AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-				*(pOut1+i) = AT_CLIP_POS_IMM(AT_SCALE(S4, pSc[i], pScN[i]), 7);
-			}
-			gap_waitbarrier(0);
-		}
-		if (Wo&0x1) {
-			int c = Wo-1;
-		// for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int OffC = -Lb - Min(PosC, 0);
-                        if (Iter>=4) {
-                                for (int f=0; f<(Iter/4); f++)
-					for (int j=Tb; j<Db; j++)
-                                        	for (int i=Lb; i<Rb; i++)
-							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-                        } else if (Iter>=2) {
-                                if (Iter&0x2)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
-                                if (Iter&0x1)
-					for (int j=Tb; j<Db; j++)
-						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
-                        } else if (Iter>0)
-				for (int j=Tb; j<Db; j++)
-					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
-
-			PosC += Sx;
-			gap_waitbarrier(0);
-
-			int *pBias = Bias + First;
-			signed char *pC = Filter + W_In1*First;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (c)*OutFeat+First;
-			unsigned char *pSc = Scale + First;
-			unsigned char *pScN = ScaleN + First;
-	                for (int Line=0; Line<IterOut/4; Line++) {
-				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
-				pC=pC3+W_In1;
-	                        int S0 = (*pBias)<<NormBias; pBias++;
-	                        int S1 = (*pBias)<<NormBias; pBias++;
-	                        int S2 = (*pBias)<<NormBias; pBias++;
-	                        int S3 = (*pBias)<<NormBias; pBias++;
-				signed char *pIn = ColBuff;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S1 = gap_sumdotp4(V0, C1, S1);
-	                                S2 = gap_sumdotp4(V0, C2, S2);
-	                                S3 = gap_sumdotp4(V0, C3, S3);
-					pIn+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
-	                        }
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S0 += V0*C0;
-					S1 += V0*C1;
-					S2 += V0*C2;
-					S3 += V0*C3;
-					pIn++; pC0++; pC1++; pC2++; pC3++;
-				}
-				v4s R1 = gap_pack4(AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), AT_CLIP_POS_IMM(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   AT_CLIP_POS_IMM(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), AT_CLIP_POS_IMM(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				*((v4s *) (pOut0+4*Line)) = R1;
-	                }
-			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn = ColBuff;
-	                        int S0 = (*pBias)<<NormBias; pBias++;
-	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC);
-	                               	S0 = gap_sumdotp4(V0, C0, S0);
-					pIn+=4; pC+=4;
-				}
-				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn, C0 = *pC;
-					S0 += V0*C0;
-					pIn++; pC++;
-				}
-				*(pOut0+i) = AT_CLIP_POS_IMM(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-			}
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
-}
-
-void Ker_MM_Conv2D_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx;
-	int Fy = Arg->Fy, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-        signed char * __restrict__ ColBuff1;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-
-	unsigned int W_In1 = InFeat*Fx*Fy;
-	unsigned int CoreId = gap_coreid(), ChunkCell = ChunkSize(Wo), First = CoreId*ChunkCell, Last  = Min(Wo, First+ChunkCell);
+	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
+	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
 
 	int FS = Fx*Fy;
+	int Tail = 2*((W_In1+7)/8);
+
+	signed char * __restrict__ ColBuff1 = ColBuff + 4*Tail;
+	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
+	((int *)ColBuff1)[Tail-1] = 0; ((int *)ColBuff1)[Tail-2] = 0;
 	int PosL = Arg->FirstTile?(-PadT):0;
 
-	int Iter = InFeat;
+	int Iter = L-F;
 	int Iter1 = Iter*FS;
-	int IterOut = OutFeat;
-	int IterW = Max(0, Last-First);
-	ColBuff += 2*CoreId*InFeat*FS;
-	ColBuff1 = ColBuff + InFeat*FS;
+	int IterOut = Max(0, Last - First);
 	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL + First*Sx;
+		int PosC = -PadL;
 		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
 		int OffL = -Tb - Min(PosL, 0);
-		// for (int c=0; c<Wo; c++) {
-		// for (int c=First; c<Last; c++) {
-		for (int c=0; c<IterW/2; c++) {
+		for (int c=0; c<(Wo/2); c++) {
 			for (int i=0; i<(Iter1/4); i++) {
-				((int *)(ColBuff))[i]=0;
-				((int *)(ColBuff1))[i]=0;
+				((int *)(ColBuff+F*FS))[i]=0;
+				((int *)(ColBuff1+F*FS))[i]=0;
 			}
 			if (Iter1&0x2) {
-				((short int *)(ColBuff))[Iter1/2-1]=0;
-				((short int *)(ColBuff1))[Iter1/2-1]=0;
+				((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
+				((short int *)(ColBuff1+F*FS))[Iter1/2-1]=0;
 			}
 			if (Iter1&0x1) {
-				((signed char *)(ColBuff))[Iter1-1]=0;
-				((signed char *)(ColBuff1))[Iter1-1]=0;
+				((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
+				((signed char *)(ColBuff1+F*FS))[Iter1-1]=0;
 			}
 			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Fx+Sx, W);
+			int Lb1 = Max(PosC+Sx, 0), Rb1 = Min(PosC+Sx+Fx, W);
 			int OffC = -Lb - Min(PosC, 0);
 			int OffC1 = -Lb1 - Min(PosC+Sx, 0);
                         if (Iter>=4) {
                                 for (int f=0; f<(Iter/4); f++)
 					for (int j=Tb; j<Db; j++) {
-                                        	for (int i=Lb; i<Rb; i++)
-							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[f] = ((int *)(In+j*W*InFeat + i*InFeat))[f];
-                                        	for (int i=Lb1; i<Rb1; i++)
-							((int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat))[f] = ((int *)(In+j*W*InFeat + i*InFeat))[f];
+                                        	for (int i=Lb; i<Rb; i++) {
+							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
+                                        	}
+                                        	for (int i=Lb1; i<Rb1; i++) {
+							((int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
+                                        	}
 					}
                                 if (Iter&0x2)
 					for (int j=Tb; j<Db; j++) {
 						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat))[Iter/2-1];
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
 						for (int i=Lb1; i<Rb1; i++)
-							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat))[Iter/2-1];
+							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
 					}
                                 if (Iter&0x1)
 					for (int j=Tb; j<Db; j++) {
 						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat))[Iter-1];
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
 						for (int i=Lb1; i<Rb1; i++)
-							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat))[Iter-1];
+							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
 					}
                         } else if (Iter>=2) {
                                 if (Iter&0x2)
 					for (int j=Tb; j<Db; j++) {
 						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat))[Iter/2-1];
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
 						for (int i=Lb1; i<Rb1; i++)
-							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat))[Iter/2-1];
+							((short int *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
 					}
                                 if (Iter&0x1)
 					for (int j=Tb; j<Db; j++) {
 						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat))[Iter-1];
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
 						for (int i=Lb1; i<Rb1; i++)
-							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat))[Iter-1];
+							((signed char *)(ColBuff1+(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
 					}
                         } else if (Iter>0) {
 				for (int j=Tb; j<Db; j++) {
 					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat] = In[j*W*InFeat + i*InFeat];
+						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
 					for (int i=Lb1; i<Rb1; i++)
-						ColBuff1[(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat] = In[j*W*InFeat + i*InFeat];
+						ColBuff1[(j+OffL)*InFeat*Fx+(i+OffC1)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
 				}
 			}
-
 			PosC += 2*Sx;
-			int *pBias = Bias;
-			signed char *pC = Filter;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (First + 2*c)*OutFeat;
-			signed char *pOut1 = Out+l*Wo*OutFeat + (First + 2*c+1)*OutFeat;
-			unsigned char *pSc = Scale;
-			unsigned char *pScN = ScaleN;
+			gap_waitbarrier(0);
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (2*c+0)*OutFeat+First;
+			signed char *pOut1 = Out+l*Wo*OutFeat + (2*c+1)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
 	                for (int Line=0; Line<IterOut/4; Line++) {
 				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
 				pC=pC3+W_In1;
-	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
-	                        int S10 = (*pBias)<<NormBias, S11 = S10; pBias++;
-	                        int S20 = (*pBias)<<NormBias, S21 = S20; pBias++;
-	                        int S30 = (*pBias)<<NormBias, S31 = S30; pBias++;
-				signed char *pIn0 = ColBuff, *pIn1 = ColBuff1;
+	                        int S0 = (pBias[4*Line  ])<<NormBias, S4=S0;
+	                        int S1 = (pBias[4*Line+1])<<NormBias, S5=S1;
+	                        int S2 = (pBias[4*Line+2])<<NormBias, S6=S2;
+	                        int S3 = (pBias[4*Line+3])<<NormBias, S7=S3;
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
 	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                                S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-	                                S10 = gap_sumdotp4(V0, C1, S10); S11 = gap_sumdotp4(V1, C1, S11);
-	                                S20 = gap_sumdotp4(V0, C2, S20); S21 = gap_sumdotp4(V1, C2, S21);
-	                                S30 = gap_sumdotp4(V0, C3, S30); S31 = gap_sumdotp4(V1, C3, S31);
-					pIn0+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                                S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4); 
+	                                S1 = gap_sumdotp4(V0, C1, S1); S5 = gap_sumdotp4(V1, C1, S5); 
+	                                S2 = gap_sumdotp4(V0, C2, S2); S6 = gap_sumdotp4(V1, C2, S6); 
+	                                S3 = gap_sumdotp4(V0, C3, S3); S7 = gap_sumdotp4(V1, C3, S7); 
+					pIn+=4; pIn1+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
 	                        }
 				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn0, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S00 += V0*C0; S01 += V1*C0;
-					S10 += V0*C1; S11 += V1*C1;
-					S20 += V0*C2; S21 += V1*C2;
-					S30 += V0*C3; S31 += V1*C3;
-					pIn0++; pIn1++; pC0++; pC1++; pC2++; pC3++;
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0; S4 += V1*C0;
+					S1 += V0*C1; S5 += V1*C1;
+					S2 += V0*C2; S6 += V1*C2;
+					S3 += V0*C3; S7 += V1*C3;
+					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+				v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
+				*((v4s *) (pOut1+4*Line)) = R2;
 	                }
-			for (int Line=4*(IterOut/4); Line<IterOut; Line++) {
-				signed char *pIn0 = ColBuff, *pIn1 = ColBuff1;
-	                        int S00 = (*pBias)<<NormBias, S01 = S00; pBias++;
+			for (int i=4*(IterOut/4); i<IterOut; i++) {
+				signed char *pIn = ColBuff;
+				signed char *pIn1 = ColBuff1;
+	                        int S0 = (pBias[i])<<NormBias, S4=S0;
 	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn0), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
-	                               	S00 = gap_sumdotp4(V0, C0, S00); S01 = gap_sumdotp4(V1, C0, S01);
-					pIn0+=4; pIn1+=4; pC+=4;
+					v4s V0 = *((v4s *)pIn), V1 = *((v4s *)pIn1), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0); S4 = gap_sumdotp4(V1, C0, S4);
+					pIn+=4; pIn1+=4; pC+=4;
 				}
 				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn0, V1 = *pIn1, C0 = *pC;
-					S00 += V0*C0; S01 += V1*C0;
-					pIn0++; pIn1++; pC++;
+					int V0 = *pIn, V1 = *pIn1, C0 = *pC;
+					S0 += V0*C0; S4 += V1*C0;
+					pIn++; pIn1++; pC++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = gap_clip(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*(pOut0+i) = gap_clip(S0, 7);
+				*(pOut1+i) = gap_clip(S4, 7);
 			}
+			gap_waitbarrier(0);
 		}
-		if (IterW&0x1) {
-			for (int i=0; i<(Iter1/4); i++) {
-				((int *)(ColBuff))[i]=0;
-			}
-			if (Iter1&0x2) {
-				((short int *)(ColBuff))[Iter1/2-1]=0;
-			}
-			if (Iter1&0x1) {
-				((signed char *)(ColBuff))[Iter1-1]=0;
-			}
+		if (Wo&0x1) {
+			int c = Wo-1;
+		// for (int c=0; c<Wo; c++) {
+			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
+			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
+			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
 			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
 			int OffC = -Lb - Min(PosC, 0);
                         if (Iter>=4) {
                                 for (int f=0; f<(Iter/4); f++)
-					for (int j=Tb; j<Db; j++) {
+					for (int j=Tb; j<Db; j++)
                                         	for (int i=Lb; i<Rb; i++)
-							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[f] = ((int *)(In+j*W*InFeat + i*InFeat))[f];
-					}
+							((int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[f] = ((int *)(In+j*W*InFeat + i*InFeat+F))[f];
                                 if (Iter&0x2)
-					for (int j=Tb; j<Db; j++) {
+					for (int j=Tb; j<Db; j++)
 						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat))[Iter/2-1];
-					}
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
                                 if (Iter&0x1)
-					for (int j=Tb; j<Db; j++) {
+					for (int j=Tb; j<Db; j++)
 						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat))[Iter-1];
-					}
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
                         } else if (Iter>=2) {
                                 if (Iter&0x2)
-					for (int j=Tb; j<Db; j++) {
+					for (int j=Tb; j<Db; j++)
 						for (int i=Lb; i<Rb; i++)
-							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat))[Iter/2-1];
-					}
+							((short int *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter/2-1] = ((short int *)(In+j*W*InFeat + i*InFeat+F))[Iter/2-1];
                                 if (Iter&0x1)
-					for (int j=Tb; j<Db; j++) {
+					for (int j=Tb; j<Db; j++)
 						for (int i=Lb; i<Rb; i++)
-							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat))[Iter-1];
-					}
-                        } else if (Iter>0) {
-				for (int j=Tb; j<Db; j++) {
+							((signed char *)(ColBuff+(j+OffL)*InFeat*Fx+(i+OffC)*InFeat+F))[Iter-1] = ((signed char *)(In+j*W*InFeat + i*InFeat+F))[Iter-1];
+                        } else if (Iter>0)
+				for (int j=Tb; j<Db; j++)
 					for (int i=Lb; i<Rb; i++)
-						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat] = In[j*W*InFeat + i*InFeat];
-				}
-			}
-			int *pBias = Bias;
-			signed char *pC = Filter;
-			signed char *pOut0 = Out+l*Wo*OutFeat + (Last-1)*OutFeat;
-			unsigned char *pSc = Scale;
-			unsigned char *pScN = ScaleN;
+						ColBuff[(j+OffL)*InFeat*Fx+(i+OffC)*InFeat + F] = In[j*W*InFeat + i*InFeat + F];
+
+			PosC += Sx;
+			gap_waitbarrier(0);
+
+			int *pBias = Bias + First;
+			signed char *pC = Filter + W_In1*First;
+			signed char *pOut0 = Out+l*Wo*OutFeat + (c)*OutFeat+First;
+			unsigned char *pSc = Scale + First;
+			unsigned char *pScN = ScaleN + First;
 	                for (int Line=0; Line<IterOut/4; Line++) {
 				signed char *pC0 = pC, *pC1 = pC0+W_In1, *pC2 = pC1+W_In1, *pC3 = pC2+W_In1;
 				pC=pC3+W_In1;
-	                        int S00 = (*pBias)<<NormBias; pBias++;
-	                        int S10 = (*pBias)<<NormBias; pBias++;
-	                        int S20 = (*pBias)<<NormBias; pBias++;
-	                        int S30 = (*pBias)<<NormBias; pBias++;
-				signed char *pIn0 = ColBuff;
+	                        int S0 = (*pBias)<<NormBias; pBias++;
+	                        int S1 = (*pBias)<<NormBias; pBias++;
+	                        int S2 = (*pBias)<<NormBias; pBias++;
+	                        int S3 = (*pBias)<<NormBias; pBias++;
+				signed char *pIn = ColBuff;
 	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
-	                                S00 = gap_sumdotp4(V0, C0, S00);
-	                                S10 = gap_sumdotp4(V0, C1, S10);
-	                                S20 = gap_sumdotp4(V0, C2, S20);
-	                                S30 = gap_sumdotp4(V0, C3, S30);
-					pIn0+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
+					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC0), C1 = *((v4s *)pC1), C2 = *((v4s *)pC2), C3 = *((v4s *)pC3);
+	                                S0 = gap_sumdotp4(V0, C0, S0);
+	                                S1 = gap_sumdotp4(V0, C1, S1);
+	                                S2 = gap_sumdotp4(V0, C2, S2);
+	                                S3 = gap_sumdotp4(V0, C3, S3);
+					pIn+=4; pC0+=4; pC1+=4; pC2+=4; pC3+=4;
 	                        }
 				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn0, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
-					S00 += V0*C0;
-					S10 += V0*C1;
-					S20 += V0*C2;
-					S30 += V0*C3;
-					pIn0++; pC0++; pC1++; pC2++; pC3++;
+					int V0 = *pIn, C0 = *pC0, C1 = *pC1, C2 = *pC2, C3 = *pC3;
+					S0 += V0*C0;
+					S1 += V0*C1;
+					S2 += V0*C2;
+					S3 += V0*C3;
+					pIn++; pC0++; pC1++; pC2++; pC3++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+				*((v4s *) (pOut0+4*Line)) = R1;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
-				signed char *pIn0 = ColBuff;
-	                        int S00 = (*pBias)<<NormBias; pBias++;
+				signed char *pIn = ColBuff;
+	                        int S0 = (*pBias)<<NormBias; pBias++;
 	                        for (int i=0; i<(W_In1/4); i++) {
-					v4s V0 = *((v4s *)pIn0), C0 = *((v4s *)pC);
-	                               	S00 = gap_sumdotp4(V0, C0, S00);
-					pIn0+=4; pC+=4;
+					v4s V0 = *((v4s *)pIn), C0 = *((v4s *)pC);
+	                               	S0 = gap_sumdotp4(V0, C0, S0);
+					pIn+=4; pC+=4;
 				}
 				for (int f=4*(W_In1/4); f<W_In1; f++) {
-					int V0 = *pIn0, C0 = *pC;
-					S00 += V0*C0;
-					pIn0++; pC++;
+					int V0 = *pIn, C0 = *pC;
+					S0 += V0*C0;
+					pIn++; pC++;
 				}
-	                        unsigned int Sc, ScN;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*(pOut0+i) = gap_clip(S0, 7);
 			}
+			gap_waitbarrier(0);
 		}
 		PosL += Sy;
 	}
-	gap_waitbarrier(0);
 }
 
-void Ker_MM_Conv2D_ReLU_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void KerPar_MM_Conv2D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerPar_MM_Conv2D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerPar_MM_Conv2D_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerPar_MM_Conv2D_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerPar_MM_Conv2D_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerPar_MM_Conv2D_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerPar_MM_Conv2D_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerPar_MM_Conv2D_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerPar_MM_Conv2D_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv2D_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+/*
+ * 2D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Parallelization on the output Width spatial dimension -> Each Core has 2 im2col buffer
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) Ker_MM_Conv2D_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -2731,7 +1902,9 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8(
         signed char * __restrict__ ColBuff = Arg->ColBuff;
         signed char * __restrict__ ColBuff1;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 
 	unsigned int W_In1 = InFeat*Fx*Fy;
@@ -2848,17 +2021,25 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S11, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S21, Sc, ScN), 7); pOut1++;
-				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S31, Sc, ScN), 7); pOut1++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
+				*pOut1 = gap_clip(S01, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S11 = AT_SCALE(S11, Sc, ScN); ACT_SWITCH(S11, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S10, 7); pOut0++;
+				*pOut1 = gap_clip(S11, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S21 = AT_SCALE(S21, Sc, ScN); ACT_SWITCH(S21, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S20, 7); pOut0++;
+				*pOut1 = gap_clip(S21, 7); pOut1++;
+				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S31 = AT_SCALE(S31, Sc, ScN); ACT_SWITCH(S31, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S30, 7); pOut0++;
+				*pOut1 = gap_clip(S31, 7); pOut1++;
 	                }
 			for (int Line=4*(IterOut/4); Line<IterOut; Line++) {
 				signed char *pIn0 = ColBuff, *pIn1 = ColBuff1;
@@ -2875,8 +2056,10 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
-				*pOut1 = AT_CLIP_POS_IMM(AT_SCALE(S01, Sc, ScN), 7); pOut1++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S01 = AT_SCALE(S01, Sc, ScN); ACT_SWITCH(S01, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
+				*pOut1 = gap_clip(S01, 7); pOut1++;
 			}
 		}
 		if (IterW&0x1) {
@@ -2955,13 +2138,17 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S10, Sc, ScN), 7); pOut0++;
+				S10 = AT_SCALE(S10, Sc, ScN); ACT_SWITCH(S10, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S10, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S20, Sc, ScN), 7); pOut0++;
+				S20 = AT_SCALE(S20, Sc, ScN); ACT_SWITCH(S20, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S20, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S30, Sc, ScN), 7); pOut0++;
+				S30 = AT_SCALE(S30, Sc, ScN); ACT_SWITCH(S30, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S30, 7); pOut0++;
 	                }
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
 				signed char *pIn0 = ColBuff;
@@ -2978,7 +2165,8 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8(
 				}
 	                        unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = AT_CLIP_POS_IMM(AT_SCALE(S00, Sc, ScN), 7); pOut0++;
+				S00 = AT_SCALE(S00, Sc, ScN); ACT_SWITCH(S00, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S00, 7); pOut0++;
 			}
 		}
 		PosL += Sy;
@@ -2986,8 +2174,53 @@ void Ker_MM_Conv2D_ReLU_HWC_SQ8(
 	gap_waitbarrier(0);
 }
 
-void KerPar_MM_Conv2D_DxDy_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void Ker_MM_Conv2D_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_NONE);
+}
+
+void Ker_MM_Conv2D_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELU);
+}
+
+void Ker_MM_Conv2D_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELUN);
+}
+
+void Ker_MM_Conv2D_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELUM);
+}
+
+void Ker_MM_Conv2D_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void Ker_MM_Conv2D_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void Ker_MM_Conv2D_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void Ker_MM_Conv2D_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void Ker_MM_Conv2D_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void Ker_MM_Conv2D_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	Ker_MM_Conv2D_HWC_SQ8_act(Arg, ACT_TANH);
+}
+
+/*
+ * 2D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with CHW inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -3005,7 +2238,9 @@ void KerPar_MM_Conv2D_DxDy_SQ8(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
@@ -3057,7 +2292,8 @@ void KerPar_MM_Conv2D_DxDy_SQ8(
 	                                S0 = gap_sumdotp4(V1, C1, S0);
 	                        }
 	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
+	                        S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+	                        Out[Line*Wo*Ho + l*Wo + c] = gap_clip(S0, 7);
 	                }
 			gap_waitbarrier(0);
 		}
@@ -3066,8 +2302,54 @@ void KerPar_MM_Conv2D_DxDy_SQ8(
 	gap_waitbarrier(0);
 }
 
-void KerPar_MM_Conv2D_DxDy_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
+void KerPar_MM_Conv2D_DxDy_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_NONE);
+}
+
+void KerPar_MM_Conv2D_DxDy_ReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELU);
+}
+
+void KerPar_MM_Conv2D_DxDy_ReLUN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELUN);
+}
+
+void KerPar_MM_Conv2D_DxDy_ReLUM_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELUM);
+}
+
+void KerPar_MM_Conv2D_DxDy_ReLUMN_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_RELUMN);
+}
+
+void KerPar_MM_Conv2D_DxDy_LeakyReLU_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_LEAKYRELU);
+}
+
+void KerPar_MM_Conv2D_DxDy_HSwish_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_HSWISH);
+}
+
+void KerPar_MM_Conv2D_DxDy_HSigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_HSIGMOID);
+}
+
+void KerPar_MM_Conv2D_DxDy_Sigmoid_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_SIGMOID);
+}
+
+void KerPar_MM_Conv2D_DxDy_Tanh_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_SQ8_act(Arg, ACT_TANH);
+}
+
+
+/*
+ * 2D Convolutional Kernels with Dilation kernels based on MatMul (im2col: ColBuff) with HWC inout tensor order
+ * Optional Activation fused applied to the 32bits accumulator -> ACT_SWITCH defined in CNN_BasicKernels_SQ8.h
+ */
+static inline void __attribute__((always_inline)) KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(
+	Ker_MM_Conv_SQ8_T *Arg,
+	CNN_ActivationOper_T Activation
 	)
 
 {
@@ -3088,7 +2370,9 @@ void KerPar_MM_Conv2D_DxDy_HWC_SQ8(
         unsigned char * __restrict__ ScaleN = Arg->ScaleN;
         signed char * __restrict__ ColBuff = Arg->ColBuff;
 	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
+	unsigned char * Infos = Arg->Infos;
+	unsigned int ActScale = ((unsigned char *)Infos)[AT_INF_ACTSCALE], ActScaleN = ((unsigned char *)Infos)[AT_INF_ACTSCALEN];
+	int A0 = *((unsigned char *) &Infos[AT_INF_A0]); int B0 = *((unsigned char *) &Infos[AT_INF_B0]); int C0 = *((unsigned char *) &Infos[AT_INF_C0]);
 
 	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
 	v4s * __restrict__ VBuff = (v4s *) ColBuff;
@@ -3206,10 +2490,16 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 					S3 += V0*C3; S7 += V1*C3;
 					pIn++; pIn1++; pC0++; pC1++; pC2++; pC3++;
 				}
-				v4s R1 = gap_pack4(gap_clip(AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]), 7));
-				v4s R2 = gap_pack4(gap_clip(AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]), 7), gap_clip(AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]), 7),
-						   gap_clip(AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]), 7), gap_clip(AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]), 7));
+				S0 = AT_SCALE(S0, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S1 = AT_SCALE(S1, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S2 = AT_SCALE(S2, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S3 = AT_SCALE(S3, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S4 = AT_SCALE(S4, pSc[4*Line  ], pScN[4*Line  ]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S5 = AT_SCALE(S5, pSc[4*Line+1], pScN[4*Line+1]); ACT_SWITCH(S5, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S6 = AT_SCALE(S6, pSc[4*Line+2], pScN[4*Line+2]); ACT_SWITCH(S6, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S7 = AT_SCALE(S7, pSc[4*Line+3], pScN[4*Line+3]); ACT_SWITCH(S7, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				v4s R1 = gap_pack4(gap_clip(S0, 7), gap_clip(S1, 7), gap_clip(S2, 7), gap_clip(S3, 7));
+				v4s R2 = gap_pack4(gap_clip(S4, 7), gap_clip(S5, 7), gap_clip(S6, 7), gap_clip(S7, 7));
 				*((v4s *) (pOut0+4*Line)) = R1;
 				*((v4s *) (pOut1+4*Line)) = R2;
 	                }
@@ -3227,8 +2517,10 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 					S0 += V0*C0; S4 += V1*C0;
 					pIn++; pIn1++; pC++;
 				}
-				*(pOut0+i) = gap_clip(AT_SCALE(S0, pSc[i], pScN[i]), 7);
-				*(pOut1+i) = gap_clip(AT_SCALE(S4, pSc[i], pScN[i]), 7);
+				S0 = AT_SCALE(S0, pSc[i], pScN[i]); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				S4 = AT_SCALE(S4, pSc[i], pScN[i]); ACT_SWITCH(S4, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*(pOut0+i) = gap_clip(S0, 7);
+				*(pOut1+i) = gap_clip(S4, 7);
 			}
 			gap_waitbarrier(0);
 		}
@@ -3278,13 +2570,17 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 				}
 				unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S0, Sc, ScN), 7); pOut0++;
+				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S0, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S1, Sc, ScN), 7); pOut0++;
+				S1 = AT_SCALE(S1, Sc, ScN); ACT_SWITCH(S1, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S1, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S2, Sc, ScN), 7); pOut0++;
+				S2 = AT_SCALE(S2, Sc, ScN); ACT_SWITCH(S2, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S2, 7); pOut0++;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S3, Sc, ScN), 7); pOut0++;
+				S3 = AT_SCALE(S3, Sc, ScN); ACT_SWITCH(S3, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S3, 7); pOut0++;
 			}
 			for (int i=4*(IterOut/4); i<IterOut; i++) {
 				signed char *pIn = ColBuff;
@@ -3301,7 +2597,8 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 				}
 				unsigned int Sc, ScN;
 				Sc = *pSc; ScN = *pScN; pSc++; pScN++;
-				*pOut0 = gap_clip(AT_SCALE(S0, Sc, ScN), 7); pOut0++;
+				S0 = AT_SCALE(S0, Sc, ScN); ACT_SWITCH(S0, Activation, ActScale, ActScaleN, A0, B0, C0, 8, 0);
+				*pOut0 = gap_clip(S0, 7); pOut0++;
 			}
 			gap_waitbarrier(0);
 		}
@@ -3310,231 +2607,44 @@ This part is more efficient but NOT WORKING ???? TOCHECK
 	gap_waitbarrier(0);
 }
 
-void KerPar_MM_Conv2D_DxDy_ReLU_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Dx = Arg->Dx;
-	int Fy = Arg->Fy, Sy = Arg->Sy, Dy = Arg->Dy;
-	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
-
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx*Fy;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
-
-	int FS = Fx*Fy;
-	int Tail = 2*((W_In1+7)/8);
-	((int *)ColBuff)[Tail-1] = 0; ((int *)ColBuff)[Tail-2] = 0;
-	int PosL = Arg->FirstTile?(-PadT):0;
-	int DFx = Dx*(Fx-1)+1, DFy =  Dy*(Fy-1)+1;
-	// int Prec=10;
-	int InvDx = ((1<<Prec)+Dx-1)/Dx;
-	int InvDy = ((1<<Prec)+Dy-1)/Dy;
-	int Iter = L-F;
-	int Iter1 = Iter*FS;
-
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		int Tb = Max(PosL, 0), Db = Min(PosL+DFy, H);
-		int OffLBuffY = Max(0, gap_mulsN(-PosL+Dy-1, InvDy, Prec));
-		int OffLInY = OffLBuffY?(Dy*OffLBuffY+PosL):0;
-		for (int c=0; c<Wo; c++) {
-			for (int i=0; i<(Iter1/4); i++) ((int *)(ColBuff+F*FS))[i]=0;
-			if (Iter1&0x2) ((short int *)(ColBuff+F*FS))[Iter1/2-1]=0;
-			if (Iter1&0x1) ((signed char *)(ColBuff+F*FS))[Iter1-1]=0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+DFx, W);
-			int OffCBuffX = (Lb==0)?Max(0, gap_mulsN(-PosC+Dx-1, InvDx, Prec)):0;
-			int OffCInX = OffCBuffX?(Dx*OffCBuffX+PosC):0;
-			int IterY = gap_mulsN(Db-Tb-1, InvDy, Prec) + 1;
-			int IterX = gap_mulsN(Rb-Lb-1, InvDx, Prec) + 1;
-			for (int f=F; f<L; f++) {
-				for (int j=0; j<IterY; j++) {
-					for (int i=0; i<IterX; i++) ColBuff[FS*f + Fx*(j+OffLBuffY) + i+OffCBuffX] = In[f*W*H + (Tb+j*Dy+OffLInY)*W + Lb+i*Dx+OffCInX];
-				}
-			}
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=First; Line<Last; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*W_In1 + 0]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<((W_In1+7)/8); i++) {
-	                                v4s V0 = VIn1[2*i], V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[Line*Wo*Ho + l*Wo + c] = AT_CLIP_POS_IMM(AT_SCALE(S0, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
-	gap_waitbarrier(0);
+void KerPar_MM_Conv2D_DxDy_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_NONE);
 }
 
-#if 0
-void KerPar_MM_ConvDW1D_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
-
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-
-	int Wo = Arg->Wo, Ho = Arg->Ho;
+void KerPar_MM_Conv2D_DxDy_ReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_RELU);
+}
 
-	/* ColBuff must be large enough to accomodate ((((InFeat/NCores)+3)/4)*4)*8 elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	v4s M0 = (v4s){-1,0,0,0}, M1 = (v4s){0,-1,0,0}, M2 = (v4s){0,0,-1,0}, M3 = (v4s){0,0,0,-1};
+void KerPar_MM_Conv2D_DxDy_ReLUN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_RELUN);
+}
 
-	int PosL = 0;
-	int Iter = L-F;
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		for (int c=0; c<Wo; c++) {
-			for (int i=F*Fx; i<(L*Fx); i++) ColBuff[i] = 0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int Off = -Lb - Min(PosC, 0);
-			if (Iter) {
-				if (Iter>=4) {
-					for (int f=0; f<(Iter/4); f++)
-						for (int i=Lb; i<Rb; i++) ((int *)(ColBuff+(i+Off)*InFeat+F))[f] = ((int *)(In+PosL*W*InFeat + i*InFeat+F))[f];
-					if (Iter&0x2) for (int i=Lb; i<Rb; i++) ((short int *)(ColBuff+(i+Off)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + i*InFeat+F))[Iter/2-1];
-					if (Iter&0x1) for (int i=Lb; i<Rb; i++) ((signed char *)(ColBuff+(i+Off)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + i*InFeat+F))[Iter-1];
-				} else if (Iter>=2) {
-					if (Iter&0x2) for (int i=Lb; i<Rb; i++) ((short int *)(ColBuff+(i+Off)*InFeat+F))[Iter/2-1] = ((short int *)(In+PosL*W*InFeat + i*InFeat+F))[Iter/2-1];
-					if (Iter&0x1) for (int i=Lb; i<Rb; i++) ((signed char *)(ColBuff+(i+Off)*InFeat+F))[Iter-1] = ((signed char *)(In+PosL*W*InFeat + i*InFeat+F))[Iter-1];
-				} else if (Iter>0)
-					for (int i=Lb; i<Rb; i++) ColBuff[(i+Off)*InFeat + F] = In[PosL*W*InFeat + i*InFeat + F];
-			}
-			PosC += Sx;
-			gap_waitbarrier(0);
-			v4s *VIn   = (v4s *)(ColBuff + F);
-			/* Filter: Fx x InFeat -> fx*Infeat + f */
-			for (int f=0; f<(Iter/4); f++) {
-				v4s B = ((v4s *)(Bias + F))[f];
-				int S0 = B[0], S1 = B[1], S2 = B[2], S3 = B[3];
-				for (int i=0; i<Fx; i++) {
-					v4s Coeff = ((v4s *)(Filter + i*InFeat + F))[f];
-					v4s In    = ((v4s *)(ColBuff + i*InFeat + F))[f];
-	                                S0 = gap_sumdotp4(In, Coeff & M0, S0);
-	                                S1 = gap_sumdotp4(In, Coeff & M1, S1);
-	                                S2 = gap_sumdotp4(In, Coeff & M2, S2);
-	                                S3 = gap_sumdotp4(In, Coeff & M3, S3);
-				}
-	                        v4s Sc = ((v4s *) Scale+F)[f], ScN = ((v4s *) ScaleN+F)[f];
-				S0 = gap_clip(AT_SCALE(S0, Sc[0], ScN[0]), 7);
-				S1 = gap_clip(AT_SCALE(S1, Sc[1], ScN[1]), 7);
-				S2 = gap_clip(AT_SCALE(S2, Sc[2], ScN[2]), 7);
-				S3 = gap_clip(AT_SCALE(S3, Sc[3], ScN[3]), 7);
-	                        ((v4s *) Out+l*Wo*OutFeat + c*OutFeat + F)[f] = gap_pack4(S0, S1, S2, S3);
-			}
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
+void KerPar_MM_Conv2D_DxDy_ReLUM_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_RELUM);
 }
 
-void KerPar_MM_ConvDW2D_HWC_SQ8(
-	Ker_MM_Conv_SQ8_T *Arg
-	)
+void KerPar_MM_Conv2D_DxDy_ReLUMN_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_RELUMN);
+}
 
-{
-	/*
-		For HWC weights (4D Tensor) are expected to be organized as [OutFeat x Fy x Fx x InFeat]
-	*/
-	signed char *__restrict__ In = Arg->In;
-	int W = Arg->W, H = Arg->H;
-	signed char *__restrict__ Filter = Arg->Filter;
-	int Fx = Arg->Fx, Sx = Arg->Sx;
-	int Fy = Arg->Fy, Sy = Arg->Sy;
-	int PadL = Arg->Pad[0], PadT = Arg->Pad[2];
-	int InFeat = Arg->InFeat, OutFeat = Arg->OutFeat;
-        int * __restrict__ Bias = Arg->Bias;
-	int NormBias = Arg->Infos[AT_INF_BIASN];
-        signed char * __restrict__ Out = Arg->Out;
-        unsigned char * __restrict__ Scale = Arg->Scale;
-        unsigned char * __restrict__ ScaleN = Arg->ScaleN;
-        signed char * __restrict__ ColBuff = Arg->ColBuff;
-	int Wo = Arg->Wo, Ho = Arg->Ho;
-	int A0 = Arg->Infos[AT_INF_A0];
+void KerPar_MM_Conv2D_DxDy_LeakyReLU_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_LEAKYRELU);
+}
 
-	/* ColBuff must be large enough to accomodate Align(Fx*InFeat, 8) elements */
-	v4s * __restrict__ VBuff = (v4s *) ColBuff;
-	unsigned int W_In1 = InFeat*Fx*Fy;
-	unsigned int CoreId = gap_coreid(), C = ChunkSize(InFeat), F = Min(CoreId*C, InFeat), L = Min(InFeat, F+C);
-	//unsigned int ChunkCell = ChunkSize(OutFeat), First = CoreId*ChunkCell, Last  = Min(OutFeat, First+ChunkCell);
+void KerPar_MM_Conv2D_DxDy_HSwish_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_HSWISH);
+}
 
-	int FS = Fx*Fy;
-	int PosL = Arg->FirstTile?(-PadT):0;
+void KerPar_MM_Conv2D_DxDy_HSigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_HSIGMOID);
+}
 
-	for (int l=0; l<Ho; l++) {
-		int PosC = -PadL;
-		int Tb = Max(PosL, 0), Db = Min(PosL+Fy, H);
-		int OffL = -Tb - Min(PosL, 0);
-		for (int c=0; c<Wo; c++) {
-			for (int i=F; i<L; i++) ColBuff[i] = 0;
-			int Lb = Max(PosC, 0), Rb = Min(PosC+Fx, W);
-			int OffC = -Lb - Min(PosC, 0);
-                        int Iter = L-F;
-                        // Transpose the input from HWxC -> CxHW: InFeatxFyxFx
-                        for (int c=F; c<L; c++)
-				for (int j=Tb; j<Db; j++) 
-					for (int i=Lb; i<Rb; i++)
-						ColBuff[c*FS + (j+OffL)*Fy + (i+OffC)] = In[j*W*InFeat + i*InFeat + c];
+void KerPar_MM_Conv2D_DxDy_Sigmoid_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_SIGMOID);
+}
 
-			PosC += Sx;
-			gap_waitbarrier(0);
-	                for (int Line=F; Line<L; Line++) {
-	                        v4s *VIn1 = (v4s *) (&Filter[Line*FS]);
-	                        v4s *VBuff = (v4s *) (&ColBuff[Line*FS]);
-	                        int S0 = (Bias[Line]<<NormBias);
-	                        for (int i=0; i<(FS/8); i++){
-	                                v4s V0 = VIn1[2*i] , V1 = VIn1[2*i+1];
-					v4s C0 = VBuff[2*i], C1 = VBuff[2*i+1];
-	                                S0 = gap_sumdotp4(V0, C0, S0);
-	                                S0 = gap_sumdotp4(V1, C1, S0);
-	                        }
-	                        for (int i=(FS/8)*8; i<FS; i++) S0 += ColBuff[i]*Filter[Line*FS+i];
-	                        unsigned int Sc = Scale[Line], ScN = ScaleN[Line];
-	                        Out[l*OutFeat*Wo+c*OutFeat+Line] = gap_clip(AT_SCALE(S0, Sc, ScN), 7);
-	                }
-			gap_waitbarrier(0);
-		}
-		PosL += Sy;
-	}
-	gap_waitbarrier(0);
+void KerPar_MM_Conv2D_DxDy_Tanh_HWC_SQ8(Ker_MM_Conv_SQ8_T *Arg) {
+	KerPar_MM_Conv2D_DxDy_HWC_SQ8_act(Arg, ACT_TANH);
 }
-#endif
+
 #pragma GCC diagnostic pop
diff --git a/tools/autotiler_v3/DSP_Generators/DSP_Generators.c b/tools/autotiler_v3/DSP_Generators/DSP_Generators.c
index 5d53db35e..06a9b3658 100644
--- a/tools/autotiler_v3/DSP_Generators/DSP_Generators.c
+++ b/tools/autotiler_v3/DSP_Generators/DSP_Generators.c
@@ -738,6 +738,11 @@ int MFCC_Generator(
 	printf("MfccKernel:	%25s\n", MfccKernel?MfccKernel:"");
 	printf("LogKernel:	%25s\n", LogKernel?LogKernel:"");
 	printf("DCTKernel:	%25s\n", DCTKernel?DCTKernel:"");
+	printf("\tN Frames: %d\n", NFrames);
+	printf("\tFrame Size: %d\n", FrameSize);
+	printf("\tFrame Stride: %d\n", FrameStride);
+	printf("\tIn Data Size: %d\n", InItemSize);
+	printf("\tTot Input Size: %d\n", FrameStride * (NFrames-1) + FrameSize);
 	printf("\tNb Oper: %d\n", LayerOp);
 	printf("\tBandwidth: %d\n", LayerBandwidth);
 	// printf("MFCC_COEF_DYN = %d\nFFT_BITS = %d\nUSE_DB = %d\nDATA_TYPE = %dLOG OFFSET %f\n", MFCC_Coeff_Dyn, Log2Nfft, UseDB, DataType, MfccLogOffset);
diff --git a/tools/autotiler_v3/DSP_Libraries/FFT_Library.c b/tools/autotiler_v3/DSP_Libraries/FFT_Library.c
index 99d5d95f3..9a6e96b2c 100644
--- a/tools/autotiler_v3/DSP_Libraries/FFT_Library.c
+++ b/tools/autotiler_v3/DSP_Libraries/FFT_Library.c
@@ -492,7 +492,7 @@ void Radix4FFT_DIF_Par_Fix16(FFT_Arg_T *Arg)
         int iCnt1, iCnt2, iCnt3,
             iL,    iM,    iQ,
             iA,    iB,    iC,     iD;
-        unsigned int iLog4N  = (gap_fl1(N_fft))>>1;
+        int iLog4N  = (gap_fl1(N_fft))>>1;
         v2s *DataV  = (v2s *) Data;
         v2s *CoeffV = (v2s *) Twiddles;
         unsigned int CoreId;
@@ -580,7 +580,7 @@ void Radix4FFT_DIF_Par_Fix32(FFT_Arg_T *Arg)
         int iCnt1, iCnt2, iCnt3,
             iL,    iM,    iQ,
             iA,    iB,    iC,     iD;
-        unsigned int iLog4N  = (gap_fl1(N_fft))>>1;
+        int iLog4N  = (gap_fl1(N_fft))>>1;
         unsigned int CoreId;
         int First, Last, Chunk;
 
@@ -656,7 +656,7 @@ void Radix4FFT_DIF_Par_f16(FFT_Arg_T *Arg)
         int iCnt1, iCnt2, iCnt3,
             iL,    iM,    iQ,
             iA,    iB,    iC,     iD;
-        unsigned int iLog4N  = (gap_fl1(N_fft))>>1;
+        int iLog4N  = (gap_fl1(N_fft))>>1;
         F16V_DSP *DataV  = (F16V_DSP *) Data;
         F16V_DSP *CoeffV = (F16V_DSP *) Twiddles;
         unsigned int CoreId;
@@ -746,7 +746,7 @@ void Radix4FFT_DIF_Par_f32(FFT_Arg_T *Arg)
         int iCnt1, iCnt2, iCnt3,
             iL,    iM,    iQ,
             iA,    iB,    iC,     iD;
-        unsigned int iLog4N  = (gap_fl1(N_fft))>>1;
+        int iLog4N  = (gap_fl1(N_fft))>>1;
         unsigned int CoreId;
         int First, Last, Chunk;
 
@@ -1428,7 +1428,7 @@ void Radix2FFT_DIF_Par_Fix32_Scal(FFT_scal_Arg_T *Arg)
         // reset the shift table
         Chunk = N_fft/nbcore;
         First =  CoreId*Chunk; Last = Min( First+Chunk,N_fft);
-        for (int i = First; i < Last; i++) shift_fft[i]=0; 
+        for (unsigned int i = First; i < Last; i++) shift_fft[i]=0; 
         gap_waitbarrier(0);
 
         // compute fft 
@@ -1656,14 +1656,17 @@ void RFFT_DIF_Par_Fix16(RFFT_Arg_T *Arg){
         if (CoreId == 0){
                 xBR = pB[0][0];
                 xBI = pB[0][1];
-                xAR = pA[0][0];
-                xAI = pA[0][1];
          
                 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
                 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-                RFFT_Out[0][0] = ( xBR + xAR + xBI + xAI ) >> 2;
-                RFFT_Out[0][1] = ( xAI - xBI + xBR - xAR ) >> 2;
                 // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
+                RFFT_Out[0][0] = ( xBR + xBI ) >> 1;
+                RFFT_Out[0][1] = 0;
+
+                // Gr(N) = Gr(0) - Gi(0)
+                // Gi(N) = 0
+                RFFT_Out[k+1][0] = ( xBR - xBI );
+                RFFT_Out[k+1][1] = 0;
         }
         gap_waitbarrier(0);
 
@@ -1698,16 +1701,6 @@ void RFFT_DIF_Par_Fix16(RFFT_Arg_T *Arg){
                 t2 = gap_add2div4( xA, gap_cplxconj(xB));
                 RFFT_Out[i] = gap_cplxmuls(tw, t1) + t2;
         }
-        if (CoreId == 0){
-                xBR = pB[-(k-1)][0];
-                xBI = pB[-(k-1)][1];
-                xAR = pA[ (k-1)][0];
-                xAI = pA[ (k-1)][1];
-                RFFT_Out[k][0] = ( xBR + xAR - xBI - xAI ) >> 2;
-                // TODO - CHECK
-                // RFFT_Out[k][1] = ( xAI - xBI - xBR + xAR ) >> 2;
-                RFFT_Out[k][1] = 0;
-        }
         gap_waitbarrier(0);
 #ifdef PRINTDEB
 if (CoreId==0){
@@ -1854,14 +1847,14 @@ void RFFT_DIF_Par_f16(RFFT_Arg_T *Arg){
         if (CoreId == 0){
                 xBR = pB[0][0];
                 xBI = pB[0][1];
-                xAR = pA[0][0];
-                xAI = pA[0][1];
          
                 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
                 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-                RFFT_Out[0][0] = 0.5f * ( xBR + xAR + xBI + xAI );
-                RFFT_Out[0][1] = 0.5f * ( xAI - xBI + xBR - xAR );
+                RFFT_Out[0][0] = xBR + xBI;
+                RFFT_Out[0][1] = 0.0f;
                 // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
+                RFFT_Out[k+1][0] = xBR - xBI;
+                RFFT_Out[k+1][1] = 0.0f;
         }
         gap_waitbarrier(0);
 
@@ -1897,15 +1890,6 @@ void RFFT_DIF_Par_f16(RFFT_Arg_T *Arg){
                 t2 = t2 + xA;
                 RFFT_Out[i] = (CplxMult_f16(tw, t1) + t2) * (F16V_DSP) {0.5f, 0.5f};
         }
-        if (CoreId == 0){
-                xBR = pB[-(k-1)][0];
-                xBI = pB[-(k-1)][1];
-                xAR = pA[(k-1)][0];
-                xAI = pA[(k-1)][1];
-                RFFT_Out[k][0] = 0.5f * ( xBR + xAR - xBI - xAI );
-                // RFFT_Out[k][1] = 0.5f * ( xAI - xBI - xBR + xAR );
-                RFFT_Out[k][1] = 0.0f;
-        }
         gap_waitbarrier(0);
 #ifdef PRINTDEB
 if (CoreId==0){
@@ -1947,10 +1931,10 @@ void RFFT_DIF_Par_f32(RFFT_Arg_T *Arg){
         if (CoreId == 0){
                 xBR = pB[0];
                 xBI = pB[1];
-                xAR = pA[0];
-                xAI = pA[1];
-                RFFT_Out[0] = 0.5f * ( xBR + xAR + xBI + xAI );
-                RFFT_Out[1] = 0.5f * ( xAI - xBI + xBR - xAR );
+                RFFT_Out[0] = xBR + xBI;
+                RFFT_Out[1] = 0.0f;
+                RFFT_Out[2*(k+1)]   = xBR - xBI;
+                RFFT_Out[2*(k+1)+1] = 0.0f;
         }
         gap_waitbarrier(0);
 
@@ -1998,16 +1982,6 @@ void RFFT_DIF_Par_f32(RFFT_Arg_T *Arg){
 
                 RFFT_Out[2*i]   = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
                 RFFT_Out[2*i+1] = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
-                // printf("%d %f %f\n", i, RFFT_Out[2*i] ,RFFT_Out[2*i+1] );
-        }
-        if (CoreId == 0){
-                xBR = pB[-2*(k-1)];
-                xBI = pB[-2*(k-1)+1];
-                xAR = pA[2*(k-1)];
-                xAI = pA[2*(k-1)+1];
-                RFFT_Out[2*k]   = 0.5f * ( xBR + xAR - xBI - xAI );
-                RFFT_Out[2*k+1] = 0.0f;
-                // RFFT_Out[2*k+1] = 0.5f * ( xAI - xBI - xBR + xAR );
         }
         gap_waitbarrier(0);
 #ifdef PRINTDEB
diff --git a/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c b/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c
index f1f31e2cf..e00ff23c0 100644
--- a/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c
+++ b/tools/autotiler_v3/DSP_Libraries/MfccBasicKernels.c
@@ -35,7 +35,7 @@ void MelFilterBank_Fix32(MelFilterBank_T *Arg)
         unsigned short int    *__restrict__ Mel_Coeffs     = (unsigned short int *__restrict__)    Arg->Mel_Coeffs;
         signed char  *__restrict__ shift_buff     = (signed char *__restrict__)  Arg->shift_buff;
         fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank;
-        short int                  Mel_NBanks     = Arg->Mel_NBanks;
+        unsigned int               Mel_NBanks     = (unsigned int) Arg->Mel_NBanks;
         short int                  Mel_Coeff_Dyn  = Arg->Mel_Coeff_dyn;
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
 
@@ -86,7 +86,7 @@ void MelFilterBank_Fix32_Scal(MelFilterBank_T *Arg)
         signed char  *__restrict__ shift_fft      = (signed char *__restrict__)  Arg->shift_fft;
         short int    *__restrict__ Mel_Coeffs     = (short int *__restrict__)    Arg->Mel_Coeffs;
         fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank;
-        short int                  Mel_NBanks     = Arg->Mel_NBanks;
+        unsigned int               Mel_NBanks     = (unsigned int) Arg->Mel_NBanks;
         short int                  Mel_Coeff_Dyn  = Arg->Mel_Coeff_dyn;
         signed char IsMagSquared = Arg->IsMagSquared;
         int MUL_EXP = IsMagSquared?2:1;
@@ -112,7 +112,7 @@ void MelFilterBank_Fix32_Scal(MelFilterBank_T *Arg)
                 }
                 // align the block scaling on the min , compute the max value in the block
                 for (k=0, j=Mel_FilterBank[i].Start; k<(unsigned int) NonZeroItems; j++, k++) {
-                        int TMP = FramePower[j] >> (MUL_EXP * (shift_fft[j] - min_shift));
+                        unsigned int TMP = FramePower[j] >> (MUL_EXP * (shift_fft[j] - min_shift));
                         if (TMP > (unsigned int) maxin) maxin = TMP;
                 }
 
@@ -149,7 +149,7 @@ void MelFilterBank_f16(MelFilterBank_T *Arg)
         F16_DSP *__restrict__ Mel_Spectr = (F16_DSP *__restrict__) Arg->MelSpectr;
         F16_DSP *__restrict__ Mel_Coeffs = (F16_DSP *__restrict__) Arg->Mel_Coeffs;
         fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank;
-        short int                  Mel_NBanks     = Arg->Mel_NBanks;
+        unsigned int               Mel_NBanks     = (unsigned int) Arg->Mel_NBanks;
 
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
         //Chunk = ChunkSize(Mel_NBanks);
@@ -183,7 +183,7 @@ void MelFilterBank_f32(MelFilterBank_T *Arg)
         float *__restrict__ Mel_Spectr = (float *__restrict__) Arg->MelSpectr;
         float *__restrict__ Mel_Coeffs = (float *__restrict__) Arg->Mel_Coeffs;
         fbank_type_t *__restrict__ Mel_FilterBank = (fbank_type_t *__restrict__) Arg->Mel_FilterBank;
-        short int                  Mel_NBanks     = Arg->Mel_NBanks;
+        unsigned int               Mel_NBanks     = (unsigned int) Arg->Mel_NBanks;
 
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
         //Chunk = ChunkSize(Mel_NBanks);
@@ -211,7 +211,7 @@ void MelFilterBank_f32(MelFilterBank_T *Arg)
 
 void MFCC_ComputeLog_Fix32(MFCC_Log_T *Arg)
 {
-        int i;
+        unsigned int i;
         int         size      = Arg->FrameSize;
         unsigned int *frameIn = (unsigned int *) Arg->FrameIn;
         short int *frameOut = (short int *) Arg->FrameOut;
@@ -269,7 +269,7 @@ void MFCC_ComputeLog_Fix32(MFCC_Log_T *Arg)
 
 void MFCC_ComputeLog_Fix32_Scal(MFCC_Log_T *Arg)
 {
-        int i;
+        unsigned int i;
         int         size      = Arg->FrameSize;
         unsigned int *frameIn = (unsigned int *) Arg->FrameIn;
         short int   *frameOut = (short int *) Arg->FrameOut;
@@ -327,7 +327,7 @@ void MFCC_ComputeLog_Fix32_Scal(MFCC_Log_T *Arg)
 
 void MFCC_ComputeLog_f16( MFCC_LogF_T *Arg)
 {        
-        int i;
+        unsigned int i;
         int size      = Arg->FrameSize;
         F16_DSP *frameIn  = (F16_DSP *) Arg->FrameIn;
         F16_DSP *frameOut = (F16_DSP *) Arg->FrameOut;
@@ -360,7 +360,7 @@ void MFCC_ComputeLog_f16( MFCC_LogF_T *Arg)
 
 void MFCC_ComputeLog_f32(MFCC_LogF_T *Arg) 
 {
-        int i;
+        unsigned int i;
         int   size      = Arg->FrameSize;
         float *frameIn  = (float *) Arg->FrameIn;
         float *frameOut = (float *) Arg->FrameOut;
@@ -392,7 +392,7 @@ void MFCC_ComputeLog_f32(MFCC_LogF_T *Arg)
 
 void MFCC_ComputeDB_Fix32(MFCC_Log_T *Arg)
 {
-        int i;
+        unsigned int i;
         int         size      = Arg->FrameSize;
         unsigned int *frameIn = (unsigned int *) Arg->FrameIn;
         short int   *frameOut = (short int *) Arg->FrameOut;
@@ -450,7 +450,7 @@ void MFCC_ComputeDB_Fix32(MFCC_Log_T *Arg)
 
 void MFCC_ComputeDB_Fix32_Scal(MFCC_Log_T *Arg)
 {
-        int i;
+        unsigned int i;
         int         size      = Arg->FrameSize;
         unsigned int *frameIn = (unsigned int *) Arg->FrameIn;
         short int   *frameOut = (short int *) Arg->FrameOut;
@@ -508,7 +508,7 @@ void MFCC_ComputeDB_Fix32_Scal(MFCC_Log_T *Arg)
 
 void MFCC_ComputeDB_f16( MFCC_LogF_T *Arg)
 {        
-        int i;
+        unsigned int i;
         int size      = Arg->FrameSize;
         F16_DSP *frameIn  = (F16_DSP *) Arg->FrameIn;
         F16_DSP *frameOut = (F16_DSP *) Arg->FrameOut;
@@ -540,7 +540,7 @@ void MFCC_ComputeDB_f16( MFCC_LogF_T *Arg)
 
 void MFCC_ComputeDB_f32(MFCC_LogF_T *Arg) 
 {
-        int i;
+        unsigned int i;
         int    size     = Arg->FrameSize;
         float *frameIn  = (float *) Arg->FrameIn;
         float *frameOut = (float *) Arg->FrameOut;
@@ -577,7 +577,7 @@ void norm_clip_16(Norm_Clip_args_T *Args)
         short int Norm = Args->Norm;
         int N = Args->N;
   
-        int i;
+        unsigned int i;
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
 
         if (CoreId==0) {
@@ -604,7 +604,7 @@ void norm_clip_32_melspect(MFCC_Clip_32_T *Args) {
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
 
         if (CoreId==0){
-                for (i=0; i<(unsigned int)N; i++) {
+                for (i=0; i<N; i++) {
                         int Qformat = (30 - shift_buff[i]);
                         Norm = Qformat - 16; //POWER
                         Out[i] = (Norm<32)?(In[i] >> Norm):0;
@@ -635,12 +635,12 @@ void norm_clip_32_melspect_scal(MFCC_Clip_32_T *Args)
 
         if (CoreId==0){
                 if (IsMagSquared){
-                        for (i=0; i<(unsigned int)N; i++) {
+                        for (i=0; i<N; i++) {
                                 Norm = Mel_Coeff_Dyn-2-shift_buff[i]+2*ExtraQ - 16; //POWER HIGH_PREC
                                 Out[i] = (Norm<32)?(In[i] >> Norm):0;
                         }
                 } else {
-                        for (i=0; i<(unsigned int)N; i++) {
+                        for (i=0; i<N; i++) {
                                 Norm = Mel_Coeff_Dyn-1-shift_buff[i]+ExtraQ - 16; //Abs HIGH_PREC
                                 Out[i] = (Norm<32)?(In[i] >> Norm):0;
                         }                        
@@ -660,8 +660,8 @@ void MFCC_ComputeDCT_II_Fix16(DCT_II_Arg_T *Args)
         v2s * in_dct   = (v2s * __restrict__ ) Args->Data;
         short int * DCTCoeff = (short int * __restrict__) Args->DCTCoeff;
         short int * FeatList = (short int * __restrict__ ) Args->FeatList;
-        short int NDCT       = Args->n_dct;
-        short int NInputs    = Args->n_input;
+        unsigned int NDCT  = (unsigned int) Args->n_dct;
+        unsigned int NInputs = (unsigned int) Args->n_input;
 
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
 
@@ -692,8 +692,8 @@ void MFCC_ComputeDCT_II_f16(DCT_II_Arg_T *Args)
         F16V_DSP * in_dct  = (F16V_DSP * __restrict__ ) Args->Data;
         F16_DSP * FeatList = (F16_DSP * __restrict__ ) Args->FeatList;
         F16_DSP * DCTCoeff = (F16_DSP * __restrict__) Args->DCTCoeff;
-        short int NDCT       = Args->n_dct;
-        short int NInputs    = Args->n_input;
+        unsigned int NDCT  = (unsigned int) Args->n_dct;
+        unsigned int NInputs = (unsigned int) Args->n_input;
 
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
 
@@ -725,8 +725,8 @@ void MFCC_ComputeDCT_II_f32(DCT_II_Arg_T *Args)
         float * in_dct   = (float * __restrict__ ) Args->Data;
         float * FeatList = (float * __restrict__ ) Args->FeatList;
         float * DCTCoeff = (float * __restrict__) Args->DCTCoeff;
-        short int NDCT       = Args->n_dct;
-        short int NInputs    = Args->n_input;
+        unsigned int NDCT  = (unsigned int) Args->n_dct;
+        unsigned int NInputs = (unsigned int) Args->n_input;
 
         unsigned int Chunk, First, Last, CoreId=gap_coreid();
 
diff --git a/tools/autotiler_v3/DSP_Libraries/PreProcessing.c b/tools/autotiler_v3/DSP_Libraries/PreProcessing.c
index 8d39cca00..e4dae722e 100644
--- a/tools/autotiler_v3/DSP_Libraries/PreProcessing.c
+++ b/tools/autotiler_v3/DSP_Libraries/PreProcessing.c
@@ -27,7 +27,7 @@ void get_max(PreEmphasis_T *Arg)
         maxin[CoreId]=0;
         if (CoreId==0) maxin[0] = Abs(Arg->Prev);
 
-        for (int j=First;j<Last;j++) {
+        for (unsigned int j=First;j<Last;j++) {
                 if(Abs((int)Frame[j])>maxin[CoreId]) maxin[CoreId]=Abs((int)Frame[j]);
         }
         gap_waitbarrier(0);
diff --git a/tools/autotiler_v3/Emulation/GapSystem.h b/tools/autotiler_v3/Emulation/GapSystem.h
index 558bb6688..4729a268e 100644
--- a/tools/autotiler_v3/Emulation/GapSystem.h
+++ b/tools/autotiler_v3/Emulation/GapSystem.h
@@ -98,6 +98,8 @@ static int Private_call(void (*fn)(void *), void * arg, __event_cb * event)
 #define gap_setupbarrier(BarN, CoreM)
 #define gap_waitbarrier(BarN)
 #define gap_waitbarrier_cc(BarN)
+#define gap_cl_critical_enter()
+#define gap_cl_critical_exit()
 
 #define rt_event_sched_init(x)
 #define rt_event_alloc(x,y) 0
@@ -190,7 +192,8 @@ static inline void __cl_dma_memcpy_2d(uint32_t ext, uint32_t loc, uint16_t size,
 #define gap_waitbarrier_cc()            eu_bar_trig_wait_clr(eu_bar_addr(1))
 
 #endif
-
+#define gap_cl_critical_enter()         pi_cl_team_critical_enter()
+#define gap_cl_critical_exit()          pi_cl_team_critical_exit()
 #endif
 
 
diff --git a/tools/autotiler_v3/Makefile b/tools/autotiler_v3/Makefile
index dc12e7fd7..0abf4ba65 100644
--- a/tools/autotiler_v3/Makefile
+++ b/tools/autotiler_v3/Makefile
@@ -1,4 +1,4 @@
-TILER_VER=4.3.0
+TILER_VER=4.3.1
 export TILER_LIB=libtile.${TILER_VER}.a
 ifdef GAP_SDK_HOME
 export TILER_URL=$(GAP_SDK_HOME)/.tiler_url
diff --git a/tools/autotiler_v3/version.cfg b/tools/autotiler_v3/version.cfg
index 4a4782aee..332f897c0 100644
--- a/tools/autotiler_v3/version.cfg
+++ b/tools/autotiler_v3/version.cfg
@@ -3,7 +3,7 @@
         {
             "version": "autotiler-v3",
             "magicNum": 718930176,
-            "git-hash": "de73fc4e0db316fa61057a8e1c9cfde47a75b6c0"
+            "git-hash": "de88fbeb3017c0db55f1e86e49cce5a0160ccbe5"
         }
     ]
 }
\ No newline at end of file
diff --git a/tools/jenkins/gap_sdk_version.txt b/tools/jenkins/gap_sdk_version.txt
index f8268c0f2..59f52fae3 100644
--- a/tools/jenkins/gap_sdk_version.txt
+++ b/tools/jenkins/gap_sdk_version.txt
@@ -1 +1 @@
-9240e025d9f6a0efa51ad259adea0ae1287f6610
+9af2d93598d20541f4c18ba45e2124b767be2388
diff --git a/tools/nntool/execution/graph_executer.py b/tools/nntool/execution/graph_executer.py
index 9e15d7393..297ea50e3 100644
--- a/tools/nntool/execution/graph_executer.py
+++ b/tools/nntool/execution/graph_executer.py
@@ -73,7 +73,7 @@ def execute_qnoq_iterator(self,
             G = self._G
             saved_outputs = {}
 
-        for node in G.dfs():
+        for node in G.topological_sort():
             step_idx = node.step_idx
             if step_idx_limit is not None and step_idx > step_idx_limit:
                 break
@@ -166,7 +166,7 @@ def execute_iterator(self,
         if not silent:
             LOG.info("execute uncached: quantization mode %s", qmode)
             ExecutionProgress.start()
-        for node in G.dfs():
+        for node in G.topological_sort():
             step_idx = node.step_idx
             if step_idx_limit is not None and step_idx > step_idx_limit:
                 break
diff --git a/tools/nntool/execution/kernels/float/matrix_operations.py b/tools/nntool/execution/kernels/float/matrix_operations.py
index b2bbd97eb..fa84e469c 100644
--- a/tools/nntool/execution/kernels/float/matrix_operations.py
+++ b/tools/nntool/execution/kernels/float/matrix_operations.py
@@ -130,7 +130,7 @@ def execute(cls, params,
         in_tensors = qrec.prepare_inputs(params, in_tensors, ktype="float")
 
         if isinstance(params, MatMulTransposedParameters):
-            mat1, mat2 = in_tensors[0], np.transpose(in_tensors[1], (1, 0))
+            mat1, mat2 = in_tensors[0], np.swapaxes(in_tensors[1], -2, -1)
         else:
             mat1, mat2 = in_tensors[0], in_tensors[1]
 
diff --git a/tools/nntool/execution/kernels/float/ssd_postprocess.py b/tools/nntool/execution/kernels/float/ssd_postprocess.py
index 591ae806b..516abe6b2 100644
--- a/tools/nntool/execution/kernels/float/ssd_postprocess.py
+++ b/tools/nntool/execution/kernels/float/ssd_postprocess.py
@@ -106,7 +106,10 @@ def execute(cls, params,
         # out_boxes, out_scores, out_classes = cls.nms(
         #     params, qrec, decoded_bboxes, valid_scores)
         # out_count = np.array([sum(out_classes != 0)])
-        return qrec.get_outputs(params, [out_boxes, out_classes, out_scores], ktype="float")
+        outputs = [out_boxes, out_classes, out_scores]
+        if params.output_detection_count:
+            outputs.append(np.array([out_idx]))
+        return qrec.get_outputs(params, outputs, ktype="float")
 
 @params_type(NMSParameters)
 @qrec_type('float')
diff --git a/tools/nntool/execution/kernels/quant/activations.py b/tools/nntool/execution/kernels/quant/activations.py
index 2556d0f88..b146a8da9 100644
--- a/tools/nntool/execution/kernels/quant/activations.py
+++ b/tools/nntool/execution/kernels/quant/activations.py
@@ -71,13 +71,13 @@ def execute(cls, params,
             params, in_tensors, ktype="symmetric")[0]
         # compute_in_out_scale(qrec)
         in_tensor = in_tensor.astype(np.int32)
-        neg_in = at_norm(in_tensor * leak_mult_gen_factor_q7(params), 7)
+        neg_in = at_norm((in_tensor) * qrec.cache["leak_factor"], 7)
         in_tensor = in_tensor * (in_tensor > 0) + neg_in * (in_tensor < 0)
         scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
-        in_tensor = scale_mul_biases_q.apply_scales(in_tensor)
-        if qrec.out_qs[0] != qrec.in_qs[0]:
-            return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric")
-        return qrec.get_outputs(params, [in_tensor], ktype="symmetric")
+        in_tensor = scale_mul_biases_q.apply_scales(in_tensor) + qrec.cache["zero_point"]
+        #if qrec.out_qs[0] != qrec.in_qs[0]:
+        #    return qrec.get_outputs(params, [qrec.out_qs[0].reduce_from(in_tensor, qrec.in_qs[0])], ktype="symmetric")
+        return qrec.get_outputs(params, [qrec.out_qs[0].clip(in_tensor)], ktype="symmetric")
 
 
 def sigmoid(params,
@@ -147,29 +147,7 @@ def hsigmoid_mult_gen_factors(params, qrec):
     return fac_1, upper_bound, lower_bound
 
 
-@params_type(HSigmoidActivationParameters)
-@qrec_type('scaled')
-class HSigmoidSymmetricMult(KernelBase):
-    @classmethod
-    def execute(cls, params,
-                in_tensors,
-                qrec: QRec,
-                **kwargs):
-        in_tensor = qrec.prepare_inputs(
-            params, in_tensors, ktype="symmetric")[0]
-        offset = qrec.cache['offset']
-        upper_bound = qrec.cache['upper_bound']
-        mult = qrec.cache['mult']
-        lower_bound = qrec.in_qs[0].zero_point
 
-        in_tensor = in_tensor.astype(np.int32)
-        in_tensor_relued = np.minimum(np.maximum(
-            in_tensor + offset, lower_bound), upper_bound) * mult
-        scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
-        in_tensor = scale_mul_biases_q.apply_scales(in_tensor_relued)
-        return qrec.get_outputs(params,
-                                [in_tensor],
-                                ktype="symmetric")
 
 
 @params_type(HSigmoidActivationParameters)
@@ -230,14 +208,18 @@ def execute(cls, params,
             params, in_tensors, ktype="symmetric")[0]
         if in_tensor.dtype == np.int8:
             in_tensor = in_tensor.astype(np.int32) << 8
-
-        output = sigmoid_lut(in_tensor, q16_out=qrec.out_qs[0].dtype == np.uint16)
-        if qrec.out_qs[0].dtype == np.int8:
-            # compute_in_out_scale(qrec, extra_scale=QType.Pow2(
-            #     bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale)
-            output >>= 8
+        elif in_tensor.dtype == np.uint8:
+            in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point
+            in_tensor <<= 8
+        elif in_tensor.dtype == np.uint16:
+            in_tensor = in_tensor.astype(np.int32) - qrec.in_qs[0].zero_point
+        else:
+            in_tensor = in_tensor.astype(np.int32)
+
+        out_q15 = sigmoid_lut(in_tensor)
         scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
-        output = scale_mul_biases_q.apply_scales(output)
+        outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.cache['zero_point']
+        output = qrec.out_qs[0].clip(outp)
         return qrec.get_outputs(params,
                                 [output],
                                 ktype="symmetric")
@@ -279,12 +261,22 @@ def execute(cls, params,
                 **kwargs):
         in_tensor = qrec.prepare_inputs(
             params, in_tensors, ktype="symmetric")[0]
-        out_q15 = tanh_lut(in_tensor.astype(np.int32) << 8)
+        if in_tensor.dtype == np.int8:
+            in_tensor = in_tensor.astype(np.int32) << 8
+        elif in_tensor.dtype == np.uint8:
+            in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point']
+            in_tensor <<= 8
+        elif in_tensor.dtype == np.uint16:
+            in_tensor = in_tensor.astype(np.int32) - qrec.cache['zero_point']
+        else:
+            in_tensor = in_tensor.astype(np.int32)
+
+        out_q15 = tanh_lut(in_tensor)
         # compute_in_out_scale(qrec, extra_scale=QType.Pow2(
         #     bits=32, q=7, signed=True).scale/qrec.in_qs[0].scale)
         scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
-        output = scale_mul_biases_q.apply_scales(out_q15 >> 8)
-
+        outp = scale_mul_biases_q.apply_scales(out_q15) + qrec.out_qs[0].zero_point
+        output = qrec.out_qs[0].clip(outp)
         return qrec.get_outputs(params,
                                 [output],
                                 ktype="symmetric")
@@ -316,15 +308,6 @@ def execute(cls, params,
                                 ktype="symmetric")
 
 
-def hswish_mult_gen_factors(qrec):
-    in_q = qrec.in_qs[0]
-    fac_1 = in_q.quantize(np.array([3.]))
-    # The scale of the result is actually in in_scale * in_scale since it is multiplied by itself
-    compute_in_out_scale(qrec, extra_scale=qrec.in_qs[0].scale * 1/6)
-    upper_bound = in_q.quantize([6.])
-    lower_bound = in_q.quantize([0.])
-    return fac_1, upper_bound, lower_bound
-
 
 @params_type(HSwishActivationParameters)
 @qrec_type('scaled')
@@ -336,18 +319,50 @@ def execute(cls, params,
                 **kwargs):
         in_tensor = qrec.prepare_inputs(
             params, in_tensors, ktype="symmetric")[0]
-        fac_1, upper_bound, lower_bound = hswish_mult_gen_factors(qrec)
         in_tensor = in_tensor.astype(np.int32)
+
+        offset = qrec.cache['offset']
+        upper_bound = qrec.cache['upper_bound']
+        zero_point = qrec.cache['zero_point']
+
         in_tensor_relued = np.minimum(np.maximum(
-            in_tensor + fac_1, lower_bound), upper_bound)
+            in_tensor + offset, 0), upper_bound)
         scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
         in_tensor = scale_mul_biases_q.apply_scales(
             in_tensor * in_tensor_relued)
+        in_tensor += zero_point
+        in_tensor = qrec.out_qs[0].clip(in_tensor)
         return qrec.get_outputs(params,
                                 [in_tensor],
                                 ktype="symmetric")
 
 
+@params_type(HSigmoidActivationParameters)
+@qrec_type('scaled')
+class HSigmoidSymmetricMult(KernelBase):
+    @classmethod
+    def execute(cls, params,
+                in_tensors,
+                qrec: QRec,
+                **kwargs):
+        in_tensor = qrec.prepare_inputs(
+            params, in_tensors, ktype="symmetric")[0]
+        in_tensor = in_tensor.astype(np.int32)
+
+        offset = qrec.cache['offset']
+        upper_bound = qrec.cache['upper_bound']
+        zero_point = qrec.cache['zero_point']
+
+        in_tensor_relued = np.minimum(np.maximum(
+            in_tensor + offset, 0), upper_bound)
+        scale_mul_biases_q = qrec.cache['scale_mul_biases_q']
+        in_tensor = scale_mul_biases_q.apply_scales(in_tensor_relued)
+        in_tensor += zero_point
+        in_tensor = qrec.out_qs[0].clip(in_tensor)
+        return qrec.get_outputs(params,
+                                [in_tensor],
+                                ktype="symmetric")
+
 @params_type(HSwishActivationParameters)
 @qrec_type('symmetric')
 class HSwishSymmetric(KernelBase):
diff --git a/tools/nntool/execution/kernels/quant/dsp_preprocessing.py b/tools/nntool/execution/kernels/quant/dsp_preprocessing.py
index c93a0bb24..ddf0263a1 100644
--- a/tools/nntool/execution/kernels/quant/dsp_preprocessing.py
+++ b/tools/nntool/execution/kernels/quant/dsp_preprocessing.py
@@ -16,13 +16,14 @@
 import logging
 
 import numpy as np
+from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
 from graph.types import (MFCCPreprocessingParameters,
                          RFFT2DPreprocessingParameters)
-from execution.kernels.kernel_base import KernelBase, params_type, qrec_type
 from quantization.new_qrec import QRec
 from utils.at_norm import at_norm
 from utils.fft_quant import (Rad2_FFT_DIF_Fix16, Rad4_FFT_DIF_Fix16,
                              RFFT_Step_Fix16, SwapSamples)
+from utils.numpy_helpers import np_asscalar
 from utils.pow_sqrt import (LN_2_1F15, LN_10_INV_Q10, LOG10_2, gap_fl1,
                             logn_17_15, sqrt_17_15)
 
@@ -84,7 +85,7 @@ def melspectrogram_step(cls, params, in_data, filterbanks_sparsity, filterbank_c
             max_in = np.max(in_data[start:start+nonzero_items])
             logn_items = gap_fl1(nonzero_items)
             shift0 = gap_fl1(max_in) if max_in else 0
-            shift = np.asscalar(
+            shift = np_asscalar(
                 np.int32(shift0 + mel_coeff_q + logn_items - 31
                          if shift0 + mel_coeff_q + logn_items > 31 else 0))
             melbin = 0
@@ -130,9 +131,11 @@ def execute(cls, params,
                 qrec: QRec,
                 **kwargs):
         in_data = in_tensors[0]
-        fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0)
+        fft_twiddles = np.stack(
+            [in_tensors[2][::2], in_tensors[2][1::2]], axis=0)
         swap_table = in_tensors[3]
-        rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0)
+        rfft_twiddles = np.stack(
+            [in_tensors[4][::2], in_tensors[4][1::2]], axis=0)
 
         spectrograms = []
         for frame_idx in range(params.n_frames):
@@ -164,9 +167,11 @@ def execute(cls, params,
                 in_tensors,
                 qrec: QRec,
                 **kwargs):
-        fft_twiddles = np.stack([in_tensors[2][::2], in_tensors[2][1::2]], axis=0)
+        fft_twiddles = np.stack(
+            [in_tensors[2][::2], in_tensors[2][1::2]], axis=0)
         swap_table = in_tensors[3]
-        rfft_twiddles = np.stack([in_tensors[4][::2], in_tensors[4][1::2]], axis=0)
+        rfft_twiddles = np.stack(
+            [in_tensors[4][::2], in_tensors[4][1::2]], axis=0)
 
         mel_filterbank_sparsity_mat = in_tensors[5]
         mel_filterbank_coeff = in_tensors[6]
diff --git a/tools/nntool/execution/kernels/quant/matrix_operations.py b/tools/nntool/execution/kernels/quant/matrix_operations.py
index 2116a2315..ba4c84ed6 100644
--- a/tools/nntool/execution/kernels/quant/matrix_operations.py
+++ b/tools/nntool/execution/kernels/quant/matrix_operations.py
@@ -193,7 +193,7 @@ def execute(cls, params,
         in_tensors = [in_tensor.astype(np.int32) for in_tensor in qrec.prepare_inputs(
             params, in_tensors, ktype="symmetric")]
         if isinstance(params, MatMulTransposedParameters):
-            mat1, mat2 = in_tensors[0], np.transpose(in_tensors[1], (1, 0))
+            mat1, mat2 = in_tensors[0], np.swapaxes(in_tensors[1], -2, -1)
         else:
             mat1, mat2 = in_tensors[0], in_tensors[1]
 
@@ -208,9 +208,10 @@ def execute(cls, params,
             biases = 0
 
         out_tensor = np.matmul(mat1, mat2) + biases
+        out_rank = len(out_tensor.shape)
         mul_biases_q = qrec.cache['mul_biases_q']
         scale_axis = None if len(mul_biases_q.scale) == 1 else \
-            (1 if isinstance(params, MatMulTransposedParameters) else 0)
+            (out_rank-1 if isinstance(params, MatMulTransposedParameters) else out_rank-2)
         out_tensor = mul_biases_q.apply_scales(out_tensor, scale_axis)
 
         return qrec.get_outputs(params, [out_tensor], ktype="symmetric")
@@ -228,7 +229,7 @@ def execute(cls, params,
             params, in_tensors, ktype="symmetric")]
 
         if isinstance(params, MatMulTransposedParameters):
-            mat1, mat2 = in_tensors[0], np.transpose(in_tensors[1], (1, 0))
+            mat1, mat2 = in_tensors[0], np.swapaxes(in_tensors[1], -2, -1)
         else:
             mat1, mat2 = in_tensors[0], in_tensors[1]
 
diff --git a/tools/nntool/execution/kernels/quant/ssd_postprocess.py b/tools/nntool/execution/kernels/quant/ssd_postprocess.py
index 911701401..3cb5a609f 100644
--- a/tools/nntool/execution/kernels/quant/ssd_postprocess.py
+++ b/tools/nntool/execution/kernels/quant/ssd_postprocess.py
@@ -131,7 +131,10 @@ def execute(cls, params,
         #     params, qrec, offsets, anchors, scores, anchors_type='centers')
         # out_boxes, out_scores, out_classes = cls.nms(params, qrec, decoded_bboxes, valid_scores)
         # out_count = np.array([sum(out_classes != 0)])
-        return qrec.get_outputs(params, [out_boxes, out_classes, out_scores], ktype="symmetric")
+        outputs = [out_boxes, out_classes, out_scores]
+        if params.output_detection_count:
+            outputs.append(np.array([out_idx], dtype=np.int32))        
+        return qrec.get_outputs(params, outputs, ktype="symmetric")
 
 @params_type(NMSParameters)
 @qrec_type('scaled')
diff --git a/tools/nntool/expressions/symbolic/basic.py b/tools/nntool/expressions/symbolic/basic.py
index c7917ff48..cdb9f9c2e 100644
--- a/tools/nntool/expressions/symbolic/basic.py
+++ b/tools/nntool/expressions/symbolic/basic.py
@@ -13,6 +13,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+import logging
+
 import numpy as np
 from bfloat16 import bfloat16
 from quantization.qtype import DTYPE_GAP_CTYPE
@@ -22,6 +24,8 @@
 from .symbol import (Constant, Rational, c_headers, copy_props, environment,
                      handles, handlesr, nargs)
 
+LOG = logging.getLogger('nntool.'+__name__)
+
 
 @nargs(2)
 @handles('__add__')
@@ -178,6 +182,7 @@ class GapAbs(Abs):
     def _c_expr(self, *args, **kwargs):
         return "gap_abs(%s)" % (args[0])
 
+
 @nargs(1)
 class Round(Function):
 
@@ -271,6 +276,7 @@ def _py_expr(self, *args, **kwargs):
     def _c_expr(self, *args, **kwargs):
         return "sqrtf(%s)" % (args[0],)
 
+
 @nargs(1)
 @c_headers('<math.h>')
 class RSqrt(Function):
@@ -284,6 +290,7 @@ def _py_expr(self, *args, **kwargs):
     def _c_expr(self, *args, **kwargs):
         return "1.0f/sqrtf(%s)" % (args[0],)
 
+
 @nargs(1)
 @c_headers('<math.h>')
 class Log(Function):
@@ -353,13 +360,14 @@ def _py_expr(self, *args, **kwargs):
     def _c_expr(self, *args, **kwargs):
         return f"square({args[0]}))"
 
+
 @nargs(2)
 @c_headers('<math.h>')
 class Pow(Function):
 
     def _impl(self, *args, **kwargs):
         if any(b < 0 and e < 1 for b, e in np.broadcast(*args)):
-            raise ValueError(
+            LOG.warning(
                 'fractional powers are being passed to a negative base for Pow operator')
         return np.power(args[0], args[1], dtype=self.dtype)
 
@@ -563,6 +571,7 @@ def _eval(self, *args, **kwargs):
             return self._eval_float_to_quant(*args, **kwargs)
         return self._eval_quant_to_float(*args, **kwargs)
 
+
 @nargs(2)
 class SquaredDifference(CompoundFunction):
     def _eval(self, *args, **kwargs):
diff --git a/tools/nntool/expressions/symbolic/function_collection.py b/tools/nntool/expressions/symbolic/function_collection.py
index d4c7e9774..ff6717ebd 100644
--- a/tools/nntool/expressions/symbolic/function_collection.py
+++ b/tools/nntool/expressions/symbolic/function_collection.py
@@ -123,6 +123,10 @@ def ops(self):
     def c_header_set(self):
         return set().union(*[func.c_header_set for func in self._functions.values()])
 
+    def set_var_shapes(self):
+        for var, func in self.functions.items():
+            var.shape = func.shape
+
     @staticmethod
     def split_indexes(unique_axis_groups):
         uaq = sorted(unique_axis_groups, key=len)
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py b/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py
index b3fd805e7..76fc4d5ed 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/clip_norm.py
@@ -63,6 +63,7 @@ class Norm(Function):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        x=0
 
     def _impl(self, *args, **kwargs):
         dtype = self.dtype
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py
index 530f5f8b8..4cd591872 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/handlers.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/handlers.py
@@ -295,7 +295,8 @@ def _quantize(cls,
             prod_q, 15), max_val=prod_scale, min_val=-prod_scale)
         if prod_q > 15:
             qsym = Norm(sym_cls(*in_syms, dtype=np.int32),
-                        QuantizedConstant(prod_q - 15))
+                        QuantizedConstant(prod_q - 15),
+                        dtype=np.int32)
         else:
             qsym = sym_cls(*in_syms)
         return (qsym, out_qrec)
diff --git a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py
index 76a4d77bd..30d812ee4 100644
--- a/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py
+++ b/tools/nntool/expressions/symbolic/q15_quantization/scale_quantized.py
@@ -54,7 +54,7 @@ def _c_expr(self, *args, **kwargs):
 @c_headers('"Gap.h"')
 @copy_props('from_qrec', 'to_qrec', 'num_bits')
 class ScaleQuantized(CompoundFunction):
-    def __init__(self, *args, from_qrec=None, to_qrec=None, num_bits=15, **kwargs):
+    def __init__(self, *args, from_qrec=None, to_qrec=None, num_bits=8, **kwargs):
         self._from_qrec = from_qrec
         self._to_qrec = to_qrec
         self._qbias, self._qnorm = None, None
@@ -130,7 +130,8 @@ def _eval(self, *args, **kwargs):
                 ),
                 #pylint: disable=invalid-unary-operand-type
                 QuantizedConstant(-qnorm, dtype=np.int8),
-                name=self.name
+                name=self.name,
+                dtype=self._to_qrec.dtype
             )
         elif qnorm > 0:
             sym = Norm(
@@ -140,7 +141,8 @@ def _eval(self, *args, **kwargs):
                     dtype=self._to_qrec.dtype
                 ),
                 QuantizedConstant(qnorm, dtype=np.int8),
-                name=self.name
+                name=self.name,
+                dtype=self._to_qrec.dtype
             )
         else:
             sym = Mul(
diff --git a/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py b/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py
index 258fdc484..c5709b71c 100644
--- a/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py
+++ b/tools/nntool/generation/at_generators/cnn_convolution_pool_relu.py
@@ -139,6 +139,8 @@ def gen_activation_op(activation):
         aop = "KOP_LEAKYRELU"
     elif activation == "sigmoid":
         aop = "KOP_SIGMOID"
+    elif activation == "tanh":
+        aop = "KOP_TANH"
     else:
         raise NotImplementedError("activation type %s not implemented" % activation)
     return aop
diff --git a/tools/nntool/generation/at_types/at_params.py b/tools/nntool/generation/at_types/at_params.py
index a075d4c19..0ba4358cd 100644
--- a/tools/nntool/generation/at_types/at_params.py
+++ b/tools/nntool/generation/at_types/at_params.py
@@ -42,22 +42,22 @@ def gen_activation_op(activation, force_relu=False, asymmetric=False):
         else:
             aop = "KOP_RELUN" if not force_relu else "KOP_RELU"
     elif activation == "hsigmoid":
-        assert not asymmetric, 'asymmetric not supported'
+        #assert not asymmetric, 'asymmetric not supported'
         aop = "KOP_HSIGMOID"
     elif activation == "htanh":
-        assert not asymmetric, 'asymmetric not supported'
+        #assert not asymmetric, 'asymmetric not supported'
         aop = "KOP_HTANH"
     elif activation == "swish" or activation == "hswish":
-        assert not asymmetric, 'asymmetric not supported'
+        #assert not asymmetric, 'asymmetric not supported'
         aop = "KOP_HSWISH"
     elif activation == "leaky":
-        assert not asymmetric, 'asymmetric not supported'
+        #assert not asymmetric, 'asymmetric not supported'
         aop = "KOP_LEAKYRELU"
     elif activation == "sigmoid":
-        assert not asymmetric, 'asymmetric not supported'
+        #assert not asymmetric, 'asymmetric not supported'
         aop = "KOP_SIGMOID"
     elif activation == "tanh":
-        assert not asymmetric, 'asymmetric not supported'
+        #assert not asymmetric, 'asymmetric not supported'
         aop = "KOP_TANH"
     else:
         raise NotImplementedError("activation type %s not implemented" % activation)
diff --git a/tools/nntool/generation/code_generator.py b/tools/nntool/generation/code_generator.py
index 0490a2809..3a243be1c 100644
--- a/tools/nntool/generation/code_generator.py
+++ b/tools/nntool/generation/code_generator.py
@@ -18,15 +18,17 @@
 import numpy as np
 from bfloat16 import bfloat16
 from expressions.symbolic.kernel_codegen import BasicKernel
-from generation.new_generators import ne16
+from graph.manipulations.dimensions import add_dimensions
 from graph.types import (ConcatParameters, ConstantInputParameters,
                          InputParameters, OutputParameters, ReshapeParameters,
                          SplitParameters, TransposeParameters)
-from graph.types.lstm import LSTMParameters
-from graph.types.others import CopyParameters, QuantizeParameters
+from graph.types.base import NNEdge
+from graph.types.fusions import FusionBase
+from graph.types.others import CopyParameters, NoOPParameters, QuantizeParameters
 from graph.types.rnn import RNNBaseParameters
 from utils.node_id import NodeId
 
+from generation.gen_utils import ModelGenerationInternalError
 from generation.generator_decorators import RegisteredGeneratorsMixin
 # pylint: disable=wildcard-import,unused-wildcard-import
 from generation.generators import *
@@ -117,6 +119,17 @@ class CodeGenerator(NewGenerator, RegisteredGeneratorsMixin):
     def __init__(self, G, naming_convension, opts=None):
         super().__init__()
         self.G = G
+        # this generates a view of the graph with all nodes that are not generated removed
+        self.hidden_graph = G.with_hidden_nodes(
+            lambda node: node.no_model_code,
+            edge_class=NNEdge
+        )
+        self.sorted_nodes = sorted(
+            self.hidden_graph.nodes(), key=lambda node: node.step_idx)
+        naming_convension.G = self.hidden_graph
+        # the edge parameters are generated from the graph with the hidden nodes but the dimensions
+        # are not updated. They are read from the nodes
+        add_dimensions(self.hidden_graph, update_graph=False)
         self.naming_convension = naming_convension
         self.name_cache = NameCache()
         self.bindings = []
@@ -133,14 +146,22 @@ def __init__(self, G, naming_convension, opts=None):
             self.opts.update(opts)
         if self.opts['include_project_header']:
             self.include_files.append(self.project_name + '.h')
-        has_vcd = False
-        for step in G.graph_state.steps:
-            node = step['node']
-            if node.at_options.vcd_trace_on is not None:
-                has_vcd = True
-        if has_vcd:
+        if any(step and step['node'].at_options.vcd_trace_on is not None
+               for step in G.graph_state.steps):
             self.include_files.append('hal/gvsoc/gvsoc.h')
 
+    @property
+    def output_nodes(self):
+        for node in sorted(self.hidden_graph.outputs(), key=lambda node: node.step_idx):
+            if isinstance(node, OutputParameters):
+                yield node
+
+    @property
+    def input_nodes(self):
+        for node in sorted(self.hidden_graph.inputs(), key=lambda node: node.step_idx):
+            if isinstance(node, InputParameters):
+                yield node
+
     @property
     def project_name(self):
         return self.naming_convension.get_project_name()
@@ -160,8 +181,9 @@ def get_edge_name(self, eparams):
     def get_node_name(self, params, target):
         try:
             return self.name_cache[params][target]
-        except:
-            raise ValueError(f"Name Cache: {params.name} {target} not found")
+        except KeyError as ex:
+            raise ModelGenerationInternalError(
+                f"Name Cache: {params.name} {target} not found") from ex
 
     def memory_device_generator(self, indent=0):
         self.opts['memory_devices'].set_l2_ram_ext_managed(
@@ -200,66 +222,36 @@ def binding_generator(self, indent=0):
         return str(code_block)
 
     @staticmethod
-    def real_up_connection(G, eparams, set_real=False):
-        while isinstance(eparams.creating_node, ReshapeParameters) or \
-                (isinstance(eparams.creating_node, TransposeParameters) and eparams.creating_node.does_nothing()):
-            set_real = True
-            eparams = G.in_edges(eparams.creating_node.name)[0].params
-        return eparams, set_real
-
-    @staticmethod
-    def real_down_connection(G, eparams):
+    def get_output(G, eparams):
         oedges = G.indexed_out_edges(eparams.creating_node.name)[
             eparams.creating_node_idx]
-        while any(isinstance(oedge.to_node, ReshapeParameters) or \
-                (isinstance(oedge.to_node, TransposeParameters) and oedge.to_node.does_nothing()) for oedge in oedges):
-            if len(oedges) > 1:
-                raise NotImplementedError('multiple edges on ungenerated node')
-            oedges = G.out_edges(oedges[0].to_node.name)
+        if len(oedges) != 1 or not isinstance(oedges[0].to_node, OutputParameters):
+            return None
         return oedges[0]
 
     def local_generator(self, indent=0):
-        edges = set(edge.params for edge in self.G.edges())
+        edges = set(edge.params for edge in self.hidden_graph.edges())
         sorted_edges = list(edges)
         sorted_edges.sort(key=lambda eparams: eparams.creating_step)
         for eparams in sorted_edges:
-            # check if the following real node is an output
-            if isinstance(eparams.creating_node, ConcatParameters):
-                rout_edge = self.real_down_connection(self.G, eparams)
-                if isinstance(rout_edge.to_node, OutputParameters):
-                    rout_eparams = rout_edge.params
-                    cname = self.naming_convension.get_edge_name(rout_eparams.creating_node,
-                                                                 rout_eparams.creating_step,
-                                                                 rout_eparams.edge_type,
-                                                                 rout_eparams.edge_order)
-                    LOG.info("edge from step %s %s is not used and is replaced with edge to step %s:%s %s cname: %s",
-                             eparams.creating_node.step_idx, eparams.creating_node.name,
-                             rout_eparams.creating_node.name, rout_eparams.creating_node.step_idx,
-                             rout_eparams.creating_step, cname)
-                    self.name_cache.set(eparams, 'edge', cname)
-                    continue
-
-            rin_eparams, set_real = self.real_up_connection(self.G, eparams)
-            if rin_eparams.edge_type == "out":
-                # The edge was marked as an output so find the real edge down
-                rin_eparams = self.real_down_connection(
-                    self.G, rin_eparams).params
+            if eparams.edge_type == "out":
+                # The edge was marked as an output so find the real output edge
+                oedges = self.hidden_graph.indexed_out_edges(eparams.creating_node.name)[
+                    eparams.creating_node_idx]
+                oedges = list(filter(lambda edge: isinstance(
+                    edge.to_node, OutputParameters), oedges))
+                if not oedges:
+                    raise ModelGenerationInternalError(
+                        f'output edge created by {eparams.creating_node.name}:{eparams.creating_node_idx} '
+                        f'is not connected to an output - {" ".join(edge.to_node.name for edge in oedges)}')
+                if len(oedges) > 1:
+                    raise ModelGenerationInternalError(
+                        f'output edge created by {eparams.creating_node.name}:{eparams.creating_node_idx} '
+                        f'is connected to more than one output - {" ".join(edge.to_node.name for edge in oedges)}')
+
+                rin_eparams = oedges[0].params
                 self.name_cache.set(eparams, 'edge', rin_eparams.name)
                 continue
-            else:
-                if set_real:
-                    # Code will not be generated for reshape or empty transpose so the input to the
-                    # following node is the input to this node
-                    cname = self.naming_convension.get_edge_name(rin_eparams.creating_node,
-                                                                 rin_eparams.creating_step,
-                                                                 rin_eparams.edge_type,
-                                                                 rin_eparams.edge_order)
-                    LOG.info("edge from step %s %s is not used and is replaced with edge from step %s:%s %s cname: %s",
-                             eparams.creating_node.step_idx, eparams.creating_node.name,
-                             rin_eparams.creating_node.name, rin_eparams.creating_node.step_idx,
-                             rin_eparams.creating_step, cname)
-                    self.name_cache.set(eparams, 'edge', cname)
-                    continue
 
             cname = self.naming_convension.get_edge_name(eparams.creating_node,
                                                          eparams.creating_step,
@@ -304,7 +296,7 @@ def local_generator(self, indent=0):
         return str(code_block)
 
     def stack_generator(self, indent=0):
-        edges = set(edge.params for edge in self.G.edges())
+        edges = set(edge.params for edge in self.hidden_graph.edges())
         sorted_edges = list(edges)
         sorted_edges.sort(key=lambda eparams: eparams.creating_step)
         concat_edges = list([eparams for eparams in sorted_edges if isinstance(
@@ -313,15 +305,15 @@ def stack_generator(self, indent=0):
             node = eparams.creating_node
             cname_out = self.name_cache[eparams]['edge']
             in_edge_names = [self.name_cache[edge.params]['edge']
-                             for edge in self.G.indexed_in_edges(node.name)]
+                             for edge in self.hidden_graph.indexed_in_edges(node.name)]
             self.stacked_tensors.append(TensorStack(cname_out, in_edge_names))
 
-        split_nodes = [node for node in self.G.nodes(
+        split_nodes = [node for node in self.hidden_graph.nodes(
         ) if isinstance(node, SplitParameters)]
         for split_node in split_nodes:
-            eparams_in = self.G.in_edges(split_node.name)[0].params
+            eparams_in = self.hidden_graph.in_edges(split_node.name)[0].params
             eparams_out = [
-                edge_bundle[0].params for edge_bundle in self.G.indexed_out_edges(split_node.name)]
+                edge_bundle[0].params for edge_bundle in self.hidden_graph.indexed_out_edges(split_node.name)]
             cname_in = self.name_cache[eparams_in]['edge']
             cnames_out = [self.name_cache[eparams]['edge']
                           for eparams in eparams_out]
@@ -359,22 +351,27 @@ def global_generator(self, indent=0):
 
     def generate_outputs(self):
         outputs = set()
-        count_outputs = 0
-        for node in self.G.output_nodes():
+        for node in self.output_nodes:
             qrec = self.G.quantization[NodeId(node)]
-            for edge in self.G.in_edges(node.name):
-                if isinstance(edge.from_node, (LSTMParameters, )) and count_outputs:
+            for edge in self.hidden_graph.in_edges(node.name):
+                if isinstance(edge.from_node, (RNNBaseParameters, )) and edge.from_idx > 0:
                     continue
-                eparams, _ = self.real_up_connection(self.G, edge.params)
+                eparams = edge.params
                 if eparams in outputs:
                     continue
                 eparams.edge_type = "out"
                 outputs.add(eparams)
                 self.execute_phase("outputs", node, qrec, edge)
-                count_outputs += 1
+
+    def sorted_nodes_and_fusions(self):
+        for node in self.sorted_nodes:
+            if isinstance(node, FusionBase) and node.quantize_internals:
+                for fnode in node.contained_nodes():
+                    yield node, fnode
+            yield node, None
 
     def generate_constants(self):
-        for _, pnode, _, fnode in self.G.nodes_iterator():
+        for pnode, fnode in self.sorted_nodes_and_fusions():
             anode = pnode if not fnode else fnode
             qrec = self.G.quantization.get(NodeId(pnode, fnode))
             if not self.new_execute_phase("globals", anode, qrec, pnode, fnode):
@@ -382,9 +379,9 @@ def generate_constants(self):
 
     def generate_inputs(self):
         inputs = set()
-        for node in self.G.input_nodes():
+        for node in self.input_nodes:
             qrec = self.G.quantization[NodeId(node)]
-            for edge in self.G.out_edges(node.name):
+            for edge in self.hidden_graph.out_edges(node.name):
                 eparams = edge.params
                 if eparams in inputs:
                     continue
@@ -474,22 +471,22 @@ def get_node_cname(self, node):
 
     def kernel_generator(self, indent=0):
         code_block = CodeBlock(starting_indent=indent)
-        for _, node, _, _ in self.G.nodes_iterator(yield_fusions=False):
+        for node in self.sorted_nodes:
             name = node.name
             cname = self.get_node_cname(node)
             if node.at_options.vcd_trace_on is not None:
                 self.add_vcd_trace_binding(cname, node.at_options.vcd_trace_on)
             self.name_cache.set(node, 'node', cname)
-            in_eparams = self.G.get_in_params(name)
-            out_eparams = self.G.get_out_params(name)
+            in_eparams = [edge.params if edge else None
+                          for edge in self.hidden_graph.indexed_in_edges(name)]
+            out_eparams = [edge_bundle[0].params if edge_bundle else None
+                           for edge_bundle in self.hidden_graph.indexed_out_edges(name)]
             try:
                 qrec = self.G.quantization[NodeId(node)]
             except KeyError as err:
                 LOG.error("Quantization record not found for node %s", node.name)
                 raise err
-            if isinstance(node, ReshapeParameters):
-                continue
-            if isinstance(node, TransposeParameters) and node.does_nothing():
+            if node.no_model_code:
                 continue
             elif isinstance(node, (InputParameters, OutputParameters)):
                 continue
@@ -507,7 +504,8 @@ def kernel_generator(self, indent=0):
                                        in_eparams, out_eparams, cname)
                 if not (self.new_execute_phase("kernels", node, qrec, in_eparams, out_eparams, cname) or
                         self.execute_phase("kernels", node, qrec, in_eparams, out_eparams, cname)):
-                    raise NotImplementedError(f"Don't know how to generate kernel for parameter type {node.name} {node.CLS_OP_NAME}. "
+                    raise NotImplementedError("Don't know how to generate kernel for parameter type "
+                                              f"{node.name} {node.CLS_OP_NAME}. "
                                               "Perhaps you need to run fusions -a expression_matcher.")
 
             # if self.opts['generate_checksums']:
@@ -527,16 +525,16 @@ def add_vcd_trace_binding(self, cname, enable):
                                 before=True))
 
     def add_checksum_binding(self, cname, name, step_idx, eparams, before):
-        node = self.G[name]
+        node = self.hidden_graph[name]
         if before:
             size = node.in_dims[0].size()
         else:
             size = node.out_dims[0].size()
         self.bindings.append(
             FunctionBindingList(cname,
-                                checksum_func(self.G, name),
+                                checksum_func(self.hidden_graph, name),
                                 Imm(step_idx),
-                                Imm(calc_value_checksum(self.G, name)),
+                                Imm(calc_value_checksum(self.hidden_graph, name)),
                                 GArgEdge(eparams[0]),
                                 Imm(size),
                                 before=before)
@@ -578,9 +576,7 @@ def load_basic_kernel_library(self, indent=0):
 
     def header_generator(self, indent=0):
         code_block = CodeBlock(starting_indent=indent)
-        for _, node, _, fnode in self.G.nodes_iterator():
-            if fnode:
-                continue
+        for node in self.sorted_nodes:
             cname = self.name_cache[node]['node']
             qrec = self.G.quantization[NodeId(node)]
             code_block.comment(cname)
@@ -687,7 +683,7 @@ def expressions_user_kernel_source_generator(self, indent=0):
     def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, indent=0):
         code_block = CodeBlock(starting_indent=indent)
         code_block.write("/* Inputs */")
-        for i, node in enumerate(self.G.input_nodes()):
+        for i, node in enumerate(self.input_nodes):
             if node.at_options.allocate or node.at_options.extern_input_pointer:
                 continue
             nodeq = self.G.quantization[NodeId(node, None)].out_qs[0]
@@ -703,7 +699,7 @@ def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, inde
                 code_block.write(
                     f"L2_MEM {CTYPE[nodeq.ctype]} {node.name.capitalize()}[{node.out_dims[0].size()}];")
         code_block.write("/* Outputs */")
-        for node in self.G.output_nodes():
+        for node in self.output_nodes:
             if node.at_options.allocate:
                 continue
             nodeq = self.G.quantization[NodeId(node, None)].out_qs[0]
@@ -711,7 +707,7 @@ def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, inde
                 f"L2_MEM {CTYPE[nodeq.ctype]} {node.name.capitalize()}[{node.out_dims[0].size()}];")
 
         if test_outputs:
-            for out_n, outp in zip(self.G.output_nodes(), test_outputs):
+            for out_n, outp in zip(self.output_nodes, test_outputs):
                 code_block.write(
                     'L2_MEM {} {}_gt[] = {{{}}};',
                     dtype2ctype(outp),
@@ -722,15 +718,13 @@ def generate_main_appl_inout_def(self, test_inputs=None, test_outputs=None, inde
 
     def gen_inout_list(self):
         inout_str = ""
-        for node in self.G.input_nodes():
+        for node in self.input_nodes:
             if node.at_options.allocate or node.at_options.extern_input_pointer:
                 continue
             inout_str += f"{node.name.capitalize()}, "
-        rnn_present = any([isinstance(node, RNNBaseParameters)
-                           for node in self.G.nodes()])
-        if rnn_present:
+        if self.hidden_graph.nodes(node_classes=RNNBaseParameters):
             inout_str += "1, "
-        for node in self.G.output_nodes():
+        for node in self.output_nodes:
             if node.at_options.allocate:
                 continue
             inout_str += f"{node.name.capitalize()}, "
@@ -739,7 +733,7 @@ def gen_inout_list(self):
     def generate_output_check(self, tol=0.0, indent=0):
         code = CodeBlock(starting_indent=indent)
         code.write('int errors;')
-        for idx, out_node in enumerate(self.G.output_nodes()):
+        for out_node in self.output_nodes:
             out_sz = out_node.out_dims[0].size()
             nodeq = self.G.quantization[NodeId(out_node, None)].out_qs[0]
             dtype = "%f" if nodeq.is_floating else "%d"
@@ -753,8 +747,9 @@ def generate_output_check(self, tol=0.0, indent=0):
                     f"{dtype2ctype(nodeq)} diff = {out_node.name.capitalize()}[j] - "
                     f"{out_node.name.capitalize()}_gt[j];")
                 code.write("diff = (diff>0)?diff:(-diff);")
-                code.write(f"if (diff > max_diff) max_diff = diff;")
-                code.write(f'if (diff > {nodeq.quantize(np.array(tol)).item()}) {{')
+                code.write("if (diff > max_diff) max_diff = diff;")
+                code.write(
+                    f'if (diff > {nodeq.quantize(np.array(tol)).item()}) {{')
             else:
                 code.write(
                     f'if ({out_node.name.capitalize()}[j] != {out_node.name.capitalize()}_gt[j]) {{')
diff --git a/tools/nntool/generation/default_appl_main_template.py b/tools/nntool/generation/default_appl_main_template.py
index c6493a947..f0a3a5e5f 100644
--- a/tools/nntool/generation/default_appl_main_template.py
+++ b/tools/nntool/generation/default_appl_main_template.py
@@ -99,7 +99,8 @@ def generate_main_appl_template(G, gen, test_inputs=None, test_outputs=None, tol
 
     printf("Call cluster\\n");
 #ifndef __EMUL__
-    struct pi_cluster_task task = {0};
+    struct pi_cluster_task task;
+    pi_cluster_task(&task,NULL,NULL);
     task.entry = cluster;
     task.arg = NULL;
     task.stack_size = (unsigned int) STACK_SIZE;
@@ -186,7 +187,7 @@ def generate_main_appl_make(G, gen, quantized, open_args=""):
 
 TRAINED_MODEL = ${os.path.split(G.graph_identity.filename)[1]}
 
-MODEL_EXPRESSIONS = ${"$(MODEL_BUILD)/" + gen.opts['basic_kernel_source_file'] if gen.G.has_expressions else ""}
+MODEL_EXPRESSIONS = ${"$(MODEL_BUILD)/" + gen.opts['basic_kernel_source_file']}
 
 NNTOOL_EXTRA_FLAGS += ${open_args}
 ${"MODEL_QUANTIZED=1" if quantized else ""}
@@ -225,7 +226,7 @@ def generate_main_appl_make_atproject(G, gen, quantized, model_path):
 
 AT_MODEL_PATH=${model_path}
 
-MODEL_EXPRESSIONS = ${gen.opts['basic_kernel_source_file'] if gen.G.has_expressions else ""}
+MODEL_EXPRESSIONS = ${gen.opts['basic_kernel_source_file']}
 
 ${"MODEL_QUANTIZED=1" if quantized else ""}
 
diff --git a/tools/nntool/generation/gen_utils.py b/tools/nntool/generation/gen_utils.py
index 0744c27ab..c2207782a 100644
--- a/tools/nntool/generation/gen_utils.py
+++ b/tools/nntool/generation/gen_utils.py
@@ -13,6 +13,11 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+import os
+
+from utils.exception import NNToolInternelError
+
+
 def at_bits(qtype):
     if qtype is None:
         return 0
@@ -32,3 +37,11 @@ def at_q(qtype):
 
 def at_bits_and_q(qtype):
     return "{}, {}".format(at_bits(qtype), qtype.q)
+
+def write_empty(model_directory, model_file, comment):
+    model_path = os.path.join(model_directory, model_file)
+    with open(model_path, "w") as output_fp:
+        output_fp.write(f"/** {comment}\n**/")
+
+class ModelGenerationInternalError(NNToolInternelError):
+    pass
diff --git a/tools/nntool/generation/generators/globals/constant_input_generator.py b/tools/nntool/generation/generators/globals/constant_input_generator.py
index 526cd8755..7ac6691b4 100644
--- a/tools/nntool/generation/generators/globals/constant_input_generator.py
+++ b/tools/nntool/generation/generators/globals/constant_input_generator.py
@@ -18,11 +18,13 @@
 from generation.at_types.constant_info import ConstantInfo
 from generation.at_types.tc_arg_info import (GlobalArgInfo, GlobalResetArgInfo,
                                              InputArgInfo)
+from generation.gen_utils import ModelGenerationInternalError
 from generation.generator_decorators import (QREC_FLOAT, QREC_MULT8, QREC_POW2,
                                              generation_function)
 from graph.types import ConstantInputParameters
-from graph.types.fusions import ConvFusionParameters, LinearFusionParameters
-from graph.types.linear import FcParameters
+from graph.types.fusions import (ConvFusionParameters,
+                                 LinearFusionParameters,
+                                 MatMulOpFusionParameters)
 from utils.node_id import NodeId
 from utils.numpy_helpers import interleave, packbits
 
@@ -83,13 +85,15 @@ def constant_input_globals_generator(gen, node, qrec, pnode, fnode) -> bool:
 
         if qtype.attr.ne16_biases:
             to_node = gen.G.out_edges(pnode.name)[0].to_node
-            if isinstance(to_node, (ConvFusionParameters, LinearFusionParameters)):
+            if isinstance(to_node, (ConvFusionParameters, LinearFusionParameters, MatMulOpFusionParameters)):
                 cnodes = to_node.contained_nodes()
                 quants = [gen.G.quantization[NodeId(
                     to_node, fnode)] for fnode in cnodes]
                 filter_qrec = quants[0]
             else:
                 filter_qrec = gen.G.quantization[NodeId(to_node)]
+            if 'mul_biases_q' not in filter_qrec.cache:
+                raise ModelGenerationInternalError(f"mul_biases_q not found in qrec for {to_node.name}")
             mul_qbiases = filter_qrec.cache['mul_biases_q'].qbiases
             mul_qnorms = filter_qrec.cache['mul_biases_q'].qnorms
             value = np.where(mul_qnorms > 0,
diff --git a/tools/nntool/generation/naming_convension.py b/tools/nntool/generation/naming_convension.py
index cc05166c2..60d10abd8 100644
--- a/tools/nntool/generation/naming_convension.py
+++ b/tools/nntool/generation/naming_convension.py
@@ -28,9 +28,16 @@
 
 class NamingConvension(ABC):
 
-    def __init__(self, G):
-        self.G = G
-        self.multi_out_edges = {}
+    def __init__(self, G=None):
+        self._G = G
+
+    @property
+    def G(self):
+        return self._G
+
+    @G.setter
+    def G(self, val):
+        self._G = val
 
     @abstractmethod
     def get_node_name(self, node_name, step_idx, params):
diff --git a/tools/nntool/generation/new_generators/general/expressions.py b/tools/nntool/generation/new_generators/general/expressions.py
index 2f4c64573..3752f18b3 100644
--- a/tools/nntool/generation/new_generators/general/expressions.py
+++ b/tools/nntool/generation/new_generators/general/expressions.py
@@ -25,7 +25,7 @@
 
 
 @paramstype(ExpressionFusionParameters)
-class GenCopyParameters(GeneratorBase, InOutBindingsMixin):
+class GenExpressionParameters(GeneratorBase, InOutBindingsMixin):
     @classmethod
     def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
         return True
diff --git a/tools/nntool/generation/new_generators/helpers/act_infos.py b/tools/nntool/generation/new_generators/helpers/act_infos.py
index a1defd826..c47d75ffe 100644
--- a/tools/nntool/generation/new_generators/helpers/act_infos.py
+++ b/tools/nntool/generation/new_generators/helpers/act_infos.py
@@ -13,13 +13,14 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import numpy as np
-
+from generation.gen_utils import ModelGenerationInternalError
 from graph.types import (HSigmoidActivationParameters,
                          HSwishActivationParameters, LeakyActivationParameters,
                          ReluActivationParameters, SigmoidActivationParameters,
                          SoftMaxParameters, TanHActivationParameters)
 from graph.types.activations import HTanHActivationParameters
 
+
 def gen_act_infos(act_params, act_q):
     comment = ""
     if isinstance(act_params, ReluActivationParameters):
@@ -27,42 +28,20 @@ def gen_act_infos(act_params, act_q):
             'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8),
             'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8)
         }
-        if act_params.upper_bound is None:  # or fnode is not None:
-            if act_q.in_qs[0].zero_point == 0:
-                contents.update({
-                    'a0': np.uint8(0),
-                    'b0': np.uint8(0),
-                    'c0': np.uint8(0),
-                })
-            else:
-                contents.update({
-                    'a0': act_q.in_qs[0].zero_point.astype(act_q.in_qs[0].dtype),
-                    'b0': np.uint8(0),
-                    'c0': np.uint8(0),
-                })
-        else:
-            if act_q.in_qs[0].zero_point == 0:
-                contents.update({
-                    'a0': act_q.in_qs[0].quantize(act_params.upper_bound),
-                    'b0': np.uint8(0),
-                    'c0': np.uint8(0),
-                })
-            else:
-                contents.update({
-                    'a0': act_q.in_qs[0].zero_point.astype(act_q.in_qs[0].dtype),
-                    'b0': act_q.in_qs[0].quantize(act_params.upper_bound),
-                    'c0': np.uint8(0),
-                })
+        contents.update({
+            'a0': act_q.cache['lower_bound'] if "lower_bound" in act_q.cache else np.uint8(0),
+            'b0': act_q.cache['upper_bound'] if "upper_bound" in act_q.cache else np.uint8(0),
+            'c0': np.uint8(0),
+        })
 
     elif isinstance(act_params, (HSigmoidActivationParameters, HSwishActivationParameters)):
-        # currently combines all scaling factors into one scale and shift
-        assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported"
+        # mult factor is combined into scale
         contents = {
             'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8),
             'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8),
             'a0': act_q.cache['upper_bound'],
             'b0': act_q.cache['offset'],
-            'c0': act_q.cache['mult']
+            'c0': act_q.cache['zero_point']
         }
     elif isinstance(act_params, SoftMaxParameters):
         assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported"
@@ -71,27 +50,25 @@ def gen_act_infos(act_params, act_q):
             'bias_sm': act_q.cache['bias_sm']
         }
     elif isinstance(act_params, LeakyActivationParameters):
-        assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported"
+        #assert act_q.in_qs[0].zero_point == 0 and act_q.out_qs[0].zero_point == 0, "asymmetric not supported"
         contents = {
             'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8),
             'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8),
             'a0': act_q.cache['leak_factor'],
-            'b0': np.uint8(0),
+            'b0': act_q.cache['zero_point'],
             'c0': np.uint8(0),
         }
-    elif isinstance(act_params, (SigmoidActivationParameters, TanHActivationParameters, HTanHActivationParameters)):
+    elif isinstance(act_params, (SigmoidActivationParameters, TanHActivationParameters)):
         contents = {
             'actscale': act_q.cache['scale_mul_biases_q'].qbiases.astype(np.uint8),
             'actscalen': act_q.cache['scale_mul_biases_q'].qnorms.astype(np.uint8),
-            'a0': np.uint8(0),
+            'a0': act_q.cache["zero_point"],
             'b0': np.uint8(0),
             'c0': np.uint8(0),
         }
     else:
-        raise NotImplementedError(
-            "activation type not implemented in model generator")
+        raise ModelGenerationInternalError(
+            f"activation type {act_params.__class__.__name__} not implemented in model generator")
     comment += f"in: {act_q.in_qs[0].scale[0]:.5f} out: {act_q.out_qs[0].scale[0]:.5f} "
-    comment += f"actscale: {contents['actscale']} actscalen: {contents['actscalen']} "
-    comment += f"A0: {contents['a0']} B0: {contents['b0']} C0: {contents['c0']}"
 
     return contents, comment
diff --git a/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py b/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py
index 492df0a58..6e197e35e 100644
--- a/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/conv_pool_mult8.py
@@ -77,12 +77,11 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             return False
 
 
-        comment = f"BiasQ: {0}" + infos_comment
         infos['BIASN'] = np.int8(0)  # BiasQ
+        conv_mul_bias = filt_q.cache.get('mul_biases_q')
+        infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(conv_mul_bias, MultMulBiasScaleQType) else 0)
 
         if filt_q.cache.get('ne16'):
-            conv_mul_bias = filt_q.cache.get('mul_biases_q')
-            infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(conv_mul_bias, MultMulBiasScaleQType) else 0)
             infos['NE16_PADVAL'] = np.atleast_1d(filt_q.in_qs[0].zero_point).astype(filt_q.in_qs[0].dtype)
             infos['NE16_WOFFSET'] =  -np.array(filt_q.in_qs[1].zero_point).astype(np.int32)
             infos_len = 'NE16_DIM'
@@ -91,7 +90,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
 
 
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array(infos_len, **infos)
+        contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos)
+        comment = infos_comment + new_comment
 
         cname, file_name = gen_constant(gen, pnode, fnode, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents)
diff --git a/tools/nntool/generation/new_generators/mult8/linear_mult8.py b/tools/nntool/generation/new_generators/mult8/linear_mult8.py
index 1a829c160..9c6364661 100644
--- a/tools/nntool/generation/new_generators/mult8/linear_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/linear_mult8.py
@@ -60,13 +60,12 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
         else:
             return False
 
-        comment = f"BiasQ: {0}" + infos_comment
         infos['BIASN'] = np.int8(0)  # BiasQ
+        conv_mul_bias = filt_q.cache.get('mul_biases_q')
+        infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(
+            conv_mul_bias, MultMulBiasScaleQType) else 0)
 
         if filt_q.cache.get('ne16'):
-            conv_mul_bias = filt_q.cache.get('mul_biases_q')
-            infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(
-                conv_mul_bias, MultMulBiasScaleQType) else 0)
             infos['NE16_PADVAL'] = np.atleast_1d(
                 filt_q.in_qs[0].zero_point).astype(np.uint16)
             infos['NE16_WOFFSET'] = - \
@@ -76,7 +75,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             infos_len = 'DIM'
 
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array(infos_len, **infos)
+        contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos)
+        comment = infos_comment + new_comment
 
         cname, file_name = gen_constant(gen, pnode, fnode, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(
diff --git a/tools/nntool/generation/new_generators/mult8/matadd_mult8.py b/tools/nntool/generation/new_generators/mult8/matadd_mult8.py
index 325e5cd9d..507a69e59 100644
--- a/tools/nntool/generation/new_generators/mult8/matadd_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/matadd_mult8.py
@@ -61,7 +61,7 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             add_node = pnode
             add_quant = qrec
             infos = {}
-            acomments = ""
+            acomments = "no activation - "
 
         infos.update({
             'IN1SCALE': add_quant.cache['scale_in_mul_biases_q'].qbiases,
@@ -69,7 +69,6 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             'OUTSCALE': add_quant.cache['scale_mul_biases_q'].qbiases,
             'OUTSCALEN': add_quant.cache['scale_mul_biases_q'].qnorms
         })
-        comments = " ".join(f'{k}: {infos[k]}' for k in ['IN1SCALE', 'IN1SCALEN', 'OUTSCALE', 'OUTSCALEN']) + acomments
         if not add_quant.in_qs[0].signed:
             infos['ADD_BIAS'] = add_quant.cache['add_bias_offset']
             infos_len = 'ASYM_ADD_DIM'
@@ -78,7 +77,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
 
 
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array(infos_len, **infos)
+        contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos)
+        comments = acomments + new_comment
 
         cname, file_name = gen_constant(gen, pnode, pnode, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(
diff --git a/tools/nntool/generation/new_generators/mult8/matmul_mult8.py b/tools/nntool/generation/new_generators/mult8/matmul_mult8.py
index d8dedd71b..cdb59f43b 100644
--- a/tools/nntool/generation/new_generators/mult8/matmul_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/matmul_mult8.py
@@ -23,6 +23,7 @@
 from generation.at_types.tc_arg_info import GlobalArgInfo
 from generation.bindings import (CommentBindingList, GNodeArgEdge,
                                  GNodeArgNode, NodeBindingList)
+from generation.gen_utils import ModelGenerationInternalError
 from generation.generators.globals.global_names import (INFOS, MULSCALE,
                                                         MULSHIFT)
 from generation.generators.kernels.autotiler_kernel import NewAutoTilerKernel
@@ -75,10 +76,10 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             infos['OUTSCALE'] = mul_qrec.cache['mul_biases_q'].qbiases[0]
             infos['OUTSCALEN'] = mul_qrec.cache['mul_biases_q'].qnorms[0]
 
+        conv_mul_bias = mul_qrec.cache.get('mul_biases_q')
+        infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(conv_mul_bias, MultMulBiasScaleQType) else 0)
+
         if mul_qrec.cache.get('ne16'):
-            conv_mul_bias = mul_qrec.cache.get('mul_biases_q')
-            infos['PRENORM'] = np.uint8(conv_mul_bias.pre_normalization if isinstance(
-                conv_mul_bias, MultMulBiasScaleQType) else 0)
             infos['NE16_PADVAL'] = np.atleast_1d(
                 mul_qrec.in_qs[0].zero_point).astype(mul_qrec.in_qs[0].dtype)
             infos['NE16_WOFFSET'] = - \
@@ -88,7 +89,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             infos_len = 'DIM'
 
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array(infos_len, **infos)
+        contents, new_comment = infos_encoder.gen_infos_array(infos_len, **infos)
+        comment += new_comment
 
         cname, file_name = gen_constant(gen, pnode, mul_node, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents)
@@ -184,12 +186,41 @@ def set_matmul_bindings(gen, in_eparams, out_eparams, cname, node, node_q, out_q
                     GNodeArgEdge(out_eparams[0], "GNA_OUT"),
                     GNodeArgNode(node, INFOS)))
 
+def calculate_dimensions(node_name, matmul_params):
+    in1_shape = tuple(matmul_params.in_dims[0].shape)
+    in2_shape = tuple(matmul_params.in_dims[1].shape)
+
+    rank1 = len(in1_shape)
+    rank2 = len(in2_shape)
+
+    if rank2 <= 2:
+        if rank2 == 1:
+            # TODO - is this correct if transposed?
+            in2_shape = (in2_shape[0], 1)
+        channels = 1
+        if rank1 > 2 and not all(dim == 1 for dim in in1_shape[:-2]):
+            in1_shape = (int(np.prod(in1_shape[:-1])), in1_shape[-1])
+        elif rank1 == 1:
+            in1_shape = (1, in1_shape[0])
+    elif rank1 == rank2 and in1_shape[:-2] == in2_shape[:-2]:
+        channels = np.prod(in1_shape[:-2])
+        LOG.warning(f'Matmul over batches is not yet properly generated - output will not be correct')
+    else:
+        raise ModelGenerationInternalError(
+            f'{node_name} Invalid dimensions for matmul kernel {in1_shape} {in2_shape}')
+
+    height_1 = in1_shape[-2]
+    width_1 = in1_shape[-1]
+    height_2 = in2_shape[-2]
+    width_2 = in2_shape[-1]
+    return height_1,width_1, height_2, width_2, channels
+
 
 class MatMulKernel(NewAutoTilerKernel):
     CALL_TEMPLATE = '''// generator for {node_name}
-CNN_MatMulAct_SQ8("{cname}", {gen_ctrl}, {bias_datasize}, 1, 
-                  {width_1}, {height_1}, {width_2}, {height_2},
-                  0, 0, 1, 1, {matmul_op}, {act_op});
+CNN_BatchedMatMulAct_SQ8("{cname}", {gen_ctrl}, {bias_datasize}, 1, 
+                         {batch_size}, {width_1}, {height_1}, {width_2}, {height_2},
+                         0, 0, 1, 1, {matmul_op}, {act_op});
 '''
 
     def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen_ctrl=None, force_relu=True):
@@ -205,10 +236,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen
         else:
             act_op = 'KOP_NONE'
 
-        height_1 = matmul_params.in_dims[0][0]
-        width_1 = matmul_params.in_dims[0][1]
-        height_2 = matmul_params.in_dims[1][0]
-        width_2 = matmul_params.in_dims[1][1]
+        height_1, width_1, height_2, width_2, batch_size = calculate_dimensions(node_name, matmul_params)
 
         if len(matmul_params.in_dims) == 3:
             bias_datasize = at_bits(matmul_qrec.in_qs[2])
@@ -222,8 +250,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen
 
         if isinstance(matmul_params, MatMulTransposedParameters):
             matmul_op += '_TRANSPOSED'
-            height_2 = matmul_params.in_dims[1][1]
-            width_2 = matmul_params.in_dims[1][0]
+            height_2, width_2 = width_2, height_2
 
         # attributes affecting generation
         attrs = {
@@ -231,6 +258,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen
             'width_1': width_1,
             'height_2': height_2,
             'width_2': width_2,
+            'batch_size': batch_size,
             'bias_datasize': bias_datasize,
             'matmul_op': matmul_op,
             'act_op': act_op
@@ -243,6 +271,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen
         }
         super().__init__(attrs, extra_attrs, gen_ctrl=gen_ctrl)
 
+
 class MatMulKernelNE16(NewAutoTilerKernel):
     CALL_TEMPLATE = '''// generator for {node_name}
 CNN_MatMulAct_NE16("{cname}", {gen_ctrl}, {in1_datasize}, {out_datasize}, {bias_datasize}, {in2_datasize_bits}, 
@@ -263,10 +292,7 @@ def __init__(self, node_name, cname, matmul_params, matmul_qrec, act_params, gen
         else:
             act_op = 'KOP_NONE'
 
-        height_1 = matmul_params.in_dims[0][0]
-        width_1 = matmul_params.in_dims[0][1]
-        height_2 = matmul_params.in_dims[1][1]
-        width_2 = matmul_params.in_dims[1][0]
+        height_1, width_1, width_2, height_2, channels  = calculate_dimensions(node_name, matmul_params)
         bias_datasize = at_bits(matmul_qrec.in_qs[2])
         in1_datasize = at_bits(matmul_qrec.in_qs[0])
         in2_datasize_bits = matmul_qrec.in_qs[1].bits
diff --git a/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py b/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py
index 8a17f468a..c6de99bd2 100644
--- a/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/padded_matadd_mult8.py
@@ -14,6 +14,9 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import logging
+from quantization.multiplicative.scaling_qtypes import MultMulBiasScaleQType
+
+import numpy as np
 
 from generation.at_types.at_params import NO_ACTIVATION, gen_activation_op
 from generation.at_types.constant_info import ConstantInfo
@@ -42,9 +45,6 @@ class PaddedMatAddSQ8Generator(GeneratorBase):
 
     @classmethod
     def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
-        cnodes = node.contained_nodes()
-        quants = [gen.G.quantization[NodeId(node, fnode)] for fnode in cnodes]
-
         cnodes = pnode.contained_nodes()
         quants = [gen.G.quantization[NodeId(pnode, cnode)] for cnode in cnodes]
 
@@ -63,10 +63,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             'OUTSCALE': quants[1].cache['scale_mul_biases_q'].qbiases,
             'OUTSCALEN': quants[1].cache['scale_mul_biases_q'].qnorms
         })
-        comments = " ".join(f'{k}: {infos1[k]}' for k in ['IN1SCALE', 'IN1SCALEN', 'OUTSCALE', 'OUTSCALEN']) + f" {acomments}"
-
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array("DIM", **infos1)
+        contents, comments = infos_encoder.gen_infos_array('DIM', **infos1)
 
         cname, file_name = gen_constant(gen, pnode, pnode, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(
@@ -77,15 +75,17 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
                                         const_info=const_info,
                                         comment=comments))
 
-
-        infos.update({
-            'IN1SCALE': quants[1].cache['scale_mul_biases_q'].qbiases,
-            'IN1SCALEN': quants[1].cache['scale_mul_biases_q'].qnorms
-        })
-        comments = " ".join(f'{k}: {infos[k]}' for k in ['IN1SCALE', 'IN1SCALEN']) + f" {acomments}"
-
+        # Padded part needs to apply out scale of the matadd + act scale
+        double_scale = MultMulBiasScaleQType(
+            dtype=np.uint8,
+            scale=quants[1].cache['scale_mul_biases_q'].scale * quants[2].cache['scale_mul_biases_q'].scale \
+                if len(cnodes) == 3 else \
+                  quants[1].cache['scale_mul_biases_q'].scale
+        )
+        infos['actscale'] = double_scale.qbiases
+        infos['actscalen'] = double_scale.qnorms
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array("DIM", **infos)
+        contents, comments = infos_encoder.gen_infos_array('DIM', **infos)
 
         cname, file_name = gen_constant(gen, pnode, cnodes[0], INFOS, extra_name='Pad')
         const_info = ConstantInfo(file_name, QType.Pow2(
diff --git a/tools/nntool/generation/new_generators/mult8/pool_mult8.py b/tools/nntool/generation/new_generators/mult8/pool_mult8.py
index 717aa5729..2294ea81c 100644
--- a/tools/nntool/generation/new_generators/mult8/pool_mult8.py
+++ b/tools/nntool/generation/new_generators/mult8/pool_mult8.py
@@ -82,7 +82,8 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
             infos['GLOBAL_SUM_SCALEN'] = pool_q.cache['scale_mul_biases_q'].qnorms
 
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array('DIM', **infos)
+        contents, new_comment = infos_encoder.gen_infos_array('DIM', **infos)
+        comment += new_comment
 
         cname, file_name = gen_constant(gen, pnode, pnode, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(
@@ -103,7 +104,7 @@ def bindings_generator(cls, gen, node, qrec, in_eparams, out_eparams, cname) ->
                     gen, in_eparams, out_eparams, cname, node, qrec)
                 return True
             return False
-        elif isinstance(node, (GlobalPoolingParameters, PoolingParameters)):
+        elif isinstance(node, (GlobalPoolingParameters, PoolingParameters, ActivationParameters)):
             cls.set_in_out_infos_bindings(
                 gen, in_eparams, out_eparams, cname, node, qrec)
         else:
diff --git a/tools/nntool/generation/new_generators/mult8/softmax_mult.py b/tools/nntool/generation/new_generators/mult8/softmax_mult.py
index cb9d34ac3..98cf23325 100644
--- a/tools/nntool/generation/new_generators/mult8/softmax_mult.py
+++ b/tools/nntool/generation/new_generators/mult8/softmax_mult.py
@@ -45,10 +45,11 @@ def globals_generator(cls, gen, node, qrec, pnode, fnode) -> bool:
         infos = { 
             'BIASL_SM': np.uint8(15 + np.ceil(np.log2(qrec.in_qs[0].scale)))
         }
-        comment = f"in: {qrec.in_qs[0].scale[0]:.5f} out: {qrec.out_qs[0].scale[0]:.5f} NORM: {infos['BIASL_SM']}"
+        comment = f"in: {qrec.in_qs[0].scale[0]:.5f} out: {qrec.out_qs[0].scale[0]:.5f} "
 
         infos_encoder = SQ8ActInfos()
-        contents = infos_encoder.gen_infos_array('DIM', **infos)
+        contents, new_comment = infos_encoder.gen_infos_array('DIM', **infos)
+        comment += new_comment
 
         cname, file_name = gen_constant(gen, pnode, pnode, INFOS)
         const_info = ConstantInfo(file_name, QType.Pow2(bits=8, q=0, signed=True), contents=contents)
@@ -95,7 +96,7 @@ def __init__(self, node_name, cname, params, qrec, gen_ctrl=None):
         # attributes affecting generation
         attrs = {
             'size': in_dim.size(),
-            'width': in_dim.size()/in_dim.shape[axis],
+            'width': in_dim.size()//in_dim.shape[axis],
             'height': in_dim.shape[axis],
             'softmax_op': softmax_op
         }
diff --git a/tools/nntool/graph/manipulations/__init__.py b/tools/nntool/graph/manipulations/__init__.py
index f7ce2002b..e69de29bb 100644
--- a/tools/nntool/graph/manipulations/__init__.py
+++ b/tools/nntool/graph/manipulations/__init__.py
@@ -1,19 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-from .dimensions import add_dimensions
-from .adjust_order import adjust_order
-from .liveness import calculate_liveness
-from .balance_filter import balance_filter, balance_all_filters
diff --git a/tools/nntool/graph/manipulations/adjust_base.py b/tools/nntool/graph/manipulations/adjust_base.py
index 98ff7f199..6881a1444 100644
--- a/tools/nntool/graph/manipulations/adjust_base.py
+++ b/tools/nntool/graph/manipulations/adjust_base.py
@@ -66,7 +66,7 @@ def apply_input_trans(self, G, node, trans: list, index=None):
             if node.in_dims_hint:
                 node.in_dims_hint[idx] = apply_transpose(node.in_dims_hint[idx], trans)
             nid = NodeId(node)
-            if G.quantization:
+            if G.quantization and nid in G.quantization:
                 G.quantization.copy_qrec(node, 'in', idx, params)
 
     def apply_output_trans(self, G, node, trans: list, index=None):
@@ -86,7 +86,8 @@ def apply_output_trans(self, G, node, trans: list, index=None):
             )
             if node.out_dims_hint:
                 node.out_dims_hint[idx] = apply_transpose(node.out_dims_hint[idx], self.invert(trans))
-            if G.quantization:
+            nid = NodeId(node)
+            if G.quantization and nid in G.quantization:
                 G.quantization.copy_qrec(node, 'out', idx, params)
 
     @staticmethod
diff --git a/tools/nntool/graph/manipulations/dimensions.py b/tools/nntool/graph/manipulations/dimensions.py
index f883d4d02..a6c79b97d 100644
--- a/tools/nntool/graph/manipulations/dimensions.py
+++ b/tools/nntool/graph/manipulations/dimensions.py
@@ -13,12 +13,13 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from graph.verify import verify_graph
 import logging
 from typing import Sequence
 
 from generation.naming_convension import (DefaultNamingConvension,
                                           NamingConvension)
+from utils.graph import GraphView
+# from graph.verify import verify_graph
 
 from ..dim import Dim, MissMatchedInputsError, MoreThanOneInputError
 from ..types import (ConcatParameters, ConstantInputParameters, EdgeParameters,
@@ -30,9 +31,10 @@
 
 
 def set_out_edges_multi(G, node: Parameters, dims: Sequence[Dim], step_idx: int,
-                        naming_convension: NamingConvension, edge_type: str = "in_out"):
+                        naming_convension: NamingConvension, update_graph, edge_type: str = "in_out"):
     # clone the dims first so that the edge dims are the same objects as the node output dims
-    dims = node.set_output_size(dims)
+    if update_graph:
+        dims = node.set_output_size(dims)
     out_edges = G.indexed_out_edges(node)
     is_multi_out = len(out_edges) > 1
     for edge_idx, edge_group in enumerate(out_edges):
@@ -49,17 +51,20 @@ def set_out_edges_multi(G, node: Parameters, dims: Sequence[Dim], step_idx: int,
 
 
 def set_out_edges_one(G, node: Parameters, dim: Dim, step_idx: int,
-                      naming_convension: NamingConvension, edge_type: str = "in_out"):
+                      naming_convension: NamingConvension, update_graph, edge_type: str = "in_out"):
     ename = naming_convension.get_edge_name(node, step_idx, edge_type)
     eparams = EdgeParameters(ename, dim, node, 0, step_idx, edge_type)
     for edge in G.out_edges(node.name):
         assert edge.from_idx == 0, "Only for use with nodes that have one output"
         edge.params = eparams
     LOG.debug("%s %s", node.name, ename)
-    eparams.dims = node.set_output_size([dim])[0]
+    if update_graph:
+        eparams.dims = node.set_output_size([dim])[0]
+    else:
+        eparams.dims = node.out_dims[0]
 
 
-def validate_one_in_edge(G, node: Parameters, expect_named: bool = True):
+def validate_one_in_edge(G, node: Parameters, update_graph, expect_named: bool = True):
     edges = G.in_edges(node.name)
     if len(edges) != 1:
         if len(edges) > 1:
@@ -70,11 +75,12 @@ def validate_one_in_edge(G, node: Parameters, expect_named: bool = True):
     assert eparams is not None, "edge parameters not yet set"
     assert not expect_named or eparams.dims.has_keys(
         ['c', 'h', 'w']), "dimensions not yet set"
-    eparams.dims = node.set_input_size([eparams.dims])[0]
+    if update_graph:
+        eparams.dims = node.set_input_size([eparams.dims])[0]
     return eparams
 
 
-def validate_multi_in_edge(G, node: Parameters, expect_named: bool = True):
+def validate_multi_in_edge(G, node: Parameters, update_graph, expect_named: bool = True):
     dims = []
     for edge in G.indexed_in_edges(node.name):
         if edge is None:
@@ -85,64 +91,77 @@ def validate_multi_in_edge(G, node: Parameters, expect_named: bool = True):
         assert not expect_named or eparams.dims.has_keys(
             ['c', 'h', 'w']), "dimensions not yet set"
         dims.append(eparams.dims)
-    try:
-        dims = node.set_input_size(dims)
-    except MissMatchedInputsError as exc:
-        raise ValueError(f'missmatched inputs on node {node.name}') from exc
+    if update_graph:
+        try:
+            dims = node.set_input_size(dims)
+        except MissMatchedInputsError as exc:
+            raise ValueError(f'missmatched inputs on node {node.name}') from exc
     return dims
 
 
 def add_dimensions_concat(G, node: Parameters, step_idx: int,
-                          naming_convension: NamingConvension, indexes):
+                          naming_convension: NamingConvension,
+                          indexes, update_graph):
     del indexes
-    in_dims = validate_multi_in_edge(G, node, expect_named=False)
-    out_dims = node.get_output_size(in_dims)
-    set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension)
+    in_dims = validate_multi_in_edge(G, node, update_graph, expect_named=False)
+    if update_graph:
+        out_dims = node.get_output_size(in_dims)
+    else:
+        out_dims = node.out_dims
+    set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension, update_graph )
 
 
 def add_dimensions_constant(G, node: Parameters, step_idx: int,
-                            naming_convension: NamingConvension, indexes):
+                            naming_convension: NamingConvension, indexes, update_graph):
     node.index = indexes['constant']
     indexes['constant'] += 1
     constant_dims = node.get_output_size(None)
     set_out_edges_one(G, node, constant_dims[0], step_idx,
-                      naming_convension, edge_type="in")
+                      naming_convension, update_graph, edge_type="in")
 
 
 def add_dimensions_input(G, node: Parameters, step_idx: int,
-                         naming_convension: NamingConvension, indexes):
+                         naming_convension: NamingConvension, indexes, update_graph):
     node.index = indexes['input']
     indexes['input'] += 1
     input_dims = node.get_output_size(None)
     node.set_input_size(input_dims)
     set_out_edges_one(G, node, input_dims[0], step_idx,
-                      naming_convension, edge_type="in")
+                      naming_convension, update_graph , edge_type="in")
 
 
 def add_dimensions_output(G, node: Parameters, step_idx: int,
-                          naming_convension: NamingConvension, indexes):
+                          naming_convension: NamingConvension, indexes, update_graph):
     node.index = indexes['output']
     indexes['output'] += 1
-    eparams = validate_one_in_edge(G, node, expect_named=False)
+    eparams = validate_one_in_edge(G, node, update_graph, expect_named=False)
     eparams.edge_type = "out"
     eparams.name = naming_convension.get_edge_name(node, step_idx, "out")
     # set the dimensions of the output node
-    node.set_output_size(node.get_output_size([eparams.dims]))
+    if update_graph:
+        node.set_output_size(node.get_output_size([eparams.dims]))
 
 
 def add_dimensions_unknown_single(G, node: Parameters, step_idx: int,
-                                  naming_convension: NamingConvension, indexes):
+                                  naming_convension: NamingConvension, indexes, update_graph):
     del indexes
-    eparams = validate_one_in_edge(G, node, expect_named=False)
-    out_dims = node.get_output_size([eparams.in_dims])
-    set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension)
+    eparams = validate_one_in_edge(G, node, update_graph, expect_named=False)
+    if update_graph:
+        out_dims = node.get_output_size([eparams.in_dims])
+    else:
+        out_dims = node.out_dims
+    set_out_edges_one(G, node, out_dims[0], step_idx, naming_convension, update_graph)
 
 
 def add_dimensions_unknown(G, node: Parameters, step_idx: int,
-                           naming_convension: NamingConvension):
-    in_dims = validate_multi_in_edge(G, node, expect_named=False)
-    set_out_edges_multi(G, node, node.get_output_size(in_dims),
-                        step_idx, naming_convension)
+                           naming_convension: NamingConvension, update_graph):
+    in_dims = validate_multi_in_edge(G, node, update_graph, expect_named=False)
+    if update_graph:
+        out_dims = node.get_output_size(in_dims)
+    else:
+        out_dims = node.out_dims
+    set_out_edges_multi(G, node, out_dims,
+                        step_idx, naming_convension, update_graph)
 
 
 OP_ROUTINES = {
@@ -154,7 +173,7 @@ def add_dimensions_unknown(G, node: Parameters, step_idx: int,
 }
 
 
-def add_dimensions(G, naming_convension: NamingConvension = None) -> list:
+def add_dimensions(G: GraphView, naming_convension: NamingConvension = None, update_graph=True) -> list:
     """ Walks graph setting all edge names and dimensions
     """
     if naming_convension is None:
@@ -171,15 +190,21 @@ def add_dimensions(G, naming_convension: NamingConvension = None) -> list:
     #                       else "b" + (str(node.step_idx) if node.step_idx else node.name)))
     LOG.debug("inputs: %s", [node.name for node in inputs])
 
-    for node in G.dfs(inputs):
+    def add_step(step, idx):
+        if len(steps) <= idx:
+            steps.extend([None] * (idx + 1 - len(steps)))
+        steps[idx] = step
+
+    for node in G.topological_sort(inputs):
         LOG.debug("add dimensions to: %s", node.name)
-        node.step_idx = len(steps)
-        steps.append({'node': node})
+        if update_graph:
+            node.step_idx = len(steps)
+        add_step({'node': node}, node.step_idx)
         if node.__class__ in OP_ROUTINES:
             OP_ROUTINES[node.__class__](
-                G, node, node.step_idx, naming_convension, indexes)
+                G, node, node.step_idx, naming_convension, indexes, update_graph)
         else:
-            add_dimensions_unknown(G, node, node.step_idx, naming_convension)
+            add_dimensions_unknown(G, node, node.step_idx, naming_convension, update_graph)
     set_aliases(G)
     # verify_graph(G, throw_exception=True)
     return steps
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
index e0276fd36..e1f13ee60 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021  GreenWaves Technologies, SAS
+# Copyright (C) 2021, 2022  GreenWaves Technologies, SAS
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -13,29 +13,26 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from functools import reduce
 import logging
 from collections.abc import MutableSet
 from copy import deepcopy
 from typing import Iterator, Sequence
 
-from graph.dim import Dim
-from graph.types import (BinaryOpParameters, ConcatParameters,
-                         ConstantInputParameters, FcParameters,
-                         InputParameters, LinearFusionParameters,
-                         OutputParameters, PadParameters, ReshapeParameters,
-                         ReverseParameters, StridedSliceParameters,
-                         TransposeParameters, ActivationParameters)
-from graph.types.base import NNEdge, SensitiveToOrder
-from graph.types.others import CopyParameters, UnaryOpParameters
-from graph.types.tensor_arithmetic import Broadcastable
-from utils.compatible_transposes import (find_all_compatible_transposes,
-                                         find_combination)
+from graph.types import (ActivationParameters, BinaryOpParameters,
+                         Broadcastable, ConcatParameters,
+                         ConstantInputParameters, CopyParameters, FcParameters,
+                         GlobalPoolingParameters, InputParameters,
+                         LinearFusionParameters, NNEdge, OutputParameters,
+                         PadParameters, PowOpParameters, ReshapeParameters,
+                         ReverseParameters, SensitiveToOrder,
+                         StridedSliceParameters, TransposeParameters,
+                         UnaryOpParameters)
+from utils.compatible_transposes import reverse_reshape
 from utils.graph import Node
-from utils.graph_utils.copy_expressions import do_transpose
 from utils.node_id import NodeId
 
-from .eliminate_transposes_actions import (Action, DeleteReshapeAction,
+from .eliminate_transposes_actions import (Action, CantContinueError,
+                                           DeleteReshapeAction,
                                            DeleteTransposeAction,
                                            EndActionDown, EndActionUp,
                                            InsertReshapeAction,
@@ -46,11 +43,11 @@
                                            SetReshapeAction,
                                            SetTransposeAction,
                                            SwitchBatchLinearAction,
-                                           TransposePad, TransposeReverse,
+                                           TransposePad,
+                                           TransposeReverse,
                                            TransposeSlidedSlice)
-from .transpose_helpers import (apply_transpose, get_reshape_transpose,
-                                identity_transpose, reshape_is_transpose,
-                                reverse_transpose, reverses_transpose,
+from .transpose_helpers import (apply_transpose, identity_transpose,
+                                reverse_transpose, reverses_transpose_up,
                                 transpose_does_nothing)
 
 LOG = logging.getLogger("nntool." + __name__)
@@ -67,7 +64,7 @@ def debug(msg):
 TRANSIENT_ACTIONS = {
     PadParameters: TransposePad,
     ReverseParameters: TransposeReverse,
-    StridedSliceParameters: TransposeSlidedSlice
+    StridedSliceParameters: TransposeSlidedSlice,
 }
 
 NODES_TO_EXPLORE_UP = {
@@ -77,10 +74,6 @@ def debug(msg):
 }
 
 
-class CantContinueError(Exception):
-    pass
-
-
 class TransposeHistory():
     def __init__(self, node, from_shape=None, transpose=None, to_shape=None) -> None:
         self.node = node
@@ -140,13 +133,13 @@ def visited_down(self, node, idx=None) -> bool:
 
     def visit_up(self, node, idx):
         val = self._nodes.setdefault(node, set())
-        val.add('up{idx}')
+        val.add(f'up{idx}')
 
     def visited_up(self, node, idx=None) -> set:
         visited = self._nodes.get(node, set())
         if idx is None:
             return any(k.startswith('up') for k in visited)
-        return 'up{idx}' in visited or 'up*' in visited
+        return f'up{idx}' in visited or 'up*' in visited
 
     def visited_direction(self, direction, idx, node) -> bool:
         return f'{direction}{idx}' in self._nodes.get(node, set())
@@ -180,55 +173,6 @@ def __repr__(self) -> str:
         return "{" + ",".join(f"{repr(node)}: {visited}" for node, visited in self._nodes.items()) + "}"
 
 
-def is_broadcasted(from_shape, to_shape):
-    from_len = len(from_shape)
-    to_len = len(to_shape)
-    if from_len >= to_len:
-        return False
-    return tuple(([1] * (to_len - from_len)) + list(from_shape)) == tuple(to_shape)
-
-
-def expand_to_len(trans, length):
-    extra = length-len(trans)
-    return tuple(list(range(extra)) + [dim + extra for dim in trans])
-
-
-def reverse_reshape(trans, from_shape, to_shape):
-    """reverses the effect of this reshape on the transpose"""
-    # if the from_shape -> to_shape is actually a broadcast reshape
-    # i.e. 4, 10, 1 -> 1, 4, 10, 1 we absolutely need to keep the order 4, 10, 1 in
-    # the transpose however the 2 1s in the result are ambiguous so handle this as a
-    # (simple) special case. Just expand the transpose with no transpose at the start
-    # and expand_len + original transpose dim at the end
-    if len(from_shape) == 0 or len(to_shape) == 0:
-        return None
-    if is_broadcasted(from_shape, to_shape):
-        return expand_to_len(trans, len(to_shape))
-
-    return next(iter([t for t in find_all_compatible_transposes(find_combination(from_shape, to_shape), trans)
-                      if len(t) == len(to_shape)]), None)
-
-
-def none_or_idx(trans, idx):
-    return None if trans[idx] is None else idx
-
-
-def reverse_broadcast(old_shape, new_shape, transpose):
-    old_shape_idx = new_shape_idx = 0
-    res_pos = {}
-    while old_shape_idx < len(old_shape) or new_shape_idx < len(new_shape):
-        if old_shape_idx < len(old_shape) and old_shape[old_shape_idx] == new_shape[new_shape_idx]:
-            res_pos[old_shape_idx] = new_shape_idx
-            old_shape_idx += 1
-            new_shape_idx += 1
-        elif new_shape_idx < len(new_shape) and new_shape[new_shape_idx] == 1:
-            new_shape_idx += 1
-        else:
-            raise ValueError(
-                f'reverse broadcast not possible between {old_shape} and {new_shape}')
-    return tuple([res_pos[idx] for idx in transpose] + [idx for idx, _ in enumerate(new_shape) if idx not in res_pos.values()])
-
-
 def requires_reshape(trans1, trans2, dim):
     """Checks if layout shape doesn't change but a reshape is necessary due to 1 position"""
     if (tuple(dim.shape) != tuple(dim.layout_shape) and
@@ -240,56 +184,38 @@ def requires_reshape(trans1, trans2, dim):
     return False
 
 
-def strip_nones(trans):
-    return [i for i in trans if i is not None]
+def check_for_null_transpose(node, transpose):
+    if transpose is None:
+        raise CantContinueError(f"can't continue at {node.name}")  # @IgnoreException
 
 
-def broadcast_reduce(out_shape, in_shape, transpose):
-    """Looking at a broadcasted input that has a lower rank than out_shape find
-    the equivalent transpose to the transpose on the broadcasted shape before
-    the broadcast
+def check_continue(visited_nodes: VisitedNodes, cur_visited_nodes: VisitedNodes, exclude_nodes, node, direction, idx):
+    """Checks to see if we should skip visiting node on edge
 
     Args:
-        out_shape (Sequence): The full shape of the output of the broadcasted operation
-        in_shape (Sequence): The shape of the unbroadcasted input
-        transpose (Sequence): The transpose on the output
+        visited_nodes (VisitedNodes): All nodes visited in previous eliminations
+        cur_visited_nodes (VisitedNodes): Nodes visited on this branch
+        exclude_nodes (Sequence[Parameters]): Don't visit these nodes
+        node (Parameters): Node on edge
+        direction (str): direction of visit 'down' or 'up'
+        idx (int): edge index
+
+    Raises:
+        CantContinueError: Fail this transpose test
 
     Returns:
-        Tuple: The in shape, the broadcasted in shape, the equivalent transpose
+        bool: True if skip False if visit
     """
-    diff_shape = len(out_shape) - len(in_shape)
-    # broadcast shape with nones
-    exp_in_shape = ([None] * diff_shape) + list(range(len(in_shape)))
-    # apply the reverse of the transpose. now we have the broadcasted shape before the transpose
-    transpose_exp_in_shape = apply_transpose(
-        exp_in_shape, reverse_transpose(transpose))
-    # strip the nones and reverse the result. This gives the transpose of the unbroadcasted shape
-    new_transpose = reverse_transpose(strip_nones(transpose_exp_in_shape))
-    new_shape = ([1] * diff_shape) + in_shape
-    return in_shape, new_shape, new_transpose
-
-
-def broadcast_expand(out_shape, in_shape, transpose):
-    diff_shape = len(out_shape) - len(in_shape)
-    exp_in_shape = ([None] * diff_shape) + list(range(len(in_shape)))
-    transpose_exp_in_shape = apply_transpose(exp_in_shape, transpose)
-    new_transpose = list(range(diff_shape)) + \
-        [dim + diff_shape for dim in transpose]
-    new_shape = ([1] * diff_shape) + in_shape
-    return in_shape, new_shape, new_transpose
-
-
-def check_for_null_transpose(node, transpose):
-    if transpose is None:
-        raise CantContinueError(f"can't continue at {node.name}")  # @IgnoreException
-
-
-def check_continue(visited_nodes: VisitedNodes, cur_visited_nodes: VisitedNodes, exclude_nodes, node, direction, idx):
     all_visited = visited_nodes | cur_visited_nodes
-    if direction == 'up' and all_visited.visited_down(node):
-        return True
-    if direction == 'down' and all_visited.visited_up(node):
-        raise CantContinueError()  # @IgnoreException
+    # if the node is sensitive to order then even if we have already visited it down
+    # we must visit it up and vice versa so that we maybe insert a reshape/transpose after it
+    if not isinstance(node, SensitiveToOrder):
+        if direction == 'up' and all_visited.visited_down(node):
+            # trying to visit node that was already visited in the other direction.
+            return True
+        if direction == 'down' and all_visited.visited_up(node):
+            # trying to visit node that was already visited in the other direction.
+            return True
     if all_visited.visited_direction(direction, idx, node):
         raise CantContinueError()  # @IgnoreException
     if node in exclude_nodes:
@@ -297,16 +223,11 @@ def check_continue(visited_nodes: VisitedNodes, cur_visited_nodes: VisitedNodes,
     return False
 
 
-def strip_leading_ones(shape, in_len):
-    res = []
-    seen_dim = False
-    for dim in shape:
-        if seen_dim:
-            res.append(dim)
-        elif dim != 1:
-            res.append(dim)
-            seen_dim = True
-    return res
+def strip_leading_dim(shape, dim=1):
+    res = list(shape.copy())
+    while len(res) > 1 and res[0] == dim:
+        res.pop(0)
+    return tuple(res)
 
 
 def compute_max_shape(dims):
@@ -343,9 +264,9 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
         node : The node to look at
         visited_nodes : Nodes already traversed
         in_edge : The edge we are arriving on at this node
-        transpose_history : A history of the reshapes passed that did not allow us to determine the transpose
-        transpose : The current transpose being propagated. Can be None to indicate that we cannot translate
-                    the transpose via that reshape
+        transpose_history : A history of the reshapes passed that did not allow us
+                            to determine the transpose. Transposes
+                            are in the downwards direction.
 
     Returns:
         A tuple of a list of actions and a list of nodes traversed
@@ -385,7 +306,7 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
 
     # if arriving on a broadcasted input the transpose needs to be expanded
     # since the transpose is only acting on the broadcasted dimensions no reshape is necessary
-    if isinstance(node, Broadcastable) and len(in_shape) != node.out_dims[0].rank:
+    if isinstance(node, (Broadcastable, PowOpParameters)) and len(in_shape) != node.out_dims[0].rank:
         check_for_null_transpose(node, transpose)
         # This could be an expression so need to broadcaset the output
         max_shape = compute_max_shape(node.out_dims)
@@ -414,15 +335,15 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
             if len(edge_in_shape) != len(max_shape):
                 # strip the broadcasted axis from the transpose
                 b_axes = broadcasted_axes(edge_in_shape, max_shape)
-                transpose_without_broadcast = strip_axes_from_transpose(
-                    reverse_transpose(transpose), b_axes)
-                # from shape will be the old shape with the unbroadcasted transpose
+                # Transpose moving down through the broadcast - strip the broadcast off it
+                transpose_without_broadcast = strip_axes_from_transpose(transpose, b_axes)
+                # from shape will be the old shape with the reversed unbroadcasted transpose - i.e. going up
                 from_shape = apply_transpose(
-                    edge_in_shape, transpose_without_broadcast)
-                # to shape is the broadcasted input shape with the transpose with the leading ones removed
+                    edge_in_shape, reverse_transpose(transpose_without_broadcast))
+                # to shape is the broadcasted input shape with the reverse transpose with the leading ones removed
                 broadcasted_shape = ([1] * len(b_axes)) + list(edge_in_shape)
-                to_shape = strip_leading_ones(apply_transpose(
-                    broadcasted_shape, reverse_transpose(transpose)), len(from_shape))
+                to_shape = strip_leading_dim(apply_transpose(
+                    broadcasted_shape, reverse_transpose(transpose)))
                 # if they are not equal insert a reshape
                 if from_shape != to_shape:
                     info(
@@ -456,27 +377,36 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
         if filter_node.batch_size > 1:
             info(
                 f"rejected {node.name} - multibatch linear layer - inserting transpose {transpose}")
-            return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
+            return [
+                InsertTransposeAction(
+                    node, direction='in', idx=in_edge.to_idx, transpose=transpose),
+                EndActionDown(node)], cur_visited_nodes
         info(
             f"accepted {node.name} - linear layer reorder input - {transpose}")
         qrec = G.quantization and G.quantization[NodeId(node)]
-        return cur_actions + [ReorderLinearAction.in_from_history(node, transpose_history, qrec), EndActionDown(node)], cur_visited_nodes
+        return cur_actions + [
+            ReorderLinearAction.in_from_history(node, transpose_history, qrec),
+            EndActionDown(node)], cur_visited_nodes
 
     if isinstance(node, TransposeParameters):
-        # TODO - Might be able to get rid of this and check history
         check_for_null_transpose(node, transpose)
-        if reverses_transpose(transpose, node.transpose, node.out_dims[0]):
+        reverses_transpose, old_shape = reverses_transpose_up(transpose, node.transpose, node.out_dims[0])
+        if reverses_transpose:
             info(
                 f"accepted {node.name} - transpose {node.transpose} reversed in by {transpose} on {node.in_dims[0]}")
-            reshape = requires_reshape(
-                transpose, node.transpose, node.in_dims[0])
-            if reshape:
+            if old_shape:
+                reshape = (old_shape, node.out_dims[0].shape)
                 info(f"requires reshape {reshape[0]} -> {reshape[1]}")
+            else:
+                reshape = None
             return [DeleteTransposeAction(node, reshape=reshape), EndActionDown(node)], cur_visited_nodes
         new_transpose = apply_transpose(transpose, node.transpose)
         info(
-            f"rejected {node.name} - transpose - does not reverse - absorbing {transpose} into {node.transpose} -> {new_transpose}")
-        return [SetTransposeAction(node, new_transpose), EndActionDown(node)], cur_visited_nodes
+            f"rejected {node.name} - transpose - does not reverse - absorbing {transpose} "
+            f"into {node.transpose} -> {new_transpose}")
+        return [
+            SetTransposeAction(node, new_transpose),
+            EndActionDown(node)], cur_visited_nodes
 
     if isinstance(node, OutputParameters):
         # TODO - Might be able to get rid of this and check history
@@ -484,7 +414,10 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
         if node.fixed_order:
             info(
                 f"rejected {node.name} - fixed order output - inserting transpose {transpose}")
-            return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
+            return [
+                InsertTransposeAction(
+                    node, direction='in', idx=in_edge.to_idx, transpose=transpose),
+                EndActionDown(node)], cur_visited_nodes
         info(
             f"accepted {node.name} - output without fixed order - transpose output {transpose}")
         # No change here since the output dimensions will be computed by the shape inference
@@ -493,23 +426,20 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
     if isinstance(node, StridedSliceParameters) and node.slice_shape != node.out_shape:
         # strided slice that is also reshaping
         check_for_null_transpose(node, transpose)
-        new_transpose = reverse_transpose(reverse_reshape(
-            reverse_transpose(transpose), node.slice_shape, node.out_shape))
+        new_transpose, from_shape, to_shape = reverse_reshape(
+            transpose, node.slice_shape, node.out_shape)
         if new_transpose is None:
             info(
-                f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}")
+                f"rejected {node.name} - cannot pass slice reshape - inserting transpose {transpose}")
             return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose),
                     EndActionDown(node)], cur_visited_nodes
 
         cur_actions.append(TransposeSlidedSlice(
-            node, reverse_transpose(transpose), transpose_out=reverse_transpose(new_transpose), dir="down"))
+            node, transpose, out_shape=to_shape, dir="down"))
 
         if identity_transpose(new_transpose):
             return cur_actions + [EndActionDown(node)], cur_visited_nodes
 
-        from_shape = do_transpose(reverse_transpose(
-            transpose), node.slice_shape) if transpose is not None else None
-
         transpose_history = transpose_history + \
             [TransposeHistory(node, node.slice_shape,
                               new_transpose, node.out_shape)]
@@ -521,49 +451,21 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
     elif isinstance(node, ReshapeParameters):
         # TODO - Might be able to get rid of this and check history
         check_for_null_transpose(node, transpose)
-        if reshape_is_transpose(node.old_shape.shape, node.shape.shape):
-            # if the reshape looks like a transpose then treat it as one. THe reshape rewriter sometimes gets
-            # the order wrong in this case
-            old_transpose = get_reshape_transpose(
-                node.old_shape.shape, node.shape.shape)
-            if reverses_transpose(transpose, old_transpose):
-                cur_actions += [
-                    DeleteReshapeAction(
-                        node
-                    )
-                ]
-                return cur_actions + [
-                    DeleteReshapeAction(
-                        node
-                    ),
-                    EndActionDown(node)], cur_visited_nodes
-            new_transpose = apply_transpose(
-                transpose, old_transpose)
-            info(
-                f"pass reshape that is transpose {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}")
-            # insert an action to rewrite the reshape shapes
-            from_shape = apply_transpose(
-                node.old_shape.shape, reverse_transpose(transpose))
-            to_shape = apply_transpose(
-                node.shape.shape, reverse_transpose(transpose))
-        else:
-            # the transpose that we are actually applying is the reverse of the transpose that we are propagating down
-            # So we reverse the transpose before evaluating the reshape and then reverse the result
-            new_transpose = reverse_transpose(reverse_reshape(
-                reverse_transpose(transpose), node.old_shape, node.shape))
+        new_transpose, from_shape, to_shape = reverse_reshape(
+            transpose, node.old_shape, node.shape)
+        info(
+            f"pass reshape {node.name} down trans: old {transpose} new {new_transpose} "
+            f"shape: old {node.old_shape} new {node.shape}")
+
+        if new_transpose is None and len(node.shape) > 1:
             info(
-                f"pass reshape {node.name} down trans: old {transpose} new {new_transpose} shape: old {node.old_shape} new {node.shape}")
+                f"rejected {node.name} - cannot pass reshape - inserting transpose {transpose}")
+            return [
+                InsertTransposeAction(
+                    node, direction='in', idx=in_edge.to_idx, transpose=transpose),
+                EndActionDown(node)], cur_visited_nodes
 
-            if new_transpose is None and len(node.shape) > 1:
-                info(
-                    f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}")
-                return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
-
-            # insert an action to rewrite the reshape shapes
-            from_shape = apply_transpose(node.old_shape.shape,
-                                         reverse_transpose(transpose)) if transpose is not None else None
-            to_shape = apply_transpose(node.shape.shape, reverse_transpose(
-                new_transpose)) if new_transpose is not None else None
+        # insert an action to rewrite the reshape shapes
         info(f"rewrite reshape to {from_shape}->{to_shape}")
         if from_shape is None or to_shape is None or from_shape != to_shape:
             cur_actions += [
@@ -589,16 +491,21 @@ def search_down(G, node, exclude_nodes, visited_nodes: VisitedNodes, in_edge,
 
         if new_transpose is None:
             try:
-                return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(), cur_actions.copy(), transpose_history, new_transpose)
+                return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(),
+                                     cur_actions.copy(), transpose_history, new_transpose)
             except CantContinueError as ex:
                 if transpose is None:
                     raise ex
                 info(
-                    f"rejected {node.name} - transpose out - does not reverse - inserting transpose {transpose}")
-                return [InsertTransposeAction(node, direction='in', idx=in_edge.to_idx, transpose=transpose), EndActionDown(node)], cur_visited_nodes
+                    f"rejected {node.name} - cannot continue {ex} - inserting transpose {transpose}")
+                return [
+                    InsertTransposeAction(
+                        node, direction='in', idx=in_edge.to_idx, transpose=transpose),
+                    EndActionDown(node)], cur_visited_nodes
         transpose = new_transpose
 
-    return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose)
+    return continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes,
+                         cur_actions, transpose_history, transpose)
 
 
 def continue_down(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose):
@@ -621,7 +528,8 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         info(
             f'accepted {node.name} - single dimension transpose')
         return [EndActionUp(node)], cur_visited_nodes
-    if isinstance(node, SensitiveToOrder) and transpose_does_nothing(reverse_transpose(transpose), node.out_dims[out_edge.from_idx].shape):
+    if (isinstance(node, SensitiveToOrder) and
+            transpose_does_nothing(reverse_transpose(transpose), node.out_dims[out_edge.from_idx].shape)):
         new_shape = apply_transpose(
             node.out_dims[out_edge.from_idx].shape, reverse_transpose(transpose))
         # could be that the transpose does nothing to the data layout but still changes the positions of
@@ -643,7 +551,10 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         check_for_null_transpose(node, transpose)
         info(
             f'rejected {node.name}  - sensitive to order - inserting transpose {transpose}')
-        return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+        return [
+            InsertTransposeAction(node, direction='out', idx=out_edge.from_idx,
+                                  out_edge=out_edge, transpose=reverse_transpose(transpose)),
+            EndActionUp(node)], cur_visited_nodes
 
     cur_actions = []
 
@@ -664,7 +575,9 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
             exclude_nodes,
             visited_nodes | cur_visited_nodes,
             edge,
-            [TransposeHistory(node, node.out_dims[edge.from_idx], transpose, apply_transpose(node.out_dims[edge.from_idx], transpose))])
+            [
+                TransposeHistory(node, node.out_dims[edge.from_idx], transpose,
+                                 apply_transpose(node.out_dims[edge.from_idx], transpose))])
         cur_visited_nodes |= visited_down_nodes
         cur_actions += new_actions
 
@@ -680,15 +593,20 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
                     f"accepted {node.name} - linear layer switch batch dimension")
                 return cur_actions + [SwitchBatchLinearAction(node), EndActionUp(node)], cur_visited_nodes
             info(f"rejected {node.name} - batched linear")
-            return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+            return [
+                InsertTransposeAction(node, direction='out', idx=out_edge.from_idx,
+                                      out_edge=out_edge, transpose=reverse_transpose(transpose)),
+                EndActionUp(node)], cur_visited_nodes
         info(f"accepted {node.name} - linear layer reorder output")
         qrec = G.quantization and G.quantization[NodeId(node)]
-        return cur_actions + [ReorderLinearAction.out_from_history(node, transpose_history, qrec), EndActionUp(node)], cur_visited_nodes
+        return cur_actions + [
+            ReorderLinearAction.out_from_history(
+                node, transpose_history, qrec),
+            EndActionUp(node)], cur_visited_nodes
 
     # Transpose may reverse the propagated transpose or be reordered
     if isinstance(node, TransposeParameters):
         check_for_null_transpose(node, transpose)
-        # TODO - in_dims or out_dims - 99% sure in_dims
         if tuple(node.transpose) == tuple(transpose):
             info(
                 f"accepted {node.name} - transpose {node.transpose} equals {transpose} on {node.in_dims[0]}")
@@ -696,11 +614,18 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
                 node.transpose, transpose, node.out_dims[0])
             if reshape:
                 info(f"requires reshape {reshape[0]} -> {reshape[1]}")
-            return cur_actions + [DeleteTransposeAction(node, reshape=reshape), EndActionUp(node)], cur_visited_nodes
-        # TODO - This should merge with the existing Transpose
-        new_transpose = apply_transpose(node.transpose, transpose)
+            return cur_actions + [
+                DeleteTransposeAction(node, reshape=reshape), EndActionUp(node)], cur_visited_nodes
+
+        # absorb transpose in a -> tranpose T1 -> b -> existing trans node T2 -> c
+        # a -> TNew -> c
+        # Apply reversed T1 to T2
+
+        new_transpose = apply_transpose(
+            node.transpose, reverse_transpose(transpose))
         info(
-            f"rejected {node.name} - transpose - does not reverse - absorbing {transpose} into {node.transpose} -> {new_transpose}")
+            f"rejected {node.name} - transpose - does not reverse - absorbing "
+            f"{transpose} into {node.transpose} -> {new_transpose}")
         return [SetTransposeAction(node, new_transpose), EndActionDown(node)], cur_visited_nodes
 
     # Input can be reordered if not frozen
@@ -708,30 +633,48 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         check_for_null_transpose(node, transpose)
         if node.fixed_order:
             info(f"rejected {node.name} - fixed order input")
-            return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=transpose), EndActionUp(node)], cur_visited_nodes
+            return [
+                InsertTransposeAction(node, direction='out', idx=out_edge.from_idx,
+                                      out_edge=out_edge, transpose=transpose), EndActionUp(node)], cur_visited_nodes
 
         info(
             f"accepted {node.name} - input without fixed order - transpose input {reverse_transpose(transpose)}")
-        return cur_actions + [ReorderInputDims.from_history(node, transpose_history, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+        return cur_actions + [
+            ReorderInputDims.from_history(
+                node, transpose_history, transpose=reverse_transpose(transpose)),
+            EndActionUp(node)], cur_visited_nodes
 
     # Constant can be reordered
     if isinstance(node, ConstantInputParameters):
         check_for_null_transpose(node, transpose)
         info(
             f"accepted {node.name} - constant input - transpose constant {transpose}")
-        return cur_actions + [ReorderConstantInput.from_history(node, transpose_history, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+        return cur_actions + [
+            ReorderConstantInput.from_history(
+                node, transpose_history, transpose=reverse_transpose(transpose)),
+            EndActionUp(node)], cur_visited_nodes
 
     # Conditions that can pass through the Transpose
     if isinstance(node, StridedSliceParameters) and node.changes_shape:
-        reversed_below = reverse_transpose(transpose)
-        reversed_above = reverse_broadcast(
-            node.out_shape, node.post_slice_shape, reversed_below)
-        new_transpose = reverse_transpose(reversed_above)
+        # special case for a strided slice that also has a reshape
+        check_for_null_transpose(node, transpose)
+        new_transpose, from_shape, to_shape = reverse_reshape(
+            transpose, node.slice_shape, node.out_shape, going_up=True)
+        if new_transpose is None:
+            info(
+                f"rejected {node.name} - cannot pass slice reshape - inserting transpose {transpose}")
+            return [InsertTransposeAction(node, direction='out', idx=0, transpose=reverse_transpose(transpose)),
+                    EndActionDown(node)], cur_visited_nodes
+
+        cur_actions.append(TransposeSlidedSlice(
+            node, reverse_transpose(transpose), out_shape=to_shape))
+
+        if identity_transpose(new_transpose):
+            return cur_actions + [EndActionUp(node)], cur_visited_nodes
+
         transpose_history = transpose_history + \
             [TransposeHistory(node, node.out_shape,
-                              new_transpose, node.post_slice_shape)]
-        cur_actions.append(
-            TransposeSlidedSlice(node, reversed_above, "up", transpose))
+                              new_transpose, node.in_dims[0].shape)]
         transpose = new_transpose
     elif node.__class__ in TRANSIENT_ACTIONS:
         check_for_null_transpose(node, transpose)
@@ -741,26 +684,28 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
 
     elif isinstance(node, ReshapeParameters):
         check_for_null_transpose(node, transpose)  # TODO - may eliminate
-        new_transpose = reverse_reshape(reverse_transpose(
-            transpose), node.shape, node.old_shape)
+        # reversed transpose is being propagated up
+        new_transpose, from_shape, to_shape = reverse_reshape(
+            transpose, node.old_shape, node.shape, going_up=True)
         # if the upwards shape has one dimension we keep going since we want to find
         # nodes such as a linear layer that can reorder their output filters
         # This could be extended to recurrent layers for the inner dimension
         info(
-            f"pass reshape {node.name} up trans: old {transpose} new {new_transpose} shape: {node.old_shape} -> {node.shape}")
+            f"pass reshape {node.name} up trans: old {transpose} new {new_transpose} "
+            f"shape: {node.old_shape} -> {node.shape}")
         if new_transpose is None and len(node.old_shape) > 1:
-            info(f"rejected {node.name} - transpose in - does not reverse")
-            return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+            info(f"rejected {node.name} - cannot pass reshape - inserting transpose {transpose}")
+            # since we are going up the transpose is in the up direction so needs to be reversed
+            return [
+                InsertTransposeAction(node, direction='out', idx=out_edge.from_idx,
+                                      out_edge=out_edge, transpose=reverse_transpose(transpose)),
+                EndActionUp(node)], cur_visited_nodes
 
         # insert an action to rewrite the reshape shapes
-        from_shape = node.old_shape.calc_transpose(
-            new_transpose) if new_transpose is not None else None
-        to_shape = node.shape.calc_transpose(
-            reverse_transpose(transpose)) if transpose is not None else None
         transpose_history = transpose_history + \
             [TransposeHistory(node, node.shape, new_transpose, node.old_shape)]
         info(f"rewrite reshape to {from_shape}->{to_shape}")
-        if from_shape is None or to_shape is None or from_shape.shape != to_shape.shape:
+        if from_shape is None or to_shape is None or from_shape != to_shape:
             cur_actions.extend([
                 SetReshapeAction(
                     node,
@@ -781,16 +726,21 @@ def search_up(G, node, exclude_nodes, visited_nodes, out_edge, transpose_history
         if new_transpose is None:
             try:
                 # @IgnoreException
-                return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(), cur_actions.copy(), transpose_history, transpose)
+                return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes.copy(),
+                                   cur_actions.copy(), transpose_history, transpose)
             except CantContinueError as ex:
                 if transpose is None:
                     raise ex
-                info(f"rejected {node.name} - transpose in - does not reverse")
-                return [InsertTransposeAction(node, direction='out', idx=out_edge.from_idx, out_edge=out_edge, transpose=reverse_transpose(transpose)), EndActionUp(node)], cur_visited_nodes
+                info(f"rejected {node.name} - cannot continue {ex} - inserting transpose {transpose}")
+                return [
+                    InsertTransposeAction(node, direction='out', idx=out_edge.from_idx,
+                                          out_edge=out_edge, transpose=reverse_transpose(transpose)),
+                    EndActionUp(node)], cur_visited_nodes
         transpose = new_transpose
 
     # Continue to visit upwards
-    return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose)
+    return continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes,
+                       cur_actions, transpose_history, transpose)
 
 
 def continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_actions, transpose_history, transpose):
@@ -802,19 +752,18 @@ def continue_up(G, node, exclude_nodes, visited_nodes, cur_visited_nodes, cur_ac
         if check_continue(visited_nodes, cur_visited_nodes, exclude_nodes, edge.from_node, 'up', edge.from_idx):
             continue
         edge_in_shape = node.in_dims[edge.to_idx].shape
-        if isinstance(node, Broadcastable) and len(edge_in_shape) != node.out_dims[0].rank:
+        if isinstance(node, (Broadcastable, PowOpParameters)) and len(edge_in_shape) != node.out_dims[0].rank:
             max_shape = compute_max_shape(node.out_dims)
             b_axes = broadcasted_axes(edge_in_shape, max_shape)
 
-            transpose_without_broadcast = strip_axes_from_transpose(
-                reverse_transpose(transpose), b_axes)
+            transpose_without_broadcast = strip_axes_from_transpose(transpose, b_axes)
             # from shape will be the old shape with the unbroadcasted transpose
             from_shape = apply_transpose(
-                edge_in_shape, transpose_without_broadcast)
+                edge_in_shape, reverse_transpose(transpose_without_broadcast))
             # to shape is the broadcasted input shape with the transpose with the leading ones removed
             broadcasted_shape = ([1] * len(b_axes)) + list(edge_in_shape)
-            to_shape = strip_leading_ones(apply_transpose(
-                broadcasted_shape, reverse_transpose(transpose)), len(from_shape))
+            to_shape = strip_leading_dim(apply_transpose(
+                broadcasted_shape, reverse_transpose(transpose)))
             # if they are not equal insert a reshape
             if from_shape != to_shape:
                 info(
@@ -896,7 +845,8 @@ def combine_transposes(G):
     for tstart, tend in trans_pairs:
         new_transpose = apply_transpose(tstart.transpose, tend.transpose)
         info(
-            f'combine transposes {tstart.name} and {tend.name} {tstart.transpose} & {tend.transpose} -> {new_transpose}')
+            f'combine transposes {tstart.name} and {tend.name} {tstart.transpose} & '
+            f'{tend.transpose} -> {new_transpose}')
         tstart.transpose = new_transpose
         G.remove_and_reconnect(tend, edge_class=NNEdge)
 
@@ -948,10 +898,12 @@ def delete_step_idx(G, action: DeleteTransposeAction):
     return G.in_edges(action.node)[0].from_node.step_idx
 
 
-def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, do_silly=True):
+def eliminate_transposes(G, debug_function=None, steps=None, single_step=False, do_silly=True, only_up=False):
     info("eliminating unnecessary transposes")
     found_results = True
     pass_count = 0
+    # keep trying to eliminate until we can't do more
+    # This should not loop since there is a bias in pushing transposes down
     while found_results:
         if steps is not None:
             if pass_count >= steps:
@@ -965,7 +917,8 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False,
         visited_nodes = set()
         actions = []
         info(f"search for transposes +++ STEP {pass_count}")
-        transposes = G.nodes(node_classes=TransposeParameters)
+        transposes = sorted(
+            G.nodes(node_classes=TransposeParameters), key=lambda node: node.name)
         while transposes:
             transpose_node = transposes.pop(0)
             if transpose_node in visited_nodes:
@@ -998,6 +951,8 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False,
                 cur_actions_up.insert(0, DeleteTransposeAction(transpose_node))
             # search down for elimination
             try:
+                if only_up:
+                    raise CantContinueError
                 cur_visited_down = VisitedNodes()
                 cur_visited_down.visit_down(transpose_node, 0)
                 cur_actions_down = []
@@ -1032,7 +987,7 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False,
             down_count = count_eliminated(cur_actions_down)
             # if the count is zero then the transpose has been eliminated however
             # 1 is better than 0 since another real transpose was deleted rather than a reorder etc
-            # always choose up before down since up is where we will transpose constants rather than reshaping them
+            # always favor up before down since up is where we will transpose constants
             if up_count > 0 and up_count >= down_count:
                 info(
                     f'found elimination for {transpose_node.name} upwards - {up_count} eliminated')
@@ -1042,7 +997,7 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False,
                 visited_nodes.add(transpose_node)
                 if single_step or steps is not None:
                     break
-            # if transpose cannot be removed upwards movement push the transpose down if it actually moved
+            # if transpose cannot be removed upwards push the transpose down if it actually moved
             elif down_count > 0 or (down_count == 0 and transpose_moved(G, cur_actions_down)):
                 info(
                     f'found elimination for {transpose_node.name} downwards - {down_count} eliminated')
@@ -1053,8 +1008,7 @@ def eliminate_transposes(G, debug_function=None, steps=None, single_step=False,
                 if single_step or steps is not None:
                     break
             else:
-                info(
-                    f'no elimination for {transpose_node.name} found')
+                info(f'no elimination for {transpose_node.name} found')
 
         if found_results:
             info("eliminate transposes")
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py
index 55927a224..6a0d8c462 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/eliminate_transposes_actions.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
+# Copyright (C) 2020, 2022  GreenWaves Technologies, SAS
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -28,12 +28,19 @@
 
 LOG = logging.getLogger("nntool." + __name__)
 
+
+class CantContinueError(Exception):
+    pass
+
+
 def info(msg):
     LOG.info(msg)
 
+
 def debug(msg):
     LOG.debug(msg)
 
+
 class Action(ABC):
     def __init__(self, node) -> None:
         self.node = node
@@ -162,6 +169,7 @@ def _execute(self, node, G):
     def __str__(self) -> str:
         return f"insert reshape at {self.node.name}:{self.direction}_{self.idx} in {self.in_shape} out {self.out_shape}"
 
+
 def make_dim(shape):
     if shape is None:
         return shape
@@ -169,6 +177,7 @@ def make_dim(shape):
         return shape.clone()
     return Dim.unnamed(shape)
 
+
 class SetReshapeAction(Action):
     def __init__(self, node, in_shape=None, out_shape=None) -> None:
         super(SetReshapeAction, self).__init__(node)
@@ -196,21 +205,22 @@ def __str__(self) -> str:
 
 
 class TransposeSlidedSlice(Action):
-    def __init__(self, node, transpose_in, dir=None, transpose_out=None) -> None:
+    def __init__(self, node, transpose, dir=None, out_shape=None) -> None:
         super(TransposeSlidedSlice, self).__init__(node)
-        self.transpose_in = tuple(transpose_in)
-        if transpose_out is None:
-            self.transpose_out = self.transpose_in
-        else:
-            self.transpose_out = tuple(transpose_out)
+        self.transpose = tuple(transpose)
+        self.shape_out = out_shape
 
     def _execute(self, node, G):
         info(f"{self}")
-        node.act_slice = [node.act_slice[idx] for idx in self.transpose_in]
-        node.out_shape = [node.out_shape[idx] for idx in self.transpose_out]
+        node.act_slice = apply_transpose(node.act_slice, self.transpose)
+        if self.shape_out is not None:
+            node.out_shape = self.shape_out
+        else:
+            node.out_shape = apply_transpose(node.out_shape, self.transpose)
 
     def __str__(self) -> str:
-        return "%s transpose slided slice parameters with %s/%s" % (self.node.name, self.transpose_in, self.transpose_out)
+        out_shape = "unchanged" if self.shape_out is None else f"changed to {self.shape_out}"
+        return f"{self.node.name} transpose slided slice parameters with {self.transpose} out shape {out_shape}"
 
 
 class TransposePad(Action):
@@ -227,19 +237,26 @@ def __str__(self) -> str:
         return "%s transpose pad parameters with %s" % (self.node.name, self.transpose)
 
 
-class TransposeReverse(Action):
+class TransposeAxisBase(Action):
     def __init__(self, node, transpose, dir=None) -> None:
-        super(TransposeReverse, self).__init__(node)
+        super(TransposeAxisBase, self).__init__(node)
         self.transpose = tuple(transpose)
 
     def _execute(self, node, G):
         info(f"{self}")
         node.axis = self.transpose[node.axis]
 
+
+class TransposeReverse(TransposeAxisBase):
     def __str__(self) -> str:
         return "%s transpose reverse parameters with %s" % (self.node.name, self.transpose)
 
 
+class TransposeGlobalPool(TransposeAxisBase):
+    def __str__(self) -> str:
+        return "%s transpose global pool parameters with %s" % (self.node.name, self.transpose)
+
+
 class TransposeInputBase(Action):
     def __init__(self, node, transpose, dir=None) -> None:
         super(TransposeInputBase, self).__init__(node)
@@ -360,25 +377,35 @@ def __str__(self) -> str:
 
 
 class ReorderLinearAction(Action):
-    def __init__(self, node, direction, transpose, shape, qrec=None) -> None:
+    def __init__(self, node, direction, transpose, shape, set_reshape_shape=None, qrec=None) -> None:
         super(ReorderLinearAction, self).__init__(node)
         self.direction = direction
         self.shape = shape
         self.transpose = tuple(transpose)
         self.qrec = qrec
+        self.set_reshape_shape = set_reshape_shape
 
     @classmethod
-    def from_history(cls, node, history, qrec, dir):
+    def from_history(cls, node, history, qrec, direction):
         # Find the first entry in the transpose history that actually has a transpose
-        first_valid_entry = next(iter([rec
-                                       for rec in reversed(history)
-                                       if rec.transpose]))
+        entry_idx, first_valid_entry = next(iter([(idx, rec) for idx, rec in enumerate(reversed(history))
+                                                  if rec.transpose]))
         # arriving from the top the transpose is in the down direction and from the
         # bottom in the up direction so in both cases we need to reverse it
         transpose = tuple(reverse_transpose(first_valid_entry.transpose))
         # shape closest to the node
         shape = tuple(first_valid_entry.to_shape)
-        return cls(node, dir, transpose, shape, qrec=qrec)
+        set_reshape_shape = None
+        # if direction == "out":
+        #     first_reshape = next(iter([elem.node for elem
+        #                                in list(reversed(history))[:entry_idx] if isinstance(elem.node, ReshapeParameters)]), None)
+        #     if first_reshape:
+        #         if shape != tuple(first_reshape.shape.shape):
+        #             raise CantContinueError(f'reshape {first_reshape.name} after linear {node.name} has '
+        #                                     f'incorrect out shape {first_reshape.shape.shape} to apply transpose {transpose}')
+        #         set_reshape_shape = (first_reshape, apply_transpose(first_reshape.shape.shape, transpose))
+
+        return cls(node, direction, transpose, shape, set_reshape_shape=set_reshape_shape, qrec=qrec)
 
     @classmethod
     def out_from_history(cls, node, history, qrec):
@@ -390,7 +417,8 @@ def in_from_history(cls, node, history, qrec):
 
     def _execute(self, node, G):
         info(f"{self}")
-        filter_node = node.contained_filters()[0] if isinstance(node, LinearFusionParameters) else node
+        filter_node = node.contained_filters()[0] if isinstance(
+            node, LinearFusionParameters) else node
         in_edges = G.indexed_in_edges(node.name)
         weights_node = in_edges[1].from_node
         if self.direction == "in":
@@ -425,7 +453,11 @@ def _execute(self, node, G):
                         list(self.transpose)
                     ),
                     biases_node.value.shape)
-            nid = NodeId(node, filter_node) if isinstance(node, LinearFusionParameters) else NodeId(node)
+            nid = NodeId(node, filter_node) if isinstance(
+                node, LinearFusionParameters) else NodeId(node)
+            if self.set_reshape_shape:
+                self.set_reshape_shape[0].shape = Dim.unnamed(
+                    self.set_reshape_shape[1])
             # since the output channel order has changed we need to make channel scaled qrec match this
             if G.quantization and nid in G.quantization:
                 qrec = G.quantization[nid]
@@ -446,8 +478,6 @@ def _execute(self, node, G):
                         if len(qrec.in_qs) > 2:
                             fqrec.in_qs[2] = qrec.in_qs[2]
 
-
-
     def __str__(self) -> str:
         return "reorder linear layer %s %s with shape %s transposed %s" % (self.node.name, self.direction,
                                                                            self.shape, self.transpose)
diff --git a/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py b/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py
index 00b865e59..f043bd65a 100644
--- a/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py
+++ b/tools/nntool/graph/manipulations/eliminate_transposes/transpose_helpers.py
@@ -22,20 +22,6 @@ def reverse_transpose(trans):
     return [trans.index(idx) for idx in range(len(trans))]
 
 
-def reverses_transpose(trans1, trans2, dim=None):
-    """Checks if one transpose reverses another. If a dim is provided then
-    look if the transpose sequence produces an equivalent dim to cope with 1s in
-    dimensions."""
-    if trans1 is None or trans2 is None:
-        return False
-    if dim and dim.layout_shape == dim.calc_transpose(trans1).calc_transpose(trans2).layout_shape:
-        return True
-    for idx, val in enumerate(trans1):
-        if trans2[val] != idx:
-            return False
-    return True
-
-
 def identity_transpose(trans):
     if trans is None:
         return False
@@ -46,6 +32,35 @@ def apply_transpose(elems, trans):
     return [elems[i] for i in trans]
 
 
+def strip_ones(shape):
+    return tuple(dim for dim in shape if dim != 1)
+
+
+def reverses_transpose_up(trans1, trans2, dim=None):
+    """trans1->trans2->dim
+    1) without dim do the transposes cancel
+    2) with dim to the transposes cancel considering layout shape (i.e. without 1s in shape"""
+    if dim is not None and not isinstance(dim, tuple):
+        dim = tuple(dim.shape)
+    if trans1 is None or trans2 is None:
+        return False, None
+    if identity_transpose(apply_transpose(trans1, trans2)):
+        return True, None
+    if dim is not None:
+        # apply dim -> reverse t2 -> reverse t1
+        # strip 1s and see if it is the same
+        layout_shape_after = strip_ones(dim)
+        shape_before = apply_transpose(
+                apply_transpose(dim, reverse_transpose(trans2)),
+                reverse_transpose(trans1))
+        return strip_ones(shape_before) == layout_shape_after, shape_before
+    return False, None
+
+
+def indexes_of(trans1, trans2):
+    return [trans1.index(i) for i in trans2]
+
+
 def transpose_does_nothing(transpose, shape):
     if transpose is None:
         return False
@@ -57,10 +72,6 @@ def reduce_mask(mask):
     return reduce_mask(mask) == tmask
 
 
-def strip_ones(shape):
-    return tuple(dim for dim in shape if dim != 1)
-
-
 def reshape_is_transpose(old_shape, new_shape):
     # TODO - check the order of the non 1 dimensions
     if len(old_shape) != len(new_shape):
diff --git a/tools/nntool/graph/manipulations/extract.py b/tools/nntool/graph/manipulations/extract.py
index 24bdec766..ffb07dc93 100644
--- a/tools/nntool/graph/manipulations/extract.py
+++ b/tools/nntool/graph/manipulations/extract.py
@@ -51,7 +51,6 @@ def extract_node(G: NNGraph, keep_node: Parameters):
         if node not in keep_nodes and node.name in G:
             LOG.info(f'remove {node.name}')
             G.remove(node)
-    G.reset_inout_counts()
     for edge in in_edges:
         input_node = G.add_input(edge.from_node.out_dims[edge.from_idx])
         G.add_edge(NNEdge(input_node, keep_node, to_idx=edge.to_idx))
diff --git a/tools/nntool/graph/manipulations/liveness.py b/tools/nntool/graph/manipulations/liveness.py
index 66e8a1aae..0c3959d83 100644
--- a/tools/nntool/graph/manipulations/liveness.py
+++ b/tools/nntool/graph/manipulations/liveness.py
@@ -15,6 +15,8 @@
 
 from typing import Mapping, Sequence
 
+from graph.types.input_output import InputBaseParameters, InputParameters, OutputParameters
+
 def calculate_liveness(G, steps: Sequence[Mapping]) -> Mapping[str, Mapping]:
     liveness = {}
     for i, step in enumerate(steps):
@@ -23,7 +25,7 @@ def calculate_liveness(G, steps: Sequence[Mapping]) -> Mapping[str, Mapping]:
         step['start'] = []
         step['end'] = []
         # input nodes create tensors
-        if G.is_input(node):
+        if isinstance(node, InputBaseParameters):
             edges = G.out_edges(node.name)
             if edges:
                 assert all(edge.from_idx == 0 for edge in edges), "inputs should create a single tensor"
@@ -40,7 +42,7 @@ def calculate_liveness(G, steps: Sequence[Mapping]) -> Mapping[str, Mapping]:
                 assert live is not None, "Inputs to node must have already been created"
                 if live['end'] < i:
                     live['end'] = i
-                if G.is_output(node):
+                if isinstance(node, OutputParameters):
                     live['is_output'] = True
             # check what we create
             for edge in G.out_edges(node.name):
diff --git a/tools/nntool/graph/manipulations/set_aliases.py b/tools/nntool/graph/manipulations/set_aliases.py
index dd7ff1614..94bbf8192 100644
--- a/tools/nntool/graph/manipulations/set_aliases.py
+++ b/tools/nntool/graph/manipulations/set_aliases.py
@@ -34,13 +34,7 @@ def walk_up(G, edge, concat_node):
               edge.to_node.name, edge.to_idx)
     edge.params.is_alias = True
     node = edge.from_node
-    if isinstance(node, ReshapeParameters):
-        # since it is a reshape it can only have one input
-        return walk_up(G, G.in_edges(node.name)[0], concat_node)
-    if isinstance(node, TransposeParameters):
-        if not node.does_nothing():
-            return False
-        # since it is a reshape it can only have one input
+    if node.no_model_code:
         return walk_up(G, G.in_edges(node.name)[0], concat_node)
     if isinstance(node, SplitParameters):
         LOG.warning("split node %s is directly connected to concat node %s",
@@ -61,12 +55,7 @@ def walk_down(G, edge, split_node):
     edge.params.is_alias = True
     node = edge.to_node
     errors = False
-    if isinstance(node, ReshapeParameters):
-        for edge in G.out_edges(node.name):
-            errors = errors or walk_down(G, edge, split_node)
-    elif isinstance(node, TransposeParameters):
-        if not node.does_nothing():
-            return errors
+    if node.no_model_code:
         for edge in G.out_edges(node.name):
             errors = errors or walk_down(G, edge, split_node)
     elif isinstance(node, ConcatParameters):
diff --git a/tools/nntool/graph/matches/fusions.py b/tools/nntool/graph/matches/fusions.py
new file mode 100644
index 000000000..544a30ad1
--- /dev/null
+++ b/tools/nntool/graph/matches/fusions.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from graph.matches.matches import get_matches
+from graph.types.constant_input import ConstantInputParameters
+from quantization.verify_quantization import verify_quantization
+from quantization.quantizer.new_quantizer import NewQuantizer
+
+
+def fusions(nngraph, *match_names, no_postprocess: bool = False):
+    state = ConstantInputParameters.save_compression_state(nngraph)
+    try:
+        match_group = get_matches(*match_names)
+        while match_group:
+            match_group.match(nngraph)
+            nngraph.add_dimensions()
+            if no_postprocess:
+                break
+            if match_group.run_qtune:
+                quantizer = NewQuantizer(nngraph)
+                quantizer.quantize()
+            if match_group.run_adjust:
+                nngraph.adjust_order()
+            if match_group.run_again:
+                match_group = get_matches(*match_group.run_again)
+            else:
+                match_group = None
+
+        if nngraph.quantization and verify_quantization(nngraph):
+            quantizer = NewQuantizer(nngraph)
+            quantizer.quantize()
+            problems = verify_quantization(nngraph)
+            if problems:
+                problems = "\n".join(problems)
+                raise ValueError(f'quantization issue after fusions\n{problems}\n')
+    finally:
+        ConstantInputParameters.restore_compression_state(nngraph, state)
diff --git a/tools/nntool/graph/matches/match_utils.py b/tools/nntool/graph/matches/match_utils.py
index bdffdc756..6ebb02ee4 100644
--- a/tools/nntool/graph/matches/match_utils.py
+++ b/tools/nntool/graph/matches/match_utils.py
@@ -30,7 +30,7 @@ def search_down(G, edge, for_node_classes, can_pass=None, can_pass_fn=None, edge
         multi_on_target (bool, optional): Allow target to have multiple edges. Defaults to True.
 
     Returns:
-        Optional[Sequence[Edge]]: Edges found or None  
+        Optional[Sequence[Edge]]: Edges found or None
     """
     if edge_list is None:
         edge_list = []
@@ -71,18 +71,19 @@ def search_up(G, edge, for_node_classes, can_pass=None, can_pass_fn=None, edge_l
 
     Args:
         G (NNGraph): Graph
-        edge (Edge): Edge to look down
+        edge (Edge): Edge to look up
         for_node_classes (Union[Parameters, Tuple[Parameters]]): Node class or classes to look for
         can_pass (Union[Parameters, Tuple[Parameters]], optional): Will pass through this node class or classes.
                                                                    Defaults to None.
         can_pass_fn (Callable, optional): function with graph and node as parameters. Should return True if
                                           search can pass this node. Defaults to None.
-        follow_multi (str, optional): Follow multi edge outputs. Defaults to empty string which means don't follow can be same or any.
+        follow_multi (str, optional): Follow multi edge outputs. Defaults to empty string which means don't
+                follow can be same or any.
         follow_first (bool, optional): Only follow first input. Defaults to True.
         multi_on_target (bool, optional): Allow target to have multiple edges. Defaults to True.
 
     Returns:
-        Optional[Sequence[Edge]]: Edges found or None  
+        Optional[Sequence[Edge]]: Edges found or None
     """
     if edge_list is None:
         edge_list = []
diff --git a/tools/nntool/graph/matches/matcher.py b/tools/nntool/graph/matches/matcher.py
index a2914b99a..809d595a6 100644
--- a/tools/nntool/graph/matches/matcher.py
+++ b/tools/nntool/graph/matches/matcher.py
@@ -15,11 +15,10 @@
 
 import logging
 from abc import ABC, abstractmethod
-from typing import Generator, Sequence
+from typing import Sequence
+from utils.graph import GraphView, MatchNode
 from utils.node_id import NodeId
 
-from utils.graph import GraphView, MatchNode, Node
-
 LOG = logging.getLogger("nntool." + __name__)
 
 
@@ -55,6 +54,18 @@ def __init__(self, identity: str = None):
     def name(self):
         return self.NAME
 
+    @property
+    def run_again(self):
+        return self.RUN_AGAIN_ON_MATCH
+
+    @property
+    def run_qtune(self):
+        return self.RUN_QTUNE_ON_MATCH
+
+    @property
+    def run_adjust(self):
+        return self.RUN_ADJUST_ON_MATCH
+
     @staticmethod
     def remove_quantization(G, node):
         if G.quantization:
@@ -83,7 +94,7 @@ def description(val):
 
     @staticmethod
     def needs_valid_dimension(val):
-        return Matcher.property_register("DESCRIPTION", val)
+        return Matcher.property_register("NEEDS_VALID_DIMENSION", val)
 
     @staticmethod
     def modifies_dimensions(val):
@@ -102,12 +113,14 @@ def run_again_on_match(*args):
         return Matcher.property_register("RUN_AGAIN_ON_MATCH", args)
 
     @staticmethod
-    def run_qtune_on_match(val):
-        return Matcher.property_register("RUN_QTUNE_ON_MATCH", val)
+    def run_qtune_on_match(cls):
+        setattr(cls, 'RUN_QTUNE_ON_MATCH', True)
+        return cls
 
     @staticmethod
-    def run_adjust_on_match(val):
-        return Matcher.property_register("RUN_ADJUST_ON_MATCH", val)
+    def run_adjust_on_match(cls):
+        setattr(cls, 'RUN_ADJUST_ON_MATCH', True)
+        return cls
 
     @staticmethod
     def groups(*args):
@@ -134,59 +147,6 @@ def deco(cls):
 
 groups = Matcher.groups
 
-class DontReplaceError(Exception):
-    pass
-
-
-class DefaultMatcher(Matcher):
-    @abstractmethod
-    def match_function(self, G: GraphView) -> Generator[GraphView, None, None]:
-        pass
-
-    @abstractmethod
-    def replace_function(self, G: GraphView, subgraph: GraphView) -> Node:
-        pass
-
-    def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
-        replaced = True
-        has_modified_graph = False
-        while replaced:
-            replaced = False
-            for subgraph in self.match_function(G):
-                # TODO - Save in and out edges here since the replace function may modify the
-                # subgraph
-                in_edges = [in_edge for input_node in subgraph.inputs()
-                            for in_edge in G.in_edges(input_node.name)]
-                out_edges = [out_edge for output_node in subgraph.outputs()
-                             for out_edge in G.out_edges(output_node.name)]
-                try:
-                    replacement, edge_in_mapping, edge_out_mapping = self.replace_function(
-                        G, subgraph)
-                    if replacement is None:
-                        G.remove_fragment(subgraph)
-                        has_modified_graph = True
-                    elif isinstance(replacement, Node):
-                        # use saved  in and out edges
-                        G.replace_fragment(subgraph,
-                                           replacement,
-                                           frag_in_edges=in_edges,
-                                           frag_out_edges=out_edges,
-                                           edge_in_mapping=edge_in_mapping,
-                                           edge_out_mapping=edge_out_mapping)
-                        has_modified_graph = True
-                    else:
-                        raise TypeError(
-                            "unexcepted return value from replace_function")
-                    replaced = True
-                    break
-                except DontReplaceError:
-                    pass
-
-        if set_identity:
-            self.set_identity(G)
-
-        return has_modified_graph
-
 
 # This can be used to define groups of matches to be selected
 # from the command line
@@ -195,52 +155,56 @@ class MatchGroup(Matcher):
 
     def __init__(self, *args: Sequence[Matcher], identity: str = None):
         super().__init__(identity)
-        self.matches = list(args)
+        self._matches = {match.name: match for match in args}
+        self._matches_pending = []
+        self._adjust_pending = False
+        self._qtune_pending = False
+
+    @property
+    def run_again(self):
+        return self._matches_pending
+
+    @property
+    def run_qtune(self):
+        return self._qtune_pending
+
+    @property
+    def run_adjust(self):
+        return self._adjust_pending
 
     def add_match(self, match: Matcher):
-        self.matches.append(match)
+        self._matches.append(match)
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
         # Note: assumption is that dimensions are valid when a match is called
         found_match = True
         dimensions_set = True
+        self._matches_pending = []
+        self._adjust_pending = False
+        self._qtune_pending = False
         while found_match:
             found_match = False
-            for match_instance in self.matches:
-                LOG.debug("fusions - start %s", match_instance.name)
-                if match_instance.NEEDS_VALID_DIMENSION and not dimensions_set:
+            matches = list(self._matches.values())
+            while matches:
+                match = matches.pop(0)
+                LOG.debug("fusions - start %s", match.name)
+                if match.NEEDS_VALID_DIMENSION and not dimensions_set:
                     G.add_dimensions(quiet=True)
                     dimensions_set = True
-                has_modified_graph = match_instance.match(
+                has_modified_graph = match.match(
                     G, set_identity=False, group_identity=self._identity)
                 if has_modified_graph:
-                    LOG.info("++ fusion %s modified graph", match_instance.name)
+                    LOG.info("++ fusion %s modified graph", match.name)
                     found_match = True
                     G.add_dimensions(quiet=True)
+                    for required_match in match.run_again:
+                        if match not in self._matches_pending:
+                            self._matches_pending.append(required_match)
+                    self._adjust_pending = self._adjust_pending or match.run_adjust
+                    if G.quantization:
+                        self._qtune_pending = self._qtune_pending or match.run_qtune
+
                 if dimensions_set and has_modified_graph:
                     dimensions_set = False
         if set_identity:
             self.set_identity(G)
-
-
-def find_forward(G: GraphView, edge, find_node_classes, skip_node_classes=None, find_skip=None):
-    if find_skip is None:
-        find_skip = [find_node_classes, skip_node_classes]
-        for idx, elem in enumerate(find_skip):
-            if elem is not None and not isinstance(elem, tuple):
-                if isinstance(elem, list):
-                    find_skip[idx] = tuple(elem)
-                else:
-                    find_skip[idx] = tuple([elem])
-    if isinstance(edge.to_node, find_skip[0]):
-        return [[edge]]
-    if skip_node_classes and isinstance(edge.to_node, find_skip[0]):
-        res = []
-        for out_edge in G.out_edges(edge.to_node.name):
-            edge_lists = find_forward(G, out_edge, find_node_classes,
-                                      find_skip=find_skip)
-            if not edge_lists:
-                continue
-            res.extend([[edge] + edge_list for edge_list in edge_lists])
-        return res
-    return []
diff --git a/tools/nntool/graph/matches/matchers/concat_slice.py b/tools/nntool/graph/matches/matchers/concat_slice.py
new file mode 100644
index 000000000..ef6fb82b6
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/concat_slice.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+from functools import reduce
+
+from graph.matches.match_utils import search_up
+from graph.types import ConcatParameters, NNEdge
+from graph.types.others import (NoOPParameters, ReshapeParameters,
+                                StridedSliceParameters)
+from utils.compatible_transposes import find_combination
+from utils.graph import GraphView
+
+from ..matcher import (Matcher, description, groups, match_name,
+                       needs_valid_dimension, run_qtune_on_match)
+
+LOG = logging.getLogger("nntool." + __name__)
+
+
+def info(msg):
+    LOG.info(msg)
+
+def validate_slice(node: StridedSliceParameters):
+    if any(sl[2] != 1 for sl in node.act_slice):
+        return [None] * 2
+    slices_axes = node.slices_axes
+    if len(slices_axes) != 1:
+        return [None] * 2
+    axis = slices_axes[0]
+    act_slice = node.act_slice[axis]
+    return axis, (act_slice[0], act_slice[1]-act_slice[0])
+
+def up_from_slice(G, edge, axis, shape, remove_nodes=None, removing=True, reshape=None):
+    if remove_nodes is None:
+        remove_nodes = []
+    if removing:
+        remove_nodes.append(edge.to_node)
+    node = edge.from_node
+    if isinstance(node, ConcatParameters):
+        if axis != node.axis:
+            return [None] * 4
+        offsets = node.offsets
+        if shape[0] not in offsets:
+            return [None] * 4
+        index = offsets.index(shape[0])
+        length = node.in_dims[index].shape[node.axis]
+        if shape[1] != length:
+            return [None] * 4
+        return (remove_nodes, edge, index, reshape)
+    else:
+        if isinstance(node, ReshapeParameters):
+            if reshape is None:
+                reshape = node
+            combinations = find_combination(node.shape, node.old_shape)
+            combination = next(iter([comb for comb in combinations if (axis,) in comb]), None)
+            if combination is None:
+                return [None] * 4
+            axis = combination.index((axis,))
+        elif not isinstance(node, NoOPParameters):
+            return [None] * 4
+        return up_from_slice(
+            G,
+            G.in_edges(node)[0],
+            axis,
+            shape,
+            remove_nodes=remove_nodes,
+            removing=removing and len(G.out_edges(node)) == 1,
+            reshape=reshape
+        )
+
+@groups('*')
+@run_qtune_on_match
+@needs_valid_dimension(True)
+@match_name("concat_slice")
+@description("removes slices after concats that match an input of the concat")
+class ConcatSliceMatch(Matcher):
+
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
+        has_modified_graph = False
+        concat_slice_edges = {}
+        for slice_node in G.nodes(node_classes=StridedSliceParameters):
+            # validate that slice is positive unit stride on a single axis
+            axis, shape = validate_slice(slice_node)
+            if axis is None:
+                continue
+            # search up for a concat
+            remove_nodes, concat_edge, concat_in_idx, reshape = up_from_slice(
+                G,
+                G.in_edges(slice_node)[0],
+                axis,
+                shape)
+            if not remove_nodes:
+                continue
+            concat = concat_edge.from_node
+            concat_slice_edges.setdefault(
+                concat, []).append((slice_node, remove_nodes, concat_in_idx, reshape))
+        if not concat_slice_edges:
+            return False
+
+        for concat, slices in concat_slice_edges.items():
+            for (slice_node, remove_nodes, concat_in_idx, reshape) in slices:
+                self.eliminate_slice(G, concat, slice_node, remove_nodes, concat_in_idx, reshape)
+            if not G.out_edges(concat):
+                has_modified_graph = True
+                concat_in_edges = G.in_edges(concat)
+                info(f"removing concat {concat.name}")
+                if concat_in_edges:
+                    self.resolve_unused_concat_in_edges(
+                        G, concat, concat_in_edges)
+                if concat.name in G:
+                    G.remove(concat)
+
+        if set_identity:
+            self.set_identity(G)
+
+        return has_modified_graph
+
+    def search_delete_nodes_up(self, G, edge):
+        out_edges = G.out_edges(edge.from_node)
+        if len(out_edges) > 1:
+            return []
+        return [edge.from_node] + self.search_delete_nodes_up(G, G.in_edges(edge.from_node)[0])
+
+    @staticmethod
+    def edge_list_str(G, edges):
+        edge_list = list(reversed([edge.to_node.name for edge in edges]))
+        if len(G.out_edges(edges[-1].from_node)) == 1:
+            edge_list.insert(0, f'{edges[-1].from_node.name} (removed)')
+        else:
+            edge_list.insert(0, edges[-1].from_node.name)
+        return "->".join(edge_list)
+
+    def resolve_unused_concat_in_edges(self, G, concat, concat_in_edges):
+        for edge in concat_in_edges:
+            nodes = self.search_delete_nodes_up(G, edge)
+            if nodes:
+                info(f"removing unused nodes {' '.join(node.name for node in nodes)}")
+                for node in nodes:
+                    if node.name in G:
+                        G.remove(node)
+
+    def eliminate_slice(self, G, concat, slice_node, remove_nodes, concat_in_idx, reshape_in):
+        concat_in_edge = G.indexed_in_edges(concat)[concat_in_idx]
+        node_idx = (concat_in_edge.from_node, concat_in_edge.from_idx)
+        info(f'removing slice {slice_node.name} and connecting {concat_in_edge.from_node.name}:{concat_in_edge.from_idx} to its edges')
+        if reshape_in:
+            reshape = ReshapeParameters(
+                G.unique_name(f'{slice_node.name}_reshape'),
+                old_shape=concat_in_edge.from_node.out_dims[concat_in_edge.from_idx].shape,
+                shape=slice_node.out_shape)
+        elif slice_node.changes_shape:
+            reshape = ReshapeParameters(
+                G.unique_name(f'{slice_node.name}_reshape'),
+                old_shape=slice_node.post_slice_shape,
+                shape=slice_node.out_shape)
+        else:
+            reshape = None
+        if reshape:
+            G.add_edge(
+                NNEdge(
+                    from_node=node_idx[0],
+                    from_idx=node_idx[1],
+                    to_node=reshape))
+            node_idx = (reshape, 0)
+
+        slice_out_edges = G.out_edges(slice_node)
+        for rnode in remove_nodes:
+            if rnode.name in G:
+                G.remove(rnode)
+        if slice_node.name in G:
+            G.remove(slice_node)
+        for edge in slice_out_edges:
+            G.add_edge(NNEdge(from_node=node_idx[0], from_idx=node_idx[1],
+                              to_node=edge.to_node, to_idx=edge.to_idx))
diff --git a/tools/nntool/graph/matches/matchers/concat_split.py b/tools/nntool/graph/matches/matchers/concat_split.py
index 7adb7f978..e6a362b0c 100644
--- a/tools/nntool/graph/matches/matchers/concat_split.py
+++ b/tools/nntool/graph/matches/matchers/concat_split.py
@@ -15,15 +15,10 @@
 
 import logging
 
-from graph.dim import Dim
 from graph.types import ConcatParameters, NNEdge, SplitParameters
-from graph.types.others import (CopyParameters, NoOPParameters,
-                                ReshapeParameters, TransposeParameters)
 from utils.graph import GraphView
-from utils.node_id import NodeId
 
-from ..match_utils import search_down
-from ..matcher import Matcher, description, groups, match_name, run_before
+from ..matcher import Matcher, description, groups, match_name
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -35,7 +30,7 @@ class ConcatSplitMatch(Matcher):
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
         has_modified_graph = False
-        for split_node in set([node for node in G.nodes() if isinstance(node, SplitParameters)]):
+        for split_node in G.nodes(node_classes=SplitParameters):
             in_edges = G.in_edges(split_node.name)
             if len(in_edges) > 1:
                 continue
@@ -70,120 +65,3 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
             self.set_identity(G)
 
         return has_modified_graph
-
-
-def reduce_slices(slices, shapes):
-    res_slice = []
-    res_shape = []
-    for slice_axis, shape_axis in zip(zip(*slices), zip(*shapes)):
-        if slice_axis[0] == slice_axis[1]:
-            res_slice.append(slice_axis[0])
-            res_shape.append(shape_axis[0])
-        else:
-            res_slice.append(
-                (slice_axis[0][0],
-                 slice_axis[-1][1],
-                 slice_axis[0][2]))
-            res_shape.append(sum(shape_axis))
-    return res_slice, res_shape
-
-
-def remove_edges(G, edges):
-    if not edges:
-        return
-    edges = edges.copy()
-    while len(edges) > 1:
-        edge = edges.pop(0)
-        G.remove(edge.to_node)
-        if G.quantization:
-            nid = NodeId(edge.to_node)
-            if nid in G.quantization:
-                del G.quantization[nid]
-    try:
-        G.remove_edge(edges[0])  # @IgnoreException
-    except KeyError:
-        pass
-
-
-@groups('*')
-@match_name("split_concat")
-@run_before('remove_noops', 'remove_copies')
-@description("removes splits that go to concats where all the out edges of the split are in sequence in the concat")
-class SplitConcatMatch(Matcher):
-    def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
-        edge_groups = []
-        for node in G.nodes(node_classes=SplitParameters):
-            cur_group = None
-            for out_edge_bundle in G.indexed_out_edges(node):
-                if len(out_edge_bundle) == 1:
-                    out_edge = out_edge_bundle[0]
-                    concat_node_edges = search_down(
-                        G, out_edge, ConcatParameters,
-                        can_pass=(CopyParameters, NoOPParameters,
-                                  ReshapeParameters),
-                        can_pass_fn=lambda _, node: isinstance(node, TransposeParameters) and node.does_nothing)
-                    if concat_node_edges:
-                        if cur_group:
-                            this_concat_edge = concat_node_edges[-1]
-                            last_concat_edge = cur_group[-1][-1]
-                            if this_concat_edge.to_node == last_concat_edge.to_node and this_concat_edge.to_idx == last_concat_edge.to_idx + 1:
-                                cur_group.append(concat_node_edges)
-                                continue
-                            if len(cur_group) > 1:
-                                edge_groups.append(cur_group)
-                        cur_group = [concat_node_edges]
-                        continue
-                if cur_group:
-                    if len(cur_group) > 1:
-                        edge_groups.append(cur_group)
-                    cur_group = None
-            if cur_group:
-                if len(cur_group) > 1:
-                    edge_groups.append(cur_group)
-                cur_group = None
-        # we leave the splits and concats after this since they will be cleared up by remove_noops
-        for edge_group in edge_groups:
-            split_node = edge_group[0][0].from_node
-            concat_node = edge_group[0][-1].to_node
-            from_idx = edge_group[0][0].from_idx
-            to_idx = edge_group[-1][0].from_idx
-            from_concat_idx = edge_group[0][-1].to_idx
-            to_concat_idx = edge_group[1][-1].to_idx
-            LOG.info(
-                f"combining outputs {from_idx}:{to_idx} on split node {split_node.name} followed by concat {concat_node.name}")
-            # combine slices and shapes on edges in group
-            new_slice, new_shape = reduce_slices(
-                split_node.act_slices[from_idx:to_idx+1],
-                split_node.out_shapes[from_idx:to_idx+1]
-            )
-            new_concat_shape = Dim.combine(
-                [concat_node.in_dims[idx]
-                    for idx in range(from_concat_idx, to_concat_idx+1)],
-                concat_node.axis)
-            split_node.act_slices = split_node.act_slices[:from_idx] + [
-                new_slice] + split_node.act_slices[to_idx+1:]
-            # the slice may need to reshape since we will remove everything in between
-            split_node.out_shapes = split_node.out_shapes[:from_idx] + [
-                new_concat_shape.shape] + split_node.out_shapes[to_idx+1:]
-
-            # remove all edges and intermediate nodes on all edge groups
-            for edge_list in edge_group:
-                remove_edges(G, edge_list)
-            # add back a direct edge to the first idx
-            G.add_edge(NNEdge(from_node=split_node, from_idx=edge_group[0][0].from_idx, to_node=concat_node, to_idx=edge_group[0][-1].to_idx))
-            out_edge_bundles = G.indexed_out_edges(split_node)
-            # move edges beyond the edge group after the first index
-            for offset, edge_list in enumerate(out_edge_bundles[to_idx+1:]):
-                assert len(edge_list) == 1
-                edge = edge_list[0]
-                G.remove_edge(edge)
-                G.add_edge(NNEdge.clone(edge, from_idx=from_idx+1+offset))
-            # reindex the in edges in the concat
-            from_idx = edge_group[0][-1].to_idx
-            to_idx = edge_group[-1][-1].to_idx
-            in_edges = G.indexed_in_edges(concat_node)
-            for offset, in_edge in enumerate(in_edges[to_idx+1:]):
-                G.remove_edge(in_edge)
-                G.add_edge(NNEdge.clone(in_edge, to_idx=from_idx+1+offset))
-
-        return bool(edge_groups)
diff --git a/tools/nntool/graph/matches/matchers/copy_on_outputs.py b/tools/nntool/graph/matches/matchers/copy_on_outputs.py
index cbb64195b..3a4462842 100644
--- a/tools/nntool/graph/matches/matchers/copy_on_outputs.py
+++ b/tools/nntool/graph/matches/matchers/copy_on_outputs.py
@@ -15,9 +15,7 @@
 
 import logging
 
-from graph.matches.matchers.insert_copies import find_real_in_edge
-from graph.types import (CopyParameters, NNEdge, OutputParameters,
-                         ReshapeParameters, TransposeParameters)
+from graph.types import CopyParameters, NNEdge, OutputParameters
 from utils.graph import GraphView
 from utils.node_id import NodeId
 
@@ -29,7 +27,7 @@
 
 def search_down(G, edge):
     node = edge.to_node
-    if isinstance(node, ReshapeParameters) or (isinstance(node, TransposeParameters) and node.does_nothing):
+    if node.no_model_code:
         res = []
         for out_edge in G.out_edges(node):
             res.extend(search_down(G, out_edge))
@@ -49,7 +47,7 @@ def search_up(G, edge):
             if out_edge == edge:
                 continue
             res.extend(search_down(G, out_edge))
-    if isinstance(node, (OutputParameters, ReshapeParameters)) or (isinstance(node, TransposeParameters) and node.does_nothing):
+    if node.no_model_code:
         edge = G.in_edges(node)[0]
         res.extend(search_up(G, edge))
     return res
diff --git a/tools/nntool/graph/matches/matchers/copy_on_split_inputs.py b/tools/nntool/graph/matches/matchers/copy_on_split_inputs.py
deleted file mode 100644
index 4b0be9606..000000000
--- a/tools/nntool/graph/matches/matchers/copy_on_split_inputs.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-from graph.matches.matchers.insert_copies import find_real_in_edge
-import logging
-
-from graph.types import InputParameters, ReshapeParameters, ConstantInputParameters
-from graph.types.others import ConcatParameters, CopyParameters, SplitParameters
-from utils.graph import GraphView
-
-from ..matcher import Matcher, groups, match_name, description, modifies_dimensions, run_after
-
-LOG = logging.getLogger("nntool." + __name__)
-
-
-def search_up_for_input(G, node, going_up=None):
-    if going_up is None or isinstance(node, ReshapeParameters):
-        return search_up_for_input(G, G.in_edges(node.name)[0].from_node, going_up=True)
-    if isinstance(node, (InputParameters, ConstantInputParameters)):
-        return node
-    return None
-
-@match_name("copy_on_split_inputs")
-@description("Insert copy on inputs that could be in a tensor stack")
-@modifies_dimensions(True)
-@groups('*')
-class CopyOnSplitInputs(Matcher):
-
-    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-
-        candidates = [node for node in G.nodes(node_classes=(SplitParameters, ConcatParameters))]
-        need_a_copy_edges = []
-        for node in candidates:
-            for idx, edge in enumerate(G.indexed_in_edges(node.name)):
-                real_from_node, _ = find_real_in_edge(G, edge)
-                if isinstance(real_from_node, (InputParameters, ConstantInputParameters)):
-                    need_a_copy_edges.append((edge, idx))
-        has_modified_graph = False
-        for edge in need_a_copy_edges:
-            LOG.info(
-                "Insert copy on split input %s", edge[0].to_node.name)
-            has_modified_graph = True
-            cnode = CopyParameters(G.unique_name(f'{edge[0].to_node.name}_copy'))
-            G.insert_node_at_edge(cnode, edge[0])
-            if G.quantization:
-                G.quantization.copy_qrec(edge[0].to_node, 'in', 0, cnode)
-        if set_identity:
-            self.set_identity(G)
-        return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/duplicate_constants.py b/tools/nntool/graph/matches/matchers/duplicate_constants.py
index 6aebf13cb..bd4a103c3 100644
--- a/tools/nntool/graph/matches/matchers/duplicate_constants.py
+++ b/tools/nntool/graph/matches/matchers/duplicate_constants.py
@@ -22,7 +22,7 @@
 
 LOG = logging.getLogger("nntool." + __name__)
 
-@match_name("match_duplicate_constants")
+@match_name("duplicate_constants")
 @description("""Find constants that are linked to more than one node and duplicate them""")
 @run_before('*')
 @groups('symmetric', 'scaled')
diff --git a/tools/nntool/graph/matches/matchers/duplicate_operations.py b/tools/nntool/graph/matches/matchers/duplicate_operations.py
index 4ac00b591..6072c389f 100644
--- a/tools/nntool/graph/matches/matchers/duplicate_operations.py
+++ b/tools/nntool/graph/matches/matchers/duplicate_operations.py
@@ -12,32 +12,37 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from graph.dim import Dim
 import logging
 from copy import deepcopy
 from functools import partial
 from itertools import groupby
 
 import numpy as np
+from graph.dim import Dim
 from graph.types import SplitParameters
 from graph.types.base import ComparableParameters, NNEdge
 from utils.graph import GraphView
 
-from ..matcher import Matcher, description, groups, match_name, run_before
+from ..matcher import Matcher, description, groups, match_name, run_before, run_qtune_on_match
 
 LOG = logging.getLogger("nntool." + __name__)
 
-@match_name("match_duplicate_operations")
+
+@match_name("duplicate_operations")
 @description("""Removes operations that are duplicates on the same edge""")
 @run_before("*")
 @groups('symmetric', 'scaled')
+@run_qtune_on_match
 class MatchDuplicateOperations(Matcher):
+    def __init__(self, identity: str = None, limit_to_dest_classes=None):
+        super().__init__(identity)
+        self._limit_to_dest_classes = limit_to_dest_classes
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-        if G.quantization:
-            LOG.warning(
-                'match_duplicate_operations does not handle quantized graphs')
-            return False
+        # if G.quantization:
+        #     LOG.warning(
+        #         'duplicate_operations does not handle quantized graphs')
+        #     return False
 
         def same_source_edge_fn(x):
             return f"{x.from_node.__hash__()}##{x.from_idx}"
@@ -53,6 +58,11 @@ def same_dest_edge(x):
             # all have the same origin
             same_source_edges = [elem for elem in same_source_edges
                                  if len(elem) > 1]
+            if self._limit_to_dest_classes:
+                same_source_edges = list(
+                    filter(
+                        lambda edges: all(isinstance(edge.to_node, self._limit_to_dest_classes) for edge in edges),
+                        same_source_edges))
             same_dest_edges = []
             same_dest_group_edges = []
 
@@ -63,7 +73,7 @@ def same_dest_edge(x):
                     first = same_source_edge.pop(0)
 
                     others = list(filter(partial(lambda x, y: x.to_node != y.to_node and y.to_node.is_same_operation_as(G,
-                        x.to_node), first), same_source_edge))
+                                                                                                                        x.to_node), first), same_source_edge))
                     if others:
                         same_dest_edges.append(tuple([first] + others))
                         for other in others:
@@ -83,10 +93,11 @@ def same_dest_edge(x):
             while same_dest_edges:
                 edge_set = same_dest_edges.pop(0)
                 keep_node = edge_set[0].to_node
-                other_edge_sets = [edges for edges in same_dest_edges if any(edge.to_node == keep_node for edge in edges)]
+                other_edge_sets = [edges for edges in same_dest_edges if any(
+                    edge.to_node == keep_node for edge in edges)]
                 for other_edge_set in other_edge_sets:
                     same_dest_edges.remove(other_edge_set)
-                
+
                 nodes_to_delete = set()
                 for edge_set in [edge_set] + other_edge_sets:
                     for edge in edge_set:
@@ -95,13 +106,13 @@ def same_dest_edge(x):
                             continue
                         nodes_to_delete.add(other_node)
                         for out_edge in G.out_edges(other_node):
-                            G.add_edge(NNEdge(from_node=keep_node, to_node=out_edge.to_node, to_idx=out_edge.to_idx))
+                            G.add_edge(
+                                NNEdge(from_node=keep_node, to_node=out_edge.to_node, to_idx=out_edge.to_idx))
                 LOG.info(
                     f'removed duplicates {",".join(node.name for node in nodes_to_delete)} to {keep_node.name}')
                 for node in nodes_to_delete:
                     G.remove(node)
-                        
-            
+
             # # all are multiple edges that go to something comparable
 
             # for edge_set in same_dest_edges:
@@ -145,7 +156,8 @@ def same_dest_edge(x):
                 out_edges = G.out_edges(first_node.name)
                 for edge in out_edges:
                     G.remove_edge(edge)
-                    G.add_edge(NNEdge(from_node=split1, from_idx=out_num, to_node=edge.to_node, to_idx=edge.to_idx))
+                    G.add_edge(NNEdge(from_node=split1, from_idx=out_num,
+                                      to_node=edge.to_node, to_idx=edge.to_idx))
                 G.add_edge(NNEdge(from_node=first_node, to_node=split1))
                 # first split output goes to original output
                 for other in edge_set[1::]:
@@ -169,7 +181,8 @@ def same_dest_edge(x):
                     G.remove(weights_other)
                     G.remove(biases_other)
                     for edge in out_edges:
-                        G.add_edge(NNEdge(from_node=split1, from_idx=out_num, to_node=edge.to_node, to_idx=edge.to_idx))
+                        G.add_edge(NNEdge(from_node=split1, from_idx=out_num,
+                                          to_node=edge.to_node, to_idx=edge.to_idx))
                 LOG.info(
                     f'merged convolutions {",".join(dup_nodes)} into {first_node.name}')
             if not found_more:
diff --git a/tools/nntool/graph/matches/matchers/duplicate_operations_out.py b/tools/nntool/graph/matches/matchers/duplicate_operations_out.py
index 1700adb7c..98078efe8 100644
--- a/tools/nntool/graph/matches/matchers/duplicate_operations_out.py
+++ b/tools/nntool/graph/matches/matchers/duplicate_operations_out.py
@@ -23,7 +23,7 @@
 LOG = logging.getLogger("nntool." + __name__)
 
 
-@match_name("match_duplicate_operations_out")
+@match_name("duplicate_operations_out")
 @description("""Removes operations that are duplicates on the same out edge""")
 @run_before("*")
 @groups('*')
@@ -37,7 +37,8 @@ def explore(self, G, nodes, result=None):
         out_edges_bundle = [G.indexed_out_edges(node.name) for node in nodes]
         if any(len(out_edges) != 1 or len(out_edges[0]) != 1 for out_edges in out_edges_bundle):
             return result
-        if any(not isinstance(node, ComparableParameters) or not node.is_same_operation_as(G, nodes[0])
+        # node == nodes[0] added since node could be a multi input expression
+        if any(not isinstance(node, ComparableParameters) or not node.is_same_operation_as(G, nodes[0]) or node == nodes[0]
                for node in nodes[1::]):
             return result
         if not result:
diff --git a/tools/nntool/graph/matches/matchers/equalize_sym_mult_concats.py b/tools/nntool/graph/matches/matchers/equalize_sym_mult_concats.py
deleted file mode 100644
index a3aa280ac..000000000
--- a/tools/nntool/graph/matches/matchers/equalize_sym_mult_concats.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-import logging
-
-from graph.types import (ConcatParameters, ReshapeParameters, SplitParameters,
-                         TransposeParameters)
-from graph.types.base import FilterParameters
-from utils.graph import Edge, GraphView
-from utils.node_id import NodeId
-
-from ..matcher import Matcher, description, groups, match_name, modifies_dimensions
-
-LOG = logging.getLogger("nntool." + __name__)
-
-CAN_PASS = (
-    ReshapeParameters,
-    TransposeParameters,
-    SplitParameters
-)
-
-# TODO - This match should be rewritten to use the quantizer
-
-
-def set_in_scale(qrec, index, scale):
-    in_q = qrec.in_qs[index]
-    assert qrec.ktype.startswith(
-        'scaled'), "not supported on other quantization types"
-    in_q.scale = scale
-
-
-def set_out_scale(node, qrec, index, scale):
-    out_q = qrec.out_qs[index]
-    assert qrec.ktype.startswith(
-        'scaled'), "not supported on other quantization types"
-    if isinstance(node, FilterParameters):
-        assert index == 0, "trying to set strange index on filter quantization record"
-        out_q.scale = scale
-        qrec.cache['mul_biases_q'].scale = qrec.in_qs[0].scale * \
-            qrec.in_qs[1].scale / out_q.scale
-    else:
-        out_q.scale = scale
-
-
-def propagate_qtype_up(G, qtype, edge: Edge):
-    LOG.info("propagating scale up from node %s to node %s",
-             edge.to_node.name, edge.from_node.name)
-    qrec_out = G.quantization[NodeId(edge.from_node)]
-    set_out_scale(edge.from_node, qrec_out, edge.from_idx, qtype.scale)
-    qrec_in = G.quantization[NodeId(edge.to_node)]
-    set_in_scale(qrec_in, edge.to_idx, qtype.scale)
-    if isinstance(edge.from_node, CAN_PASS):
-        for edge in G.in_edges(edge.from_node.name):
-            propagate_qtype_up(G, qtype, edge)
-
-@match_name("equalize_sm_concats")
-@description("""Equalize input quantization of concats with symmetric multiplicative quantization""")
-@groups('symmetric')
-@modifies_dimensions(False)
-class EqualizeSymmetricMultiplicativeQuantivedConcats(Matcher):
-
-    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-        if not G.quantization:
-            return
-        concats = [node for node in G.nodes() if isinstance(node,
-                                                            ConcatParameters)]
-        qrecs = [G.quantization[NodeId(node)] for node in concats]
-        if not all(qrec.ktype.startswith('scaled') for qrec in qrecs):
-            return
-        for concat, qrec in zip(concats, qrecs):
-            out_q = qrec.out_qs[0]
-            for edge in G.in_edges(concat.name):
-                in_q = qrec.in_qs[edge.to_idx]
-                if in_q != out_q:
-                    propagate_qtype_up(G, out_q, edge)
-
-        if set_identity:
-            self.set_identity(G)
-
-        return False
diff --git a/tools/nntool/graph/matches/matchers/expression_matcher.py b/tools/nntool/graph/matches/matchers/expression_matcher.py
index cd1f8c5d8..4f25528f1 100644
--- a/tools/nntool/graph/matches/matchers/expression_matcher.py
+++ b/tools/nntool/graph/matches/matchers/expression_matcher.py
@@ -250,7 +250,7 @@ def find_connected_groups(G):
 
 @match_name("expression_matcher")
 @description("Groups piecewise expressions for kernel generation")
-@run_after('expand_transposes')
+@run_after('*')
 @needs_valid_dimension(True)
 class ExpressionMatcher(Matcher):
 
diff --git a/tools/nntool/graph/matches/matchers/filt_bigger_than_in.py b/tools/nntool/graph/matches/matchers/filter_bigger_than_input.py
similarity index 100%
rename from tools/nntool/graph/matches/matchers/filt_bigger_than_in.py
rename to tools/nntool/graph/matches/matchers/filter_bigger_than_input.py
diff --git a/tools/nntool/graph/matches/matchers/find_asymmetric_quantization.py b/tools/nntool/graph/matches/matchers/find_asymmetric_quantization.py
deleted file mode 100644
index 411a35991..000000000
--- a/tools/nntool/graph/matches/matchers/find_asymmetric_quantization.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-# import logging
-
-# from graph.matches.matcher import Matcher
-# from graph.types import (ActivationParameters, ConcatParameters,
-#                          ConstantInputParameters, Conv2DParameters,
-#                          ConvFusionParameters, FcParameters,
-#                          GlobalPoolingParameters, InputParameters,
-#                          MatrixAddParameters, OutputParameters,
-#                          PoolingParameters, ReshapeParameters)
-
-# from utils.graph import GraphView
-# from utils.node_id import NodeId
-
-# LOG = logging.getLogger("nntool." + __name__)
-
-# CAN_CHANGE_OUTPUT = (
-#     InputParameters, ConstantInputParameters, Conv2DParameters,
-#     ConvFusionParameters, FcParameters, MatrixAddParameters
-# )
-
-# CAN_CHANGE_INPUT = (
-#     OutputParameters, Conv2DParameters, ConvFusionParameters,
-#     FcParameters, MatrixAddParameters
-# )
-
-# CAN_PROPAGATE_INPUT = (
-#     GlobalPoolingParameters, ReshapeParameters, ConcatParameters, ActivationParameters, PoolingParameters
-# )
-
-# ARE_MULTI_INPUT = (
-#     ConcatParameters
-# )
-
-# class FindAsymmetricQuantization(Matcher):
-#     NAME = "find_asymmetric_quantization"
-#     DESCRIPTION = """Find nodes that can have asymmetric quantization. Must run after padding has been fused."""
-
-#     def can_change_input(self, G, node, exclude=None):
-#         """Returns None or a list of tuples of (node, multi_input_node) where node is an
-#         input of multi_input_node. An empty list is a confirmed string. A list that contains
-#         multi input nodes needs to be reconciled. An empty list means that this node
-#         cannot be changed."""
-
-#         if isinstance(node, CAN_PROPAGATE_INPUT):
-#             if exclude and node in exclude:
-#                 return None
-#             nodes = []
-#             for succ in [succ
-#                          for succs in G.successors(node.name)
-#                          for succ in succs]:
-#                 can_change = self.can_change_input(G, succ, exclude=exclude)
-#                 if can_change is None:
-#                     return None
-#                 nodes += can_change
-#                 if isinstance(succ, ARE_MULTI_INPUT):
-#                     nodes.append((node, succ))
-#             return nodes
-#         if not isinstance(node, CAN_CHANGE_INPUT):
-#             return None
-#         if isinstance(node, ConvFusionParameters):
-#             filters = node.contained_filters()
-#             if len(filters) == 1 and not filters[0].padding.has_padding:
-#                 return []
-#             else:
-#                 return None
-#         if isinstance(node, Conv2DParameters):
-#             return None if node.padding.has_padding else []
-#         return []
-
-#     def can_change_output(self, node):
-#         return isinstance(node, CAN_CHANGE_OUTPUT)
-
-#     def validate_multi_input(self, G, input_dict):
-#         # {start_node: [(pred, mi_node), ..]}
-#         mi_nodes = {}
-#         # index all of the predecessor nodes by mi node
-#         for pr_node, mi_node in [match for matches in input_dict.values() for match in matches]:
-#             pr_node_set = mi_nodes.get(mi_node)
-#             if pr_node_set is None:
-#                 pr_node_set = set()
-#                 mi_nodes[mi_node] = pr_node_set
-#             pr_node_set.add(pr_node)
-#         bad_mi_nodes = []
-#         # check that all the predecessors were OK
-#         for mi_node, pr_nodes in mi_nodes.items():
-#             if not all(node in pr_nodes for node in G.predecessors(mi_node)):
-#                 bad_mi_nodes.append(mi_node)
-#         start_nodes = []
-#         # find the records that have bad nodes in them
-#         if bad_mi_nodes:
-#             for start_node, matches in input_dict.items():
-#                 if any(mi_node in bad_mi_nodes for _, mi_node in matches):
-#                     start_nodes.append(start_nodes)
-#         for start_node in start_nodes:
-#             del input_dict[start_node]
-#             matches = self.can_change_input(G, start_node, exclude=bad_mi_nodes)
-#             if matches is not None:
-#                 assert len(matches) == 0
-#                 input_dict[start_node] = []
-#         return input_dict
-
-#     def change_output_to_async(self, G, node, idx):
-#         if isinstance(node, ConvFusionParameters):
-#             changing = False
-#             for fnode in node.contained_nodes():
-#                 if changing:
-#                     nid = NodeId(node, fnode)
-#                     qrec = G.quantization[nid]
-#                     if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper):
-#                         qrec.in_qs[0] = qrec.in_qs[0].wrapped
-#                     if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper):
-#                         qrec.out_qs[0] = qrec.out_qs[0].wrapped
-#                 elif isinstance(fnode, (Conv2DParameters, FcParameters)):
-#                     changing = True
-#                     nid = NodeId(node, fnode)
-#                     qrec = G.quantization[nid]
-#                     if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper):
-#                         qrec.out_qs[0] = qrec.out_qs[0].wrapped
-
-#         nid = NodeId(node)
-#         qrec = G.quantization[nid]
-#         if isinstance(qrec.out_qs[idx], SymmetricMultQTypeWrapper):
-#             qrec.out_qs[idx] = qrec.out_qs[idx].wrapped
-
-#     def change_input_to_async(self, G, node, idx):
-#         if isinstance(node, ConvFusionParameters):
-#             for fnode in node.contained_nodes():
-#                 nid = NodeId(node, fnode)
-#                 qrec = G.quantization[nid]
-#                 if isinstance(fnode, (Conv2DParameters, FcParameters)):
-#                     if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper):
-#                         qrec.in_qs[0] = qrec.in_qs[0].wrapped
-#                         qrec.in_qs[2].link(qrec.in_qs[1], qrec.in_qs[0])
-#                     return
-#                 if isinstance(qrec.in_qs[0], SymmetricMultQTypeWrapper):
-#                     qrec.in_qs[0] = qrec.in_qs[0].wrapped
-#                 if isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper):
-#                     qrec.out_qs[0] = qrec.out_qs[0].wrapped
-
-#         nid = NodeId(node)
-#         qrec = G.quantization[nid]
-#         if isinstance(qrec.in_qs[idx], SymmetricMultQTypeWrapper):
-#             qrec.in_qs[idx] = qrec.in_qs[idx].wrapped
-#             if isinstance(node, (Conv2DParameters, FcParameters)):
-#                 qrec.in_qs[2].link(qrec.in_qs[1], qrec.in_qs[idx])
-#         if isinstance(node, OutputParameters) and isinstance(qrec.out_qs[0], SymmetricMultQTypeWrapper):
-#             qrec.out_qs[0] = qrec.out_qs[0].wrapped
-
-#     def do_change(self, G, node, idx=0):
-#         self.change_output_to_async(G, node, idx)
-#         for edge in G.out_edges(node.name):
-#             if isinstance(edge.to_node, CAN_PROPAGATE_INPUT):
-#                 self.change_input_to_async(G, edge.to_node, edge.to_idx)
-#                 self.do_change(G, edge.to_node, edge.from_idx)
-#             else:
-#                 assert isinstance(edge.to_node, CAN_CHANGE_INPUT)
-#                 if isinstance(edge.to_node, ConvFusionParameters):
-#                     filters = edge.to_node.contained_filters()
-#                     assert len(filters) == 1 and not filters[0].padding.has_padding
-#                 if isinstance(edge.to_node, Conv2DParameters):
-#                     assert not edge.to_node.padding.has_padding
-#                 self.change_input_to_async(G, edge.to_node, edge.to_idx)
-
-#     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-#         if not G.quantization:
-#             return
-#         input_dict = {}
-#         for node in G.nodes():
-#             if not self.can_change_output(node):
-#                 continue
-#             all_matches = []
-#             for succ in [succ for succs in G.successors(node.name) for succ in succs]:
-#                 matches = self.can_change_input(G, succ)
-#                 if matches is None:
-#                     all_matches = None
-#                     break
-#                 all_matches += matches
-#             if all_matches is None:
-#                 continue
-#             input_dict[node] = all_matches
-
-#         input_dict = self.validate_multi_input(G, input_dict)
-#         for node in input_dict:
-#             # all nodes that can currently change output have one output
-#             self.do_change(G, node)
-
-#         if set_identity:
-#             self.set_identity(G)
-#         return False
diff --git a/tools/nntool/graph/matches/matchers/match_external_bias.py b/tools/nntool/graph/matches/matchers/fuse_external_bias.py
similarity index 80%
rename from tools/nntool/graph/matches/matchers/match_external_bias.py
rename to tools/nntool/graph/matches/matchers/fuse_external_bias.py
index 213a99851..fe258b10f 100644
--- a/tools/nntool/graph/matches/matchers/match_external_bias.py
+++ b/tools/nntool/graph/matches/matchers/fuse_external_bias.py
@@ -20,9 +20,8 @@
                          MatrixAddParameters, MatrixMulParameters, NNEdge)
 from graph.types.others import ReshapeParameters
 from utils.graph import GraphView
-from utils.node_id import NodeId
 
-from ..matcher import Matcher, match_name, description, groups, run_before
+from ..matcher import Matcher, match_name, description, groups, run_before, run_qtune_on_match
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -33,10 +32,13 @@
     MatrixMulParameters: (np.multiply, True)
 }
 
+
 @match_name('fuse_external_bias')
 @description('Fuse bias addition after filter with filter bias')
 @groups('scaled', 'symmetric')
-@run_before('match_op_activation', 'move_pooling_scale8', 'move_activations_pow2', 'move_activations_scale8')
+@run_before('fuse_op_activation_scale8', 'fuse_op_activation_pow2', 'move_pooling_scale8',
+            'move_activations_up')
+@run_qtune_on_match
 class MatchExternalBias(Matcher):
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
@@ -77,37 +79,35 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                     LOG.warning('could not absorb %s into %s',
                                 const_node.name, filter_node.name)
                     break
-                # If there is quantization then essentially the output of the filter
-                # takes the quantization of the output of the operation.
-                # The biases will not change since their quantization depends on the weights
-                # and input
-                fnid = NodeId(filter_node)
-                opnid = NodeId(op_node)
-                if G.quantization and (fnid in G.quantization or opnid in G.quantization):
-                    if not (fnid in G.quantization and opnid in G.quantization):
-                        LOG.warning(
-                            'could not absorb %s into %s - graph is partially quantized', const_node.name, filter_node.name)
-                        break
-                    fqrec = G.quantization[fnid]
-                    opqrec = G.quantization[opnid]
-                    fqrec.out_qs[0] = opqrec.out_qs[0]
 
                 has_modified_graph = True
                 LOG.info("fusing bias in %s into %s",
                          const_node.name, filter_node.name)
                 self.fuse_bias(G, filter_node, other_idx, op, flat_value, 2)
                 if weights_and_biases:
-                    # TODO - need to adjust weights quantization here
                     LOG.info("fusing multiplicative bias in %s into %s",
                              const_node.name, filter_node.name)
                     self.fuse_bias(G, filter_node, other_idx,
                                    op, flat_value, 1)
 
-                out_edges = G.out_edges(op_node.name)
+                # save out edges and remove the mul or add
+                out_edges = G.out_edges(op_node)
                 G.remove(op_node)
                 if remove_constant:
                     G.remove(const_node)
+
+                # it's possible that there is a broadcast on the op from the constant
+                # if there is insert a reshape since it will no longer happen
+                in_shape = tuple(op_node.in_dims[out_edge.to_idx].shape)
+                out_shape = tuple(op_node.out_dims[0].shape)
                 from_node = seen_reshape[-1] if seen_reshape else filter_node
+                if in_shape != out_shape:
+                    reshape = ReshapeParameters(G.unique_name(f'{op_node.name}_reshape'),
+                                                old_shape=in_shape, shape=out_shape)
+                    G.add_edge(NNEdge(from_node=from_node, to_node=reshape))
+                    from_node = reshape
+
+                # connect up the output nodes
                 for edge in out_edges:
                     G.add_edge(NNEdge(from_node=from_node,
                                       to_node=edge.to_node, to_idx=edge.to_idx))
diff --git a/tools/nntool/graph/matches/matchers/match_external_bias_matmul.py b/tools/nntool/graph/matches/matchers/fuse_external_bias_matmul.py
similarity index 92%
rename from tools/nntool/graph/matches/matchers/match_external_bias_matmul.py
rename to tools/nntool/graph/matches/matchers/fuse_external_bias_matmul.py
index 04aa887c8..03c50aee0 100644
--- a/tools/nntool/graph/matches/matchers/match_external_bias_matmul.py
+++ b/tools/nntool/graph/matches/matchers/fuse_external_bias_matmul.py
@@ -21,7 +21,7 @@
     RemoveUnnecessaryQuantizeOperators
 from graph.types import (ConstantInputParameters, MatrixAddParameters,
                          MatrixMulParameters, NNEdge)
-from graph.types.tensor_arithmetic import MatMulOpParameters
+from graph.types.tensor_arithmetic import MatMulOpParameters, MatMulTransposedParameters
 from quantization.quantizer.new_quantizer import NewQuantizer
 from utils.graph import GraphView
 from utils.node_id import NodeId
@@ -65,7 +65,7 @@ def reverse_matmul(G: GraphView, params):
 @match_name('fuse_external_bias_matmul')
 @description('Fuse bias addition after matmul')
 @groups('scaled', 'symmetric')
-@run_before('match_op_activation', 'move_pooling_scale8', 'move_activations_pow2', 'move_activations_scale8', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2')
+@run_before('fuse_op_activation_scale8', 'fuse_op_activation_pow2', 'move_pooling_scale8', 'move_activations_up', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2')
 class MatchExternalBiasMatmul(Matcher):
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
@@ -98,10 +98,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
 
                 flat_value = const_node.dqvalue.flatten()
                 out_shape = matmul.out_dims[0].shape
-                if len(out_shape) != 2:
-                    raise ValueError(
-                        f'strange outputs shape of {out_shape} for matmul {params.name}')
-                if len(flat_value) != out_shape[0] and len(flat_value) != out_shape[1]:
+                if len(flat_value) != out_shape[-1] and len(flat_value) != out_shape[-2]:
                     LOG.info("can't fuse %s into %s - value shape is not correct for bias",
                              const_node.name, matmul.name)
                     break
@@ -110,7 +107,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                 out_node = seen_reshape[-1] if seen_reshape else matmul
                 if isinstance(op_node, MatrixAddParameters):
                     if has_bias:
-                        if len(flat_value.shape) != len(matmul.in_dims[2]):
+                        if len(flat_value) != matmul.in_dims[2].size():
                             LOG.info(
                                 "can't fuse %s into %s - bias shape is not the same", const_node.name, matmul.name)
                             break
@@ -120,7 +117,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                             "folding additive bias from %s into existing bias on %s", op_node.name, matmul.name)
                         bias_node.value = bias_node.dqvalue + flat_value
                     else:
-                        if len(flat_value) != out_shape[1]:
+                        if len(flat_value) != out_shape[-1]:
                             # matmul needs to be transposed to fuse this
                             in_nodes, trans_node = reverse_matmul(G, matmul)
                             if seen_reshape:
diff --git a/tools/nntool/graph/matches/matchers/fuse_gap_convs.py b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py
new file mode 100644
index 000000000..1b4c368cd
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/fuse_gap_convs.py
@@ -0,0 +1,226 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+from copy import deepcopy
+
+from graph.types import (ActivationParameters, Conv2DParameters,
+                         ConvFusionParameters, HSigmoidActivationParameters,
+                         HSwishActivationParameters, LeakyActivationParameters,
+                         NNEdge, PoolingParameters, ReluActivationParameters,
+                         SigmoidActivationParameters)
+from graph.types.activations import (HTanHActivationParameters,
+                                     TanHActivationParameters)
+from graph.types.base import NNNodeRef
+from graph.types.fusions import FusionInputParameters, FusionOutputParameters
+from utils.graph import GraphView, NodeRef
+
+from ..matcher import (Matcher, description, groups, match_name,
+                       run_adjust_on_match, run_qtune_on_match)
+
+LOG = logging.getLogger("nntool." + __name__)
+
+VALID_ACTIVATIONS_SQ8 = (
+    ReluActivationParameters,
+    LeakyActivationParameters,
+    HSigmoidActivationParameters,
+    HSwishActivationParameters,
+    SigmoidActivationParameters,
+    TanHActivationParameters,
+    HTanHActivationParameters
+)
+
+VALID_FUSIONS_SQ8 = (
+    'conv_active',
+    'conv_max_active',
+    'conv_average_active',
+    'conv_active_max',
+)
+
+VALID_ACTIVATIONS_POW2 = (
+    ReluActivationParameters,
+    LeakyActivationParameters,
+    HSigmoidActivationParameters,
+    HSwishActivationParameters,
+    SigmoidActivationParameters
+)
+
+VALID_FUSIONS_POW2 = (
+    'conv_active',
+    'conv_max_active',
+    'conv_average_active',
+    'conv_active_max',
+)
+
+
+class MergeStopError(Exception):
+    pass
+
+
+class MergeAbortError(Exception):
+    pass
+class NewFusionMatch():
+    def __init__(self, valid_activations, valid_fusions) -> None:
+        self.fusion = None
+        self.conv = None
+        self.pool = None
+        self.active = None
+        self.valid_activations = valid_activations
+        self.valid_fusions = valid_fusions
+        self.order = []
+        self.nodes_in_fusion = 0
+
+    @classmethod
+    def from_node(cls, G, node, valid_activations, valid_fusions) -> 'NewFusionMatch':
+        matcher = cls(valid_activations, valid_fusions)
+        try:
+            matcher.add_node(node)
+            while node:
+                edges = G.out_edges(node)
+                if len(edges) > 1:
+                    break
+                node = edges[0].to_node
+                matcher.add_node(node)
+        except MergeStopError:
+            pass
+        except MergeAbortError:
+            return None
+        return matcher
+
+    @staticmethod
+    def calc_fusion_type(contents, pool_type=False):
+        return '_'.join(['conv' if isinstance(params, Conv2DParameters)
+                         else 'active' if isinstance(params, ActivationParameters)
+                         else params.pool_type if pool_type else 'pool' for params in contents])
+
+    def can_add(self, node):
+        fusion_type = self.calc_fusion_type(
+            self.order + [node], pool_type=True)
+        return any(valid_fusion.startswith(fusion_type) for valid_fusion in self.valid_fusions)
+
+    def add_node(self, params, in_fusion=False):
+        if in_fusion:
+            self.nodes_in_fusion += 1
+        if isinstance(params, ConvFusionParameters):
+            if self.fusion:
+                raise MergeStopError() # @IgnoreException
+            self.fusion = params
+            try:
+                for cnode in params.contained_nodes():
+                    self.add_node(cnode, in_fusion=True)
+            except MergeStopError:  # @IgnoreException
+                raise MergeAbortError()
+        elif isinstance(params, Conv2DParameters):
+            if self.conv or not self.can_add(params):
+                raise MergeStopError() # @IgnoreException
+            self.order.append(params)
+            self.conv = params
+        elif isinstance(params, self.valid_activations):
+            if self.active or not self.can_add(params):
+                raise MergeStopError() # @IgnoreException
+            self.order.append(params)
+            self.active = params
+        elif isinstance(params, PoolingParameters):
+            if self.pool or not self.can_add(params):
+                raise MergeStopError() # @IgnoreException
+            self.order.append(params)
+            self.pool = params
+        else:
+            raise MergeStopError() # @IgnoreException
+
+    @property
+    def can_fuse(self):
+        return (self.calc_fusion_type(self.order, pool_type=True) in self.valid_fusions
+                and len(self.order) > self.nodes_in_fusion)
+
+    def fuse(self, G: GraphView):
+        fusion_outputs = G.out_edges(self.order[-1])
+        if self.fusion is None:
+            fuse_node_name = G.unique_name(self.conv.name + '_fusion')
+            subg = GraphView()
+            inputs = [NNNodeRef(subg, FusionInputParameters(f'{fuse_node_name}_in_{idx}',
+                idx=idx, dims=self.conv.in_dims[0].shape), 0) for idx in range(3)]
+            in_edges = G.indexed_in_edges(self.conv)
+        else:
+            fuse_node_name = self.fusion.name
+            subg = self.fusion.subgraph
+            subg_output = subg.outputs()
+            assert len(subg_output) == 1
+            inputs = [NNNodeRef(subg, subg.in_edges(subg_output[0])[0].from_node, 0)]
+            subg.remove_all(subg_output)
+            in_edges = None
+        nodes_to_fuse = self.order[self.nodes_in_fusion:]
+        LOG.info(f'fusing nodes {",".join(node.name for node in nodes_to_fuse)}'
+                 f' into {fuse_node_name}')
+        node = None
+        while nodes_to_fuse:
+            node = nodes_to_fuse.pop(0)
+            G.remove(node)
+            inputs = [node(*inputs)]
+        FusionOutputParameters(
+            f'{fuse_node_name}_out_0',
+            dims=node.out_dims[0].shape)(*inputs)
+        if not self.fusion:
+            fusion = ConvFusionParameters(
+                fuse_node_name,
+                fusion_type=self.fusion_type,
+                subgraph=subg,
+                in_dims_hint=self.conv.in_dims_hint,
+                out_dims_hint=self.conv.out_dims_hint,
+                in_dims=deepcopy(self.conv.in_dims),
+                out_dims=deepcopy(self.order[-1].out_dims),
+                inout_set=True)
+            for edge in in_edges:
+                G.add_edge(edge.clone(to_node=fusion))
+        else: # in the fusion case the outputs will already be removed since the node after was removed
+            fusion = self.fusion
+        for edge in fusion_outputs:
+            G.add_edge(edge.clone(from_node=fusion))
+        if G.quantization:
+            for node in self.order[self.nodes_in_fusion:]:
+                G.quantization.move_to_fusion(node, fusion)
+
+
+    @property
+    def fusion_type(self):
+        return self.calc_fusion_type(self.order)
+
+
+@groups('*')
+@match_name("fuse_gap_convs")
+@run_qtune_on_match
+@description('Fuse convolutions, pools and activations to match GAP AutoTiler operations')
+class MatchAllGapConv(Matcher):
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
+        has_modified_graph = False
+        group_identity = kwargs.get('group_identity')
+        if group_identity == 'pow2_match_group':
+            valid_activations = VALID_ACTIVATIONS_POW2
+            valid_fusions = VALID_FUSIONS_POW2
+        else:
+            valid_activations = VALID_ACTIVATIONS_SQ8
+            valid_fusions = VALID_FUSIONS_SQ8
+
+        for conv_node in G.nodes(node_classes=(Conv2DParameters, ConvFusionParameters)):
+            matcher = NewFusionMatch.from_node(G, conv_node, valid_activations, valid_fusions)
+            if not matcher or not matcher.can_fuse:
+                continue
+            has_modified_graph = True
+            matcher.fuse(G)
+
+        if set_identity:
+            self.set_identity(G)
+
+        return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/match_gap_linear.py b/tools/nntool/graph/matches/matchers/fuse_gap_linear.py
similarity index 89%
rename from tools/nntool/graph/matches/matchers/match_gap_linear.py
rename to tools/nntool/graph/matches/matchers/fuse_gap_linear.py
index 634bcb62b..4ba1624e6 100644
--- a/tools/nntool/graph/matches/matchers/match_gap_linear.py
+++ b/tools/nntool/graph/matches/matchers/fuse_gap_linear.py
@@ -19,11 +19,11 @@
                          HSwishActivationParameters, LeakyActivationParameters,
                          LinearFusionParameters, NNEdge,
                          ReluActivationParameters, SigmoidActivationParameters)
-from quantization.new_qrec import QRec
+from graph.types.activations import TanHActivationParameters
 from utils.graph import GraphView
-from utils.node_id import NodeId
 
-from ..matcher import Matcher, description, groups, match_name
+from ..matcher import (Matcher, description, groups, match_name,
+                       run_adjust_on_match, run_qtune_on_match)
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -32,7 +32,8 @@
     LeakyActivationParameters,
     HSigmoidActivationParameters,
     HSwishActivationParameters,
-    SigmoidActivationParameters
+    SigmoidActivationParameters,
+    TanHActivationParameters,
 )
 
 VALID_ACTIVATIONS_POW2 = (
@@ -80,6 +81,8 @@ def move_stats_to_fusion(fusion, stats):
 
 
 @groups('*')
+@run_qtune_on_match
+@run_adjust_on_match
 @match_name("fuse_gap_linear")
 @description('Fuse linear layers and activations to match GAP AutoTiler operations')
 class MatchGapLinear(Matcher):
@@ -124,14 +127,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                 input_mapping=input_mapping,
                 output_mapping=output_mapping)
             if G.quantization:
-                # TODO - stats
-                qrecs = G.quantization.get_all(pnode.contained_nodes())
-                if qrecs:
-                    prec = QRec.copy_ktype(
-                        qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs)
-                    for node in pnode.contained_nodes():
-                        G.quantization.move_to_fusion(node, pnode)
-                    G.quantization[NodeId(pnode)] = prec
+                for node in pnode.contained_nodes():
+                    G.quantization.move_to_fusion(node, pnode)
             in_edges = G.in_edges(node_list.linear.name)
             out_edges = G.out_edges(last_node.name)
             for node in node_list.order:
diff --git a/tools/nntool/graph/matches/matchers/match_gap_pool.py b/tools/nntool/graph/matches/matchers/fuse_gap_pool.py
similarity index 84%
rename from tools/nntool/graph/matches/matchers/match_gap_pool.py
rename to tools/nntool/graph/matches/matchers/fuse_gap_pool.py
index 8b0314d5a..db7aa2997 100644
--- a/tools/nntool/graph/matches/matchers/match_gap_pool.py
+++ b/tools/nntool/graph/matches/matchers/fuse_gap_pool.py
@@ -14,20 +14,17 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import logging
-from copy import deepcopy
 
-import numpy as np
 from graph.types import (HSigmoidActivationParameters,
                          HSwishActivationParameters, LeakyActivationParameters,
                          NNEdge, PoolingParameters, ReluActivationParameters,
                          SigmoidActivationParameters)
 from graph.types.fusions import ActivationFusion
 from graph.types.global_pooling import GlobalPoolingParameters
-from quantization.new_qrec import QRec
 from utils.graph import GraphView
-from utils.node_id import NodeId
 
-from ..matcher import Matcher, description, groups, match_name, run_after
+from ..matcher import (Matcher, description, groups, match_name,
+                       run_adjust_on_match, run_after, run_qtune_on_match)
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -86,7 +83,10 @@ def fusion_type(self):
 
 
 @groups('*')
+
 @match_name("fuse_gap_pool")
+@run_qtune_on_match
+@run_adjust_on_match
 @description('Fuse pooling layers and activations to match GAP AutoTiler operations')
 @run_after('fuse_gap_convs')
 class MatchGapPool(Matcher):
@@ -135,19 +135,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                 output_mapping=output_mapping)
             if G.quantization:
                 # TODO - stats
-                qrecs = G.quantization.get_all(pnode.contained_nodes())
-                if qrecs:
-                    prec = QRec.copy_ktype(
-                        qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs)
-                    for node in pnode.contained_nodes():
-                        G.quantization.move_to_fusion(node, pnode)
-                        if isinstance(node, GlobalPoolingParameters):
-                            # Global pooling fused with activations need to have only the activation scale
-                            G.quantization[NodeId(pnode, node)].out_qs[0] = deepcopy(
-                                G.quantization[NodeId(pnode, node)].in_qs[0])
-                            G.quantization[NodeId(
-                                pnode, node)].out_qs[0].dtype = np.int32
-                    G.quantization[NodeId(pnode)] = prec
+                for node in pnode.contained_nodes():
+                    G.quantization.move_to_fusion(node, pnode)
             in_edges = G.in_edges(node_list.pool.name)
             out_edges = G.out_edges(last_node.name)
             for node in node_list.order:
diff --git a/tools/nntool/graph/matches/matchers/matscale.py b/tools/nntool/graph/matches/matchers/fuse_matscale.py
similarity index 100%
rename from tools/nntool/graph/matches/matchers/matscale.py
rename to tools/nntool/graph/matches/matchers/fuse_matscale.py
diff --git a/tools/nntool/graph/matches/matchers/match_op_activation.py b/tools/nntool/graph/matches/matchers/fuse_op_activation.py
similarity index 92%
rename from tools/nntool/graph/matches/matchers/match_op_activation.py
rename to tools/nntool/graph/matches/matchers/fuse_op_activation.py
index 406dbde5c..11dd896f3 100644
--- a/tools/nntool/graph/matches/matchers/match_op_activation.py
+++ b/tools/nntool/graph/matches/matchers/fuse_op_activation.py
@@ -13,7 +13,6 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from graph.types.tensor_arithmetic import MatMulTransposedParameters
 import logging
 from abc import abstractproperty
 
@@ -22,14 +21,13 @@
                          GlobalPoolingParameters, HSigmoidActivationParameters,
                          HSwishActivationParameters, LeakyActivationParameters,
                          MatMulOpFusionParameters, MatMulOpParameters,
-                         MatrixAddParameters, NNEdge,
-                         PoolingParameters, ReluActivationParameters,
-                         SigmoidActivationParameters)
-from quantization.new_qrec import QRec
+                         MatrixAddParameters, NNEdge, PoolingParameters,
+                         ReluActivationParameters, SigmoidActivationParameters)
+from graph.types.tensor_arithmetic import MatMulTransposedParameters
 from utils.graph import GraphView
 from utils.node_id import NodeId
 
-from ..matcher import Matcher, description, groups, match_name, run_after
+from ..matcher import Matcher, description, groups, match_name, run_after, run_qtune_on_match, run_adjust_on_match
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -134,6 +132,8 @@ def fusion_type(self):
 
 
 @run_after('fuse_gap_pool', 'fuse_external_bias_matmul')
+@run_qtune_on_match
+@run_adjust_on_match
 class MatchOpActivation(Matcher):
 
     @abstractproperty
@@ -179,13 +179,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                 output_mapping=output_mapping)
             if G.quantization:
                 # TODO - stats
-                qrecs = G.quantization.get_all(pnode.contained_nodes())
-                if qrecs:
-                    prec = QRec.copy_ktype(
-                        qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs)
-                    for fnode in pnode.contained_nodes():
-                        G.quantization.move_to_fusion(fnode, pnode)
-                    G.quantization[NodeId(pnode)] = prec
+                for fnode in pnode.contained_nodes():
+                    G.quantization.move_to_fusion(fnode, pnode)
             in_edges = G.in_edges(node_list.node.name)
             out_edges = G.out_edges(last_node.name)
             for snode in node_list.order:
diff --git a/tools/nntool/graph/matches/matchers/fuse_pad.py b/tools/nntool/graph/matches/matchers/fuse_pad.py
index d5826ce5b..ed00d72eb 100644
--- a/tools/nntool/graph/matches/matchers/fuse_pad.py
+++ b/tools/nntool/graph/matches/matchers/fuse_pad.py
@@ -53,7 +53,7 @@ def expand_padding(from_shape, to_shape, padding):
 @match_name('fuse_pad')
 @description('Fuse pad operation to subsequent Convolution or Pool')
 @groups('*')
-@run_before('match_gap_conv', 'match_gap_pool')
+@run_before('fuse_gap_convs', 'fuse_gap_pool')
 class MatchFusePad(Matcher):
     @staticmethod
     def remove_padding(shape, padding):
diff --git a/tools/nntool/graph/matches/matchers/match_channel_padded_add.py b/tools/nntool/graph/matches/matchers/fuse_padded_add.py
similarity index 91%
rename from tools/nntool/graph/matches/matchers/match_channel_padded_add.py
rename to tools/nntool/graph/matches/matchers/fuse_padded_add.py
index a650765a4..e4f8ba545 100644
--- a/tools/nntool/graph/matches/matchers/match_channel_padded_add.py
+++ b/tools/nntool/graph/matches/matchers/fuse_padded_add.py
@@ -22,7 +22,7 @@
 from utils.graph import GraphView
 from utils.node_id import NodeId
 
-from ..matcher import Matcher, match_name, description, groups
+from ..matcher import Matcher, match_name, description, groups, run_before, run_qtune_on_match
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -64,6 +64,8 @@ def fusion_type(self):
 
 @match_name('fuse_padded_add')
 @description('Fuse convolutions, pools and activations to match GAP AutoTiler operations')
+@run_before('fuse_op_activation_scale8')
+@run_qtune_on_match
 @groups('scaled')
 class MatchPadAddAct(Matcher):
 
@@ -109,14 +111,8 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                 input_mapping=input_mapping,
                 output_mapping=output_mapping)
             if G.quantization:
-                qrecs = G.quantization.get_all(pnode.contained_nodes())
-                # TODO - stats
-                if qrecs:
-                    prec = QRec.copy_ktype(
-                        qrecs[1], in_qs=qrecs[1].in_qs, out_qs=qrecs[-1].out_qs)
-                    for node in pnode.contained_nodes():
-                        G.quantization.move_to_fusion(node, pnode)
-                    G.quantization[NodeId(pnode)] = prec
+                for node in pnode.contained_nodes():
+                    G.quantization.move_to_fusion(node, pnode)
             if padded_input_idx == 0:
                 in_edges = G.in_edges(node_list.pad.name) + \
                     G.indexed_in_edges(node_list.add.name)[1::]
diff --git a/tools/nntool/graph/matches/matchers/insert_copies.py b/tools/nntool/graph/matches/matchers/insert_copies.py
index 29b53c275..dce28ef08 100644
--- a/tools/nntool/graph/matches/matchers/insert_copies.py
+++ b/tools/nntool/graph/matches/matchers/insert_copies.py
@@ -16,8 +16,8 @@
 import logging
 from copy import deepcopy
 
-from graph.types import (ConcatParameters, CopyParameters, InputParameters,
-                         NNEdge, NoOPParameters, OutputParameters,
+from graph.types import (ConcatParameters, CopyParameters, InputParameters, RNNBaseParameters,
+                         NNEdge, NoOPParameters, OutputParameters, ConstantInputParameters,
                          ReshapeParameters, SplitParameters,
                          TransposeParameters)
 from quantization.new_qrec import QRec
@@ -30,34 +30,30 @@
 LOG = logging.getLogger("nntool." + __name__)
 
 
-def find_real_in_edge(G, edge):
-    from_node = edge.from_node
-    if isinstance(from_node, ReshapeParameters):
-        res = find_real_in_edge(G, G.in_edges(from_node.name)[0])
-        return res
-    if isinstance(from_node, NoOPParameters):
-        res = find_real_in_edge(G, G.in_edges(from_node.name)[0])
-        return res
-    if isinstance(from_node, TransposeParameters):
-        _, real_transpose = from_node.real_shape()
-        if len(real_transpose) <= 1:
-            res = find_real_in_edge(G, G.in_edges(from_node.name)[0])
-            return res
-    return (edge.from_node, edge.from_idx)
+class VisitEdge():
+    def __init__(self, edge, direction) -> None:
+        self.edge = edge
+        self.direction = direction
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, VisitEdge) and self.edge == other.edge
+
+    def __hash__(self) -> int:
+        return self.edge.__hash__()
 
 
 @match_name('insert_copies')
-@description('insert copy nodes on edges that link splits to concats')
-@run_after('insert_transposes')
+@description('insert copy nodes on edges that link stacked tensors between themselves or to inputs or outputs')
 @groups('*')
 @needs_valid_dimension(True)
 class MatchInsertCopies(Matcher):
     @staticmethod
     def can_pass_node(node):
-        return (isinstance(node, (ReshapeParameters, NoOPParameters)) or
-                isinstance(node, TransposeParameters) and node.does_nothing)
+        # nodes that do not generate any kernel
+        return node.no_model_code
 
     def find_split_concat_down(self, G, edge):
+        # search for a split or concat on any downward edge
         if isinstance(edge.to_node, (SplitParameters, ConcatParameters)):
             return True
         elif self.can_pass_node(edge.to_node):
@@ -76,7 +72,7 @@ def search_up_for_duplicate(self, G, edge):
             for out_edge in out_edges:
                 if self.find_split_concat_down(G, out_edge):
                     return out_edge
-        elif self.can_pass_node(edge.to_node):
+        elif self.can_pass_node(edge.from_node):
             return self.search_up_for_duplicate(G, G.in_edges(edge.from_node)[0])
         return None
 
@@ -91,15 +87,22 @@ def insert_copy_at_edge(G, edge):
         if G.quantization and nid in G.quantization:
             qrec = G.quantization[nid]
             qtype = deepcopy(qrec.out_qs[edge.from_idx])
-            QRec.copy_ktype(qrec, in_qs=[qtype], out_qs=[qtype])
+            G.quantization[NodeId(copy_node)] = QRec.copy_ktype(qrec, in_qs=[qtype], out_qs=[qtype])
 
     def find_common_in_edges(self, G: GraphView):
-        # Look for splits and concats that share a common in edge where a copy is necessary
-        nodes = G.nodes(node_classes=(SplitParameters, ConcatParameters))
+        # Look for splits and concats that share a common in edge
+        # the split is a stacked tensor and the concat is an alias in a different stack
+        # a copy is always necessary
+        # RNNBaseParameters are also here since they create a UserKernelGroup where their
+        # input can already be in a stack so causes a tiler error
+        nodes = G.nodes(node_classes=(SplitParameters,
+                                      ConcatParameters, RNNBaseParameters))
         has_modified_graph = False
         while nodes:
             node = nodes.pop(0)
-            for in_edge in G.in_edges(node):
+            for in_edge in G.indexed_in_edges(node):
+                if isinstance(node, RNNBaseParameters) and in_edge.to_idx > 0:
+                    break
                 # find another edge that would be generated as the same edge
                 # with a concat/split on it. If found then insert a copy
                 # and search again on that node to find others
@@ -119,38 +122,87 @@ def search_up_for(self, G, edge, node_class):
             return self.search_up_for(G, G.in_edges(edge.from_node)[0], node_class)
         return None
 
-    def insert_copy_split_to_output_or_concat(self, G):
-        # insert copys between splits and outputs or concats
-        nodes = G.nodes(node_classes=(ConcatParameters, OutputParameters))
-        has_modified_graph = False
-        while nodes:
-            node = nodes.pop(0)
-            for edge in G.in_edges(node):
-                split_edge = self.search_up_for(G, edge, SplitParameters)
-                if split_edge is None:
-                    continue
-                has_modified_graph = True
-                self.insert_copy_at_edge(G, split_edge)
-        return has_modified_graph
+    def on_same_edge_as(self, G, node, node_class, visited=None, last_direction=None, start_edge=None):
+        if visited is None:
+            visited = set()
+            if start_edge:
+                to_visit = {VisitEdge(start_edge, 'up')}
+            else:
+                to_visit = set(VisitEdge(edge, 'up') for edge in G.in_edges(node))
+
+        while to_visit:
+            visited_edge = to_visit.pop()
+            visit_node = visited_edge.edge.from_node if visited_edge.direction == "up" else visited_edge.edge.to_node
+            visited.add(visited_edge)
+            # if node class is a tuple of class and direction then see if we are visiting that side of the class
+            # this copes with splits and concats that are converted to stacks in the model.
+            # A stack alias cannot be an output. The aliases will be the output side of a split and the input side
+            # of a concat
+            if isinstance(node_class[0], tuple):
+                if visited_edge.direction == "up":
+                    if any(pair[1] == "down" and isinstance(visit_node, pair[0]) for pair in node_class):
+                        return visited_edge.edge
+                else:
+                    if any(pair[1] == "up" and isinstance(visit_node, pair[0]) for pair in node_class):
+                        return visited_edge.edge
+            elif isinstance(visit_node, node_class):
+                return visited_edge.edge
+            if visited_edge.direction == "up":
+                if self.can_pass_node(visit_node):
+                    # if arriving upwards on the node and can pass visit all its edges that we have not visited
+                    to_visit |= (set(VisitEdge(edge, 'up')
+                                     for edge in G.in_edges(visit_node)) - visited)
+                    to_visit |= (set(VisitEdge(edge, 'down')
+                                     for edge in G.out_edges(visit_node)) - visited)
+                else:
+                    # can't pass but must still visit all the edges on the same out idx
+                    to_visit |= (set(VisitEdge(edge, 'down')
+                                     for edge in G.indexed_out_edges(visit_node)[visited_edge.edge.from_idx]) - visited)
+            else:
+                # the other up edges case is not here since if it is a concat it would already have
+                # triggered the return
+                if self.can_pass_node(visit_node):
+                    to_visit |= (set(VisitEdge(edge, 'down')
+                                     for edge in G.out_edges(visit_node)) - visited)
+        return None
 
-    def insert_copy_input_or_split_to_concat(self, G):
-        # insert copies between inputs or splits and concats
-        nodes = G.nodes(node_classes=(ConcatParameters))
+    COPIES_BETWEEN_CLSES = [
+        # any stack -> any stack (different memory - must always copy)
+        {'from': (SplitParameters, ConcatParameters), 'to': (
+            ConcatParameters, SplitParameters)},
+        # stacked memory -> stack in user kernel group
+        {'from': (SplitParameters, ConcatParameters), 'to': (RNNBaseParameters,)},
+        # stacked alias -> output (name is lost since it is an alias - could be fixed in AT)
+        {'from': ((SplitParameters, 'down'), (ConcatParameters, 'up')), 'to': (OutputParameters,)},
+        # # stacked tensor -> alias (different memory - must always copy)
+        # input -> stacked tensor (can be fixed in AT)
+        # input -> stacked alias (name is lost since it is an alias - could be fixed in AT)
+        {'from': (InputParameters,), 'to': (
+            ConcatParameters, SplitParameters)},
+        # constant -> stacked alias (I guess the constant could be loaded here in some cases by AT)
+        # constant -> stack (not sure if this works or not. Including for safety. If it doesn't work it could.)
+        {'from': (ConstantInputParameters,), 'to': (
+            ConcatParameters, SplitParameters)},
+    ]
+
+    def insert_copies_between(self, G, from_clses, to_clses):
+        # insert copys between splits and outputs or concats
+        nodes = G.nodes(node_classes=to_clses)
         has_modified_graph = False
         while nodes:
             node = nodes.pop(0)
-            for edge in G.in_edges(node):
-                input_edge = self.search_up_for(
-                    G, edge, (SplitParameters, InputParameters))
-                if input_edge is None:
-                    continue
-                has_modified_graph = True
-                self.insert_copy_at_edge(G, input_edge)
+            for edge in G.indexed_in_edges(node):
+                found_edge = self.on_same_edge_as(G, node, from_clses, start_edge=edge)
+                if found_edge:
+                    has_modified_graph = True
+                    self.insert_copy_at_edge(G, found_edge)
         return has_modified_graph
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-        has_modified_graph = self.insert_copy_input_or_split_to_concat(G)
-        has_modified_graph |= self.insert_copy_split_to_output_or_concat(G)
+        has_modified_graph = False
+        for clses in self.COPIES_BETWEEN_CLSES:
+            has_modified_graph |= self.insert_copies_between(
+                G, clses['from'], clses['to'])
         has_modified_graph |= self.find_common_in_edges(G)
 
         return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/match_gap_conv.py b/tools/nntool/graph/matches/matchers/match_gap_conv.py
deleted file mode 100644
index cbb5ca865..000000000
--- a/tools/nntool/graph/matches/matchers/match_gap_conv.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-from graph.types.activations import TanHActivationParameters
-import logging
-from copy import deepcopy
-
-from graph.types import (ActivationParameters, Conv2DParameters,
-                         ConvFusionParameters, HSigmoidActivationParameters,
-                         HSwishActivationParameters, LeakyActivationParameters,
-                         NNEdge, PoolingParameters, ReluActivationParameters,
-                         SigmoidActivationParameters)
-from quantization.new_qrec import QRec
-from utils.graph import GraphView
-from utils.node_id import NodeId
-
-from ..matcher import Matcher, description, groups, match_name
-
-LOG = logging.getLogger("nntool." + __name__)
-
-VALID_ACTIVATIONS_SQ8 = (
-    ReluActivationParameters,
-    LeakyActivationParameters,
-    HSigmoidActivationParameters,
-    HSwishActivationParameters,
-    SigmoidActivationParameters,
-    TanHActivationParameters
-)
-
-VALID_ACTIVATIONS_POW2 = (
-    ReluActivationParameters,
-    LeakyActivationParameters,
-    HSigmoidActivationParameters,
-    HSwishActivationParameters,
-    SigmoidActivationParameters
-)
-
-
-class FusionMatch():
-    def __init__(self, valid_activations) -> None:
-        self.conv = None
-        self.pool = None
-        self.active = None
-        self.tensor_order = None
-        self.valid_activations = valid_activations
-        self.order = []
-
-    def add_node(self, params):
-        if isinstance(params, Conv2DParameters):
-            if self.conv:
-                return None
-            self.tensor_order = params.ker_out_order[0]
-            self.order.append(params)
-            self.conv = params
-            return self
-        elif isinstance(params, self.valid_activations):
-            if self.active:
-                return None
-            self.order.append(params)
-            self.active = params
-            return self
-        elif isinstance(params, PoolingParameters):
-            if self.pool:
-                return None
-            if self.tensor_order != params.ker_in_order[0]:
-                return None
-            self.order.append(params)
-            self.pool = params
-            return self
-        else:
-            return None
-
-    @property
-    def fusion_type(self):
-        return '_'.join(['conv' if isinstance(params, Conv2DParameters)
-                         else 'active' if isinstance(params, ActivationParameters)
-                         else 'pool' for params in self.order])
-
-
-@groups('*')
-@match_name("fuse_gap_convs")
-@description('Fuse convolutions, pools and activations to match GAP AutoTiler operations')
-class MatchAllGapConv(Matcher):
-
-    def get_node_list(self, G, params, valid_activations, result=None):
-        if result is None:
-            result = FusionMatch(valid_activations)
-        if not result.add_node(params):
-            return result
-        out_edges = G.out_edges(params.name)
-        if len(out_edges) > 1:
-            return result
-        return self.get_node_list(G, out_edges[0].to_node, valid_activations, result=result)
-
-    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-        has_modified_graph = False
-        group_identity = kwargs.get('group_identity')
-        if group_identity == 'pow2_match_group':
-            valid_activations = VALID_ACTIVATIONS_POW2
-        else:
-            valid_activations = VALID_ACTIVATIONS_SQ8
-        for conv_node in [params for params in G.nodes() if isinstance(params, Conv2DParameters)]:
-            node_list = self.get_node_list(G, conv_node, valid_activations)
-            if node_list is None or len(node_list.order) < 2:
-                continue
-            if node_list.fusion_type == 'conv_active_pool':
-                if node_list.pool.pool_type == "average":
-                    node_list.order = node_list.order[:2:]
-                    node_list.pool = None
-            elif node_list.fusion_type == 'conv_pool_active':
-                # NOTE: This is only for old POW2 kernels - SQ8 can handle this
-                if node_list.pool.pool_type == "average" and node_list.active.activation != "relu":
-                    continue
-            LOG.info("fusing nodes %s", ",".join(
-                (node.name for node in node_list.order)))
-            has_modified_graph = True
-            subgraph = GraphView()
-            last_node = None
-            for node in node_list.order:
-                if last_node is not None:
-                    subgraph.add_edge(
-                        NNEdge(from_node=last_node, to_node=node))
-                last_node = node
-            input_mapping = [[(node_list.conv, idx)] for idx in range(3)]
-            output_mapping = [(last_node, 0)]
-            pnode = ConvFusionParameters(
-                node_list.conv.name + '_fusion',
-                fusion_type=node_list.fusion_type,
-                subgraph=subgraph,
-                in_dims_hint=node_list.conv.in_dims_hint,
-                out_dims_hint=node_list.conv.out_dims_hint,
-                in_dims=deepcopy(node_list.conv.in_dims),
-                out_dims=deepcopy(node_list.order[-1].out_dims),
-                input_mapping=input_mapping,
-                output_mapping=output_mapping)
-            if G.quantization:
-                qrecs = G.quantization.get_all(pnode.contained_nodes())
-                if qrecs:
-                    # TODO - stats
-                    prec = QRec.copy_ktype(
-                        qrecs[0], in_qs=deepcopy(qrecs[0].in_qs), out_qs=deepcopy(qrecs[-1].out_qs))
-                    for node in pnode.contained_nodes():
-                        G.quantization.move_to_fusion(node, pnode)
-                    G.quantization[NodeId(pnode)] = prec
-            in_edges = G.in_edges(node_list.conv.name)
-            out_edges = G.out_edges(last_node.name)
-            for node in node_list.order:
-                G.remove(node)
-            for edge in in_edges:
-                G.add_edge(NNEdge(edge.from_node, pnode,
-                                  from_idx=edge.from_idx, to_idx=edge.to_idx))
-            for edge in out_edges:
-                G.add_edge(NNEdge(pnode, edge.to_node,
-                                  from_idx=edge.from_idx, to_idx=edge.to_idx))
-
-        if set_identity:
-            self.set_identity(G)
-
-        return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/find_hsigmoid.py b/tools/nntool/graph/matches/matchers/match_hsigmoid.py
similarity index 100%
rename from tools/nntool/graph/matches/matchers/find_hsigmoid.py
rename to tools/nntool/graph/matches/matchers/match_hsigmoid.py
diff --git a/tools/nntool/graph/matches/matchers/match_matmul_add_bias.py b/tools/nntool/graph/matches/matchers/match_matmul_add_bias.py
deleted file mode 100644
index 16e4b6b2c..000000000
--- a/tools/nntool/graph/matches/matchers/match_matmul_add_bias.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-# import logging
-
-# from graph.types import (ActivationParameters, ConstantInputParameters,
-#                          MatMulOpFusionParameters, MatMulOpParameters,
-#                          MatrixAddParameters, NNEdge)
-# from quantization.new_qrec import QRec
-# from utils.graph import GraphView
-# from utils.node_id import NodeId
-
-# from ..matcher import Matcher, groups, match_name, description
-
-# LOG = logging.getLogger("nntool." + __name__)
-
-
-# class FusionMatch():
-#     def __init__(self) -> None:
-#         self.matmul = None
-#         self.add = None
-#         self.active = None
-#         self.order = []
-
-#     def add_node(self, params, G):
-#         if isinstance(params, MatMulOpParameters):
-#             if self.matmul:
-#                 return None
-#             self.order.append(params)
-#             self.matmul = params
-#             return self
-#         elif isinstance(params, ActivationParameters):
-#             if self.active:
-#                 return None
-#             self.order.append(params)
-#             self.active = params
-#             return self
-#         elif isinstance(params, MatrixAddParameters):
-#             if self.add or self.active:
-#                 return None
-#             can_fuse = False
-#             for in_edge in G.in_edges(params.name):
-#                 can_fuse = can_fuse or isinstance(
-#                     in_edge.from_node, ConstantInputParameters)
-#             if not can_fuse:
-#                 return None
-#             self.order.append(params)
-#             self.add = params
-#             return self
-#         else:
-#             return None
-
-#     @property
-#     def fusion_type(self):
-#         return '_'.join(['matmul' if isinstance(params, MatMulOpParameters)
-#                          else 'with_bias' if isinstance(params, MatrixAddParameters)
-#                          else 'active' for params in self.order])
-
-# @groups('*')
-# @match_name("fuse_gap_matmul")
-# @description('Fuse matmul layers with optional add and/or activations to match GAP AutoTiler operations')
-# class MatchMatMulAddBias(Matcher):
-
-#     def get_node_list(self, G, params, result=None):
-#         if result is None:
-#             result = FusionMatch()
-#         if not result.add_node(params, G):
-#             return result
-#         out_edges = G.out_edges(params.name)
-#         if len(out_edges) > 1:
-#             return result
-#         return self.get_node_list(G, out_edges[0].to_node, result=result)
-
-#     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-#         has_modified_graph = False
-#         for matmul_node in [params for params in G.nodes() if isinstance(params, MatMulOpParameters)]:
-#             node_list = self.get_node_list(G, matmul_node)
-#             if node_list is None or len(node_list.order) < 2:
-#                 continue
-#             LOG.info("fusing nodes %s", ",".join(
-#                 (node.name for node in node_list.order)))
-#             has_modified_graph = True
-#             subgraph = GraphView()
-#             if node_list.active is not None:
-#                 subgraph.add_edge(
-#                     NNEdge(from_node=node_list.matmul, to_node=node_list.active))
-#             input_mapping = [[(node_list.matmul, idx)] for idx in range(2)]
-#             if node_list.add:
-#                 input_mapping += [[(node_list.matmul, 2)]]
-#             output_mapping = [(node_list.active, 0)] if node_list.active else [
-#                 (node_list.matmul, 0)]
-#             pnode = MatMulOpFusionParameters(
-#                 node_list.matmul.name + '_fusion',
-#                 fusion_type=node_list.fusion_type,
-#                 subgraph=subgraph,
-#                 input_mapping=input_mapping,
-#                 output_mapping=output_mapping)
-#             if G.quantization:
-#                 # if there are quantization stats then clear them. They need to be created again
-#                 G.quantization.stats = None
-#                 qrecs = G.quantization.get_all(pnode.contained_nodes())
-#                 if qrecs:
-#                     prec = QRec.copy_ktype(
-#                         qrecs[0], in_qs=qrecs[0].in_qs, out_qs=qrecs[-1].out_qs)
-#                     for node in pnode.contained_nodes():
-#                         G.quantization.move_to_fusion(node, pnode)
-#                     G.quantization[NodeId(pnode)] = prec
-#             in_edges = G.in_edges(node_list.matmul.name)
-#             if node_list.add:
-#                 bias_edge = [add_edge for add_edge in G.in_edges(node_list.add.name) if isinstance(
-#                     add_edge.from_node, ConstantInputParameters)][0]
-#             out_edges = G.out_edges(node_list.order[-1].name)
-#             for node in node_list.order:
-#                 G.remove(node)
-#             for edge in in_edges:
-#                 G.add_edge(NNEdge(edge.from_node, pnode,
-#                                   from_idx=edge.from_idx, to_idx=edge.to_idx))
-#             if node_list.add:
-#                 G.add_edge(NNEdge(bias_edge.from_node, pnode,
-#                                   from_idx=bias_edge.from_idx, to_idx=2))
-#             for edge in out_edges:
-#                 G.add_edge(NNEdge(pnode, edge.to_node,
-#                                   from_idx=edge.from_idx, to_idx=edge.to_idx))
-
-#         if set_identity:
-#             self.set_identity(G)
-
-#         return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/match_transpose_matmul.py b/tools/nntool/graph/matches/matchers/match_transpose_matmul.py
index b9ddb5c92..1b7cc5d53 100644
--- a/tools/nntool/graph/matches/matchers/match_transpose_matmul.py
+++ b/tools/nntool/graph/matches/matchers/match_transpose_matmul.py
@@ -13,30 +13,23 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from graph.types.tensor_arithmetic import MatMulTransposedParameters
 import logging
-from abc import abstractproperty
 
-from graph.types import (TransposeParameters, ActivationParameters,
-                         BroadcastableActivationFusion,
-                         GlobalPoolingParameters, HSigmoidActivationParameters,
-                         HSwishActivationParameters, LeakyActivationParameters,
-                         MatMulOpFusionParameters, MatMulOpParameters,
-                         MatrixAddParameters, NNEdge,
-                         PoolingParameters, ReluActivationParameters,
-                         SigmoidActivationParameters)
-from quantization.new_qrec import QRec
+from graph.manipulations.eliminate_transposes.transpose_helpers import \
+    identity_transpose
+from graph.types import MatMulOpParameters, NNEdge, TransposeParameters
+from graph.types.tensor_arithmetic import MatMulTransposedParameters
 from utils.graph import GraphView
-from utils.node_id import NodeId
 
-from ..matcher import Matcher, description, groups, match_name, run_after, run_before
+from ..matcher import (Matcher, description, groups, match_name, run_after,
+                       run_before)
 
 LOG = logging.getLogger("nntool." + __name__)
 
 @run_after('fuse_external_bias_matmul')
 @run_before('fuse_op_activation_scale8', 'fuse_op_activation_pow2')
 @groups('*')
-@match_name("match_trans_matmul")
+@match_name("match_transpose_matmul")
 @description("spots Transpose followed by matmul and generates the proper matmul generator")
 class MatchTransMatMul(Matcher):
 
@@ -56,6 +49,11 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
             trans_node = in_edges[1].from_node
             if not isinstance(trans_node, TransposeParameters):
                 continue
+            transpose = tuple(trans_node.transpose)
+            if not identity_transpose(transpose[:-2]):
+                continue
+            if transpose[-2:] != (len(transpose) - 1, len(transpose) - 2):
+                continue
             if isinstance(node, MatMulTransposedParameters):
                 new_node = MatMulOpParameters(node.name)
             else:
diff --git a/tools/nntool/graph/matches/matchers/move_node_up.py b/tools/nntool/graph/matches/matchers/move_node_up.py
index f3cd828fd..9522f7dcf 100644
--- a/tools/nntool/graph/matches/matchers/move_node_up.py
+++ b/tools/nntool/graph/matches/matchers/move_node_up.py
@@ -19,12 +19,12 @@
                          MatrixAddParameters, MatrixMulParameters, NNEdge,
                          PoolingParameters, ReluActivationParameters,
                          ReshapeParameters, TransposeParameters, MatMulTransposedParameters)
-from graph.types.others import ReverseParameters, StridedSliceParameters
+from graph.types.others import QuantizeParameters, ReverseParameters, StridedSliceParameters
 from graph.types.tensor_arithmetic import MatMulOpParameters
 from utils.graph import GraphView
 from utils.node_id import NodeId
 
-from ..matcher import Matcher, match_name, groups, run_before, description, needs_valid_dimension
+from ..matcher import Matcher, match_name, groups, run_before, description, needs_valid_dimension, run_qtune_on_match
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -56,7 +56,8 @@ def find_home_for_node(self, G, node, first=True):
                 raise LocationNotFoundError()  # @IgnoreException
             # Concat can have multiple inputs that must all acccept moved node
             if isinstance(node, ConcatParameters):
-                for in_edge in G.in_edges(node):
+                # important to use indexed here so the order is always the same
+                for in_edge in G.indexed_in_edges(node):
                     yield from self.find_home_for_node(G,
                                                        in_edge.from_node,
                                                        first=False)
@@ -140,10 +141,11 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
              "Should be run before match_gap_ * fusions.")
 @needs_valid_dimension(True)
 @run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8', 'fuse_op_activation_pow2')
+@run_qtune_on_match
 class MoveActivationsMatcherScale8(MoveNodeUpMatcher):
 
     ValidNodesToPass = (ReshapeParameters, StridedSliceParameters, ReverseParameters,
-                        TransposeParameters, ConcatParameters)
+                        TransposeParameters, ConcatParameters, QuantizeParameters)
     ValidFusions = (Conv2DParameters, FcParameters, PoolingParameters,
                     GlobalPoolingParameters, MatrixAddParameters, MatrixMulParameters,
                     MatMulOpParameters, MatMulTransposedParameters)
@@ -159,8 +161,7 @@ class MoveActivationsMatcherScale8(MoveNodeUpMatcher):
 @run_before('fuse_gap_convs', 'fuse_gap_linear', 'fuse_gap_pool', 'fuse_op_activation_scale8')
 class MoveMaxPoolMatcherScale8(MoveNodeUpMatcher):
 
-    ValidNodesToPass = (ReshapeParameters, TransposeParameters,
-                        ReluActivationParameters, ConcatParameters)
+    ValidNodesToPass = (ReluActivationParameters,)
     ValidFusions = (Conv2DParameters, FcParameters)
     ValidNodes = (lambda node: isinstance(
         node, PoolingParameters) and node.pool_type == "max",)
diff --git a/tools/nntool/graph/matches/matchers/propagate_rnn_sym_mult_qrec.py b/tools/nntool/graph/matches/matchers/propagate_rnn_sym_mult_qrec.py
deleted file mode 100644
index 3303b64af..000000000
--- a/tools/nntool/graph/matches/matchers/propagate_rnn_sym_mult_qrec.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# # This program is free software: you can redistribute it and/or modify
-# # it under the terms of the GNU Affero General Public License as
-# # published by the Free Software Foundation, either version 3 of the
-# # License, or (at your option) any later version.
-
-# # This program is distributed in the hope that it will be useful,
-# # but WITHOUT ANY WARRANTY; without even the implied warranty of
-# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# # GNU Affero General Public License for more details.
-
-# # You should have received a copy of the GNU Affero General Public License
-# # along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-# from graph.types import RNNBaseParameters
-# from utils.graph import GraphView
-# from utils.node_id import NodeId
-
-# from ..matcher import Matcher, groups, match_name, description
-# from .equalize_sym_mult_concats import propagate_qtype_up
-
-
-# @groups('scaled')
-# @match_name("propagate_up_rnn_in_qs")
-# @description("After quantization of rnn their in_q and out_q are the same "
-#              "so in_q may be changed and we need to propagate it up")
-# class PropagateUpRNNInputQ(Matcher):
-
-#     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-#         if not G.quantization:
-#             return
-#         rnns = [node for node in G.nodes() if isinstance(
-#             node, RNNBaseParameters)]
-#         qrecs = [G.quantization[NodeId(node)] for node in rnns]
-#         for rnn, qrec in zip(rnns, qrecs):
-#             in_idx = rnn.INPUT_NAMES.index('input')
-#             in_edge = [edge for edge in G.in_edges(
-#                 rnn.name) if edge.to_idx == in_idx][0]
-#             in_q = qrec.in_qs[in_idx]
-#             propagate_qtype_up(G, in_q, in_edge)
-
-#         if set_identity:
-#             self.set_identity(G)
-
-#         return False
diff --git a/tools/nntool/graph/matches/matchers/reduce_max_to_pool.py b/tools/nntool/graph/matches/matchers/reduce_max_to_pool.py
deleted file mode 100644
index 55b51139c..000000000
--- a/tools/nntool/graph/matches/matchers/reduce_max_to_pool.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-# from graph.types import ReduceMaxParameters, GlobalPoolingParameters
-# from utils.graph import GraphView
-# from .matcher import DefaultMatcher, MatchNode, DontReplaceError
-
-# class MatchReduceMax(DefaultMatcher):
-#     NAME = 'match_reduce_max_nodes'
-#     DESCRIPTION = 'Match reduce max nodes and replace them with GlobalMaxPooling'
-
-#     def match_function(self, G: GraphView):
-#         sub = GraphView()
-#         sub.add_node(MatchNode('0', matcher=lambda node:
-#                                isinstance(node, ReduceMaxParameters)))
-#         return G.match_fragment(sub)
-
-#     def replace_function(self, G: GraphView, subgraph: GraphView):
-#         reduce_max_node = list(subgraph.nodes())[0]
-
-#         for idx, in_dim in enumerate(reduce_max_node.in_dims[0].shape):
-#             if idx > 0 and idx not in reduce_max_node.axis and in_dim == 1:
-#                 raise DontReplaceError()
-#         return GlobalPoolingParameters(reduce_max_node.name + "_GLOBAL_MAXPOOL", pool_type='max',
-#                                     in_dims_hint=reduce_max_node.in_dims_hint,
-#                                     out_dims_hint=reduce_max_node.out_dims_hint), None, None
diff --git a/tools/nntool/graph/matches/matchers/remove_copies.py b/tools/nntool/graph/matches/matchers/remove_copies.py
index 302b9321a..69dba7e5c 100644
--- a/tools/nntool/graph/matches/matchers/remove_copies.py
+++ b/tools/nntool/graph/matches/matchers/remove_copies.py
@@ -33,7 +33,7 @@
 @description("Remove unnecessary copies")
 @modifies_dimensions(True)
 @groups('*')
-@run_after('expand_transposes', 'remove_noops')
+@run_after('remove_noops')
 class RemoveCopies(Matcher):
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
@@ -47,15 +47,13 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                     G,
                     out_edges[0],
                     (OutputParameters, InputParameters, ConstantInputParameters, SplitParameters, ConcatParameters),
-                    can_pass=(ReshapeParameters, NoOPParameters),
-                    can_pass_fn=lambda G, node: isinstance(node, TransposeParameters) and node.does_nothing,
+                    can_pass_fn=lambda G, node: node.no_model_code,
                     follow_multi=True) and
                     search_up(
                         G,
                         G.in_edges(node)[0],
                         (InputParameters, OutputParameters, ConstantInputParameters, SplitParameters, ConcatParameters),
-                        can_pass=(ReshapeParameters, NoOPParameters),
-                        can_pass_fn=lambda G, node: isinstance(node, TransposeParameters) and node.does_nothing,
+                        can_pass_fn=lambda G, node: node.no_model_code,
                         follow_multi=True)):
                 continue
             nodes_to_remove.append(node)
diff --git a/tools/nntool/graph/matches/matchers/remove_noops.py b/tools/nntool/graph/matches/matchers/remove_noops.py
index d469cccf2..3f902fd85 100644
--- a/tools/nntool/graph/matches/matchers/remove_noops.py
+++ b/tools/nntool/graph/matches/matchers/remove_noops.py
@@ -15,9 +15,7 @@
 
 import logging
 
-from graph.types import NNEdge, NoOPParameters
-from graph.types.others import (ConcatParameters, ReshapeParameters,
-                                SplitParameters, TransposeParameters)
+from graph.types import NNEdge
 from utils.graph import GraphView
 
 from ..matcher import Matcher, description, groups, match_name, run_before
@@ -34,30 +32,9 @@
 @groups('symmetric', 'scaled')
 class RemoveNoOPs(Matcher):
 
-    @staticmethod
-    def one_inedge(G, node, idx=None):
-        in_edges = G.in_edges(node)
-        return len(in_edges) == 1 and (idx is None or in_edges[0].to_idx == idx)
-
-    @staticmethod
-    def one_outedge(G, node, idx=None):
-        out_edges = G.out_edges(node)
-        return len(out_edges) == 1 and (idx is None or out_edges[0].from_idx == idx)
-
-    @staticmethod
-    def one_in_and_outedge(G, node, idx=None):
-        return RemoveNoOPs.one_inedge(G, node, idx=idx) and RemoveNoOPs.one_outedge(G, node, idx=idx)
-
-    @staticmethod
-    def node_does_nothing(G, node):
-        return (isinstance(node, NoOPParameters) or
-                isinstance(node, TransposeParameters) and node.transpose is None or
-                isinstance(node, ReshapeParameters) and node.old_shape == node.shape or
-                (isinstance(node, (ConcatParameters, SplitParameters)) and RemoveNoOPs.one_in_and_outedge(G, node, idx=0)))
-
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
         has_modified_graph = False
-        for node in [node for node in G.nodes() if self.node_does_nothing(G, node)]:
+        for node in [node for node in G.nodes() if node.does_nothing]:
             has_modified_graph = True
             in_edge = G.in_edges(node.name)[0]
             G.remove_edge(in_edge)
diff --git a/tools/nntool/graph/matches/matchers/remove_reshapes.py b/tools/nntool/graph/matches/matchers/remove_reshapes.py
index 573d056da..1d9defb7b 100644
--- a/tools/nntool/graph/matches/matchers/remove_reshapes.py
+++ b/tools/nntool/graph/matches/matchers/remove_reshapes.py
@@ -43,7 +43,7 @@ def validate_reshape(G, reshape):
             return False
         candidate = edge.to_node
         if isinstance(candidate, TransposeParameters):
-            if not candidate.does_nothing():
+            if not candidate.no_model_code:
                 return False
             out_shape = tuple(candidate.out_dims[0].shape)
         else:
@@ -56,12 +56,13 @@ def validate_reshape(G, reshape):
         return (reshape, candidates, out_shape)
 
     def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
-        modified_graph = True
-        while modified_graph:
-            modified_graph = False
+        modified_graph = False
+        found_reshapes = True
+        while found_reshapes:
+            found_reshapes = False
             for reshape in G.nodes(node_classes=(ReshapeParameters,)):
                 if reshape.shape.shape == reshape.old_shape.shape:
-                    modified_graph = True
+                    found_reshapes = modified_graph = True
                     LOG.info('removing reshape that does nothing %s', reshape.name)
                     G.remove_and_reconnect(reshape, edge_class=NNEdge)
                     nid = NodeId(reshape)
@@ -72,7 +73,7 @@ def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
                 res = self.validate_reshape(G, reshape)
                 if res:
                     LOG.info('unnecessary reshape found after %s', reshape.name)
-                    modified_graph = True
+                    found_reshapes = modified_graph = True
                     (reshape, candidates, out_shape) = res
                     for candidate in candidates:
                         LOG.info(
diff --git a/tools/nntool/graph/matches/matchers/remove_ssd_output.py b/tools/nntool/graph/matches/matchers/remove_ssd_output.py
new file mode 100644
index 000000000..7e77b587e
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/remove_ssd_output.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2020, 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+from copy import deepcopy
+
+from graph.types import SSDDetectorParameters
+from utils.graph import GraphView
+
+from ..matcher import (Matcher, description, groups, match_name, run_qtune_on_match,
+                       needs_valid_dimension)
+
+LOG = logging.getLogger("nntool." + __name__)
+
+
+@match_name('remove_ssd_output')
+@description('remove the 4th output on the ssd detector - num detections. This is necessary for '
+             'operation with GAP kernels')
+@groups('*')
+@needs_valid_dimension(True)
+@run_qtune_on_match
+class RemoveSSDOutput(Matcher):
+
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs):
+        has_modified_graph = False
+        for node in G.nodes(node_classes=SSDDetectorParameters):
+            if not node.output_detection_count:
+                continue
+            has_modified_graph = True
+            LOG.info(f'removing detection count output on {node.name}')
+            edges_below = G.indexed_out_edges(node)[3]
+            for edge in edges_below:
+                G.remove_below(edge.to_node)
+                G.remove(edge.to_node)
+            node.output_detection_count = False
+
+        return has_modified_graph
diff --git a/tools/nntool/graph/matches/matchers/match_reversed_rnn.py b/tools/nntool/graph/matches/matchers/rnn_reverse.py
similarity index 100%
rename from tools/nntool/graph/matches/matchers/match_reversed_rnn.py
rename to tools/nntool/graph/matches/matchers/rnn_reverse.py
diff --git a/tools/nntool/graph/matches/matchers/match_rnn_unpack.py b/tools/nntool/graph/matches/matchers/rnn_unpack.py
similarity index 100%
rename from tools/nntool/graph/matches/matchers/match_rnn_unpack.py
rename to tools/nntool/graph/matches/matchers/rnn_unpack.py
diff --git a/tools/nntool/graph/matches/matchers/slice_to_split.py b/tools/nntool/graph/matches/matchers/slice_to_split.py
index 20f601e88..53ab42b5a 100644
--- a/tools/nntool/graph/matches/matchers/slice_to_split.py
+++ b/tools/nntool/graph/matches/matchers/slice_to_split.py
@@ -118,7 +118,7 @@ def slices_to_sizes(slices_and_shapes, shape_rest):
 
 @ match_name("slice_to_split")
 @ description("collects slices from a single node and converts to a single split")
-@ run_before('unused_concats')
+@ run_before('remove_noops', 'insert_copies')
 @ groups('*')
 class SliceToSplitMatch(Matcher):
     @ staticmethod
diff --git a/tools/nntool/graph/matches/matchers/split_concat.py b/tools/nntool/graph/matches/matchers/split_concat.py
new file mode 100644
index 000000000..ef2615cda
--- /dev/null
+++ b/tools/nntool/graph/matches/matchers/split_concat.py
@@ -0,0 +1,149 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+
+from graph.dim import Dim
+from graph.types import ConcatParameters, NNEdge, SplitParameters
+from graph.types.others import (CopyParameters, NoOPParameters,
+                                ReshapeParameters, TransposeParameters)
+from utils.graph import GraphView
+from utils.node_id import NodeId
+
+from ..match_utils import search_down
+from ..matcher import Matcher, description, groups, match_name, run_before
+
+LOG = logging.getLogger("nntool." + __name__)
+
+
+def reduce_slices(slices, shapes):
+    res_slice = []
+    res_shape = []
+    for slice_axis, shape_axis in zip(zip(*slices), zip(*shapes)):
+        if slice_axis[0] == slice_axis[1]:
+            res_slice.append(slice_axis[0])
+            res_shape.append(shape_axis[0])
+        else:
+            res_slice.append(
+                (slice_axis[0][0],
+                 slice_axis[-1][1],
+                 slice_axis[0][2]))
+            res_shape.append(sum(shape_axis))
+    return res_slice, res_shape
+
+
+def remove_edges(G, edges):
+    if not edges:
+        return
+    edges = edges.copy()
+    while len(edges) > 1:
+        edge = edges.pop(0)
+        G.remove(edge.to_node)
+        if G.quantization:
+            nid = NodeId(edge.to_node)
+            if nid in G.quantization:
+                del G.quantization[nid]
+    try:
+        G.remove_edge(edges[0])  # @IgnoreException
+    except KeyError:
+        pass
+
+
+@groups('*')
+@match_name("split_concat")
+@run_before('remove_noops', 'remove_copies')
+@description("removes splits that go to concats where all the out edges of the split are in sequence in the concat")
+class SplitConcatMatch(Matcher):
+    def _match(self, G: GraphView, set_identity: bool = True, **kwargs) -> bool:
+        edge_groups = []
+        for node in G.nodes(node_classes=SplitParameters):
+            cur_group = None
+            for out_edge_bundle in G.indexed_out_edges(node):
+                if len(out_edge_bundle) == 1:
+                    out_edge = out_edge_bundle[0]
+                    concat_node_edges = search_down(
+                        G, out_edge, ConcatParameters,
+                        can_pass=(CopyParameters,),
+                        can_pass_fn=lambda _, node: node.no_model_code)
+                    if concat_node_edges:
+                        if cur_group:
+                            this_concat_edge = concat_node_edges[-1]
+                            last_concat_edge = cur_group[-1][-1]
+                            if (this_concat_edge.to_node == last_concat_edge.to_node and
+                                    this_concat_edge.to_idx == last_concat_edge.to_idx + 1):
+                                cur_group.append(concat_node_edges)
+                                continue
+                            if len(cur_group) > 1:
+                                edge_groups.append(cur_group)
+                        cur_group = [concat_node_edges]
+                        continue
+                if cur_group:
+                    if len(cur_group) > 1:
+                        edge_groups.append(cur_group)
+                    cur_group = None
+            if cur_group:
+                if len(cur_group) > 1:
+                    edge_groups.append(cur_group)
+                cur_group = None
+        # we leave the splits and concats after this since they will be cleared up by remove_noops
+        for edge_group in edge_groups:
+            split_node = edge_group[0][0].from_node
+            concat_node = edge_group[0][-1].to_node
+            from_idx = edge_group[0][0].from_idx
+            to_idx = edge_group[-1][0].from_idx
+            from_concat_idx = edge_group[0][-1].to_idx
+            to_concat_idx = edge_group[1][-1].to_idx
+            LOG.info(
+                f"combining outputs {from_idx}:{to_idx} on split node {split_node.name} "
+                f"followed by concat {concat_node.name}")
+            # combine slices and shapes on edges in group
+            new_slice, new_shape = reduce_slices(
+                split_node.act_slices[from_idx:to_idx+1],
+                split_node.out_shapes[from_idx:to_idx+1]
+            )
+            new_concat_shape = Dim.combine(
+                [concat_node.in_dims[idx]
+                    for idx in range(from_concat_idx, to_concat_idx+1)],
+                concat_node.axis)
+            split_node.act_slices = split_node.act_slices[:from_idx] + [
+                new_slice] + split_node.act_slices[to_idx+1:]
+            # the slice may need to reshape since we will remove everything in between
+            split_node.out_shapes = split_node.out_shapes[:from_idx] + [
+                new_concat_shape.shape] + split_node.out_shapes[to_idx+1:]
+
+            # remove all edges and intermediate nodes on all edge groups
+            for edge_list in edge_group:
+                remove_edges(G, edge_list)
+            # add back a direct edge to the first idx
+            G.add_edge(NNEdge(from_node=split_node,
+                              from_idx=edge_group[0][0].from_idx,
+                              to_node=concat_node,
+                              to_idx=edge_group[0][-1].to_idx))
+            out_edge_bundles = G.indexed_out_edges(split_node)
+            # move edges beyond the edge group after the first index
+            for offset, edge_list in enumerate(out_edge_bundles[to_idx+1:]):
+                assert len(edge_list) == 1
+                edge = edge_list[0]
+                G.remove_edge(edge)
+                G.add_edge(NNEdge.clone(edge, from_idx=from_idx+1+offset))
+            # reindex the in edges in the concat
+            from_idx = edge_group[0][-1].to_idx
+            to_idx = edge_group[-1][-1].to_idx
+            in_edges = G.indexed_in_edges(concat_node)
+            for offset, in_edge in enumerate(in_edges[to_idx+1:]):
+                G.remove_edge(in_edge)
+                G.add_edge(NNEdge.clone(in_edge, to_idx=from_idx+1+offset))
+
+        return bool(edge_groups)
diff --git a/tools/nntool/graph/matches/matches.py b/tools/nntool/graph/matches/matches.py
index 451dc92fe..fe1130680 100644
--- a/tools/nntool/graph/matches/matches.py
+++ b/tools/nntool/graph/matches/matches.py
@@ -17,32 +17,19 @@
 
 import logging
 
-from graph.matches.matcher import Matcher, MatchGroup
+from graph.matches.matcher import Matcher, MatchGroup, match_name, description
 from utils.subclasses import get_all_subclasses
 
 from .matchers import *
 
 LOG = logging.getLogger("nntool." + __name__)
 
-
-def general_validation(match: Matcher):
-    if match.DESCRIPTION is None:
-        LOG.warning('matcher %s has no description', match.NAME)
-    if match.NAME is None:
-        raise ValueError(f'match {match.NAME} has no name')
-    if '*' in match.RUN_BEFORE and '*' in match.RUN_AFTER:
-        raise ValueError(
-            f'match {match.NAME} has wildcard in run_before and run_after')
-    return match
-
-
-ALL_MATCHERS = [general_validation(match_class) for match_class in get_all_subclasses(Matcher)
-                if match_class.NAME is not None]
+ALL_MATCHERS = {}
 
 
 def select_matchers(group=None):
-    return [match_class for match_class in ALL_MATCHERS
-            if (group is None or '*' in match_class.GROUPS or group in match_class.GROUPS)]
+    return [match_class for match_class in ALL_MATCHERS.values()
+            if ('*' in match_class.GROUPS or group in match_class.GROUPS)]
 
 
 def order_matchers(matchers):
@@ -75,22 +62,39 @@ def select_sorted_matcher_instances(group=None):
 def get_fusions():
     return sorted(
         [(match_class.NAME, match_class.DESCRIPTION)
-         for match_class in ALL_MATCHERS],
+         for match_class in ALL_MATCHERS.values()],
         key=lambda x: x[0])
 
 
+@match_name("pow2_match_group")
+@description("a selection of matches that are relevant for POW2 quantized graphs")
+class POW2MatchGroup(MatchGroup):
+    def __init__(self):
+        super().__init__(*select_sorted_matcher_instances(group='symmetric'),
+                         identity='pow2_match_group')
+
+
+@match_name("scaled_match_group")
+@description("a selection of matches that are relevant for scaled quantized graphs")
+class ScaledMatchGroup(MatchGroup):
+    def __init__(self):
+        super().__init__(*select_sorted_matcher_instances(group='scaled'),
+                         identity='scaled_match_group')
+
+
 def get_pow2_match_group():
-    return MatchGroup(
-        *select_sorted_matcher_instances(group='symmetric'),
-        identity="pow2_match_group"
-    )
+    return POW2MatchGroup()
 
 
 def get_scale8_match_group():
-    return MatchGroup(
-        *select_sorted_matcher_instances(group='scaled'),
-        identity="std_match_group"
-    )
+    return ScaledMatchGroup()
+
+
+def get_matches(*match_names, identity="custom"):
+    not_found = set(match_names) - set(ALL_MATCHERS)
+    if not_found:
+        raise ValueError(f'matches {" ".join(not_found)} not found')
+    return MatchGroup(*[ALL_MATCHERS[name]() for name in match_names], identity=identity)
 
 
 def get_fusion(name):
@@ -98,8 +102,10 @@ def get_fusion(name):
         return get_pow2_match_group()
     if name in ["std_match_group", "scale8_match_group"]:
         return get_scale8_match_group()
-    match_class = next((match_class for match_class in select_matchers()
-                        if match_class.NAME == name), None)
-    if match_class is not None:
-        return match_class()
+    if name in ALL_MATCHERS:
+        return ALL_MATCHERS[name]()
     return None
+
+
+ALL_MATCHERS.update({match_class.NAME: match_class for match_class in get_all_subclasses(Matcher)
+                     if match_class.NAME is not None})
diff --git a/tools/nntool/graph/nngraph.py b/tools/nntool/graph/nngraph.py
index 830f49739..4448d913b 100644
--- a/tools/nntool/graph/nngraph.py
+++ b/tools/nntool/graph/nngraph.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
+# Copyright (C) 2020, 2022  GreenWaves Technologies, SAS
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -16,30 +16,33 @@
 import logging
 import os
 import re
-from typing import Generator, Sequence, Union
+from typing import Callable, Generator, Sequence, Tuple, Union
 
 import numpy as np
 from quantization.quantization_set import QuantizationSet
 from reports.graph_reporter import GraphReporter
-from reports.quantization_reporter import QuantizationReporter
 from utils.graph import Graph, Node
 from utils.node_id import NodeId
 from utils.tabular import TextTableRenderer
 
 from graph.dim import Dim
-from graph.dump_tensor import PrintDumper, dump_tensor
 from graph.graph_identity import GraphIdentity
-from graph.manipulations import (add_dimensions, adjust_order,
-                                 balance_all_filters, calculate_liveness)
-from graph.manipulations.balance_filter import balance_filter_with_constants
+from graph.manipulations.adjust_order import adjust_order
+from graph.manipulations.balance_filter import (balance_all_filters,
+                                                balance_filter_with_constants)
+from graph.manipulations.dimensions import add_dimensions
+from graph.manipulations.liveness import calculate_liveness
+from graph.matches.fusions import fusions
 from graph.types import (ConstantInputParameters, InputBaseParameters,
                          InputParameters, MultiplicativeBiasParameters,
                          OutputParameters, ResizerParameters,
                          RNNBaseParameters, SSDDetectorParameters)
-from graph.types.base import NNNodeRef
+from graph.types.base import NNEdge, NNNodeRef, Parameters
 from graph.types.dsp_preprocessing import DSPParameters
 from graph.types.expression_fusion import ExpressionFusionParameters
-from graph.types.fusions import ActivationFusionBase, FilterFusionBase, FusionBase, MatMulOpFusionParameters, PaddedAddFusionParameters
+from graph.types.fusions import (ActivationFusionBase, FilterFusionBase,
+                                 FusionBase, MatMulOpFusionParameters,
+                                 PaddedAddFusionParameters)
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -80,78 +83,140 @@ def liveness(self):
     def liveness(self, val):
         self._state['liveness'] = val
 
-    @property
-    def has_quantization_info(self):
-        return self._state['quantization']
-
-    @has_quantization_info.setter
-    def has_quantization_info(self, val):
-        self._state['quantization'] = val
-
 
 class NNGraph(Graph):
     def __init__(self,
                  model=None,
                  name=None,
                  filename=None):
-        super().__init__()
-
-        self.model = model
 
-        self.num_inputs = 0
-        self.num_outputs = 0
-        self.num_constants = 0
-        self.node_options = {}
-        self.num_rinputs = 0
-        self.num_routputs = 0
+        attrs = {
+            'model': model,
+            'node_options': {},
 
-        self.graph_state = NNGraphState()
-
-        self.load_function = None
-        self.graphname = name
-        self.graph_identity = GraphIdentity(filename)
-        self._info = {
             'quantization': None,
+            'has_quantized_parameters': False,
+            'graphname': name,
+            'graph_state': NNGraphState(),
+            'graph_identity': GraphIdentity(filename)
         }
+        super().__init__(**attrs)
+
+    INVALID_CHARS = re.compile(r'[^A-Za-z0-9_]')
+
+    @staticmethod
+    def valid_c_identifier(val: str) -> str:
+        return NNGraph.INVALID_CHARS.sub('_', val)
+
+    @property
+    def _edge_class(self):
+        return NNEdge
+
+    @property
+    def name(self) -> str:
+        """Returns the name of the graph potentially modified to be a valid C identifier
+
+        Returns:
+            str: The graph name
+        """
+        if self._attr['graphname'] is None:
+            base, _ = os.path.splitext(
+                os.path.basename(self._attr['graph_identity'].filename))
+            return self.valid_c_identifier(base)
+        return self.valid_c_identifier(self._attr['graphname'])
+
+    @name.setter
+    def name(self, val: str):
+        """Sets the name of the graph
+
+        Args:
+            val (str): The name of the graph
+        """
+        self._attr['graphname'] = val
+
+    @property
+    def model(self):
+        """The original model that generated the NNTool graph
+
+        Returns:
+            Any: The model file (TFLite or ONNX graph descriptor)
+        """
+        return self._attr['model']
+
+    @property
+    def num_inputs(self) -> int:
+        """Current number of inputs
+
+        Returns:
+            int: Number of inputs
+        """
+        return len(self.nodes(node_classes=InputParameters))
+
+    @property
+    def num_outputs(self) -> int:
+        """Current number of outputs
+
+        Returns:
+            int: Number of outputs
+        """
+        return len(self.nodes(node_classes=OutputParameters))
 
     @property
-    def info(self):
-        return self._info
+    def num_constants(self) -> int:
+        """Current number of constant inputs
 
-    @info.setter
-    def info(self, val):
-        self._info = val
+        Returns:
+            int: Number of constant inputs
+        """
+        return len(self.nodes(node_classes=ConstantInputParameters))
+
+    @property
+    def node_options(self) -> dict:
+        return self._attr['node_options']
 
     @property
-    def quantization(self) -> QuantizationSet:
-        return self._info.get('quantization')
+    def quantization(self) -> Union[QuantizationSet, None]:
+        """Current graph Quantization
+
+        Returns:
+            Union[QuantizationSet, None]: quantization set
+        """
+        return self._attr['quantization']
 
     @quantization.setter
-    def quantization(self, val: QuantizationSet):
-        self._info['quantization'] = val
+    def quantization(self, val: Union[QuantizationSet, None]):
+        """Sets or clears the quantization
+
+        Args:
+            val (Union[QuantizationSet, None]): quantization set
+        """
+        self._attr['quantization'] = val
 
     @property
     def has_quantized_parameters(self) -> bool:
-        return self._info.get('has_quantized_parameters')
+        """Graph was imported with quantized parameters
+
+        Returns:
+            bool: quantized parameters or not
+        """
+        return self._attr['has_quantized_parameters']
 
     @has_quantized_parameters.setter
     def has_quantized_parameters(self, val: bool):
-        self._info['has_quantized_parameters'] = val
+        """Graph was imported with quantized parameters
 
-    INVALID_CHARS = re.compile(r'[^A-Za-z0-9_]')
-
-    @staticmethod
-    def valid_c_identifier(val: str) -> str:
-        return NNGraph.INVALID_CHARS.sub('_', val)
+        Args:
+            val (bool): quantized parameters or not
+        """
+        self._attr['has_quantized_parameters'] = val
 
+    @property
+    def graph_state(self) -> NNGraphState:
+        return self._attr['graph_state']
 
     @property
-    def name(self) -> str:
-        if self.graphname is None:
-            base, _ = os.path.splitext(
-                os.path.basename(self.graph_identity.filename))
-            return self.valid_c_identifier(base)
-        return self.valid_c_identifier(self.graphname)
+    def graph_identity(self) -> GraphIdentity:
+        return self._attr['graph_identity']
 
     @property
     def inputs_dim(self) -> list:
@@ -161,165 +226,256 @@ def inputs_dim(self) -> list:
     def outputs_dim(self) -> list:
         return [out_node.out_dims[0].shape for out_node in self.output_nodes()]
 
-    @name.setter
-    def name(self, val):
-        self.graphname = val
-
     @property
-    def has_ssd_postprocess(self):
+    def has_ssd_postprocess(self) -> bool:
+        """Graph has SSD detector nodes
+
+        Returns:
+            bool: True if present
+        """
         return self.has_node_type(SSDDetectorParameters)
 
     @property
-    def has_resizer(self):
+    def has_resizer(self) -> bool:
+        """Graph has resizer nodes
+
+        Returns:
+            bool: True if present
+        """
         return self.has_node_type(ResizerParameters)
 
     @property
-    def has_expressions(self):
-        return self.has_node_type(ExpressionFusionParameters)
-
-    def has_rnn(self, ktype=None, ne16=False):
-        nodes = self.nodes(node_classes=RNNBaseParameters)
-        if not nodes:
-            return False
-        if ktype is not None and not any(self.quantization[NodeId(node)].ktype == ktype for node in nodes):
-            return False
-        if ne16 and not any(self.quantization[NodeId(node)].cache.get('ne16') for node in nodes):
-            return False
-        return True
+    def has_expressions(self) -> bool:
+        """Graph has compiled expressions
 
+        Returns:
+            bool: True if present
+        """
+        return self.has_node_type(ExpressionFusionParameters)
 
     @property
-    def has_dsp(self):
+    def has_dsp(self) -> bool:
+        """Graph has DSP nodes
+
+        Returns:
+            bool: True if present
+        """
         return self.has_node_type(DSPParameters)
 
     @property
-    def all_expressions(self):
-        return self.all_node_types(ExpressionFusionParameters)
+    def all_expressions(self) -> Sequence[ExpressionFusionParameters]:
+        """All the expression nodes in the graph
+
+        Returns:
+            Sequence[ExpressionFusionParameters]: List of nodes
+        """
+        return self.nodes(node_classes=ExpressionFusionParameters)
 
     @property
-    def nodes_by_step_idx(self):
+    def nodes_by_step_idx(self) -> Sequence[Parameters]:
+        """All the nodes in the graph ordered by execution order
+
+        Returns:
+            Sequence[Parameters]: List of nodes
+        """
         return [step['node'] for step in self.graph_state.steps]
 
     @property
-    def nodes_by_step_idx_with_fusions(self):
+    def nodes_by_step_idx_with_fusions(self) -> Sequence[Parameters]:
+        """Nodes ordered by execution order but also including internal nodes
+        for fusions
+
+        Returns:
+            Sequence[Parameters]: List of nodes
+        """
         nodes = []
         for step in self.graph_state.steps:
             node = step['node']
-            if isinstance(node, (FilterFusionBase, ActivationFusionBase, PaddedAddFusionParameters, MatMulOpFusionParameters)):
+            if isinstance(node, (FilterFusionBase, ActivationFusionBase,
+                                 PaddedAddFusionParameters, MatMulOpFusionParameters)):
                 nodes.extend(node.contained_nodes())
             nodes.append(node)
         return nodes
 
-    def has_node_type(self, node_type):
-        return any(isinstance(node, node_type) for node in self.nodes())
+    @property
+    def total_ops(self) -> int:
+        """Estimated total operations in the graph
 
-    def all_node_types(self, node_type):
-        return [node for node in self.nodes() if isinstance(node, node_type)]
-
-    def set_load_function(self, func):
-        self.load_function = func
-
-    def load_tensors(self, file=None):
-        assert self.load_function
-        self.load_function(self, file)
-
-    def get_in_params(self, name: str) -> set:
-        in_edges = self.in_edges(name)
-        if not in_edges:
-            return in_edges
-        in_edges.sort(key=lambda edge: edge.to_idx)
-        res = []
-        in_idx = 0
-        for real_idx in range(max(edge.to_idx for edge in in_edges) + 1):
-            if real_idx == in_edges[in_idx].to_idx:
-                res.append(in_edges[in_idx].params)
-                in_idx += 1
-            else:
-                res.append(None)
+        Returns:
+            int: Number of operations
+        """
+        tot_ops = 0
+        for node in self.nodes():
+            ops = node.compute_load()
+            tot_ops += ops if ops else 0
+        return tot_ops
 
-        return res
+    def has_rnn(self, ktype: str = None, ne16: bool = False) -> bool:
+        """Graph has RNN nodes
 
-    def get_out_params(self, name: str) -> set:
-        out_edges = self.indexed_out_edges(name)
-        return [edge_list[0].params for edge_list in out_edges]
+        Args:
+            ktype (str, optional): kernel type to match or all. Defaults to None.
+            ne16 (bool, optional): match nodes that will map to ne16 kernels. Defaults to False.
 
-    def all_inputs(self) -> Generator[Node, None, None]:
-        return (node for node in self.nodes() if isinstance(node, (InputBaseParameters)))
+        Returns:
+            bool: True if present
+        """
+        nodes = self.nodes(node_classes=RNNBaseParameters)
+        if not nodes:
+            return False
+        if ktype is not None and not any(self.quantization[NodeId(node)].ktype == ktype for node in nodes):
+            return False
+        if ne16 and not any(self.quantization[NodeId(node)].cache.get('ne16') for node in nodes):
+            return False
+        return True
+
+    def has_node_type(self, node_type: Parameters) -> bool:
+        """Returns True if graph contains node type
+
+        Args:
+            node_type (Parameters): Node class
+
+        Returns:
+            bool: True if present
+        """
+        return any(isinstance(node, node_type) for node in self.nodes())
 
     def inputs_and_constants(self) -> Generator[Node, None, None]:
+        """Iterate over all inputs and constants
+
+        Returns:
+            Generator[Node]: a generator for all nodes
+        """
         return (node for node in self.nodes() if isinstance(node, InputBaseParameters))
 
     def input_nodes(self) -> Generator[Node, None, None]:
+        """Iterate over all inputs
+
+        Returns:
+            Generator[Node]: a generator for all nodes
+        """
         return (node for node in self.nodes() if isinstance(node, InputParameters))
 
     def output_nodes(self) -> Generator[Node, None, None]:
-        return (node for node in self.nodes() if isinstance(node, OutputParameters))
+        """Iterate over all outputs
 
-    def is_input(self, node_name: Union[str, Node]) -> bool:
-        if isinstance(node_name, str):
-            return isinstance(self[node_name], InputParameters)
-        return isinstance(node_name, InputParameters)
+        Returns:
+            Generator[Node]: a generator for all nodes
+        """
+        return (node for node in self.nodes() if isinstance(node, OutputParameters))
 
-    def is_output(self, node_name: Union[str, Node]) -> bool:
-        if isinstance(node_name, str):
-            return isinstance(self[node_name], OutputParameters)
-        return isinstance(node_name, OutputParameters)
+    def add_input(self, dim: Union[Dim, Tuple[int]], name: str = None, **kwargs) -> NNNodeRef:
+        """Create an input node. If a name is not supplied then one will be automatically chosen.
 
-    def reset_inout_counts(self):
-        self.num_inputs = 0
-        self.num_outputs = 0
-        self.num_constants = 0
+        Args:
+            dim (Union[Dim, Tuple[int]]): Input dimension
+            name (str, optional): Node name. Defaults to None.
 
-    def add_input(self, dim: Dim, name=None, **kwargs) -> InputParameters:
-        self.num_inputs += 1
-        node_name = "input_"+str(self.num_inputs) if not name else name
+        Returns:
+            NNNodeRef: Reference to created node in graph
+        """
+        node_name = self.unique_name(
+            f"input_{self.num_inputs + 1}") if not name else name
         node = InputParameters(node_name, dims=dim, **kwargs)
         self.add_node(node)
-        return NNNodeRef(node, 0, self)
-
-    def add_constant(self, dim: Dim, name: str = None,
-                     adjust_transpose=None, is_mutated=False,
-                     is_intermediate=False, short_name=None) -> ConstantInputParameters:
-        self.num_constants += 1
-        node_name = name if name else "constant_"+str(self.num_constants)
+        return NNNodeRef(self, node, 0)
+
+    def add_constant(self, dim: Union[Dim, Tuple[int]] = None,
+                     name: str = None,
+                     value: np.ndarray = None,
+                     adjust_transpose: Sequence[int]=None,
+                     is_mutated=False,
+                     is_intermediate=False,
+                     short_name: str=None) -> NNNodeRef:
+        """Creates a constant node
+
+        Args:
+            dim (Union[Dim, Tuple[int]], optional): Dimension of constant if not supplied then a value must be. Defaults to None.
+            name (str, optional): Optional name. A unique one will be created if None. Defaults to None.
+            value (np.ndarray, optional): Numpy array with value. Defaults to None.
+            adjust_transpose (Sequence[int], optional): Adjust will transpose the value using this transpose. Defaults to None.
+            is_mutated (bool, optional): Constant is both an input and an output. Defaults to False.
+            is_intermediate (bool, optional): Constant is marked as intermediate at import. Defaults to False.
+            short_name (str, optional): Preferred short name for model generation. Defaults to None.
+
+        Returns:
+            NNNodeRef: A reference to the Node in the Graph
+        """
+        node_name = name if name else self.unique_name(f"constant_{self.num_constants}")
         node = ConstantInputParameters(node_name, dims=dim,
+                                       value=value,
                                        adjust_transpose=adjust_transpose,
                                        is_intermediate=is_intermediate,
                                        is_mutated=is_mutated,
                                        short_name=short_name)
         self.add_node(node)
-        return NNNodeRef(node, 0, self)
-
-    def variable_in_edges(self, node_name):
-        return list([edge for edge in self.in_edges(node_name)
-                     if not isinstance(edge.from_node, ConstantInputParameters)])
+        return NNNodeRef(self, node, 0)
 
     def add_output(self, name=None) -> OutputParameters:
-        self.num_outputs += 1
-        node_name = "output_"+str(self.num_outputs) if name is None else name
+        """Create an output node. If a name is not supplied then one will be automatically chosen.
+
+        Args:
+            name (str, optional): Node name. Defaults to None.
+
+        Returns:
+            OutputParameters: Created node
+        """
+        node_name = self.unique_name(
+            f"output_{self.num_outputs + 1}") if name is None else name
         node = OutputParameters(node_name)
         self.add_node(node)
         return node
 
     def nodes_iterator(self, yield_fusions=True):
+        """Yields a tuple of length 4 with the step idx and parameters of each node. Optionally
+        when in a fusion yields tuples containing the fusion internal step id and node for each internal
+        node.
+
+        Args:
+            yield_fusions (bool, optional): Whether to yield fusion nodes. Defaults to True.
+
+        Yields:
+            [Tuple[int, Parameters, Optional[int], Optional[Parameters]]]: Tuple containing node_idx, node, fusion_idx, fusion_node
+        """
         for step_idx, step in enumerate(self.graph_state.steps):
             node = step['node']
-            if isinstance(node, (FusionBase)) and not isinstance(node, ExpressionFusionParameters):
-                if yield_fusions:
+            if yield_fusions:
+                if isinstance(node, (FusionBase)) and not isinstance(node, ExpressionFusionParameters):
                     for fusion_idx, fnode in enumerate(node.contained_nodes()):
                         yield (step_idx, node, fusion_idx, fnode)
-                yield (step_idx, node, None, None)
-            else:
-                yield (step_idx, node, None, None)
-
-    def adjust_order(self, reshape_weights=True, postprocess=True, debug_function=None, steps=None, single_step=False):
+            yield (step_idx, node, None, None)
+
+    def adjust_order(self, reshape_weights=True, no_postprocess=False, debug_function: Callable=None, steps: int=None, single_step=False):
+        """Adjusts tensor order to match selected kernels
+
+        Args:
+            reshape_weights (bool, optional): Whether weights should be modified to remove transposes. Defaults to True.
+            no_postprocess (bool, optional): Whether post processing such as transpose elimination is run. Defaults to False.
+            debug_function (Callable, optional): Function to be called after each transpose elimination step. Defaults to None.
+            steps (int, optional): Number of elimination steps to run. Defaults to None.
+            single_step (bool, optional): Execute only one transpose elimination step in each cycle. Defaults to False.
+        """
         adjust_order(self, reshape_weights=reshape_weights,
-                     postprocess=postprocess, debug_function=debug_function,
+                     postprocess=not no_postprocess, debug_function=debug_function,
                      steps=steps, single_step=single_step)
         LOG.info("adjusted order")
         self.graph_identity.is_adjusted = True
 
+    def fusions(self, *match_names, no_postprocess: bool = False):
+        """Run matchers on the graph
+
+        Args:
+            match_names (str): Names of matches to apply
+            no_postprocess (bool, optional): Do not execute postprocessing such as transpose elimination. Defaults to False.
+        """
+        fusions(self, *match_names, no_postprocess=no_postprocess)
+
     def add_dimensions(self, quiet=False):
+        """Add dimensions to the graph and calculate execution order and liveness
+
+        Args:
+            quiet (bool, optional): Do not log progress. Defaults to False.
+        """
         if not quiet:
             LOG.info("update graph dimensions")
         self.graph_state.steps = add_dimensions(self)
@@ -329,7 +485,17 @@ def add_dimensions(self, quiet=False):
             self,
             self.graph_state.steps)
 
-    def balance_filters(self, step_idx=None, precision_threshold=0.20):
+    def balance_filters(self, step_idx: int=None, precision_threshold=0.20):
+        """Experimental filter balancing routines
+
+        Args:
+            step_idx (int, optional): Step to balance. Defaults to None.
+            precision_threshold (float, optional): Precision threshold. Defaults to 0.20.
+
+        Raises:
+            ValueError: Bad parameters
+            NotImplementedError: Bad graph structure
+        """
         if step_idx is not None:
             if step_idx > len(self.graph_state.steps) or step_idx < 0:
                 raise ValueError("step idx out of range")
@@ -352,62 +518,13 @@ def balance_filters(self, step_idx=None, precision_threshold=0.20):
         else:
             balance_all_filters(self, precision_threshold=precision_threshold)
 
-    def print_intermediates(self, outputs, limit=None, width=8,
-                            precision=4, channel=None, order=None,
-                            checksum=False, print_constants=False):
-        def print_step(step, outs, index):
-            node = step['node']
-            if checksum:
-                for out_idx, out in enumerate(outs):
-                    if isinstance(node, ConstantInputParameters):
-                        continue
-                    print(f"S{index} - {node.name}\n\tChecksum = {np.sum(out) if out.dtype != np.uint8 else np.sum(out.astype(np.int8))}")
-            else:
-                print(node.name)
-                for out_idx, out in enumerate(outs):
-                    dims = node.out_dims[out_idx]
-                    if order is not None and dims.is_named and order != dims.order and all(k in dims.order
-                                                                                           for k in order):
-                        transpose = dims.transpose_to_order(order)
-                        out = out.transpose(transpose)
-                    if channel is not None:
-                        out = out[channel:channel+1:1, ...]
-                    dump_tensor(out, PrintDumper(
-                        out, width=width, precision=precision))
-
-        if limit is not None:
-            print_step(self.graph_state.steps[limit], outputs[limit], limit)
-        else:
-            for idx, out in enumerate(outputs):
-                print_step(self.graph_state.steps[idx], out, idx)
-        print()
-
-    def qshow(self):
-        tab = QuantizationReporter().report(self, self.quantization)
-        renderer = TextTableRenderer(150)
-        tab.render(renderer)
-        return renderer.get_output()
-
-    def merge(self, other: 'NNGraph'):
-        if self != other:
-            for edge in other.edges:
-                self.add_edge(edge)
-        return self
-
-    def __getitem__(self, key):
+    def __getitem__(self, key) -> Parameters:
         if isinstance(key, int):
             return self.nodes_by_step_idx[key]
         return super().__getitem__(key)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         tab = GraphReporter().report(self)
         renderer = TextTableRenderer(150)
         tab.render(renderer)
         return renderer.get_output()
-
-    def total_ops(self):
-        tot_ops = 0
-        for node in self.nodes():
-            ops = node.compute_load()
-            tot_ops += ops if ops else 0
-        return tot_ops
diff --git a/tools/nntool/graph/types/activations.py b/tools/nntool/graph/types/activations.py
index 503a7a25c..59f75d7b9 100644
--- a/tools/nntool/graph/types/activations.py
+++ b/tools/nntool/graph/types/activations.py
@@ -226,10 +226,6 @@ class TanHActivationParameters(ActivationParameters, CanFuseToExpression):
     def can_equalize(self):
         return False
 
-    def should_fuse(self, node_set, qrec=None):
-        # TODO - TanH is only supported in an expression currently
-        return True
-
 
 @expression_op(Sigmoid)
 @cls_op_name('sigmoid')
diff --git a/tools/nntool/graph/types/base.py b/tools/nntool/graph/types/base.py
index 0dc86024d..e80eb5223 100644
--- a/tools/nntool/graph/types/base.py
+++ b/tools/nntool/graph/types/base.py
@@ -20,6 +20,7 @@
 from expressions.symbolic.symbol import Symbol
 from generation.at_types.gen_ctrl import CTRL_FEATURES, GenCtrl
 from graph.dim import Dim, PadDim, StrideDim
+from stats.ranges_utils import collect_stat
 
 from utils.graph import Edge, Node, NodeRef
 from utils.option_list import OptionList
@@ -64,25 +65,13 @@ def clone_dims(dims: Sequence[Dim], hints: Sequence[Dim]):
 
 
 class NNNodeRef(NodeRef):
-    def __init__(self, node, idx, G) -> None:
-        super(NNNodeRef, self).__init__(node)
-        self._G = G
-        self._idx = idx
-
-    @property
-    def G(self):
-        return self._G
-
-    @property
-    def ref(self):
-        return ((self._node, self._idx), self._G)
 
     def __getattr__(self, name):
         return getattr(self._node, name)
 
     def __setattr__(self, name, val):
         if name in ['_node', '_G', '_idx']:
-            super().__setattr__(name, val)
+            return super().__setattr__(name, val)
         return setattr(self._node, name, val)
 
     def __hasattr__(self, name):
@@ -94,26 +83,14 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return self._node.__repr__()
 
-    def __eq__(self, o: object) -> bool:
-        if isinstance(o, NNNodeRef):
-            return super().__eq__(o)
-        return self._node.__eq__(o)
-
-    def __hash__(self) -> int:
-        return self._node.__hash__()
-
-    def __call__(self, *args, **kwargs):
-        raise ValueError("this is already a reference")
-
 
 class Parameters(Node):
     CLS_OP_NAME = None
     NARGS = {1}
     NOT_GENERATED = False
 
-    def __init__(self, name, *args, in_dims_hint=None, out_dims_hint=None, **kwargs):
-        super().__init__(name, *args, **kwargs)
-        del args, kwargs
+    def __init__(self, name, in_dims_hint=None, out_dims_hint=None, **kwargs):
+        super().__init__(name, **kwargs)
         self._in_dims = None
         self._out_dims = None
         self._in_dims_hint = in_dims_hint
@@ -134,17 +111,19 @@ def __str__(self):
     def __repr__(self):
         return f'{self.__class__.__name__}({self.name})'
 
-    def __call__(self, *args, **kwargs):
+    @property
+    def _edge_class(self):
+        return NNEdge
+
+    @property
+    def _noderef_class(self):
+        return NNNodeRef
+
+    def __call__(self, *args, num_outputs=1, **kwargs):
         # set of number of args
         if isinstance(self.NARGS, set):
             if '*' not in self.NARGS and len(args) not in self.NARGS:
                 raise ValueError("incorrect number of arguments")
-            inputs, fragments = [], []
-            for arg in args:
-                if arg is not None and not isinstance(arg, NNNodeRef):
-                    raise ValueError("expecting NNNodeRef")
-                inputs.append(arg.ref[0] if arg else None)
-                fragments.append(arg.ref[1] if arg else None)
 
         # list of possible inputs passed in kwargs. Things passed in args get
         # copied to kwargs with their index from the names in nargs
@@ -153,32 +132,37 @@ def __call__(self, *args, **kwargs):
                 if idx >= len(self.nargs):
                     raise ValueError('Too many inputs for this node type')
                 kwargs[self.nargs[idx]] = arg
-            inputs = []
-            fragments = []
+            args = []
             for name in self.nargs:
                 if name in kwargs:
                     ref = kwargs[name]
                     if not isinstance(ref, NNNodeRef):
                         raise ValueError("expecting NNNodeRef")
-                    inputs.append(ref[0])
-                    fragments.append(ref[1])
+                    args.append(ref)
                 else:
-                    inputs.append(None)
-                    fragments.append(None)
-            if inputs[0] is None:
+                    args.append(None)
+            if args[0] is None:
                 raise ValueError('Expecting at least an input')
 
-        fragment = [frag for frag in fragments if frag is not None][0]
-        if len(fragments) > 1:
-            for other in fragments[1::]:
-                if other is not None:
-                    fragment.merge(other)
-        for to_idx, from_tuple in enumerate(inputs):
-            if from_tuple is not None:
-                from_node, from_idx = from_tuple
-                fragment.add_edge(NNEdge(from_node=from_node,
-                                         from_idx=from_idx, to_node=self, to_idx=to_idx))
-        return NNNodeRef(self, 0, fragment)
+        return super().__call__(*args, num_outputs=num_outputs)
+
+    @property
+    def no_model_code(self) -> bool:
+        """Returns True if node results in no kernel, global or local generation in model
+
+        Returns:
+            bool: True if nothing generated
+        """
+        return False
+
+    @property
+    def does_nothing(self) -> bool:
+        """Returns True if the node does not modify its input in any way
+
+        Returns:
+            bool: True if node could be eliminated with no effect
+        """
+        return False
 
     @property
     def graph_label(self):
@@ -314,6 +298,9 @@ def can_equalize(self):
     def op_name(self):
         return self.CLS_OP_NAME
 
+    def details_collector(self, stats, stat, details):
+        pass
+
     def compute_load(self):
         return None
 
@@ -531,6 +518,8 @@ def __init__(self, *args, filt=None, has_bias=True, use_compressed=False, **kwar
         self.details = None
         self.at_options.update_valid_options(CTRL_FEATURES)
 
+    def details_collector(self, stats, stat, details):
+        collect_stat(stat, 'range_acc', details, details_name='acc')
 
 class MultiplicativeBiasParameters(FilterParameters):
     def __init__(self, *args, **kwargs):
@@ -538,6 +527,11 @@ def __init__(self, *args, **kwargs):
         self.has_mul_bias = False
         self._mul_biases = None
 
+    def details_collector(self, stats, stat, details):
+        super().details_collector(stats, stat, details)
+        if self.mul_biases:
+            collect_stat(stat, 'range_pre_mul_bias', details, details_name='pre_mul_bias')
+
     @property
     def mul_biases(self):
         return self._mul_biases
@@ -645,4 +639,3 @@ def __init__(self, from_node: Union[str, Node], to_node: Union[str, Node],
                  from_idx: int = 0, to_idx: int = 0):
         super().__init__(from_node, to_node, from_idx, to_idx)
         self.params = params
-
diff --git a/tools/nntool/graph/types/constant_input.py b/tools/nntool/graph/types/constant_input.py
index 940deecd1..a7a55fb02 100644
--- a/tools/nntool/graph/types/constant_input.py
+++ b/tools/nntool/graph/types/constant_input.py
@@ -53,7 +53,7 @@ def __init__(self, *args, adjust_transpose=None, is_mutated=False,
     def __call__(self, graph):
         if graph.__class__.__name__ != 'NNGraph':
             raise ValueError('expecting NNGraph as parameter')
-        return NNNodeRef(self, 0, graph)
+        return NNNodeRef(graph, self, 0)
 
     @classmethod
     def fake(cls, G, val):
diff --git a/tools/nntool/graph/types/expression_fusion.py b/tools/nntool/graph/types/expression_fusion.py
index 51d7ac65f..43afedf3b 100644
--- a/tools/nntool/graph/types/expression_fusion.py
+++ b/tools/nntool/graph/types/expression_fusion.py
@@ -13,6 +13,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from copy import deepcopy
 import logging
 from collections import Counter
 
@@ -121,6 +122,19 @@ def set_min_max(qrecs, symbol, node):
         qtype = qrec.out_qs[0]
         symbol.control.add_min_max(symbol, qtype.min_val, qtype.max_val)
 
+    def details_collector(self, stats, stat, details):
+        if 'expression' in stat:
+            stat = stat['expression']
+            for sym_name, rec in details.items():
+                if sym_name == "results":
+                    continue
+                stat_rec = stat.setdefault(
+                    sym_name, {'min': float('inf'), 'max': float('-inf')})
+                stat_rec['min'] = min(stat_rec['min'], rec['min'])
+                stat_rec['max'] = max(stat_rec['max'], rec['max'])
+        else:
+            stat['expression'] = deepcopy(details)
+
     def is_same_operation_as(self, G, other):
         if not isinstance(other, ExpressionFusionParameters):
             return False
@@ -182,6 +196,8 @@ def get_output_size(self, in_dims):
             if tuple(in_vars[idx].shape) != shape:
                 in_vars[idx].shape = shape
                 dim_change = True
+        if dim_change:
+            self.func_col.set_var_shapes()
         out_dims = super().get_output_size(in_dims)
         if dim_change: # if the input shapes haven't changed then the output shapes have not changed
             out_vars = [self.func_col.variables[name] for name in self.output_symbols]
diff --git a/tools/nntool/graph/types/fusions.py b/tools/nntool/graph/types/fusions.py
index 01d3d33f0..8e894dbe6 100644
--- a/tools/nntool/graph/types/fusions.py
+++ b/tools/nntool/graph/types/fusions.py
@@ -19,6 +19,8 @@
 from graph.types.others import PadParameters
 from graph.types.pooling import PoolingParameters
 from graph.types.tensor_arithmetic import Broadcastable
+from utils.graph import GraphView
+from utils.node_id import NodeId
 
 from ..dim import Dim
 from .base import (FilterParameters, NNEdge, NodeOptions, Parameters,
@@ -69,7 +71,7 @@ class FusionBase(Parameters):
     fusion_op_name = "!!NOT SET!!"
     quantize_internals = True
 
-    def __init__(self, name, *args, fusion_type=None, subgraph=None,
+    def __init__(self, name, *args, fusion_type=None, subgraph: GraphView=None,
                  input_mapping=None,
                  output_mapping=None,
                  in_dims=None, out_dims=None,
@@ -157,11 +159,11 @@ def op_name(self):
         return self.fusion_op_name
 
     @property
-    def subgraph(self):
+    def subgraph(self) -> GraphView:
         return self._subgraph
 
     def contained_nodes(self):
-        return [node for node in self.subgraph.dfs()
+        return [node for node in self.subgraph.topological_sort()
                 if not isinstance(node, FusionInputOutputParameters)]
 
     def get_contained_node(self, name):
@@ -197,7 +199,7 @@ def get_parameter_size(self):
 
     def get_output_size(self, in_dims):
         node_out_dims = []
-        for node in self.subgraph.dfs():
+        for node in self.subgraph.topological_sort():
             if isinstance(node, FusionInputParameters):
                 node_in_dims = [self.clone_dim_with_hint(
                     in_dims[node.idx], node.idx)]
diff --git a/tools/nntool/graph/types/input_output.py b/tools/nntool/graph/types/input_output.py
index 7a67de6a4..9a7ef936f 100644
--- a/tools/nntool/graph/types/input_output.py
+++ b/tools/nntool/graph/types/input_output.py
@@ -131,7 +131,7 @@ def __init__(self, *args, **kwargs):
     def __call__(self, graph):
         if graph.__class__.__name__ != 'NNGraph':
             raise ValueError('expecting NNGraph as parameter')
-        return NNNodeRef(self, 0, graph)
+        return NNNodeRef(graph, self, 0)
 
     def verify(self, G):
         problems = []
diff --git a/tools/nntool/graph/types/others.py b/tools/nntool/graph/types/others.py
index 2d033305c..70bacc828 100644
--- a/tools/nntool/graph/types/others.py
+++ b/tools/nntool/graph/types/others.py
@@ -15,6 +15,7 @@
 
 import logging
 import math
+from functools import reduce
 
 import numpy as np
 from expressions.symbolic.basic import (Abs, Ceil, Cos, Exp, Log, Max, Min,
@@ -24,7 +25,7 @@
 from utils.real_transpose import real_transpose
 
 from .base import (CanFuseToExpression, ComparableParameters,
-                   InsensitiveToQuantization, NNNodeRef,
+                   InsensitiveToQuantization,
                    NoSizeChangeParameters, Parameters, SensitiveToOrder,
                    SingleInputAndOutput, cls_op_name, expression_op, nargs,
                    not_generated)
@@ -63,7 +64,8 @@ def get_parameter_size(self):
     def permute(self, val):
         return [val[i] for i in self.transpose]
 
-    def does_nothing(self):
+    @property
+    def no_model_code(self):
         if not self.transpose:
             return True
         if not self.in_dims or not self.in_dims[0]:
@@ -75,9 +77,13 @@ def does_nothing(self):
                        for idx in trans if shape_idx[idx] is not None]
         return shape_trans == sorted(shape_trans)
 
+    @property
+    def does_nothing(self) -> bool:
+        return self._transpose is None
+
     @property
     def is_not_generated(self):
-        return self.does_nothing()
+        return self.does_nothing
 
     def is_same_operation_as(self, G, other):
         if not isinstance(other, TransposeParameters):
@@ -174,7 +180,7 @@ def __str__(self):
 
 
 @cls_op_name('quantize')
-class QuantizeParameters(Parameters):
+class QuantizeParameters(Parameters, ComparableParameters):
 
     def __init__(self, *args, from_qtype=None, to_qtype=None,
                  inserted_by_quantizer=False, **kwargs):
@@ -187,6 +193,11 @@ def __init__(self, *args, from_qtype=None, to_qtype=None,
     def get_parameter_size(self):
         return 0
 
+    def is_same_operation_as(self, G, other):
+        return (isinstance(other, QuantizeParameters) and
+                self.from_qtype == other.from_qtype and
+                self.to_qtype == other.to_qtype)
+
     @property
     def can_equalize(self):
         return False
@@ -224,10 +235,11 @@ def __str__(self):
 @not_generated
 class ConcatParameters(Parameters, SensitiveToOrder):
 
-    def __init__(self, *args, axis=None, axis_hint=None, **kwargs):
+    def __init__(self, *args, axis=None, **kwargs):
         super(ConcatParameters, self).__init__(*args, **kwargs)
+        if axis is None:
+            raise ValueError("axis must be set")
         self._axis = axis
-        self._axis_hint = axis_hint
 
     @property
     def graph_label(self):
@@ -245,6 +257,10 @@ def axis(self):
     def axis(self, val):
         self._axis = val
 
+    @property
+    def does_nothing(self) -> bool:
+        return self.in_dims and len(self.in_dims) == 1
+
     def get_parameter_size(self):
         return 0
 
@@ -252,9 +268,16 @@ def get_parameter_size(self):
     def can_equalize(self):
         return False
 
+    @property
+    def offsets(self):
+        return reduce(
+            lambda state, in_dim: (
+                state[0] + [state[1]], state[1] + in_dim.shape[self.axis]),
+            self.in_dims,
+            ([], 0)
+        )[0]
+
     def get_output_size(self, in_dims):
-        if in_dims[0].is_named and self._axis_hint:
-            self._axis = in_dims[0].get_order_idx(self._axis_hint)
         out_dim = Dim.combine([in_dim for in_dim in in_dims], self.axis)
         return [out_dim]
 
@@ -281,8 +304,11 @@ def __init__(self, *args,
         self.axis = axis
 
     def __call__(self, *args, **kwargs):
-        noderef = super(SplitParameters, self).__call__(*args, **kwargs)
-        return tuple(NNNodeRef(self, i, noderef.ref[1]) for i in range(len(self.act_slices)))
+        return super().__call__(*args, num_outputs=len(self.act_slices), **kwargs)
+
+    @property
+    def does_nothing(self) -> bool:
+        return self.out_dims and len(self.out_dims) == 1
 
     @property
     def graph_label(self):
@@ -308,7 +334,8 @@ def get_splits(in_shape, axis, splits=None, num_splits=None):
         if splits:
             if in_shape[axis] is not None and any(split == -1 for split in splits):
                 rest_sz = sum(split for split in splits if split > 0)
-                splits = (split if split > 0 else in_shape[axis] - rest_sz for split in splits)
+                splits = (split if split >
+                          0 else in_shape[axis] - rest_sz for split in splits)
             for sz in splits:
                 act_slices.append([(in_idx, in_idx + sz, 1) if idx == axis else (0, shape, 1)
                                    for idx, shape in enumerate(in_shape)
@@ -392,7 +419,6 @@ def can_equalize(self):
     def __str__(self):
         return "A %s I %s" % (self.axis, self.indices)
 
-
 @cls_op_name('strided_slice')
 class StridedSliceParameters(Parameters, SingleInputAndOutput, ComparableParameters, InsensitiveToQuantization):
 
@@ -403,7 +429,8 @@ def __init__(self, *args,
 
         super(StridedSliceParameters, self).__init__(*args, **kwargs)
         self.act_slice = act_slice
-        self.slice_shape = tuple(int(abs(math.ceil((sl[1] - sl[0])/sl[2]))) for sl in self.act_slice)
+        self.slice_shape = tuple(
+            int(abs(math.ceil((sl[1] - sl[0])/sl[2]))) for sl in self.act_slice)
         self.out_shape = tuple(out_shape)
 
     @property
@@ -414,6 +441,34 @@ def graph_label(self):
     def graph_anon_label(self):
         return ['Slice'] + ["(%s,%s,%s)" % elem for elem in self.act_slice]
 
+    @property
+    def slice_shape(self):
+        return self._slice_shape
+
+    @slice_shape.setter
+    def slice_shape(self, val):
+        self._slice_shape = tuple(val)
+
+    @property
+    def slices_axes(self):
+        in_shape = self.in_dims[0].shape
+        return tuple(idx for idx, shapes in enumerate(zip(self.post_slice_shape, in_shape)) if shapes[0] != shapes[1])
+
+    @property
+    def post_slice_shape(self):
+        old_settings = np.seterr(all='raise')
+        res = tuple(abs(((sl[1] if sl[1] >= -1 else -1) - sl[0])//sl[2]) for sl in self.act_slice)
+        np.seterr(**old_settings)
+        return res
+
+    @property
+    def changes_shape(self):
+        return self.post_slice_shape != self.out_shape
+
+    @property
+    def can_equalize(self):
+        return False
+
     def numpy_slice(self, arr: np.ndarray):
         slice_spec = [slice(elem[0], elem[1], elem[2])
                       for elem in self.act_slice if len(elem) == 3]
@@ -447,12 +502,14 @@ def only_slices(self, axis):
                    for idx, dim in enumerate(self.in_dims[0].shape) if axis != idx)
 
     @property
-    def post_slice_shape(self):
-        return [(sl[1] - sl[0])//sl[2] for sl in self.act_slice]
+    def does_nothing(self) -> bool:
+        return self.no_model_code and not self.changes_shape
 
     @property
-    def changes_shape(self):
-        return len(self.post_slice_shape) > len(self.out_shape)
+    def no_model_code(self) -> bool:
+        if not self.in_dims:
+            return False
+        return self.post_slice_shape == tuple(self.in_dims[0].shape)
 
     def get_parameter_size(self):
         return 0
@@ -460,10 +517,6 @@ def get_parameter_size(self):
     def get_output_size(self, in_dims):
         return [Dim.unnamed(self.out_shape)]
 
-    @property
-    def can_equalize(self):
-        return False
-
     def __str__(self):
         return ",".join("(%s,%s,%s)" % elem for elem in self.act_slice)
 
@@ -675,14 +728,19 @@ def __init__(self, *args, old_shape=None, shape=None, **kwargs):
 
     @property
     def graph_label(self):
-        return [self.name, f'{self.old_shape} to {self.shape}']
+        return [f'Reshape({self.name})', f'{self.old_shape} to {self.shape}']
 
     @property
     def graph_anon_label(self):
         return ['Reshape', f'{self.old_shape} to {self.shape}']
 
+    @property
     def does_nothing(self):
-        return self.shape.layout_shape == self.old_shape.layout_shape
+        return tuple(self.shape.shape) == tuple(self.old_shape.shape)
+
+    @property
+    def no_model_code(self) -> bool:
+        return True
 
     def get_parameter_size(self):
         return 0
@@ -691,7 +749,7 @@ def exp_red_pattern(self):
         """ If the reshape is an expand or reduce dim i.e. adds or removes 1 size axes then
         return a pattern with True indicating an added axis, False a removed axis and None
         an unchanged axis"""
-        if not self.does_nothing():
+        if not self.does_nothing:
             return None
         res = []
         s1 = self._old_shape.shape.copy()
@@ -781,6 +839,14 @@ def get_parameter_size(self):
     def can_equalize(self):
         return False
 
+    @property
+    def no_model_code(self) -> bool:
+        return True
+
+    @property
+    def does_nothing(self) -> bool:
+        return True
+
     def compute_load(self):
         return 0
 
diff --git a/tools/nntool/graph/types/rnn.py b/tools/nntool/graph/types/rnn.py
index 12f0912e0..aa3a01fdb 100644
--- a/tools/nntool/graph/types/rnn.py
+++ b/tools/nntool/graph/types/rnn.py
@@ -18,6 +18,7 @@
 from graph.dim import Dim
 from graph.types import (ConstantInputParameters, NNEdge, Parameters,
                          SensitiveToOrder, SingleInputAndOutput)
+from stats.ranges_utils import collect_stat
 
 from .base import cls_op_name, nargs
 
@@ -60,6 +61,10 @@ def graph_label(self):
     def graph_anon_label(self):
         return ["Filt"]
 
+    def details_collector(self, stats, stat, details):
+        for k in filter(lambda x: x.startswith('range_'), details):
+            collect_stat(stat, k, details[k])
+
     def get_parameter_size(self):
         return 0
 
diff --git a/tools/nntool/graph/types/ssd.py b/tools/nntool/graph/types/ssd.py
index 826b1b73c..a18fe59d4 100644
--- a/tools/nntool/graph/types/ssd.py
+++ b/tools/nntool/graph/types/ssd.py
@@ -17,7 +17,7 @@
 
 from graph.dim import Dim
 
-from .base import NNNodeRef, Parameters, SensitiveToOrder, cls_op_name, nargs
+from .base import Parameters, SensitiveToOrder, cls_op_name, nargs
 
 LOG = logging.getLogger("nntool." + __name__)
 
@@ -41,14 +41,26 @@ def __init__(self, *args, parameters=None, **kwargs):
         self.nms_config = {'using_json_config': {'INCLUDE': False, 'json_config_path': ''},
                            'using_pipeline_config': {'INCLUDE': False, 'pipeline_config_path': ''},
                            'using_params': {'INCLUDE': True, 'params': self._parameters}}
+        self.at_options.valid_options['NMS_SCORE_THRESHOLD'] = float
+        self.at_options.valid_options['NMS_IOU_THRESHOLD'] = float
+        self.nms_score_threshold = self._parameters['nms_score_threshold']
+        self.nms_iou_threshold = self._parameters['nms_iou_threshold']
+        self._output_detection_count = True
 
     def __call__(self, *args, **kwargs):
-        noderef = super(SSDDetectorParameters, self).__call__(*args, **kwargs)
-        return tuple(NNNodeRef(self, i, noderef.ref[1]) for i in range(3))
+        return super().__call__(*args, num_outputs=4 if self._output_detection_count else 3, **kwargs)
 
     def get_parameter_size(self):
         return 0
 
+    @property
+    def output_detection_count(self):
+        return self._output_detection_count
+
+    @output_detection_count.setter
+    def output_detection_count(self, val):
+        self._output_detection_count = val
+
     @property
     def can_equalize(self):
         return False
@@ -69,14 +81,6 @@ def w_scale(self):
     def h_scale(self):
         return self._parameters['h_scale']
 
-    @property
-    def nms_score_threshold(self):
-        return self._parameters['nms_score_threshold']
-
-    @nms_score_threshold.setter
-    def nms_score_threshold(self, val):
-        self._parameters['nms_score_threshold'] = val
-
     @property
     def max_bb_before_nms(self):
         return self._parameters['max_bb_before_nms']
@@ -95,7 +99,19 @@ def use_exp_for_wh_decode(self, val):
 
     @property
     def nms_iou_threshold(self):
-        return self._parameters['nms_iou_threshold']
+        return self.at_options.nms_iou_threshold
+
+    @nms_iou_threshold.setter
+    def nms_iou_threshold(self, val):
+        self.at_options.nms_iou_threshold = val
+
+    @property
+    def nms_score_threshold(self):
+        return self.at_options.nms_score_threshold
+
+    @nms_score_threshold.setter
+    def nms_score_threshold(self, val):
+        self.at_options.nms_score_threshold = val
 
     @property
     def max_detections(self):
@@ -108,18 +124,18 @@ def max_classes_per_detection(self):
     def get_output_size(self, in_dims):
         num_detected_boxes = self._parameters['max_detections'] * \
             self._parameters['max_classes_per_detection']
-        return [
-            Dim(shape=[num_detected_boxes, 4], is_ordered=True),
-            Dim(shape=[num_detected_boxes], is_ordered=True),
-            Dim(shape=[num_detected_boxes], is_ordered=True),
-            Dim(shape=[num_detected_boxes], is_ordered=True),
+        outputs = [
+            Dim.unnamed([num_detected_boxes, 4]),
+            Dim.unnamed([num_detected_boxes]),
+            Dim.unnamed([num_detected_boxes]),
         ]
+        if self.output_detection_count:
+            outputs.append(Dim.unnamed([1]))
+        return outputs
 
     def __str__(self):
-        return "{} SCORE_THR {:.2f} IOU_THR {:.2f}".format(
-            self.at_options,
-            self.nms_score_threshold,
-            self.nms_iou_threshold
+        return "{}".format(
+            self.at_options
         )
 
 
@@ -148,10 +164,13 @@ def __init__(self, *args, parameters=None, in_dims_hint=None, out_dims_hint=None
         self._ker_in_order = [['batch', 'spatial_dim', 'box'], [
             'batch', 'class', 'spatial_dim']]
         self._ker_out_order = [['spatial_dim', 'index']]
+        self.at_options.valid_options['NMS_SCORE_THRESHOLD'] = float
+        self.at_options.valid_options['NMS_IOU_THRESHOLD'] = float
+        self.nms_score_threshold = self._parameters['nms_score_threshold']
+        self.nms_iou_threshold = self._parameters['nms_iou_threshold']
 
     def __call__(self, *args, **kwargs):
-        noderef = super(NMSParameters, self).__call__(*args, **kwargs)
-        return tuple(NNNodeRef(self, i, noderef.ref[1]) for i in range(2))
+        return super().__call__(*args, num_outputs=2, **kwargs)
 
     def get_parameter_size(self):
         return 0
@@ -160,17 +179,21 @@ def get_parameter_size(self):
     def can_equalize(self):
         return False
 
+    @property
+    def nms_iou_threshold(self):
+        return self.at_options.nms_iou_threshold
+
+    @nms_iou_threshold.setter
+    def nms_iou_threshold(self, val):
+        self.at_options.nms_iou_threshold = val
+
     @property
     def nms_score_threshold(self):
-        return self._parameters['nms_score_threshold']
+        return self.at_options.nms_score_threshold
 
     @nms_score_threshold.setter
     def nms_score_threshold(self, val):
-        self._parameters['nms_score_threshold'] = val
-
-    @property
-    def nms_iou_threshold(self):
-        return self._parameters['nms_iou_threshold']
+        self.at_options.nms_score_threshold = val
 
     @property
     def max_output_boxes_per_class(self):
@@ -192,8 +215,6 @@ def get_output_size(self, in_dims):
         ]
 
     def __str__(self):
-        return "{} SCORE_THR {:.2f} IOU_THR {:.2f}".format(
-            self.at_options,
-            self.nms_score_threshold,
-            self.nms_iou_threshold
+        return "{}".format(
+            self.at_options
         )
diff --git a/tools/nntool/importer/common/broadcast_mixin.py b/tools/nntool/importer/common/broadcast_mixin.py
index e23c1bf8e..8a0627dd8 100644
--- a/tools/nntool/importer/common/broadcast_mixin.py
+++ b/tools/nntool/importer/common/broadcast_mixin.py
@@ -19,11 +19,16 @@
 
 from .provisional_dim import ProvisionalDim
 
+# reduces broadcasted constants on unknown dimensions.
+# Setting this to false can provoke conception errors in matchers
+FIX_CONSTANTS = True
 
 class BroadcastMixin(object):
 
     @classmethod
-    def get_broadcasted_shape(cls, x, y):
+    def get_broadcasted_shape(cls, x, y, is_constant=None):
+        if is_constant is None:
+            is_constant = (False, False)
         if len(x) < len(y):
             x = ([1] * (len(y) - len(x))) + x
         elif len(y) < len(x):
@@ -34,9 +39,20 @@ def get_broadcasted_shape(cls, x, y):
             "{} and {} cannot be broadcasted".format(x, y)
 
         def broad(elem_x, elem_y):
-            if elem_x is None or elem_y is None:
-                return None
-            return elem_x if elem_y == 1 else elem_y
+            # if one element is not None then take it since that dimension will be broadcasted
+            if elem_x is None:
+                if elem_y is None or (FIX_CONSTANTS and is_constant[1] and elem_y == 1):
+                    return None
+                else:
+                    return elem_y
+            else:
+                if elem_y is None:
+                    if FIX_CONSTANTS and is_constant[0] and elem_x == 1:
+                        return None
+                    else:
+                        return elem_x
+                else:
+                    return elem_x if elem_y == 1 else elem_y
         return [broad(elem_x, elem_y) for elem_x, elem_y in zip(x, y)]
 
     @classmethod
@@ -62,8 +78,9 @@ def _fix_constant_inputs(cls, inputs, shape):
 
     @classmethod
     def implied_broadcast(cls, inputs):
+        is_constant = [isinstance(inp[0], ConstantInputParameters) for inp in inputs]
         x = inputs[0][2].shape
         y = inputs[1][2].shape
-        shape = cls.get_broadcasted_shape(x, y)
+        shape = cls.get_broadcasted_shape(x, y, is_constant=is_constant)
         cls._fix_constant_inputs(inputs, shape)
         return [ProvisionalDim(shape)]
diff --git a/tools/nntool/importer/common/check_batchdim.py b/tools/nntool/importer/common/check_batchdim.py
new file mode 100644
index 000000000..07d2dd098
--- /dev/null
+++ b/tools/nntool/importer/common/check_batchdim.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from graph.types.base import NNEdge
+from graph.types.others import ReshapeParameters
+from importer.common.provisional_dim import ProvisionalDim
+
+
+def check_batchdim(G, x, valid_name):
+    x_shape = x[2].shape
+    if x_shape[0] is not None:
+        if x_shape[0] != 1:
+            raise NotImplementedError(
+                f'{valid_name} pool is on more than one batch. This is not supported')
+        reshape = ReshapeParameters(G.unique_name(f'{valid_name}_reshape'),
+                                    old_shape=tuple(x_shape), shape=tuple(x_shape[1:]))
+        G.add_edge(NNEdge(from_node=x[0], from_idx=x[1], to_node=reshape))
+        x_shape[0] = None
+        if len(x) == 3:
+            return (reshape, 0, ProvisionalDim(x_shape))
+        return (reshape, 0, ProvisionalDim(x_shape), x[3])
+    return x
diff --git a/tools/nntool/importer/importer.py b/tools/nntool/importer/importer.py
index 035c4c2f6..db57003a4 100644
--- a/tools/nntool/importer/importer.py
+++ b/tools/nntool/importer/importer.py
@@ -20,8 +20,8 @@
 from .tflite2.tflite import NNGraph, TFLiteImporter
 
 GRAPH_IMPORTERS = {
-    'onnx': {'matches':[r".*\.onnx$"], 'importer':OnnxImporter, 'loader': None},
-    'tflite': {'matches':[r".*\.tflite$"], 'importer':TFLiteImporter, 'loader': None},
+    'onnx': {'matches':[r".*\.onnx$"], 'importer':OnnxImporter},
+    'tflite': {'matches':[r".*\.tflite$"], 'importer':TFLiteImporter},
 }
 
 class ImportException(Exception):
@@ -51,7 +51,6 @@ def create_graph(filename: str, graph_format: str = None, opts: Mapping = None)
             if re.search(match, filename):
                 importer = v['importer']()
                 graph = importer.create_graph(filename, opts)
-                graph.set_load_function(v['loader'])
                 return graph
 
     raise ValueError("Graph importer not found")
diff --git a/tools/nntool/importer/onnx/common/handler_helper.py b/tools/nntool/importer/onnx/common/handler_helper.py
index 1a0102ea0..e6d32a8bf 100644
--- a/tools/nntool/importer/onnx/common/handler_helper.py
+++ b/tools/nntool/importer/onnx/common/handler_helper.py
@@ -14,12 +14,14 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from onnx import defs
+from onnx.defs import SchemaError
 
 from .. import common
 # pylint: disable=wildcard-import,unused-wildcard-import
 from ..handlers.backend import *  # noqa
 from ..handlers.backend_handler import BackendHandler
 
+
 def get_opset_status():
     ops = []
     onnx_ops = {}
@@ -33,7 +35,8 @@ def get_opset_status():
     counts_by_domain = {}
     for handler in BackendHandler.__subclasses__():
         handler.check_cls()
-        counts_by_domain.setdefault(handler.DOMAIN, [0, onnx_ops.get(handler.DOMAIN, 0)])
+        counts_by_domain.setdefault(
+            handler.DOMAIN, [0, onnx_ops.get(handler.DOMAIN, 0)])
         counts_by_domain[handler.DOMAIN][0] += 1
         ops.append([
             handler.DOMAIN,
@@ -47,6 +50,7 @@ def get_opset_status():
     ])
     return ops, counts_by_domain
 
+
 def get_all_backend_handlers(opset_dict):
     """ Get a dict of all backend handler classes.
     e.g. {'domain': {'Abs': Abs handler class}, ...}, }.
@@ -65,14 +69,16 @@ def get_all_backend_handlers(opset_dict):
         since_version = 1
         if defs.has(handler.ONNX_OP, domain=handler.DOMAIN):
             try:
-                since_version = defs.get_schema(  # @IgnoreException
+                since_version = defs.get_schema( #@IgnoreException
                     handler.ONNX_OP,
                     domain=handler.DOMAIN,
                     max_inclusive_version=version).since_version
-            except RuntimeError:
-                common.logger.debug("Fail to get since_version of %s in domain `%s` "
-                                    "with max_inclusive_version=%s. Set to 1.",
-                                    handler.ONNX_OP, handler.DOMAIN, version)
+            except (SchemaError, RuntimeError):
+                versions = sorted([int(ver_func[len('varsion_'):]) for ver_func in dir(handler) if ver_func.startswith('version_')])
+                since_version = versions[0] if versions else 1
+                common.logger.debug(
+                    f"Fail to load schema of {handler.ONNX_OP} in domain `{handler.DOMAIN}` "
+                    f"with max_inclusive_version=version. Since version set to {since_version}.")
         else:
             common.logger.debug("Unknown op %s in domain `%s`.",
                                 handler.ONNX_OP, handler.DOMAIN or "ai.onnx")
diff --git a/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py b/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py
index 0b4654fdc..bfab732c8 100644
--- a/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py
+++ b/tools/nntool/importer/onnx/handlers/backend/concat_from_sequence.py
@@ -71,11 +71,3 @@ def _common(cls, node, **kwargs):
     @classmethod
     def version_11(cls, node, **kwargs):
         return cls._common(node, **kwargs)
-
-    @classmethod
-    def version_9(cls, node, **kwargs):
-        return cls._common(node, **kwargs)
-
-    @classmethod
-    def version_1(cls, node, **kwargs):
-        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/equal.py b/tools/nntool/importer/onnx/handlers/backend/equal.py
new file mode 100644
index 000000000..fcbaabbeb
--- /dev/null
+++ b/tools/nntool/importer/onnx/handlers/backend/equal.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+from graph.dim import Dim
+from graph.types import ConstantInputParameters
+from importer.common.constant_mixin import ConstantMixin
+from importer.common.provisional_dim import ProvisionalDim
+
+from ..backend_handler import BackendHandler
+from ..handler import onnx_op
+
+
+@onnx_op("Equal")
+class Equal(ConstantMixin, BackendHandler):
+
+    @classmethod
+    def _common(cls, node, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        G = kwargs['G']
+        valid_name = kwargs['valid_name']
+        inputs = [all_nodes[inp] for inp in node.input]
+        x = inputs[0]
+        x_shape = x[2].shape
+        if all(cls.is_constant(inp) for inp in inputs):
+            a = cls.get_constant(inputs[0])
+            b = cls.get_constant(inputs[1])
+            params = ConstantInputParameters(valid_name, dims=Dim.unnamed(a.shape), value=(a==b))
+        else:
+            raise ValueError("ONNX Equal operator is not implemented")
+        all_nodes[node.output[0]] = (params, 0, ProvisionalDim(x_shape), None)
+        return params
+
+    @classmethod
+    def version_1(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_7(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_11(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_13(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/handlers/backend/gather.py b/tools/nntool/importer/onnx/handlers/backend/gather.py
index 1a2761e3d..911396330 100644
--- a/tools/nntool/importer/onnx/handlers/backend/gather.py
+++ b/tools/nntool/importer/onnx/handlers/backend/gather.py
@@ -13,13 +13,14 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-import copy
-from graph.types.others import StridedSliceParameters
+
 import numpy as np
 from graph.types import ConstantInputParameters, GatherParameters, NNEdge
+from graph.types.others import ReshapeParameters, StridedSliceParameters
 from importer.common.constant_mixin import ConstantMixin
 from importer.common.provisional_dim import ProvisionalDim
 from importer.onnx.common import logger
+from utils.numpy_helpers import np_asscalar
 
 from ..backend_handler import BackendHandler
 from ..handler import onnx_op
@@ -40,21 +41,31 @@ def _common(cls, node, **kwargs):
         indices = cls.get_constant(y)
         axis = node.attrs.get('axis', 0)
 
-        pshape = ProvisionalDim(x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:])
+        pshape = ProvisionalDim(
+            x_shape[:axis:] + list(indices.shape) + x_shape[axis + 1:])
         if cls.is_constant(x):
             x_val = cls.get_constant(x)
-            logger.info(f"reducing {valid_name} to a constant {cls.print_small(x_val)}")
-            params = ConstantInputParameters(valid_name, value=np.take(x_val, indices, axis=axis))
+            logger.info(
+                f"reducing {valid_name} to a constant {cls.print_small(x_val)}")
+            params = ConstantInputParameters(valid_name, value=np.take(
+                x_val, indices.astype(np.int64), axis=axis))
         else:
             if np.ndim(indices) <= 1:
-                idx = np.asscalar(indices)
-                act_slice = tuple([(0, dim, 1) if i != axis else (idx, idx+1, 1) for i, dim in enumerate(x_shape) if dim is not None])
+                idx = np_asscalar(indices)
+                act_slice = tuple([(0, dim, 1) if i != axis else (
+                    idx, idx+1, 1) for i, dim in enumerate(x_shape) if dim is not None])
                 out_shape = pshape.known_shape.copy()
-                params = StridedSliceParameters(valid_name, act_slice=act_slice, out_shape=out_shape)
+                params = StridedSliceParameters(
+                    valid_name, act_slice=act_slice, out_shape=out_shape)
+                if params.post_slice_shape == tuple(x[2].known_shape):
+                    params = ReshapeParameters(valid_name, old_shape=tuple(
+                        x[2].known_shape), shape=out_shape)
             else:
                 axis = cls._trim_axis(axis, x_shape)
-                params = GatherParameters(valid_name, axis=axis, indices=indices)
-            G.add_edge(NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
+                params = GatherParameters(
+                    valid_name, axis=axis, indices=indices)
+            G.add_edge(
+                NNEdge(from_node=x[0], to_node=params, from_idx=x[1], to_idx=0))
         all_nodes[node.output[0]] = (params, 0, pshape, x[3])
         return params
 
diff --git a/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py b/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py
index 72c18169e..3324a1da2 100644
--- a/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/mat_mul_mixin.py
@@ -101,8 +101,8 @@ def _handle(cls, node, quantized=False, **kwargs):
                 NNEdge(from_node=y[0], to_node=trans2, from_idx=y[1], to_idx=0))
             G.add_edge(
                 NNEdge(from_node=trans2, to_node=params, from_idx=0, to_idx=1))
-            biases_params = ConstantInputParameters(f'{valid_name}_biases', dims=Dim.unnamed([out_dims[0].shape[1]]),
-                                                    value=np.zeros((out_dims[0].shape[1]), dtype=np.float32))
+            biases_params = ConstantInputParameters(f'{valid_name}_biases', dims=Dim.unnamed([out_dims[0].shape[-1]]),
+                                                    value=np.zeros((out_dims[0].shape[-1]), dtype=np.float32))
             G.add_edge(NNEdge(from_node=biases_params,
                               to_node=params, to_idx=2))
 
diff --git a/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py b/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py
index 93b977116..69544b786 100644
--- a/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py
+++ b/tools/nntool/importer/onnx/handlers/backend/pool_mixin.py
@@ -17,8 +17,8 @@
 from graph.types import GlobalPoolingParameters, PoolingParameters
 from graph.types.base import NNEdge
 from importer.common.provisional_dim import ProvisionalDim
+from importer.common.check_batchdim import check_batchdim
 
-from ..handler import partial_support, ps_description
 from .pad_mixin import PadMixin
 
 
@@ -31,6 +31,7 @@ def pool(cls, node, pool_type=None, copy_qtype=False, **kwargs):
         valid_name = kwargs['valid_name']
         inputs = [all_nodes[inp] for inp in node.input]
         x = inputs[0]
+        x = check_batchdim(G, x, valid_name)
         x_shape = x[2].shape
         x_feature_shape = x_shape[2::]
         input_rank = len(x_feature_shape)
diff --git a/tools/nntool/importer/onnx/handlers/backend/where.py b/tools/nntool/importer/onnx/handlers/backend/where.py
new file mode 100644
index 000000000..e7abcc53b
--- /dev/null
+++ b/tools/nntool/importer/onnx/handlers/backend/where.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+from graph.dim import Dim
+from graph.types import ConstantInputParameters
+from importer.common.constant_mixin import ConstantMixin
+from importer.common.provisional_dim import ProvisionalDim
+
+from ..backend_handler import BackendHandler
+from ..handler import onnx_op
+
+
+@onnx_op("Where")
+class Where(ConstantMixin, BackendHandler):
+
+    @classmethod
+    def _common(cls, node, **kwargs):
+        all_nodes = kwargs['all_nodes']
+        G = kwargs['G']
+        valid_name = kwargs['valid_name']
+        inputs = [all_nodes[inp] for inp in node.input]
+        x = inputs[0]
+        x_shape = x[2].shape
+        if all(cls.is_constant(inp) for inp in inputs):
+            condition = cls.get_constant(inputs[0])
+            x = cls.get_constant(inputs[1])
+            y = cls.get_constant(inputs[2])
+            params = ConstantInputParameters(valid_name, dims=Dim.unnamed(x.shape), value=np.where(condition, x, y))
+        else:
+            raise ValueError("ONNX Where operator is not implemented")
+        all_nodes[node.output[0]] = (params, 0, ProvisionalDim(x_shape), None)
+        return params
+
+    @classmethod
+    def version_9(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
+
+    @classmethod
+    def version_16(cls, node, **kwargs):
+        return cls._common(node, **kwargs)
diff --git a/tools/nntool/importer/onnx/onnx.py b/tools/nntool/importer/onnx/onnx.py
index ad083709a..a89f4d71f 100644
--- a/tools/nntool/importer/onnx/onnx.py
+++ b/tools/nntool/importer/onnx/onnx.py
@@ -354,8 +354,14 @@ def _import_nodes(self, G, graph, handlers, all_nodes, outputs, **kwargs):
                                           from_idx=producer[1]))
                     banned_inputs.update(node.output)
                     continue
-
-            params = handler.handle(OnnxNode(node), all_nodes=all_nodes, vars_dict=vars_dict,
+            onode = OnnxNode(node)
+            inputs = [all_nodes[inp] if inp else None for inp in onode.input]
+            if inputs:
+                x = inputs[0]
+                x_shape = x[2].shape
+                name = hasattr(node, 'name') and getattr(node, 'name')
+                x=0
+            params = handler.handle(onode, all_nodes=all_nodes, vars_dict=vars_dict,
                                     G=G, valid_name=self._node_name(node),
                                     used_tensors=used_tensors, importer=self, **kwargs)
             if params is None:
diff --git a/tools/nntool/importer/tflite2/handlers/backend/concatenation.py b/tools/nntool/importer/tflite2/handlers/backend/concatenation.py
index 21cf0dd24..4a9924777 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/concatenation.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/concatenation.py
@@ -96,7 +96,7 @@ def red_func(x, y):
             params = ConstantInputParameters(node.name, value=value)
         else:
             axis -= sum(1 if dim is None else 0 for dim in pout_shape[:axis:])
-            params = ConcatParameters(node.name, axis=axis, axis_hint=None)
+            params = ConcatParameters(node.name, axis=axis)
 
             for idx, inp in enumerate(inputs):
                 inp_node, inp_idx = cls._maybe_insert_reshape(G, inp, inp_shapes[idx], pout_shape)
diff --git a/tools/nntool/importer/tflite2/handlers/backend/pack.py b/tools/nntool/importer/tflite2/handlers/backend/pack.py
index 57f16a58b..73371e5fb 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/pack.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/pack.py
@@ -84,7 +84,7 @@ def _common(cls, node: TFLiteNode, **kwargs):
             G.add_edge(NNEdge(from_node=inputs[0][0], to_node=params, from_idx=inputs[0][1]))
         else:
             axis -= sum(1 if dim is None else 0 for dim in pconcat_out_shape[:axis:])
-            params = ConcatParameters(node.name, axis=axis, axis_hint=None)
+            params = ConcatParameters(node.name, axis=axis)
 
             # insert reshapes on each input to add concat axis
             for idx, inp in enumerate(inputs):
diff --git a/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py b/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py
index 0c56a4034..791ce1bc9 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/pool_mixin.py
@@ -19,6 +19,7 @@
 from importer.common.provisional_dim import ProvisionalDim
 from importer.tflite2.tflite_schema_head.Pool2DOptions import Pool2DOptions
 from utils.node_id import NodeId
+from importer.common.check_batchdim import check_batchdim
 
 from .filter_pad_mixin import FilterPadMixin
 
@@ -34,6 +35,8 @@ def pool2d(cls, node, pool_type=None, **kwargs):
 
         inputs = [all_nodes[inp] for inp in node.input]
         x = inputs[0]
+        x = check_batchdim(G, x, node.name)
+
         x = cls.remove_known_batch_dimension(G, x, node)
         x_shape = x[2].shape
         in_c = x_shape[1]
diff --git a/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py b/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py
index e61eb9707..b262697be 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/resize_mixin.py
@@ -16,6 +16,7 @@
 
 from graph.dim import Dim
 from graph.types.base import NNEdge
+from importer.common.check_batchdim import check_batchdim
 from utils.node_id import NodeId
 
 
@@ -32,6 +33,7 @@ def _common(cls, node, **kwargs):
 
         inputs = [all_nodes[inp] for inp in node.input]
         x = inputs[0]
+        x = check_batchdim(G, x, node.name)
         new_shape = tuple(cls._verify_constant(inputs[1]))
         params = params_class(node.name,
                               new_shape=new_shape,
diff --git a/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py b/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py
index f8c23360e..8bbef202c 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/tflite_detection_postprocess.py
@@ -37,15 +37,10 @@ def _common(cls, node: TFLiteNode, **kwargs):
         opts = kwargs['opts']
         all_nodes = kwargs['all_nodes']
         importer = kwargs['importer']
-        graph_outputs = kwargs['outputs']
 
-        if len(node.output) > 3 and node.output[3] in graph_outputs:
-            G.remove(graph_outputs[node.output[3]][0])
-            del graph_outputs[node.output[3]]
         inputs = [all_nodes[t] for t in node.input]
         outputs = [all_nodes.get(node.output[idx]) if idx < len(node.output) else None
-                   for idx in range(3)]
-        # inp_shapes = [input[2].shape for input in inputs]
+                   for idx in range(4)]
 
         if 'max_bb_before_nms' not in custom_opts:
             custom_opts['max_bb_before_nms'] = 300
@@ -79,9 +74,10 @@ def _common(cls, node: TFLiteNode, **kwargs):
                                   dtype=np.int16, scale=2**(-14))
             o_scores_qtype = node.input[1].qtype
             o_class_qtype = QType(scale=1, dtype=np.int8)
+            o_num_detect = QType(scale=1, dtype=np.int8)
             qrec = QRec.scaled(in_qs=in_qtypes,
                                out_qs=[o_boxes_qtype, o_class_qtype,
-                                       o_scores_qtype])
+                                       o_scores_qtype, o_num_detect])
             G.quantization[NodeId(params)] = qrec
 
         return params
diff --git a/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py b/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py
index c60de0360..6de3241b5 100644
--- a/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py
+++ b/tools/nntool/importer/tflite2/handlers/backend/transpose_conv.py
@@ -22,6 +22,7 @@
 from importer.tflite2.tflite_schema_head.Padding import Padding
 from importer.tflite2.tflite_schema_head.TransposeConvOptions import \
     TransposeConvOptions
+from importer.common.check_batchdim import check_batchdim
 
 from ..backend_handler import BackendHandler
 from ..handler import tflite_op, partial_support, ps_description
@@ -43,12 +44,14 @@ def version_1(cls, node: TFLiteNode, **kwargs):
 
         inputs = [all_nodes[t] for t in node.input]
         x = inputs[2]
+        x = check_batchdim(G, x, node.name)
         x_shape = x[2].shape
         in_b, in_h, in_w, in_c = tuple(x_shape)
         pout_shape = [dim if x_shape[idx] is not None else None for idx,
                       dim in enumerate(cls.get_constant(inputs[0]))]
         out_b, out_h, out_w, out_c = tuple(pout_shape)
 
+
         filt = inputs[1]
         weights_node = filt[0]
         filt_shape = filt[2].shape
diff --git a/tools/nntool/importer/tflite2/remove_concats.py b/tools/nntool/importer/tflite2/remove_concats.py
deleted file mode 100644
index c3a3391ca..000000000
--- a/tools/nntool/importer/tflite2/remove_concats.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-import logging
-
-from graph.matches.matcher import find_forward
-from graph.types import (ConcatParameters, NNEdge, NoOPParameters,
-                         StridedSliceParameters)
-from utils.node_id import NodeId
-
-LOG = logging.getLogger('nntool.' + __name__)
-
-def remove_concats(G):
-    concat_nodes = list([node for node in G.nodes() if isinstance(node, ConcatParameters)])
-    strided_slices_removed = []
-    concats_removed = []
-    for node in concat_nodes:
-        concat_out_edges = G.indexed_out_edges(node.name)[0]
-        concat_in_edges = G.indexed_in_edges(node.name)
-        axis_slices = []
-        start_idx = 0
-        # find the slice patterns that can match inputs
-        for in_idx, dim in enumerate(node.in_dims):
-            slice_patterns = [(start_idx, start_idx + dim.shape[node.axis], 1)]
-            if dim.shape[node.axis] == 1:
-                # can also match reversed
-                slice_patterns.append((start_idx, start_idx - 1, -1))
-            axis_slices.append(slice_patterns)
-            start_idx += dim.shape[node.axis]
-        for out_edge in concat_out_edges:
-            edge_lists = find_forward(G, out_edge,
-                                      StridedSliceParameters,
-                                      skip_node_classes=NoOPParameters)
-            # each list of edges goes to a strided slice
-            for edge_list in edge_lists:
-                edge = edge_list[-1]
-                assert isinstance(edge.to_node, StridedSliceParameters)
-                ssp = edge.to_node
-                LOG.info("found strided slice %s", ssp.name)
-                # must only slice axis of concat
-                if not ssp.only_slices_axis(node.axis):
-                    LOG.info("rejected: slices more than one axis")
-                    continue
-                # must match a slice pattern on the input
-                ssp_slice = ssp.act_slice[node.axis]
-
-                in_idx = None
-                for idx, slice_patterns in enumerate(axis_slices):
-                    if ssp_slice in slice_patterns:
-                        in_idx = idx
-                        break
-                if in_idx is None:
-                    LOG.info("rejected: slices pattern matching concat not found")
-                    continue
-                LOG.info("removing slice %s", ssp.name)
-                strided_slices_removed.append(ssp.name)
-                # save the out edges
-                ssp_out_edges = G.out_edges(ssp.name)
-                in_edge = concat_in_edges[in_idx]
-                # remove all the nodes including the ssp
-                for inter_edge in edge_list:
-                    if G.quantization:
-                        del G.quantization[NodeId(inter_edge.to_node)]
-                    G.remove(inter_edge.to_node)
-                # connect all the ssp out edges to the node on the concat input
-
-                for ssp_out_edge in ssp_out_edges:
-                    G.add_edge(NNEdge(in_edge.from_node, ssp_out_edge.to_node,
-                                      from_idx=in_edge.from_idx,
-                                      to_idx=ssp_out_edge.to_idx))
-        # if the concat now has no out edges remove it
-        if G.num_out_edges(node.name) == 0:
-            LOG.info("removing concat %s", node.name)
-            concats_removed.append(node.name)
-            G.remove(node)
-
-    return (strided_slices_removed, concats_removed)
diff --git a/tools/nntool/importer/tflite2/tflite.py b/tools/nntool/importer/tflite2/tflite.py
index d0e35779d..2566e5906 100644
--- a/tools/nntool/importer/tflite2/tflite.py
+++ b/tools/nntool/importer/tflite2/tflite.py
@@ -40,7 +40,6 @@
 from .common import LOG, check
 from .common.handler_helper import get_all_backend_handlers
 from .fix_split_in_edges import fix_split_in_edges
-from .remove_concats import remove_concats
 
 # pylint: disable=E1101
 
@@ -104,7 +103,6 @@ def create_graph(self, filename, opts):
         RemoveReshapesBeforeLinear().match(G)
         # DrawGraphReporter().report(G)
         G.add_dimensions()
-        remove_concats(G)
         if opts['remove_quantize_ops']:
             RemoveQuantizeOperators().match(G)
             G.add_dimensions()
diff --git a/tools/nntool/interpreter/commands/adjust.py b/tools/nntool/interpreter/commands/adjust.py
index f03975ac8..65c51d756 100644
--- a/tools/nntool/interpreter/commands/adjust.py
+++ b/tools/nntool/interpreter/commands/adjust.py
@@ -47,5 +47,5 @@ def do_adjust(self, args):
         else:
             steps = None
         self.G.adjust_order(
-            postprocess=not args.no_postprocess, steps=steps, single_step=args.individual_step)
+            no_postprocess=args.no_postprocess, steps=steps, single_step=args.individual_step)
         self.G.add_dimensions()
diff --git a/tools/nntool/interpreter/commands/aquant.py b/tools/nntool/interpreter/commands/aquant.py
index 56b2cf8c8..e66fc99e4 100644
--- a/tools/nntool/interpreter/commands/aquant.py
+++ b/tools/nntool/interpreter/commands/aquant.py
@@ -14,11 +14,14 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import argparse
+import glob
 import logging
 import pickle
-import glob
+from pathlib import Path
+
 from cmd2 import Cmd2ArgumentParser, with_argparser
 from cmd2.cmd2 import Cmd
+from interpreter.commands.qtune import load_options
 from interpreter.nntool_shell_base import (NNToolShellBase,
                                            store_once_in_history)
 from interpreter.shell_utils import glob_input_files, input_options
@@ -50,6 +53,9 @@ class AquantCommand(NNToolShellBase):
     parser_aquant.add_argument('--stats',
                                completer_method=Cmd.path_complete,
                                help='pickle file containing statistics')
+    parser_aquant.add_argument('--json',
+                               completer_method=Cmd.path_complete,
+                               help='json file file containing saved quantization options using qtunesave command')
     add_options_to_parser(parser_aquant)
     input_options(parser_aquant)
 
@@ -62,6 +68,16 @@ def do_aquant(self, args: argparse.Namespace):
         stats_collector = ActivationRangesCollector()
         # if replaying state file then load the activation stats if they are present
         opts = get_options_from_args(args)
+
+        if args.json:
+            json_path = Path(args.json)
+            if not json_path.exists() or not json_path.is_file():
+                self.perror(f'{json_path} does not exist or is not a file')
+                return
+            json_opts = load_options(json_path)
+            json_opts.update(opts)
+            opts = json_opts
+
         state = ConstantInputParameters.save_compression_state(self.G)
         try:
             if args.stats:
diff --git a/tools/nntool/interpreter/commands/compile_at_model.py b/tools/nntool/interpreter/commands/compile_at_model.py
index 75db2c9f8..db5e6cc9a 100644
--- a/tools/nntool/interpreter/commands/compile_at_model.py
+++ b/tools/nntool/interpreter/commands/compile_at_model.py
@@ -111,7 +111,8 @@ def do_compile(self, args):
             at_gen_srcs.append(os.path.join(TILER_DSP_GENERATOR_PATH, "DSP_Generators.c"))
 
         objs = cc.compile(
-            srcs + at_gen_srcs,
+            sources=at_gen_srcs + srcs,
+            output_dir=args.model_dir,
             debug=1,
             extra_preargs=["-g"]
         )
diff --git a/tools/nntool/interpreter/commands/dsp_preprocessing.py b/tools/nntool/interpreter/commands/dsp_preprocessing.py
index e8cf8e192..4d9c1e29c 100644
--- a/tools/nntool/interpreter/commands/dsp_preprocessing.py
+++ b/tools/nntool/interpreter/commands/dsp_preprocessing.py
@@ -13,6 +13,8 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+from graph.types.base import NNEdge
+from graph.types.others import ReshapeParameters
 import json
 import logging
 
@@ -26,9 +28,11 @@
 
 LOG = logging.getLogger("nntool")
 
+
 class DSPPreprocessingCommand(NNToolShellBase):
     # GEN COMMAND
     parser_compile = Cmd2ArgumentParser()
+
     def inputs_choices(self):
         if self.G is None:
             return []
@@ -37,8 +41,8 @@ def inputs_choices(self):
     def dsp_types(self):
         return [clas.__name__ for clas in DSPParameters.__subclasses__()]
 
-
-    parser_dsp = Cmd2ArgumentParser("inserts dsp preprocessing node into graphs")
+    parser_dsp = Cmd2ArgumentParser(
+        "inserts dsp preprocessing node into graphs")
     parser_dsp.add_argument('input_node',
                             choices_method=inputs_choices,
                             help='input node name to format')
@@ -49,6 +53,10 @@ def dsp_types(self):
                             help='path to the config file for mfcc')
     parser_dsp.add_argument('--n_fft', type=int,
                             help="n_fft bins")
+    parser_dsp.add_argument('--n_frames', type=int,
+                            help="number of frames")
+    parser_dsp.add_argument('--n_fbanks', type=int,
+                            help="number of filter banks")
     parser_dsp.add_argument('--frame_size', default=None, type=int,
                             help='frame size in samples')
     parser_dsp.add_argument('--frame_step', default=None, type=int,
@@ -68,6 +76,8 @@ def do_dsp_preprocessing(self, args):
         magsquared = args.magsquared
         win_fn = args.window_fn
         preemp_factor = args.preemp_factor
+        n_frames = args.n_frames
+        n_fbanks = args.n_fbanks
         config_dict = None
         if args.config_json:
             with open(args.config_json) as json_file:
@@ -78,38 +88,42 @@ def do_dsp_preprocessing(self, args):
             magsquared = config_dict.get("magsquared", magsquared)
             win_fn = config_dict.get("window_fn", win_fn)
             preemp_factor = config_dict.get("preemp_factor", preemp_factor)
+            n_frames = config_dict.get("n_frames", n_frames)
+            n_fbanks = config_dict.get("n_fbanks", n_fbanks)
 
         assert frame_step, "frame_step is required"
-        spect_shape = self.G[args.input_node].out_dims[0].shape
-        if len(spect_shape) > 2:
-            if 1 in spect_shape:
-                temp = spect_shape[::-1]
-                temp.remove(1)
-                spect_shape = temp[::-1]
-            LOG.info(f"spectrogram shape expected as {spect_shape}")
-        n_frames = spect_shape[-2]
-        n_fbanks = spect_shape[-1]
+        assert n_fbanks and n_frames, "n_frames and n_fbanks are required"
+        org_input_dim = self.G[args.input_node].out_dims[0]
+        if org_input_dim.size() != (n_frames * n_fbanks):
+            raise ValueError(
+                f"Next layer has dimension {org_input_dim} (size: {org_input_dim.size()}) while you are trying to insert a DSP params with output size of {n_fbanks*n_frames} ({n_frames}x{n_fbanks})")
         LOG.info(f"N FRAMES: {n_frames}")
         new_input_size = frame_step * (n_frames - 1) + frame_size
         if args.dsp_node_type == "MFCCPreprocessingParameters":
-            dsp_params = MFCCPreprocessingParameters("MfccPreprocessing", conf_dict=config_dict)
+            dsp_params = MFCCPreprocessingParameters(
+                "MfccPreprocessing", conf_dict=config_dict)
             win_lut, fft_twiddles, swaptable, rfft_twiddles = dsp_params.gen_fft_twiddles()
             melfilt_coeff_sparse_node, melfilt_sparsity_node = dsp_params.gen_melfilter()
-            dct_matrix_node = dsp_params.gen_dct_matrix()(self.G) if dsp_params.n_dct else None
-            dsp_params_ref = dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(self.G), swaptable(self.G), rfft_twiddles(self.G), melfilt_sparsity_node(self.G), melfilt_coeff_sparse_node(self.G), dct_matrix_node)
+            dct_matrix_node = dsp_params.gen_dct_matrix()(
+                self.G) if dsp_params.n_dct else None
+            dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(self.G), swaptable(self.G), rfft_twiddles(
+                self.G), melfilt_sparsity_node(self.G), melfilt_coeff_sparse_node(self.G), dct_matrix_node)
         elif args.dsp_node_type == "RFFT2DPreprocessingParameters":
-            dsp_params = RFFT2DPreprocessingParameters("RfftPreprocessing", conf_dict=config_dict)
+            dsp_params = RFFT2DPreprocessingParameters(
+                "RfftPreprocessing", conf_dict=config_dict)
             win_lut, fft_twiddles, swaptable, rfft_twiddles = dsp_params.gen_fft_twiddles()
-            dsp_params_ref = dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(self.G), swaptable(self.G), rfft_twiddles(self.G))
+            dsp_params(None, win_lut(self.G) if win_lut else win_lut, fft_twiddles(
+                self.G), swaptable(self.G), rfft_twiddles(self.G))
 
-        new_input_node = InputParameters(args.input_node, dims=Dim.unnamed([new_input_size]))
+        new_input_node = InputParameters(
+            args.input_node, dims=Dim.unnamed([new_input_size]))
         input_node_edge = self.G.out_edges(args.input_node)[0]
         input_node_edge.from_node.in_dims[0] = Dim.unnamed([new_input_size])
         self.G.insert_node_at_edge(dsp_params, input_node_edge)
         self.G.replace_node(self.G[args.input_node], new_input_node)
+        dsp_out_dim = dsp_params.get_output_size([Dim.unnamed([new_input_size])])[0]
+        if dsp_out_dim != org_input_dim:
+            reshape = ReshapeParameters("reshape_dsp", old_shape=dsp_out_dim.shape, shape=org_input_dim.shape)
+            self.G.insert_node_after(
+                    dsp_params, reshape, from_idx=0, edge_class=NNEdge)
         self.G.add_dimensions()
-
-
-
-
-
diff --git a/tools/nntool/interpreter/commands/dump.py b/tools/nntool/interpreter/commands/dump.py
index b665bbfda..ec05f2acc 100644
--- a/tools/nntool/interpreter/commands/dump.py
+++ b/tools/nntool/interpreter/commands/dump.py
@@ -18,22 +18,56 @@
 import pickle
 
 import numpy as np
-from PIL import Image, ImageDraw
 from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser
-
 from execution.graph_executer import GraphExecuter
 from execution.quantization_mode import QuantizationMode
-from graph.types import SSDDetectorParameters
 from interpreter.nntool_shell_base import NNToolShellBase, no_history
-from interpreter.shell_utils import (glob_input_files,
-                                     input_options)
+from interpreter.shell_utils import glob_input_files, input_options
+from PIL import Image, ImageDraw
+from utils.at_norm import get_do_rounding, set_do_rounding
 from utils.data_importer import import_data
 from utils.node_id import NodeId
-from utils.at_norm import set_do_rounding, get_do_rounding
+
+from graph.dump_tensor import PrintDumper, dump_tensor
+from graph.types import ConstantInputParameters, SSDDetectorParameters
 
 LOG = logging.getLogger('nntool.'+__name__)
 
 
+def print_intermediates(G, outputs, limit=None, width=8,
+                        precision=4, channel=None, order=None,
+                        checksum=False, print_constants=False):
+    def print_step(step, outs, index):
+        node = step['node']
+        if checksum:
+            for out_idx, out in enumerate(outs):
+                if isinstance(node, ConstantInputParameters):
+                    continue
+                checksum_val = np.sum(out) if out.dtype != np.uint8 else np.sum(
+                    out.astype(np.int8))
+                print(
+                    f"S{index} - {node.name}\n\tChecksum = {checksum_val}")
+        else:
+            print(node.name)
+            for out_idx, out in enumerate(outs):
+                dims = node.out_dims[out_idx]
+                if order is not None and dims.is_named and order != dims.order and all(k in dims.order
+                                                                                       for k in order):
+                    transpose = dims.transpose_to_order(order)
+                    out = out.transpose(transpose)
+                if channel is not None:
+                    out = out[channel:channel+1:1, ...]
+                dump_tensor(out, PrintDumper(
+                    out, width=width, precision=precision))
+
+    if limit is not None:
+        print_step(G.graph_state.steps[limit], outputs[limit], limit)
+    else:
+        for idx, out in enumerate(outputs):
+            print_step(G.graph_state.steps[idx], out, idx)
+    print()
+
+
 class DumpCommand(NNToolShellBase):
     # DUMP COMMAND
     parser_dump = Cmd2ArgumentParser()
@@ -126,9 +160,9 @@ def do_dump(self, args: argparse.Namespace):
             if args.pickle or self._in_py or args.save:
                 pickles.append(outputs)
             else:
-                self.G.print_intermediates(outputs, limit=step, width=args.number_width,
-                                           precision=args.precision, channel=args.channel,
-                                           order=['c', 'h', 'w'], checksum=args.checksum)
+                print_intermediates(self.G, outputs, limit=step, width=args.number_width,
+                                    precision=args.precision, channel=args.channel,
+                                    order=['c', 'h', 'w'], checksum=args.checksum)
 
             if args.visualize_detection:
                 img_in = Image.open(file_per_input[0]).convert('RGBA')
diff --git a/tools/nntool/interpreter/commands/fquant.py b/tools/nntool/interpreter/commands/fquant.py
index 251d6aeeb..baa78b0d2 100644
--- a/tools/nntool/interpreter/commands/fquant.py
+++ b/tools/nntool/interpreter/commands/fquant.py
@@ -15,9 +15,11 @@
 
 import argparse
 import logging
+from pathlib import Path
 
 import numpy as np
-from cmd2 import Cmd2ArgumentParser, with_argparser
+from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser
+from interpreter.commands.qtune import load_options
 from interpreter.nntool_shell_base import NNToolShellBase
 from quantization.handlers_helpers import (add_options_to_parser,
                                            get_options_from_args)
@@ -55,6 +57,9 @@ class FquantCommand(NNToolShellBase):
     parser_fquant.add_argument('--seed',
                                type=int, default=0,
                                help='numpy random seed, default not set and inputs change every time')
+    parser_fquant.add_argument('--json',
+                               completer_method=Cmd.path_complete,
+                               help='json file file containing saved quantization options using qtunesave command')
     add_options_to_parser(parser_fquant)
 
     @with_argparser(parser_fquant)
@@ -65,6 +70,16 @@ def do_fquant(self, args: argparse.Namespace):
 weights and input data are avalaible."""
         self._check_graph()
         opts = get_options_from_args(args)
+        opts = get_options_from_args(args)
+        if args.json:
+            json_path = Path(args.json)
+            if not json_path.exists() or not json_path.is_file():
+                self.perror(f'{json_path} does not exist or is not a file')
+                return
+            json_opts = load_options(json_path)
+            json_opts.update(opts)
+            opts = json_opts
+
         state = ConstantInputParameters.save_compression_state(self.G)
         try:
             if self.replaying_history and self.history_stats:
diff --git a/tools/nntool/interpreter/commands/fusions.py b/tools/nntool/interpreter/commands/fusions.py
index 4fae2751d..bf2bd21ec 100644
--- a/tools/nntool/interpreter/commands/fusions.py
+++ b/tools/nntool/interpreter/commands/fusions.py
@@ -16,13 +16,8 @@
 import texttable
 from cmd2 import Cmd2ArgumentParser, with_argparser
 from interpreter.nntool_shell_base import NNToolShellBase
-from quantization.quantizer.new_quantizer import NewQuantizer
-from quantization.verify_quantization import verify_quantization
 
-from graph.matches.matches import (get_fusion, get_fusions,
-                                   get_pow2_match_group,
-                                   get_scale8_match_group)
-from graph.types import ConstantInputParameters
+from graph.matches.matches import get_fusions
 
 
 class FusionsCommand(NNToolShellBase):
@@ -31,6 +26,9 @@ def fusions_list(self):
         return [elem[0] for elem in get_fusions()]
 
     parser_fusions = Cmd2ArgumentParser("apply fusions to graph")
+    parser_fusions.add_argument('--no_postprocess',
+                                action='store_true',
+                                help="don't run adjust or qtune or rerun fusions (debugging option)")
     parser_fustions_exclusive = parser_fusions.add_mutually_exclusive_group()
     parser_fustions_exclusive.add_argument('-l', '--list',
                                            action='store_true',
@@ -59,31 +57,17 @@ def do_fusions(self, args):
             self.ppaged(table.draw())
             return
         self._check_graph()
-        state = ConstantInputParameters.save_compression_state(self.G)
         try:
             if args.apply:
-                fusions = [get_fusion(name) for name in args.apply]
-                invalid_names = [args.apply[idx] for idx, fusion in enumerate(fusions) if fusion is None]
-                if invalid_names:
-                    self.perror(f'fusion{"s" if len(invalid_names) > 1 else ""} {", ".join(invalid_names)} not found')
-                    return
+                fusions_names = args.apply
             elif args.pow2:
-                fusions = [get_pow2_match_group()]
+                fusions_names = ['pow2_match_group']
             elif args.scale8:
-                fusions = [get_scale8_match_group()]
+                fusions_names = ['scaled_match_group']
             else:
-                self.perror("No fusion set selected. Nothing to do. Select --pow2 or --scale8.")
+                self.perror(
+                    "No fusion set selected. Nothing to do. Select --pow2 or --scale8.")
                 return
-            for fusion in fusions:
-                fusion.match(self.G)
-            self.G.add_dimensions()
-            if self.G.quantization and verify_quantization(self.G):
-                quantizer = NewQuantizer(self.G)
-                quantizer.quantize()
-                problems = verify_quantization(self.G)
-                if problems:
-                    self.perror('quantization issue after fusions')
-                    for problem in problems:
-                        self.perror(problem)
-        finally:
-            ConstantInputParameters.restore_compression_state(self.G, state)
+            self.G.fusions(*fusions_names, no_postprocess=args.no_postprocess)
+        except ValueError as ex:
+            self.perror(f'{ex}')
diff --git a/tools/nntool/interpreter/commands/gen.py b/tools/nntool/interpreter/commands/gen.py
index 6e083a8c8..ccbd77b9e 100644
--- a/tools/nntool/interpreter/commands/gen.py
+++ b/tools/nntool/interpreter/commands/gen.py
@@ -16,14 +16,19 @@
 import argparse
 import logging
 import os
+
 from cmd2 import Cmd, Cmd2ArgumentParser, with_argparser
-from interpreter.nntool_shell_base import NNToolShellBase, no_history
-from utils.data_importer import import_data
 from execution.graph_executer import GraphExecuter
 from execution.quantization_mode import QuantizationMode
-from generation.default_template import basic_kernel_header_template, basic_kernel_source_template, default_template, dynamic_template, header_template
-from generation.naming_convension import DefaultNamingConvension
 from generation.code_generator import CodeGenerator
+from generation.default_template import (basic_kernel_header_template,
+                                         basic_kernel_source_template,
+                                         default_template, dynamic_template,
+                                         header_template)
+from generation.gen_utils import write_empty
+from generation.naming_convension import DefaultNamingConvension
+from interpreter.nntool_shell_base import NNToolShellBase, no_history
+from utils.data_importer import import_data
 
 LOG = logging.getLogger("nntool")
 
@@ -92,9 +97,11 @@ def do_gen(self, args):
         self.settings['basic_kernel_source_file'] = args.basic_kernel_source_file
         self.settings['basic_kernel_header_file'] = args.basic_kernel_header_file
         self.settings['anonymise'] = args.anonymise
-        os.makedirs(os.path.abspath(self.settings['model_directory']), mode=0o750, exist_ok=True)
-        os.makedirs(os.path.abspath(self.settings['tensor_directory']), mode=0o750, exist_ok=True)
-        code_gen = CodeGenerator(self.G, DefaultNamingConvension(self.G, anonymise=args.anonymise), self.settings)
+        os.makedirs(os.path.abspath(
+            self.settings['model_directory']), mode=0o750, exist_ok=True)
+        os.makedirs(os.path.abspath(
+            self.settings['tensor_directory']), mode=0o750, exist_ok=True)
+        code_gen = CodeGenerator(self.G, DefaultNamingConvension(anonymise=args.anonymise), self.settings)
 
         if self.settings['template_file']:
             code_template = dynamic_template(self.settings['template_file'])
@@ -108,18 +115,28 @@ def do_gen(self, args):
             if self.G.has_expressions:
                 with open(os.path.join(self.settings['model_directory'],
                                        args.basic_kernel_source_file), "w") as output_fp:
-                    output_fp.write(basic_kernel_source_template(self.G, code_generator=code_gen))
+                    output_fp.write(basic_kernel_source_template(
+                        self.G, code_generator=code_gen))
                 with open(os.path.join(self.settings['model_directory'],
                                        args.basic_kernel_header_file), "w") as output_fp:
-                    output_fp.write(basic_kernel_header_template(self.G, code_generator=code_gen))
+                    output_fp.write(basic_kernel_header_template(
+                        self.G, code_generator=code_gen))
+            else:
+                write_empty(self.settings['model_directory'],
+                            args.basic_kernel_source_file, "no expressions used")
+                write_empty(self.settings['model_directory'],
+                            args.basic_kernel_header_file, "no expressions used")
         else:
             self.ppaged(code_template(self.G, code_generator=code_gen))
             if self.G.has_expressions:
-                self.ppaged(basic_kernel_source_template(self.G, code_generator=code_gen))
-                self.ppaged(basic_kernel_header_template(self.G, code_generator=code_gen))
+                self.ppaged(basic_kernel_source_template(
+                    self.G, code_generator=code_gen))
+                self.ppaged(basic_kernel_header_template(
+                    self.G, code_generator=code_gen))
         if args.output_tensors:
             code_gen.write_constants()
 
         if args.header_file:
             with open(os.path.join(self.settings['model_directory'], args.header_file), "w") as output_fp:
-                output_fp.write(header_template(self.G, code_generator=code_gen))
+                output_fp.write(header_template(
+                    self.G, code_generator=code_gen))
diff --git a/tools/nntool/interpreter/commands/gen_project.py b/tools/nntool/interpreter/commands/gen_project.py
index b8202d8b3..83457d548 100644
--- a/tools/nntool/interpreter/commands/gen_project.py
+++ b/tools/nntool/interpreter/commands/gen_project.py
@@ -36,6 +36,7 @@
 from generation.default_template import (basic_kernel_header_template,
                                          basic_kernel_source_template,
                                          default_template)
+from generation.gen_utils import write_empty
 from generation.naming_convension import DefaultNamingConvension
 from interpreter.commands.aquant import AquantCommand
 from interpreter.commands.open import OpenCommand
@@ -134,6 +135,11 @@ def do_gen_project(self, args):
         self._check_quantized()
         self._check_adjusted()
 
+        if "GAP_SDK_HOME" not in os.environ or "NNTOOL_PATH" not in os.environ:
+            self.perror(
+                'you must run "source sourceme.sh" in the GAP SDK before using this command')
+            return
+
         if args.input_tensors:
             if args.input_tensors not in self.tensor_store:
                 self.perror(
@@ -217,7 +223,7 @@ def do_performance(self, args):
         self._check_graph()
         self._check_quantized()
         self._check_adjusted()
-        if "GAP_SDK_HOME" not in os.environ:
+        if "GAP_SDK_HOME" not in os.environ or "NNTOOL_PATH" not in os.environ:
             self.perror(
                 'you must run "source sourceme.sh" in the GAP SDK before using this command')
             return
@@ -281,7 +287,7 @@ def do_performance(self, args):
                     self.tensor_store[args.output_tensors] = at_map_tensors(
                         self.G, at_tensor_loader_int(fp))
 
-            match_perf = r" +((?:S\d+|Tota)[^:]+): *Cycles: +(\d+)[^:]+: +(\d+)[^:]+: +([\d<.]+)"
+            match_perf = r" +((?:S\d+|Tota)[^:]+): *Cycles: +(\d+)[^:]+: +(\d+)[^:]+: +(.+)"
             matcher = re.compile(match_perf)
             perf = matcher.findall(res.stdout)
             if not perf:
@@ -356,7 +362,9 @@ def process_script(script):
         if line.startswith('aquant'):
             # add abs path for input files and try to remake command
             args = aquant_parser.parse_args(line.rstrip().split(' ')[1:])
-            input_files = [os.path.abspath(f) for f in args.input_files if f != '']
+            input_files = [os.path.abspath(f)
+                           for f in args.input_files if f != '']
+#pylint: disable=singleton-comparison
             opts = [f"--{k} {v}" if v != True else f"--{k}" for k, v in vars(args).items()
                     if v and k != 'input_files']
             line = " ".join(['aquant'] + opts + input_files)
@@ -373,7 +381,7 @@ def gen_project(G, settings, project_folder, script_commands, overwrite=False, p
     settings['graph_produce_operinfos'] = True
 
     code_gen = CodeGenerator(
-        G, DefaultNamingConvension(G), settings)
+        G, DefaultNamingConvension(), settings)
 
     if not os.path.exists(project_folder):
         os.mkdir(project_folder)
@@ -447,7 +455,7 @@ def gen_project(G, settings, project_folder, script_commands, overwrite=False, p
             if script_commands[-1] != "save_state":
                 fp.write('save_state\n')
     if gen_atproject:
-        code_gen = CodeGenerator(G, DefaultNamingConvension(G), settings)
+        code_gen = CodeGenerator(G, DefaultNamingConvension(), settings)
         with open(os.path.join(project_folder, 'Model.c'), "w") as output_fp:
             output_fp.write(default_template(G, code_generator=code_gen))
         if G.has_expressions:
@@ -457,6 +465,12 @@ def gen_project(G, settings, project_folder, script_commands, overwrite=False, p
             with open(os.path.join(project_folder, "Expression_Kernels.h"), "w") as output_fp:
                 output_fp.write(basic_kernel_header_template(
                     G, code_generator=code_gen))
+        else:
+            write_empty(project_folder, "Expression_Kernels.c",
+                        "no expressions used")
+            write_empty(project_folder, "Expression_Kernels.h",
+                        "no expressions used")
+
         code_gen.write_constants(tensor_directory=project_folder)
     ignore_function = None if overwrite else skip_existing_files(
         project_folder)
diff --git a/tools/nntool/interpreter/commands/open.py b/tools/nntool/interpreter/commands/open.py
index 2db6d849f..1e317bb5f 100644
--- a/tools/nntool/interpreter/commands/open.py
+++ b/tools/nntool/interpreter/commands/open.py
@@ -179,7 +179,11 @@ def do_open(self, args: argparse.Namespace):
         else:
             # reset the current graph
             self._graphs[self._graph_idx] = NO_GRAPH.copy()
-        self.__open_graph(args)
+        try:
+            self.__open_graph(args)
+        except FileNotFoundError:
+            self.perror(f'{args.nnfile} not found')
+            return
         self._update_prompt()
         self.py_locals['G'] = self.G
 
diff --git a/tools/nntool/interpreter/commands/qtune.py b/tools/nntool/interpreter/commands/qtune.py
index 1eaa78c9f..e7b60a90f 100644
--- a/tools/nntool/interpreter/commands/qtune.py
+++ b/tools/nntool/interpreter/commands/qtune.py
@@ -75,7 +75,7 @@ def qtune_first_arg_mapper(self, nodestr):
     parser_tune.add_argument(
         '--json',
         completer_method=Cmd.path_complete,
-        help='json file to save quantization options')
+        help='json file to load quantization options from')
 
     @with_argparser(parser_tune, ns_provider=capture_shell)
     def do_qtune(self, args):
@@ -102,8 +102,7 @@ def reduction(state, x):
             if not json_path.exists() or not json_path.is_file():
                 self.perror(f'{json_path} does not exist or is not a file')
                 return
-            with json_path.open('r') as fp:
-                options = json.load(fp, cls=JsonSerializableStateDecoder)
+            options = load_options(json_path)
         else:
             options = {}
 
@@ -115,6 +114,17 @@ def reduction(state, x):
         quantizer.quantize()
         self.pfeedback('quantization options set')
 
+def load_options(file_path):
+    with file_path.open('r') as fp:
+        save_options = json.load(fp, cls=JsonSerializableStateDecoder)
+        options = save_options['global']
+        for node_opt in save_options['nodes']:
+            if 'node_name' not in node_opt:
+                raise ValueError('node option missing node id')
+            options[NodeId(node_opt['node_name'])] = {opt: val for opt, val in node_opt.items() if opt != "node_name"}
+    return options
+
+
 class QTuneSaveCommand(NNToolShellBase):
 
     # QTUNESAVE COMMAND
@@ -126,13 +136,35 @@ class QTuneSaveCommand(NNToolShellBase):
     @with_argparser(parser_qtune_save)
     def do_qtunesave(self, args):
         """
-Save set quantization options."""
+Save set quantization options.
+
+You can manually edit quantization options in the file.
+The global section contains options that will be applied to the whole graph
+The nodes array contains options for each node that override the global options
+Each nodes entry should be a JSON mapping with a key node_name
+node_name should contain a node name or and arry with the fusion name and fusion internal
+node name.
+"""
         self._check_graph()
         self._check_quantized()
         save_path = Path(args.jsonfile).with_suffix('.json')
-        options = self.G.quantization.options.copy()
-        if 'scheme' not in options:
-            options['scheme'] = self.G.quantization.scheme_priority[0]
+        save_options = {
+            "global": {},
+            "nodes": []
+        }
+        for optid, opt in self.G.quantization.options.items():
+            if isinstance(optid, NodeId):
+                opt = opt.copy()
+                if 'qtype_ind' in opt:
+                    del opt['qtype_ind']
+                if opt:
+                    opt['node_name'] = optid.id[0] if not optid.id[1] else optid
+                    save_options['nodes'].append(opt)
+            else:
+                save_options['global'][optid] = opt
+
+        if 'scheme' not in save_options:
+            save_options['scheme'] = self.G.quantization.scheme_priority[0]
         with save_path.open('w') as fp:
-            json.dump(options, fp, cls=JsonSerializableStateEncoder, indent=2)
+            json.dump(save_options, fp, cls=JsonSerializableStateEncoder, indent=2)
         self.pfeedback(f'quantization options saved to {save_path}')
diff --git a/tools/nntool/interpreter/commands/remove.py b/tools/nntool/interpreter/commands/remove.py
index f5b716a7d..3283f59c8 100644
--- a/tools/nntool/interpreter/commands/remove.py
+++ b/tools/nntool/interpreter/commands/remove.py
@@ -14,6 +14,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import argparse
+from functools import reduce
 
 from cmd2 import Cmd2ArgumentParser, with_argparser
 from interpreter.nntool_shell_base import NNToolShellBase
@@ -40,6 +41,9 @@ def nodes_choices(self):
     parser_remove.add_argument('-u', '--up',
                                action='store_true',
                                help='when one node is specified remove it and everything above it')
+    parser_remove.add_argument('--leave',
+                               action='store_true',
+                               help='when one node is specified only remove what is above or below and not the node itself')
 
     @with_argparser(parser_remove)
     def do_remove(self, args: argparse.Namespace):
@@ -51,36 +55,53 @@ def do_remove(self, args: argparse.Namespace):
         node_from = self.G[args.nodes[0]]
         if len(args.nodes) == 1:
             if args.up:
-                nodes_above = self.G.nodes_above(node_from)
-                out_edges = self.G.indexed_out_edges(node_from)
-                nodes_above.add(node_from)
+                nodes_above = set(self.G.nodes_above(node_from))
+                if args.leave:
+                    remove_nodes = nodes_above
+                    inputs_on = []
+                    dims = node_from.in_dims
+                    for in_edge in self.G.indexed_in_edges(node_from):
+                        if isinstance(in_edge.from_node, ConstantInputParameters):
+                            nodes_above.remove(in_edge.from_node)
+                        else:
+                            inputs_on.append([in_edge])
+                else:
+                    dims = node_from.out_dims
+                    remove_nodes = nodes_above | {node_from}
+                    inputs_on = self.G.indexed_out_edges(node_from)
+
                 input_names = sorted(
-                    [node.name for node in nodes_above if isinstance(node, InputParameters)])
-                self.G.remove_all(nodes_above | {node_from})
-                for idx, edge_group in enumerate(out_edges):
+                    [node.name for node in remove_nodes if isinstance(node, InputParameters)])
+                self.G.remove_all(remove_nodes)
+
+                for idx, edge_group in enumerate(inputs_on):
                     name = input_names.pop(0) if input_names else None
-                    in_node = self.G.add_input(
-                        node_from.out_dims[idx], name=name)
+                    in_node = self.G.add_input(dims[idx], name=name)
                     self.pfeedback(f'adding input {in_node.name}')
                     for edge in edge_group:
                         self.G.add_edge(NNEdge(from_node=in_node,
                                                to_idx=edge.to_idx,
                                                to_node=edge.to_node))
             else:
-                nodes_below = self.G.nodes_below(node_from)
-                for node in list(nodes_below):
-                    nodes_below.update(edge.from_node for edge in self.G.in_edges(node)
-                                       if isinstance(edge.from_node, ConstantInputParameters))
+                nodes_below = set(self.G.nodes_below(node_from))
                 if self.G.is_vertex_cut(nodes_below):
                     self.perror(
                         f'removing everything below {node_from.name} would split the graph which is not permitted')
                     return
-                nodes_below.add(node_from)
-                in_edges = self.G.in_edges(node_from.name)
+                if args.leave:
+                    remove_nodes = nodes_below
+                    outputs_on = [edge_bundle[0]
+                                  for edge_bundle in self.G.indexed_out_edges(node_from)]
+                else:
+                    input_nodes = {edge.from_node for edge in self.G.in_edges(node_from)
+                                   if isinstance(edge.from_node, (InputParameters, ConstantInputParameters))}
+                    remove_nodes = nodes_below | {node_from} | input_nodes
+                    outputs_on = self.G.indexed_in_edges(node_from)
                 output_names = sorted(
-                    [node.name for node in nodes_below if isinstance(node, OutputParameters)])
-                self.G.remove_all(nodes_below)
-                for edge in in_edges:
+                    [node.name for node in remove_nodes if isinstance(node, OutputParameters)])
+
+                self.G.remove_all(remove_nodes)
+                for edge in outputs_on:
                     name = output_names.pop(0) if output_names else None
                     out_node = self.G.add_output(name=name)
                     self.pfeedback(f'adding output {out_node.name}')
@@ -98,8 +119,11 @@ def do_remove(self, args: argparse.Namespace):
                     f'all paths from {node_from.name} must lead to {node_to.name}')
                 return
 
-            edges_from = self.G.indexed_out_edges(node_from)
-            edges_to = self.G.indexed_in_edges(node_to.name)
+            edges_from = set(self.G.out_edges(node_from))
+            edges_to = set(self.G.in_edges(node_to.name))
+            between_edges = reduce(lambda s, x: s|set(self.G.edges(x)), nodes_between, set())
+            edges_from = edges_from.intersection(between_edges)
+            edges_to = edges_to.intersection(between_edges)
             if len(edges_from) != len(edges_to):
                 self.perror(
                     f"{node_from.name} has a different number of outputs than {node_to.name}'s inputs")
diff --git a/tools/nntool/interpreter/generator.py b/tools/nntool/interpreter/generator.py
index 5a89cc584..c271c2c65 100644
--- a/tools/nntool/interpreter/generator.py
+++ b/tools/nntool/interpreter/generator.py
@@ -24,6 +24,7 @@
                                          basic_kernel_source_template,
                                          default_template, dynamic_template,
                                          header_template)
+from generation.gen_utils import write_empty
 from generation.naming_convension import DefaultNamingConvension
 from interpreter.nntool_shell import NNToolShell
 
@@ -51,7 +52,6 @@ def write_template(G, code_gen, model_directory, model_file, template, template_
     with open(model_path, "w") as output_fp:
         output_fp.write(model)
 
-
 def generate_code(args):
     LOG.propagate = False
 
@@ -85,7 +85,7 @@ def generate_code(args):
     os.makedirs(os.path.abspath(opts['model_directory']), mode=0o750, exist_ok=True)
     os.makedirs(os.path.abspath(opts['tensor_directory']), mode=0o750, exist_ok=True)
 
-    code_gen = CodeGenerator(G, DefaultNamingConvension(G, anonymise=opts.get('anonymise')), opts)
+    code_gen = CodeGenerator(G, DefaultNamingConvension(anonymise=opts.get('anonymise')), opts)
     if args.template_file:
         code_template = dynamic_template(args.template_file)
     else:
@@ -96,6 +96,9 @@ def generate_code(args):
                        opts['basic_kernel_header_file'], basic_kernel_header_template, "kernel headers")
         write_template(G, code_gen, opts['model_directory'],
                        opts['basic_kernel_source_file'], basic_kernel_source_template, "kernel source")
+    else:
+        write_empty(opts['model_directory'], opts['basic_kernel_header_file'], "no expressions used")
+        write_empty(opts['model_directory'], opts['basic_kernel_source_file'], "no expressions used")
 
     if args.header_file:
         with open(os.path.join(opts['model_directory'], args.header_file), "w") as output_fp:
diff --git a/tools/nntool/quantization/clipping.py b/tools/nntool/quantization/clipping.py
new file mode 100644
index 000000000..3cd8b8695
--- /dev/null
+++ b/tools/nntool/quantization/clipping.py
@@ -0,0 +1,99 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import math
+
+import numpy as np
+
+# import scipy.optimize as opt
+# sigma=1.0
+# ALPHA_GAUS = {m: opt.minimize_scalar(lambda x: mse_gaussian(x, sigma=sigma, num_bits=m)).x for m in range(2,17,1)}
+# b=1.
+# ALPHA_LAPLACE = {m: opt.minimize_scalar(lambda x: mse_laplace(x, b=b, num_bits=m)).x for m in range(2,17,1)}
+
+ALPHA_GAUS = {2: 1.7106351863419305, 3: 2.1515927416420935, 4: 2.559136455058456, 5: 2.9362006203824396,
+              6: 3.2869143669161147, 7: 3.615114210893466, 8: 3.924034014599462, 9: 4.216330919936089,
+              10: 4.494170448727792, 11: 4.759309171709873, 12: 5.013218066309031, 13: 5.2570849373594974,
+              14: 5.491968790304721, 15: 5.7186999893215, 16: 5.937970657819115}
+
+ALPHA_LAPLACE = {2: 2.830682989304011, 3: 3.89722946313961, 4: 5.028640140480669, 5: 6.204766334217521,
+                 6: 7.413126215019491, 7: 8.645619949475485, 8: 9.896759823828738, 9: 11.16268502214751,
+                 10: 12.440591336219248, 11: 13.728384769877623, 12: 15.024464757336403, 13: 16.32758309514459,
+                 14: 17.6367486184042, 15: 18.95116231019748, 16: 20.270171640301292}
+GAUSSIAN_CONST = (0.5 * 0.35) * (1 + (math.pi * math.log(4)) ** 0.5)
+
+
+def get_alpha_laplace(num_bits, stat):
+    alpha = ALPHA_LAPLACE[num_bits] * stat['b']
+    return alpha, stat['b']
+
+
+def get_alpha_gaus(shape, num_bits, stat):
+    size = np.prod(shape)
+    std = ((stat['max'] - stat['min']) * GAUSSIAN_CONST) / \
+        ((2 * math.log(size)) ** 0.5)
+    alpha = ALPHA_GAUS[num_bits] * std
+    return alpha, std
+
+
+def mse_laplace(b, alpha, num_bits):
+    return 2 * (b ** 2) * np.exp(-alpha / b) + ((alpha ** 2) / (3 * 2 ** (2 * num_bits)))
+
+
+def mse_gaussian(sigma, alpha, num_bits):
+    clipping_err = (sigma ** 2 + (alpha ** 2)) * (1 - math.erf(alpha / (sigma * np.sqrt(2.0)))) - \
+        np.sqrt(2.0 / np.pi) * alpha * sigma * \
+        (np.e ** ((-1) * (0.5 * (alpha ** 2)) / sigma ** 2))
+    quant_err = (alpha ** 2) / (3 * (2 ** (2 * num_bits)))
+    return clipping_err + quant_err
+
+
+def alpha2DeltaOffset(self, alpha, max_value, min_value, mean):
+    max_range = max_value - min_value
+    if alpha <= 0 or alpha >= max_range / 2:
+        delta = max_range
+    else:
+        delta = 2 * alpha
+        min_value = max(min_value, mean - delta / 2)
+
+    return delta, min_value
+
+
+def get_clip(shape, num_bits, stat, clip_type):
+    if clip_type == "laplace":
+        alpha, _ = get_alpha_laplace(num_bits, stat)
+    elif clip_type == "gaus":
+        alpha, _ = get_alpha_gaus(shape, num_bits, stat)
+    elif clip_type == "mix":
+        alpha_laplace, b = get_alpha_laplace(num_bits, stat)
+        alpha_gaus, std = get_alpha_gaus(shape, num_bits, stat)
+        mse_est_laplace = mse_laplace(b, alpha_laplace, num_bits)
+        mse_est_gaus = mse_gaussian(std, alpha_gaus, num_bits)
+        if mse_est_laplace < mse_est_gaus:
+            alpha = alpha_laplace
+        else:
+            alpha = alpha_gaus
+    elif clip_type == "none":
+        return stat['min'], stat['max']
+    else:
+        raise ValueError('unknown clip type')
+    max_range = stat['max'] - stat['min']
+    if alpha <= 0 or alpha >= max_range / 2:
+        return stat['min'], stat['max']
+
+    min_value = max(stat['min'], stat['mean'] - alpha)
+    max_value = min_value + 2 * alpha
+
+    return min_value, max_value
diff --git a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py
index ddd6f2607..56c24ed63 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/activation_mult.py
@@ -30,35 +30,93 @@
 from quantization.new_qrec import QRec
 from quantization.qtype import QType
 from quantization.unified_quantization_handler import (in_qs_constraint,
-                                                       out_qs_constraint,
-                                                       params_type, priority)
+                                                       out_qs_constraint,option_constraint,
+                                                       params_type, options)
 
 from ..mult_quantization_handler import MultQuantizionHandler
+from quantization.quantizer_options import *
 
 LOG = logging.getLogger('nntool.' + __name__)
 
 
+@options(
+    FORCE_OUTPUT_SIZE_OPTION,
+)
 class ActivationMultSWBase(MultQuantizionHandler):
     @classmethod
-    def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, **kwargs):
+    def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, out_asym, **kwargs):
         force_out_qs, _ = cls.get_mult_opts(**kwargs)
         force_out_q = force_out_qs and force_out_qs[0]
         fusion = kwargs.get('fusion', None)
         in_q = in_qs[0]
-        if not fusion and in_dtype == np.int32:
-            return None
+        if fusion:
+            in_dtype = np.int32
+        bits = 8 if out_dtype == np.int8 or out_dtype == np.uint8 else 16
 
         if isinstance(params, (HSwishActivationParameters, HSigmoidActivationParameters)):
-            if in_q.max < params.upper_bound:
-                # TODO - could do something clever for asymmetric here
-                in_q = QType.from_min_max_sq(-params.upper_bound, params.upper_bound,
-                                             dtype=in_dtype, forced=True)
+            cls.check_valid_ranges(params, stats, idx=0, dirs='in')
+            # we need to be able to represent offset and upper_bound in output dtype
+            # input range should match stats since swish requires the full input range
+            if fusion:
+                # in a fusion the output container is smaller than the input container
+                # The input scale may be too small to represent offset and upper_bound
+                # in the output dtype
+                params_qtype = QType.from_min_max_sq(
+                    0,
+                    np.maximum(
+                        params.upper_bound,
+                        params.offset),
+                    bits=bits,
+                    dtype=out_dtype)
+                in_q = QType.from_min_max_sq(
+                    stats['range_in'][0]['min'],
+                    stats['range_in'][0]['max'],
+                    dtype=in_dtype)
+                # if params scale is larger then we must reduce precision
+                if np.all(params_qtype.scale > in_q.scale):
+                    in_q.scale = params_qtype.scale
+            else:
+                # outside a fusion our in and out dtype is the same
+                # so we just need to check that offset and upper_bound can be represented
+                if in_dtype == np.uint8:
+                    in_dtype = np.int8
+                elif in_dtype == np.uint16:
+                    in_dtype = np.int16
+                if isinstance(params, HSwishActivationParameters):
+                    lower = stats['range_in'][0]['min']
+                    upper = np.maximum(
+                        np.maximum(
+                            params.upper_bound,
+                            params.offset),
+                        stats['range_in'][0]['max'])
+                else:
+                    lower = -params.offset
+                    upper = params.upper_bound
+
+                in_q = QType.from_min_max_sq(
+                    lower,
+                    upper,
+                    dtype=in_dtype)
         elif isinstance(params, (TanHActivationParameters, SigmoidActivationParameters)):
-            in_q = QType.from_min_max_sq(-8, 8, dtype=in_dtype, forced=True)
+            if in_dtype == np.int8:
+                in_q = QType.from_min_max_sq(
+                    -8,
+                    8,
+                    dtype=in_dtype,
+                    forced=True)
+            else:
+                in_q = QType(
+                    dtype=in_dtype,
+                    scale=pow(2, -12))
+        elif isinstance(params, (HTanHActivationParameters, )):
+            scale = 2 / pow(2, bits)
+            in_q = QType(scale=scale, dtype=in_dtype, forced=True)
+        elif isinstance(params, (LeakyActivationParameters, )):
+            max_out = max(abs(stats['range_out'][0]['max']), abs(stats['range_out'][0]['min']))
+            scale = (2 * max_out) / pow(2, bits)
+            in_q = QType(scale=scale, dtype=in_dtype, forced=True)
 
         if force_out_q:
-            if force_out_q.signed != in_q.signed:
-                return None
             if fusion and fusion.fusion_type in ['conv_active_pool', 'conv_active']:
                 if not isinstance(params, (SigmoidActivationParameters, HTanHActivationParameters,
                                            HSwishActivationParameters, HSigmoidActivationParameters)):
@@ -70,35 +128,43 @@ def _quantize_sw(cls, params, in_qs, stats, in_dtype, out_dtype, **kwargs):
 
         else:
             cls.check_valid_ranges(params, stats, idx=0, dirs='out')
-            if (isinstance(params, ReluActivationParameters) and params.lower_bound == 0 and
-                    in_q.dtype == np.int8):
+            if isinstance(params, ReluActivationParameters):
                 max_val = params.upper_bound if params.upper_bound else stats['range_out'][0]['max']
                 o_q = QType.from_min_max_sq(0,
                                             max_val,
                                             dtype=out_dtype,
                                             asymmetric=(in_q.zero_point != 0))
                 in_q = deepcopy(o_q)
-            elif isinstance(params, (TanHActivationParameters, SigmoidActivationParameters)):
-                if out_dtype == np.int8:
-                    o_q = QType(q=7, dtype=np.int8)
-                elif out_dtype == np.int16:
-                    o_q = QType(q=15, dtype=np.int16)
-                else:
-                    raise NotImplementedError(
-                        'int8 and int16 are implemented as output only')
+            elif isinstance(params, TanHActivationParameters):
+                o_q = QType.from_min_max_sq(
+                    min_val=-1, max_val=1, dtype=out_dtype, asymmetric=out_asym)
+            elif isinstance(params, SigmoidActivationParameters):
+                o_q = QType.from_min_max_sq(
+                    min_val=0, max_val=1, dtype=out_dtype, asymmetric=out_asym)
             elif isinstance(params, LeakyActivationParameters):
                 o_q = QType.from_min_max_sq(stats['range_out'][0]['min'],
                                             stats['range_out'][0]['max'],
-                                            dtype=out_dtype)
-                # force the preceeding filter to clip the negative range
-                in_q = deepcopy(o_q)
+                                            dtype=out_dtype,
+                                            asymmetric=out_asym)
+                in_q.scale = o_q.scale
+            elif isinstance(params, HSigmoidActivationParameters):
+                # hsigmoid prefer to output zeropoint 0 to represent 0 - 1 range
+                o_q = QType.from_min_max_sq(
+                    min_val=0, max_val=1, dtype=out_dtype, asymmetric=out_asym)
+            elif isinstance(params, HSwishActivationParameters):
+                # hswish multiplies 0-upper bound range by input so take the upper bound from stats
+                o_q = QType.from_min_max_sq(stats['range_out'][0]['min'],
+                                            stats['range_out'][0]['max'],
+                                            dtype=out_dtype,
+                                            asymmetric=out_asym)
             else:
                 o_q = QType.from_min_max_sq(stats['range_out'][0]['min'],
                                             stats['range_out'][0]['max'],
-                                            dtype=out_dtype)
+                                            dtype=out_dtype,
+                                            asymmetric=out_asym)
 
         qrec = QRec.scaled(in_qs=[in_q], out_qs=[o_q])
-        qrec = cls.compute_cache(params, qrec)
+        qrec = cls.compute_cache(params, qrec, stats)
         return qrec
 
     @classmethod
@@ -106,32 +172,39 @@ def get_prefered_input_dtypes(cls, params, **kwargs):
         return [np.int8]
 
     @classmethod
-    def compute_cache(cls, params, qrec):
+    def compute_cache(cls, params, qrec, stats):
         scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
         qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q
-
-        if isinstance(params, (SigmoidActivationParameters, TanHActivationParameters)):
-            scale_mul_biases_q.scale = (math.pow(
-                2, -7 if qrec.out_qs[0].dtype == np.int8 else -15)/qrec.out_qs[0].scale)
+        if isinstance(params, (ReluActivationParameters)):
+            if params.upper_bound:
+                qrec.cache['upper_bound'] = qrec.in_qs[0].quantize(
+                    params.upper_bound).astype(qrec.out_qs[0].dtype)
+            if params.lower_bound:
+                qrec.cache['lower_bound'] = qrec.in_qs[0].quantize(
+                    params.lower_bound).astype(qrec.out_qs[0].dtype)
+            scale_mul_biases_q.scale = (
+                qrec.in_qs[0].scale/qrec.out_qs[0].scale)
+        elif isinstance(params, (SigmoidActivationParameters, TanHActivationParameters)):
+            scale_mul_biases_q.scale = math.pow(2, -15) / qrec.out_qs[0].scale
+            qrec.cache["zero_point"] = qrec.out_qs[0].zero_point.astype(
+                qrec.out_qs[0].dtype)
         elif isinstance(params, (LeakyActivationParameters)):
             scale_mul_biases_q.scale = (
                 qrec.in_qs[0].scale/qrec.out_qs[0].scale)
             qrec.cache['leak_factor'] = np.int8(
                 params.leak_factor*math.pow(2, 7) + 0.5)
-        elif isinstance(params, HSwishActivationParameters):
-            scale_mul_biases_q.scale = (
-                (qrec.in_qs[0].scale*qrec.in_qs[0].scale * params.mult)/qrec.out_qs[0].scale)
-            qrec.cache['offset'] = qrec.in_qs[0].quantize(params.offset)
-            qrec.cache['mult'] = np.int8(1)
-            qrec.cache['upper_bound'] = qrec.in_qs[0].quantize(
-                params.upper_bound)
-        elif isinstance(params, HSigmoidActivationParameters):
-            scale_mul_biases_q.scale = (
-                (qrec.in_qs[0].scale*params.mult)/qrec.out_qs[0].scale)
-            qrec.cache['offset'] = qrec.in_qs[0].quantize(params.offset)
-            qrec.cache['mult'] = np.int8(1)
+            qrec.cache['zero_point'] = qrec.out_qs[0].zero_point.astype(qrec.out_qs[0].dtype)
+        elif isinstance(params, (HSwishActivationParameters, HSigmoidActivationParameters)):
+            scale = (qrec.in_qs[0].scale * params.mult)/qrec.out_qs[0].scale
+            if isinstance(params, HSwishActivationParameters):
+                # HSwish multiplies HSigmoid by input
+                scale *= qrec.in_qs[0].scale
+            scale_mul_biases_q.scale = scale
+            qrec.cache['offset'] = qrec.in_qs[0].quantize(
+                params.offset).astype(qrec.out_qs[0].dtype)
+            qrec.cache['zero_point'] = qrec.out_qs[0].zero_point
             qrec.cache['upper_bound'] = qrec.in_qs[0].quantize(
-                params.upper_bound)
+                params.upper_bound).astype(qrec.out_qs[0].dtype)
         else:
             scale_mul_biases_q.scale = (
                 qrec.in_qs[0].scale/qrec.out_qs[0].scale)
@@ -139,184 +212,86 @@ def compute_cache(cls, params, qrec):
 
 
 @params_type(ActivationParameters)
-@in_qs_constraint({'dtype': np.int8})
+@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}})
 @out_qs_constraint({'dtype': np.int8})
-class ActivationMultSW8x8(ActivationMultSWBase):
+@option_constraint(force_output_size={8, None})
+class ActivationMultSW_I_I8(ActivationMultSWBase):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        return cls._quantize_sw(params, in_qs, stats, np.int8, np.int8, **kwargs)
+        return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.int8, out_asym=False, **kwargs)
 
 
-@params_type(ActivationParameters)
-@in_qs_constraint({'dtype': np.int32})
-@out_qs_constraint({'dtype': np.int8})
-@priority(2)
-class ActivationMultSW32x8(ActivationMultSWBase):
+@params_type(HSwishActivationParameters, HSigmoidActivationParameters)
+@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}})
+@out_qs_constraint({'dtype': np.uint8})
+@option_constraint(force_output_size={8, None})
+class ActivationMultSW_HSwish_I_U8(ActivationMultSWBase):
     @classmethod
-    def _quantize(cls, params, in_qs, stats, **kwargs):
-        return cls._quantize_sw(params, in_qs, stats, np.int32, np.int8, **kwargs)
-
+    def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs):
+        dtype = in_qs and in_qs[0] and in_qs[0].dtype
+        if dtype == np.uint16:
+            dtype = np.int16
+        else:
+            dtype = np.int8
+        return [QType.from_min_max_sq(
+            stats['range_in'][0]['min'],
+            stats['range_in'][0]['max'],
+            dtype=dtype)]
 
-@params_type(ActivationParameters)
-@in_qs_constraint({'dtype': np.int16})
-@out_qs_constraint({'dtype': np.int16})
-@priority(2)
-class ActivationMultSW16x16(ActivationMultSWBase):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        return cls._quantize_sw(params, in_qs, stats, np.int16, np.int16, **kwargs)
+        return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint8, out_asym=False, **kwargs)
 
 
-@params_type(ActivationParameters)
-@in_qs_constraint({'dtype': np.int32})
-@out_qs_constraint({'dtype': np.int16})
-@priority(2)
-class ActivationMultSW32x16(ActivationMultSWBase):
+@params_type(HSwishActivationParameters, HSigmoidActivationParameters)
+@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}})
+@out_qs_constraint({'dtype': np.uint16})
+@option_constraint(force_output_size=16)
+class ActivationMultSW_HSwish_I_U16(ActivationMultSWBase):
     @classmethod
-    def _quantize(cls, params, in_qs, stats, **kwargs):
-        return cls._quantize_sw(params, in_qs, stats, np.int32, np.int16, **kwargs)
-
+    def _get_in_qs_from_stats(cls, params, stats, in_qs, **kwargs):
+        dtype = in_qs and in_qs[0] and in_qs[0].dtype
+        if dtype == np.uint16:
+            dtype = np.int16
+        else:
+            dtype = np.int8
+        return [QType.from_min_max_sq(
+            stats['range_in'][0]['min'],
+            stats['range_in'][0]['max'],
+            dtype=dtype)]
 
-@params_type(ReluActivationParameters)
-@in_qs_constraint({'dtype': {np.uint8, np.uint16, np.int8, np.int16}, 'attr': {'ne16': True}})
-@out_qs_constraint({'dtype': {np.uint8, np.uint16, np.int8, np.int16}})
-@priority(3)
-class ActivationMultNe16(MultQuantizionHandler):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
-        force_out_q = force_out_qs and force_out_qs[0]
-        in_q = in_qs[0]
-
-        if force_out_q:
-            # TODO - should uint8 be accepted here if it does not scale to the relu?
-            o_q = deepcopy(force_out_q)
-            in_q = deepcopy(o_q)
-        else:
-            cls.check_valid_ranges(params, stats, idx=0, dirs='out')
-            upper = (stats['range_out'][0]['max'] if params.upper_bound is None
-                     else params.upper_bound)
-            in_q = QType.from_min_max_sq(
-                params.lower_bound, upper, dtype=in_q.dtype, asymmetric=True,
-                ne16=True, dont_copy_attr=['ne16'])
-            o_q = deepcopy(in_q)
-
-        scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
-        scale_mul_biases_q.scale = in_q.scale/o_q.scale
-        qrec = QRec.scaled(in_qs=[in_q], out_qs=[o_q],
-                           ne16=True, scale_mul_biases_q=scale_mul_biases_q)
-        return qrec
-
+        return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint16, out_asym=False, **kwargs)
 
-@params_type(HSigmoidActivationParameters, HSwishActivationParameters)
-@in_qs_constraint({'dtype': {np.uint16, np.int32}}) #, 'attr': {'ne16': True}})
-@out_qs_constraint({'dtype': {np.uint16}})
-@priority(3)
-class HSigmoidSwishActivationMultNe16USQ16(MultQuantizionHandler):
-    DEFAULT_DTYPE = np.uint16
 
+@params_type(ActivationParameters)
+@in_qs_constraint({'dtype': {np.int8, np.int16, np.int32}})
+@out_qs_constraint({'dtype': np.int16})
+@option_constraint(force_output_size=16)
+class ActivationMultSW_I_I16(ActivationMultSWBase):
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
-        force_out_q = force_out_qs and force_out_qs[0]
-
-        # input ranged to values that count and upper bound must be representable
-        # there is an assumption here that params.offset is always less than params.upper_bound
-        assert params.offset <= params.upper_bound
-        in_q = in_qs[0]
-        max_repr = np.maximum(in_q.max, params.upper_bound)
-        in_q = QType.from_min_max_sq(-max_repr, max_repr, dtype=np.int32, forced=True)
-
-        if force_out_q:
-            # sigmoid and hswish has to output asymmetric with zero point at zero
-            if not force_out_q.zero_point_asymmetric_zero:
-                return None
-            # if the output has been forced then propagate it
-            out_q = deepcopy(force_out_q)
-        elif isinstance(params, HSigmoidActivationParameters):
-            # hsigmoid prefer to output Q16 zeropoint 0 to represent 0 - 1 range
-            out_q = QType(dtype=np.uint16, scale=pow(2, -16), zero_point=0,
-                          ne16=True, dont_copy_attr=['ne16'])
-        else:
-            # hswish multiplies 0-upper bound range by input so take the upper
-            # bound from stats
-            upper = stats['range_out'][0]['max']
-            max_repr = np.maximum(in_q.max, upper)
-            in_q = QType.from_min_max_sq(0, upper, dtype=np.int32,
-                                         ne16=True, dont_copy_attr=['ne16'])
-
-        qrec = QRec.scaled(in_qs=[in_q], out_qs=[out_q], ne16=True)
-        scale_mul_biases_q = MultMulBiasScaleQType(dtype=np.uint8)
-
-        if isinstance(params, HSigmoidActivationParameters):
-            scale_mul_biases_q.scale = (
-                (qrec.in_qs[0].scale*params.mult)/qrec.out_qs[0].scale)
-        elif isinstance(params, HSwishActivationParameters):
-            scale_mul_biases_q.scale = (
-                (qrec.in_qs[0].scale*qrec.in_qs[0].scale*params.mult)/qrec.out_qs[0].scale)
-        else:
-            raise ValueError(f"Unexpacted params type {params}")
-
-        qrec.cache['offset'] = qrec.in_qs[0].quantize(params.offset)
-        qrec.cache['mult'] = np.int16(1)
-        qrec.cache['upper_bound'] = qrec.in_qs[0].quantize(
-            params.upper_bound)
+        return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.int16, out_asym=False, **kwargs)
 
-        qrec.cache['scale_mul_biases_q'] = scale_mul_biases_q
-        return qrec
 
-
-@params_type(SigmoidActivationParameters)
-@in_qs_constraint({'dtype': {np.uint16, np.int32}, 'attr': {'ne16': True}})
-@out_qs_constraint({'dtype': {np.uint16}})
-@priority(3)
-class SigmoidActivationMultNe16USQ16(MultQuantizionHandler):
+@params_type(LeakyActivationParameters, TanHActivationParameters, SigmoidActivationParameters, ReluActivationParameters)
+@in_qs_constraint({'dtype': {np.uint8, np.int32}})
+@out_qs_constraint({'dtype': np.uint8})
+@option_constraint(force_output_size={8, None})
+class ActivationMultSW_U_U8(ActivationMultSWBase):
+    # This handler should be called only for NE16 for the moment --> out is asym
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
-        force_out_q = force_out_qs and force_out_qs[0]
-        in_q = QType(dtype=np.int32, scale=pow(2, -12))
-        if force_out_q:
-            # sigmoid has to output asymmetric with zero point at zero
-            if not force_out_q.zero_point_asymmetric_zero:
-                return None
-            # if the output has been forced then propagate it
-            out_q = deepcopy(force_out_q)
-        else:
-            # 0 to 1 range so prefer Q16 zeropoint 0
-            out_q = QType(dtype=np.uint16, scale=pow(2, -16), zero_point=0,
-                          ne16=True, dont_copy_attr=['ne16'], min_val=0, max_val=1)
-
-        scale_mul_biases_q = MultMulBiasScaleQType(
-            dtype=np.uint8, scale=pow(2, -16)/out_q.scale)
-        qrec = QRec.scaled(in_qs=[in_q], out_qs=[out_q],
-                           ne16=True, scale_mul_biases_q=scale_mul_biases_q)
-        return qrec
+        return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint8, out_asym=True, **kwargs)
 
 
-@params_type(TanHActivationParameters)
-@in_qs_constraint({'dtype': {np.uint16, np.int32}, 'attr': {'ne16': True}})
-@out_qs_constraint({'dtype': {np.uint16}})
-@priority(3)
-class TanHActivationMultNe16USQ16(MultQuantizionHandler):
+@params_type(LeakyActivationParameters, TanHActivationParameters, SigmoidActivationParameters, ReluActivationParameters)
+@in_qs_constraint({'dtype': {np.uint16, np.int32}})
+@out_qs_constraint({'dtype': np.uint16})
+@option_constraint(force_output_size=16)
+class ActivationMultSW_U_U16(ActivationMultSWBase):
+    # This handler should be called only for NE16 for the moment --> out is asym
     @classmethod
     def _quantize(cls, params, in_qs, stats, **kwargs):
-        force_out_qs, out_dtype = cls.get_mult_opts(**kwargs)
-        force_out_q = force_out_qs and force_out_qs[0]
-        in_q = QType(dtype=np.int32, scale=pow(2, -12))
-        if force_out_q:
-            # tanh has to output symmetric with zero point at 32768
-            if not np.all(np.atleast_1d(force_out_q.zero_point) == 32768):
-                return None
-            # if the output has been forced then propagate it
-            out_q = deepcopy(force_out_q)
-        else:
-            # -1 to 1 range so prefer Q15
-            out_q = QType(dtype=np.uint16, scale=pow(2, -15), zero_point=np.array([32768], dtype=np.uint16),
-                          ne16=True, dont_copy_attr=['ne16'], min_val=-1, max_val=1)
-
-        scale_mul_biases_q = MultMulBiasScaleQType(
-            dtype=np.uint8, scale=pow(2, -15)/out_q.scale)
-        qrec = QRec.scaled(in_qs=[in_q], out_qs=[out_q],
-                           ne16=True, scale_mul_biases_q=scale_mul_biases_q)
-        return qrec
+        return cls._quantize_sw(params, in_qs, stats, in_qs[0].dtype, np.uint16, out_asym=True, **kwargs)
diff --git a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
index 6382e89fd..57f1fb857 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/filter_mult.py
@@ -13,15 +13,16 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from graph.types.constant_input import ConstantInputParameters
-from graph.types.tensor_arithmetic import MatMulOpParameters, MatMulTransposedParameters
 import logging
 from copy import deepcopy
 
 import numpy as np
 from graph.types import (Conv2DParameters, FcParameters, FusionInputParameters,
-                         HSigmoidActivationParameters, PoolingParameters,
+                         HSigmoidActivationParameters,
                          ReluActivationParameters, SigmoidActivationParameters)
+from graph.types.constant_input import ConstantInputParameters
+from graph.types.tensor_arithmetic import (MatMulTransposedParameters)
+from quantization.clipping import get_clip
 from quantization.multiplicative.quantizers.rnn_mult_ne16 import \
     limit_input_precision
 from quantization.multiplicative.scaling_qtypes import MultMulBiasScaleQType
@@ -97,7 +98,8 @@ def check_options(params, opts=None, **kwargs):
     FORCE_INPUT_SIZE_OPTION,
     FORCE_OUTPUT_SIZE_OPTION,
     HWC_OPTION,
-    MAX_PRECISION_LIMIT_OPTION
+    MAX_PRECISION_LIMIT_OPTION,
+    CLIP_TYPE_OPTION
 )
 # pylint: disable=abstract-method
 class FilterMultBase(MultQuantizionHandler):
@@ -210,14 +212,18 @@ def _quantize_sw(cls, params, in_qs, stats, in_out_dtype, **kwargs):
 
         if force_out_q:
             o_q = force_out_q
-            # can't be forced to something not in_out_dtype
-            if o_q.dtype != in_out_dtype:
+            # can't be forced to something not in_out_dtype or int32
+            if o_q.dtype != in_out_dtype and o_q.dtype != np.int32:
                 return None
             LOG.warning(f'node {params.name} output forced to range {o_q.min}/{o_q.max} '
                         f'{"asymmetric" if o_q.asymmetric else "symmetric"}')
         else:
             cls.check_valid_ranges(params, stats, idx=0, dirs='out')
-            min_val, max_val = stats['range_out'][0]['min'], stats['range_out'][0]['max']
+            min_val, max_val = get_clip(
+                params.out_dims[0].shape,
+                8 if in_out_dtype == np.int8 else 16,
+                stats['range_out'][0],
+                opts['clip_type'])
             o_q = QType.from_min_max_sq(min_val=min_val,
                                         max_val=max_val,
                                         dtype=in_out_dtype,
@@ -273,7 +279,7 @@ def can_handle_asymmetric_input(cls, params, **kwargs):
 
 @params_type(FcParameters, Conv2DParameters)
 @in_qs_constraint({'dtype': np.int8})
-@out_qs_constraint({'dtype': np.int8})
+@out_qs_constraint({'dtype': set([np.int8, np.int32])})
 @option_constraint(check_filter_options(False, input_size={8, None}, output_size={8, None}))
 class FilterSWMult8x8(FilterSWMultBase):
     @classmethod
@@ -283,7 +289,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
 
 @params_type(FcParameters, Conv2DParameters)
 @in_qs_constraint({'dtype': np.int16})
-@out_qs_constraint({'dtype': np.int16})
+@out_qs_constraint({'dtype': set([np.int8, np.int32])})
 @option_constraint(check_filter_options(False, input_size={16, None}, output_size={16, None}))
 class FilterSWMult16x8(FilterSWMultBase):
     @classmethod
@@ -331,9 +337,13 @@ def _quantize_ne16(cls, params, in_qs, stats, input_dtype, **kwargs):
                 f'node {params.name} output forced to range {o_q.min}/{o_q.max}')
         else:
             cls.check_valid_ranges(params, stats, idx=0, dirs='out')
-            min_val, max_val = stats['range_out'][0]['min'], stats['range_out'][0]['max']
             force_output_size = opts.get('force_output_size', 8)
             output_dtype = np.uint8 if force_output_size == 8 else np.uint16
+            min_val, max_val = get_clip(
+                params.out_dims[0].shape,
+                force_output_size,
+                stats['range_out'][0],
+                opts['clip_type'])
             o_q = QType.from_min_max_sq(min_val=min_val,
                                         max_val=max_val,
                                         dtype=output_dtype,
diff --git a/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py b/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py
index b6bcf537e..c9767e147 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/generic_fusion_mult.py
@@ -44,21 +44,21 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
 
 @params_type(ActivationFusionBase, MatMulOpFusionParameters, MatScaleFusionParameters, PaddedAddFusionParameters)
 @in_qs_constraint(MatchAll({'dtype': np.int8}))
-@out_qs_constraint(MatchAll({'dtype': np.int8}))
+#@out_qs_constraint(MatchAll({'dtype': np.int8}))
 @fusion_handler
 class GenericFusionMult(GenericFusionMultBase):
     pass
 
 @params_type(ActivationFusionBase, MatMulOpFusionParameters, MatScaleFusionParameters, PaddedAddFusionParameters)
 @in_qs_constraint(MatchAll({'dtype': np.uint8}))
-@out_qs_constraint(MatchAll({'dtype': np.uint8}))
+#@out_qs_constraint(MatchAll({'dtype': np.uint8}))
 @fusion_handler
 class GenericFusionMultU8(GenericFusionMultBase):
     pass
 
 @params_type(ActivationFusionBase, MatMulOpFusionParameters, MatScaleFusionParameters, PaddedAddFusionParameters)
 @in_qs_constraint(MatchAll({'dtype': np.uint16}))
-@out_qs_constraint(MatchAll({'dtype': np.uint16}))
+#@out_qs_constraint(MatchAll({'dtype': np.uint16}))
 @fusion_handler
 class GenericFusionMultU16(GenericFusionMultBase):
     pass
diff --git a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py
index 3a2484562..d32c2009b 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/global_pooling_mult.py
@@ -58,8 +58,8 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
 
         if fusion:
             # Global pooling fused with activations need to have only the activation scale
+            #o_q = QType(scale=in_q.scale, dtype=np.int32)
             o_q = deepcopy(in_q)
-            o_q.dtype = np.int32
         elif force_out_q:
             if force_out_q.zero_point != in_q.zero_point:
                 return None
diff --git a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py
index 5b76154df..a09de7233 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/matmult_mult.py
@@ -128,7 +128,7 @@ def get_min_max(cls, fusion, stats, all_stats, params):
 
 @params_type(MatMulOpParameters)
 @in_qs_constraint({'dtype': set([np.int8])})
-@out_qs_constraint({'dtype': set([np.int8])})
+@out_qs_constraint({'dtype': set([np.int8, np.int32])})
 @option_constraint(check_filter_options(False, input_size={8, None}, output_size={8, None}))
 class MatMultMultSW8(MatMultMultBase):
     @classmethod
@@ -150,7 +150,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
             kwargs['graph_update']['requires_adjust'] = True
             in_q2 = QType.from_array_sq(
                 arr=in2_node.dqvalue,
-                quantized_dimension=0,
+                quantized_dimension=len(in2_node.dqvalue.shape) - 2,
                 dtype=np.int8,
                 narrow_range=True,
                 bits=8)
@@ -165,7 +165,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
         if force_out_q:
             o_q = force_out_q
             # can't be forced to something not np.int8
-            if o_q.dtype != np.int8 or o_q.asymmetric:
+            if (o_q.dtype != np.int8 and o_q.dtype != np.int32) or o_q.asymmetric:
                 return None
             LOG.warning(f'node {params.name} output forced to range {o_q.min}/{o_q.max} '
                         f'{"asymmetric" if o_q.asymmetric else "symmetric"}')
diff --git a/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py b/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py
index 6f2e413ad..667da7a53 100644
--- a/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py
+++ b/tools/nntool/quantization/multiplicative/quantizers/ssd_postprocess.py
@@ -26,7 +26,7 @@
 @in_qs_constraint({'dtype': np.int8})
 class SSDDetectorParametersMult(MultQuantizionHandler):
     @classmethod
-    def _quantize(cls, params, in_qs, stats, **kwargs):
+    def _quantize(cls, params: SSDDetectorParameters, in_qs, stats, **kwargs):
         force_out_qs, _ = cls.get_mult_opts(**kwargs)
         force_out_q = force_out_qs and force_out_qs[0]
         if force_out_q:
@@ -39,4 +39,7 @@ def _quantize(cls, params, in_qs, stats, **kwargs):
                               dtype=np.int16, scale=2**(-14))
         o_scores_qtype = in_qs[1]
         o_class_qtype = QType(scale=1, dtype=np.int8)
-        return QRec.scaled(in_qs=in_qs, out_qs=[o_boxes_qtype, o_class_qtype, o_scores_qtype, o_class_qtype])
+        outputs = [o_boxes_qtype, o_class_qtype, o_scores_qtype]
+        if params.output_detection_count:
+            outputs.append(QType(scale=1, dtype=np.int32))
+        return QRec.scaled(in_qs=in_qs, out_qs=outputs)
diff --git a/tools/nntool/quantization/qtype.py b/tools/nntool/quantization/qtype.py
index de8cac419..7d409ba36 100644
--- a/tools/nntool/quantization/qtype.py
+++ b/tools/nntool/quantization/qtype.py
@@ -15,7 +15,7 @@
 
 import math
 from copy import deepcopy
-from functools import reduce
+from functools import cmp_to_key, reduce
 
 import numpy as np
 from bfloat16 import bfloat16
@@ -132,7 +132,7 @@ def divide_ignore(a, b):
 IGNORE_KEYS = {'ne16', 'to_dict'}
 
 
-class AttrNamespace:
+class AttrNamespace():
     def __init__(self, **kwargs):
         self.__dict__.update(kwargs)
 
@@ -298,6 +298,26 @@ def __setstate__(self, state):
         setattr(self, '_dtype', STR_DTYPE[state['dtype']])
         setattr(self, '_EventEmitter__raw_listeners', {})
 
+    def _encapsulate(self):
+        res = {}
+        for k in self.EXPORT:
+            v = getattr(self, f'_{k}')
+            if v is None:
+                continue
+            if k == "attr":
+                res[k] = v.__getstate__()
+            else:
+                res[k] = v
+        return res
+
+    @classmethod
+    def _dencapsulate(cls, val):
+        if 'attr' in val:
+            attr = val['attr']
+            val['attr'] = AttrNamespace()
+            val['attr'].__setstate__(attr)
+        return QType(**val)
+
     @property
     def zero_point_asymmetric_zero(self):
         if self.dtype in [np.int8, np.int16, np.int32]:
@@ -335,13 +355,6 @@ def attr(self):
     def Pow2(cls, bits, q, signed, forced=False):
         return cls(bits=bits, q=q, signed=signed, forced=forced)
 
-    def _encapsulate(self):
-        return {k: getattr(self, f'_{k}') for k in self.EXPORT
-                if getattr(self, f'_{k}') is not None}
-
-    @classmethod
-    def _dencapsulate(cls, val):
-        return QType(**val)
 
     def _update_dtype(self):
         if self._signed is None or self._bits is None:
@@ -380,6 +393,27 @@ def forced_dtype(self):
     def forced_scale(self):
         return self._forced.get('scale')
 
+    @staticmethod
+    def precision_key():
+        """ Returns a key function that compares precision
+        """
+        def cmp(a, b):
+            a = float(a)
+            b = float(b)
+            return (a > b) - (a < b) 
+        def cmp_func(q1: QType, q2: QType):
+            if q1.is_floating:
+                if q2.is_floating:
+                    return cmp(q1.bits, q2.bits)
+                else:
+                    return 1 # q1 > q2
+            elif q2.is_floating:
+                return -1
+            # lower scale is more precise
+            return cmp(np.max(q2.scale), np.max(q1.scale))
+        return cmp_to_key(cmp_func)
+
+
     def set_forced(self, val=True, flags=None):
         if flags is None:
             flags = FORCED_FLAGS
@@ -528,7 +562,8 @@ def scale(self, val):
 
     @property
     def has_valid_range(self):
-        return (self._min_val is not None and self._max_val is not None) or self._scale is not None
+        return ((self._min_val is not None and self._max_val is not None) or
+                self._scale is not None or self._q is not None)
 
     @property
     def min_val(self):
diff --git a/tools/nntool/quantization/quantization_set.py b/tools/nntool/quantization/quantization_set.py
index 1e7fe2138..deae94a7e 100644
--- a/tools/nntool/quantization/quantization_set.py
+++ b/tools/nntool/quantization/quantization_set.py
@@ -185,6 +185,12 @@ def move_to_fusion(self, node: Parameters, new_pnode: Parameters):
             del self.qset[nid]
         if self.stats and nid in self.stats:
             self.stats[fnid] = self.stats[nid]
+        if self.options and nid in self.options:
+            pnid = NodeId(new_pnode)
+            options = self.options[nid]
+            del self.options[nid]
+            self.options[fnid] = options
+            self.options.setdefault(pnid, {}).update(options)
 
     def move_to_node(self, node: Parameters, new_pnode: Parameters):
         nid = NodeId(node)
diff --git a/tools/nntool/quantization/quantizer/new_quantizer.py b/tools/nntool/quantization/quantizer/new_quantizer.py
index 4bf9aaeeb..f34a497c0 100644
--- a/tools/nntool/quantization/quantizer/new_quantizer.py
+++ b/tools/nntool/quantization/quantizer/new_quantizer.py
@@ -15,10 +15,13 @@
 
 import logging
 from functools import reduce
-from operator import attrgetter
 
+from graph.matches.matchers.duplicate_operations import \
+    MatchDuplicateOperations
+from graph.matches.matchers.insert_copies import MatchInsertCopies
 from graph.matches.matchers.remove_copies import RemoveCopies
-from graph.matches.matchers.remove_unnecessary_quantize_operators import RemoveUnnecessaryQuantizeOperators
+from graph.matches.matchers.remove_unnecessary_quantize_operators import \
+    RemoveUnnecessaryQuantizeOperators
 from graph.types import (FusionBase, FusionInputParameters,
                          FusionOutputParameters, QuantizeParameters)
 from graph.types.base import NNEdge
@@ -287,14 +290,36 @@ def select_qtype_fusion(qtypes):
             return qtypes[0]
         raise CantContinueError()
 
-    def get_outqtypes_up(self, G, node):
+    @staticmethod
+    def most_precise(qtypes, stat):
+        # reduce to unique qtypes and sort most precise first
+        sorted_qtypes = sorted(
+            reduce(
+                lambda s, x: s if x in s else s + [x],
+                qtypes,
+                []),
+            key=QType.precision_key(), reverse=True)
+        if sorted_qtypes[0].is_floating:
+            return sorted_qtypes[0]
+        assert stat
+        # here none are float
+        # choose closest to range with max bits
+        max_bits = max(x.bits for x in sorted_qtypes)
+        sorted_qtypes = filter(lambda x: x.bits == max_bits, sorted_qtypes)
+        sorted_qtypes = sorted(
+            sorted_qtypes,
+            key=lambda x: abs(x.min - stat['min']) + abs(x.max - stat['max']))
+        return sorted_qtypes[0]
+        
+
+    def get_outqtypes_up(self, G, node, stat):
         # this function copes with the conflict on output edges which is the most complicated scenario since
         # there can be multiple competing forces. This only handles the cases that we have seen in real models
         # or been able to emulate in synthetic models.
         qtypes = []
-        for cur_qtypes, forced_qtypes in [zip(*[(self.get_qtype_forced_up(edge), self.get_conflict_up(edge))
-                                                for edge in edge_bundle])
-                                          for edge_bundle in G.indexed_out_edges(node)]:
+        for (cur_qtypes, forced_qtypes), edge_idx in [(zip(*[(self.get_qtype_forced_up(edge), self.get_conflict_up(edge))
+                                                             for edge in edge_bundle]), idx)
+                                                      for idx, edge_bundle in enumerate(G.indexed_out_edges(node))]:
             forced_qtypes = no_nones(forced_qtypes)
             if not forced_qtypes:
                 qtypes.append(None)
@@ -304,30 +329,10 @@ def get_outqtypes_up(self, G, node):
                     qtypes.append(forced_qtypes[0])
                     continue
             else:
-                # more than one output edge
-                if any(qtype.is_floating for qtype in forced_qtypes):
-                    return sorted(forced_qtypes, key=attrgetter('bits'))[-1]
-                uniq_cur_qtypes = reduce(
-                    lambda s, x: s if x in s else s + [x], cur_qtypes, [])
-                if len(uniq_cur_qtypes) == 1:
-                    uniq_cur_qtype = uniq_cur_qtypes[0]
-                    if len(cur_qtypes) > len(forced_qtypes):
-                        qtypes.append(uniq_cur_qtype)
-                        continue
-                    else:
-                        # all outputs are forced. we want to keep the one that best represents
-                        # the output so we calculate the maximum overlapping range
-                        # TODO - what about 16 bit versus 8 bit - if the range overlap is similar
-                        # then lower scale should be taken into account
-                        range_diffs = sorted([(qtype, min(qtype.max, uniq_cur_qtype.max_val) - max(
-                            qtype.min, uniq_cur_qtype.min_val)) for qtype in forced_qtypes], key=lambda x: x[1])
-                        qtypes.append(range_diffs[-1][0])
-                        continue
-            cur_qtypes = ",".join(str(qtype) for qtype in cur_qtypes)
-            forced_qtypes = ",".join(str(qtype) for qtype in forced_qtypes)
-            raise NotImplementedError(
-                f'unexpected quantization conflict seen cur {cur_qtypes} forced {forced_qtypes}'
-                ' - please contact GreenWaves')
+                edge_stat = stat and stat['range_out'][edge_idx]
+                qtypes.append(self.most_precise(cur_qtypes + forced_qtypes, edge_stat))
+                continue
+
         return qtypes
 
     def get_outqtypes_up_fusion(self, G, node):
@@ -626,7 +631,7 @@ def elimination_pass_down(self, cur_G, edge, qtype, visited, fusion=None):
         self.set_qtype_up(edge, qrec.in_qs[edge.to_idx])
         if self.is_conflict(edge):
             if fusion:
-                raise CantContinueError()
+                raise CantContinueError()  # @IgnoreException
             if not was_conflict:
                 self.report_conflict(edge)
         else:
@@ -744,13 +749,13 @@ def elimination_fusion_pass_up(self, parent_node, qrecs, in_qs, out_qs):
 
     def evaluate(self, cur_G, node, direction, qrecs, fusion=None):
         in_qs = self.get_inqtypes_down(cur_G, node)
-        if fusion:
-            out_qs = self.get_outqtypes_up_fusion(cur_G, node)
-        else:
-            out_qs = self.get_outqtypes_up(cur_G, node)
         nid = NodeId(node) if fusion is None else NodeId(fusion, fnode=node)
         pnid = NodeId(node) if fusion is None else NodeId(fusion)
         stat = self._stats.get(nid, None)
+        if fusion:
+            out_qs = self.get_outqtypes_up_fusion(cur_G, node)
+        else:
+            out_qs = self.get_outqtypes_up(cur_G, node, stat)
         opts = self.get_options(pnid)
         scheme_priority = self.get_scheme_priority(pnid)
         if isinstance(node, FusionBase) and node.quantize_internals:
@@ -794,7 +799,8 @@ def continue_down(self, cur_G, qrecs, visited, node, qrec, exclude_edge=None, fu
                 if not self.is_conflict(out_edge):
                     continue
                 qrecs.update(self.elimination_pass_down(cur_G,
-                                                        out_edge, self.get_qtype_down(out_edge),
+                                                        out_edge, self.get_qtype_down(
+                                                            out_edge),
                                                         visited + [node], fusion=fusion))
 
     def continue_up(self, cur_G, qrecs, visited, node, qrec, exclude_edge=None, fusion=None):
@@ -882,6 +888,9 @@ def insert_quantizers(self):
             for out_edge in self._graph.out_edges(qnode):
                 self._qtypes[out_edge] = to_qtype
         RemoveCopies().match(self._graph)
+        MatchDuplicateOperations(
+            limit_to_dest_classes=QuantizeParameters).match(self._graph)
+        MatchInsertCopies().match(self._graph)
 
     def remove_quantizers(self, only_inserted=False):
         for node in self._graph.nodes(node_classes=QuantizeParameters):
diff --git a/tools/nntool/quantization/quantizer/qrec_to_stats.py b/tools/nntool/quantization/quantizer/qrec_to_stats.py
index 5c46424ca..5e86998a3 100644
--- a/tools/nntool/quantization/quantizer/qrec_to_stats.py
+++ b/tools/nntool/quantization/quantizer/qrec_to_stats.py
@@ -30,17 +30,13 @@ def ranges_are_valid(ranges):
     return not any(rng['min'] is None or rng['max'] is None for rng in ranges if rng is not None)
 
 
-def build_stat_from_qrec(qrec, node=None):
-    if qrec is None:
-        return None
-    if qrec.in_qs is None or qrec.out_qs is None:
-        return None
+def build_stat_from_qtypes(in_qs, out_qs, node=None):
     range_in = [None if qtype is None else ({'min': qtype.min_val, 'max': qtype.max_val}
                                             if qtype.has_valid_range else {'min': None, 'max': None})
-                for qtype in qrec.in_qs]
+                for qtype in in_qs]
     range_out = [None if qtype is None else ({'min': qtype.min_val, 'max': qtype.max_val}
                                              if qtype and qtype.has_valid_range else {'min': None, 'max': None})
-                 for qtype in qrec.out_qs]
+                 for qtype in out_qs]
     range_in_valid = ranges_are_valid(range_in)
     range_out_valid = ranges_are_valid(range_out)
     if not range_in_valid or not range_out_valid:
@@ -62,6 +58,47 @@ def build_stat_from_qrec(qrec, node=None):
     }
 
 
+def build_stat_from_qrec(qrec, node=None):
+    if qrec is None:
+        return None
+    if qrec.in_qs is None or qrec.out_qs is None:
+        return None
+    return build_stat_from_qtypes(qrec.in_qs, qrec.out_qs, node=node)
+
+
+def build_fusion_stats(stats: dict, fusion: FusionBase):
+    inputs = fusion.subgraph.inputs()
+    in_stats = [None] * len(inputs)
+    for sub_node in inputs:
+        edge = fusion.subgraph.out_edges(sub_node)[0]
+        stat = stats.get(NodeId(edge.to_node))
+        if stat is None:
+            in_stats = None
+            break
+        range_in = stat['range_in']
+        if len(range_in) <= edge.to_idx:
+            in_stats = None
+            break
+        in_stats[sub_node.idx] = range_in[edge.to_idx]
+    outputs = fusion.subgraph.outputs()
+    out_stats = [None] * len(outputs)
+    for sub_node in outputs:
+        edge = fusion.subgraph.in_edges(sub_node)[0]
+        stat = stats.get(NodeId(edge.from_node))
+        if stat is None:
+            out_stats = None
+            break
+        range_out = stat['range_out']
+        if len(range_out) <= edge.from_idx:
+            out_stats = None
+            break
+        out_stats[sub_node.idx] = range_out[edge.from_idx]
+    return {
+        'range_in': in_stats,
+        'range_out': out_stats
+    }
+
+
 def build_stat(G, nid, node=None):
     if not G.quantization:
         return None
@@ -87,6 +124,9 @@ def set_stats(G, current_stats=None, current_options=None):
                         qrec = G.quantization.get(
                             nid) if G.quantization else None
                     stats[nid] = build_stat_from_qrec(qrec)
+            nid = NodeId(node)
+            if G.quantization and nid not in G.quantization:
+                stats[nid] = build_fusion_stats(stats, node)
         elif isinstance(node, ExpressionFusionParameters):
             if stats[nid] is None or 'expression' not in stats[nid]:
                 if (G.quantization is None or nid not in G.quantization or G.quantization[nid].cache is None or
@@ -96,6 +136,7 @@ def set_stats(G, current_stats=None, current_options=None):
                 stats[nid]['expression'] = G.quantization[nid].cache['expression']
         elif isinstance(node, ConstantInputParameters):
             if G.quantization and nid in G.quantization:
-                current_options.setdefault(nid, {})['qtype_ind'] = G.quantization[nid].out_qs[0]
+                current_options.setdefault(
+                    nid, {})['qtype_ind'] = G.quantization[nid].out_qs[0]
 
     return stats, current_options
diff --git a/tools/nntool/quantization/quantizer_options.py b/tools/nntool/quantization/quantizer_options.py
index d76d85623..48f114c18 100644
--- a/tools/nntool/quantization/quantizer_options.py
+++ b/tools/nntool/quantization/quantizer_options.py
@@ -136,6 +136,14 @@
     'default': 'fastfloat'
 }
 
+CLIP_TYPE_OPTION = {
+    'name': 'clip_type',
+    'type': str,
+    'choices': ['laplace', 'gaus', 'mix', 'none'],
+    'help': 'Clipping method for filter output activations min max. laplace or gaussian distribution or choose based on MSE or no clipping',
+    'default': 'none'
+}
+
 BIAS_SIZE_OPTION = {
     'name': 'pow2_biases',
     'type': int,
diff --git a/tools/nntool/reports/draw_graph_reporter.py b/tools/nntool/reports/draw_graph_reporter.py
index ec23e8d3a..da7b40b5c 100644
--- a/tools/nntool/reports/draw_graph_reporter.py
+++ b/tools/nntool/reports/draw_graph_reporter.py
@@ -17,7 +17,7 @@
 
 from expressions.symbolic.symbol import Constant, Variable
 from graph.nngraph import NNGraph
-from graph.types import ExpressionFusionParameters, FusionBase
+from graph.types import ExpressionFusionParameters, FusionBase, Parameters
 from graph.types.fusions import FusionInputParameters, FusionOutputParameters
 from graphviz import Digraph, nohtml
 from quantization.qtype import QType
@@ -92,6 +92,12 @@ def insert_tag(idx, tag, names):
         names[idx] = [f'{tag} {name}'] + names[idx][1::]
         return
 
+    @staticmethod
+    def get_label(node, anon):
+        if hasattr(node, 'graph_label'):
+            return node.graph_anon_label if anon else node.graph_label
+        return [node.name]
+
     @staticmethod
     def build_nodebox(node, ports, num_in, num_out, anon=False):
         trans_in = DrawGraphReporter.get_trans(node, 'in')
@@ -106,10 +112,10 @@ def build_nodebox(node, ports, num_in, num_out, anon=False):
             edges = [
                 f'<in{idx}> {idx if num_in > 1 else ""}{trans[idx] if idx < len(trans) else ""}' for idx in range(num_in)]
             names.append(edges)
-            names.extend(node.graph_anon_label if anon else node.graph_label)
+            names.extend(DrawGraphReporter.get_label(node, anon))
         else:
             ports[0] = [f'{node.name}:name']
-            names.extend(node.graph_anon_label if anon else node.graph_label)
+            names.extend(DrawGraphReporter.get_label(node, anon))
             DrawGraphReporter.insert_tag(0, f'<name>', names)
         if num_out > 1 or trans_out:
             if trans_out:
@@ -222,16 +228,18 @@ def in_label(self, G, edge, qrecs, parent=None, to_node=True, from_node=True):
             if not from_qtype.quantization_equal(qtype):
                 return f'{from_qtype}/{qtype}', True
             return str(qtype), False
-        else:
+        elif isinstance(node, Parameters):
             if node.in_dims:
                 return self.dim_or_error(node.in_dims, idx)
             return 'not set', True
+        return '', False
+
 
     def report_graph(self, G: NNGraph, dot, all_ports, fake_idx, nodes=None, all_dims=False,
                      anonymise=False, expressions=False, qrecs=None, fusions=False, parent=None):
         if nodes is None:
             nodes = set(G.nodes())
-        for node in G.dfs():
+        for node in G.topological_sort():
             if node not in nodes:
                 continue
             if isinstance(node, (FusionInputParameters)):
@@ -239,7 +247,7 @@ def report_graph(self, G: NNGraph, dot, all_ports, fake_idx, nodes=None, all_dim
             if expressions and isinstance(node, ExpressionFusionParameters):
                 all_ports[node] = self.report_expression(
                     dot, G, node, anonymise=anonymise, report_quantized=expressions == "quantized")
-            elif fusions and isinstance(node, FusionBase):
+            elif fusions and isinstance(node, FusionBase) and node.quantize_internals:
                 all_ports[node] = self.report_fusion(
                     dot, G, node, all_ports, fake_idx, all_dims=all_dims,
                     anonymise=anonymise, expressions=expressions, qrecs=qrecs)
@@ -251,8 +259,13 @@ def report_graph(self, G: NNGraph, dot, all_ports, fake_idx, nodes=None, all_dim
                 if not isinstance(node, FusionOutputParameters):
                     names = self.build_nodebox(
                         node, ports, num_in_edges, num_out_edges, anon=anonymise)
-                    dot.node(node.name, nohtml(names), shape='record',
-                             xlabel=f"{node.step_idx}" if parent is None else "", color="blue" if node.is_not_generated else "black")
+                    if not isinstance(node, Parameters):
+                        dot.node(node.name, nohtml(names),
+                                 shape='record', color='black')
+                    else:
+                        dot.node(node.name, nohtml(names), shape='record',
+                                 xlabel=f"{node.step_idx}" if parent is None else "",
+                                 color="blue" if node.is_not_generated else "black")
             for edge in G.in_edges(node.name):
                 if edge.from_node not in nodes:
                     if not all_dims:
@@ -318,7 +331,7 @@ def report(self, G: NNGraph, nodes=None, graph_format='PDF', all_dims=False,
             qrecs = None
         self.init_name_cache()
         all_ports = {}
-        graph_name = G.graphname if hasattr(G, 'graphname') else 'graph'
+        graph_name = G.name if hasattr(G, 'name') else 'graph'
         dot = Digraph(comment=graph_name, format=graph_format, node_attr={
                       'height': '.1'}, edge_attr={'fontsize': '10.0'})
         fake_idx = 0
@@ -376,7 +389,8 @@ def report_expression(self, dot: Digraph, G: NNGraph,
                 else:
                     dot.node(var.name, nohtml(var_name),
                              shape='plaintext', fontsize='10.0')
-                    sub.edge(node_id, var.name, xlabel=f'{str_shape(shape)}')
+                    sub.edge(
+                        node_id, var.name, xlabel=f'{str_shape(shape)}', color="red" if shape is None else "black")
 
         return [node.input_symbols, node.output_symbols]
 
@@ -424,12 +438,17 @@ def report_symbol(self, dot, symbol, intermediates, anonymise=False):
             const_name = self.get_next('Const')
             dot.node(const_name, 'Const' if anonymise else str(
                 symbol.value[0]), shape='oval', fontsize='10.0')
-            return const_name, None if len(symbol.shape) == 1 else symbol.shape
+            return const_name, symbol.shape
         ids_and_shapes = [self.report_symbol(dot, sym, intermediates, anonymise=anonymise)
                           for sym in symbol.contents]
         func_label = self.get_next(
             'Op') if anonymise else symbol.__class__.__name__
         dot.node(symbol.name, nohtml(func_label), shape='record')
         for child_id, shape in ids_and_shapes:
-            dot.edge(child_id, symbol.name, xlabel=f'{str_shape(shape)}')
-        return symbol.name, symbol.shape
+            dot.edge(child_id, symbol.name,
+                     xlabel=f'{str_shape(shape)}', color="red" if shape is None else "black")
+        try:
+            symbol_shape = symbol.shape
+        except ValueError:
+            symbol_shape = None
+        return symbol.name, symbol_shape
diff --git a/tools/nntool/requirements.txt b/tools/nntool/requirements.txt
index 3a9417a05..4694226d8 100644
--- a/tools/nntool/requirements.txt
+++ b/tools/nntool/requirements.txt
@@ -12,7 +12,7 @@ argcomplete==1.10.0
 Cython==0.29.21
 scikit-image==0.17.2
 scikit-learn==0.21.3
-onnx==1.8.0
+onnx==1.10.2
 prettytable==0.7.2
 iteration-utilities==0.11.0
 bfloat16==1.0
diff --git a/tools/nntool/stats/activation_ranges_collector.py b/tools/nntool/stats/activation_ranges_collector.py
index 42c8fd593..913187d3f 100644
--- a/tools/nntool/stats/activation_ranges_collector.py
+++ b/tools/nntool/stats/activation_ranges_collector.py
@@ -18,10 +18,9 @@
 
 import numpy as np
 from execution.graph_executer import GraphExecuter
-from graph.types import (FilterParameters, LSTMParameters,
-                         MultiplicativeBiasParameters, RNNBaseParameters)
-from graph.types.expression_fusion import ExpressionFusionParameters
 from graph.types.fusions import FusionBase, FusionInputParameters
+from stats.ranges_utils import collect_stat, update_ranges
+from utils.json_serializable import JsonSerializable
 from utils.node_id import NodeId
 
 from .stats_collector import GraphStatsCollector
@@ -41,10 +40,75 @@ def update_peraxis(var, arr: np.ndarray):
         per_axis_elem['max'] = np.maximum(
             per_axis_elem['max'], arr.max(axis=other_axis))
 
+class Rolling(JsonSerializable):
+    def __init__(self) -> None:
+        self._values = []
 
-def update_ema(ema, value, decay):
-    ema = value * decay + (1 - decay) * ema
-    return ema
+    def __float__(self):
+        if not self._values:
+            return 0
+        return float(np.sum(self._values)/len(self._values))
+
+    def add_val(self, val: float):
+        self._values.append(val)
+
+    def _encapsulate(self):
+        return float(self)
+
+    def __mul__(self, other):
+        return float(self).__mul__(other)
+
+    def __add__(self, other):
+        return float(self).__add__(other)
+
+    def __truediv__(self, other):
+        return float(self).__truediv__(other)
+
+    def __floordiv__(self, other):
+        return float(self).__floordiv__(other)
+
+    def __mod__(self, other):
+        return float(self).__mod__(other)
+
+    def __divmod__(self, other):
+        return float(self).__divmod__(other)
+
+    def __pow__(self, other):
+        return float(self).__pow__(other)
+
+    def __sub__(self, other):
+        return float(self).__sub__(other)
+
+    def __radd__(self, other):
+        return float(self).__radd__(other)
+
+    def __rsub__(self, other):
+        return float(self).__rsub__(other)
+
+    def __rmul__(self, other):
+        return float(self).__rmul__(other)
+
+    def __rtruediv__(self, other):
+        return float(self).__rtruediv__(other)
+
+    def __rfloordiv__(self, other):
+        return float(self).__rfloordiv__(other)
+
+    def __rmod__(self, other):
+        return float(self).__rmod__(other)
+
+    def __rpow__(self, other):
+        return float(self).__rpow__(other)
+
+    @classmethod
+    def _dencapsulate(cls, val):
+        return val
+
+    def __repr__(self) -> str:
+        return f'{float(self)}'
+
+    def __str__(self) -> str:
+        return f'{float(self)}'
 
 
 class ActivationRangesCollector(GraphStatsCollector):
@@ -55,40 +119,10 @@ def __init__(self, graph_execution=None, use_ema=False, ema_decay=0.999):
         self.use_ema = use_ema
         self.ema_decay = ema_decay
 
-    def update_expression_ranges(self, stat, details):
-        if 'expression' in stat:
-            stat = stat['expression']
-            for sym_name, rec in details.items():
-                if sym_name == "results":
-                    continue
-                stat_rec = stat.setdefault(
-                    sym_name, {'min': float('inf'), 'max': float('-inf')})
-                stat_rec['min'] = min(stat_rec['min'], rec['min'])
-                stat_rec['max'] = max(stat_rec['max'], rec['max'])
-        else:
-            stat['expression'] = deepcopy(details)
-
-    def collect_stat(self, stat, name, details, details_name=None):
-        range_stat = stat.get(name)
-        if not range_stat:
-            range_stat = {'min': float('inf'), 'max': float('-inf')}
-            stat[name] = range_stat
-        if details_name is None:
-            self.update_ranges(
-                range_stat, details[name]['min'], details[name]['max'])
-        else:
-            self.update_ranges(
-                range_stat, details['min_' + details_name], details['max_' + details_name])
-
-    def update_ranges(self, range_out, tensor_min, tensor_max):
-        if self.use_ema and all([range_out['min'] != float('inf'), range_out['max'] != float('-inf')]):
-            range_out['min'] = update_ema(
-                range_out['min'], tensor_min, self.ema_decay)
-            range_out['max'] = update_ema(
-                range_out['max'], tensor_max, self.ema_decay)
-        else:
-            range_out['min'] = min(range_out['min'], tensor_min)
-            range_out['max'] = max(range_out['max'], tensor_max)
+
+    def collect_stat(self, stat: dict, name, details, details_name=None):
+        ema_decay = self.ema_decay if self.use_ema else None
+        collect_stat(stat, name, details, details_name=details_name, ema_decay=ema_decay)
 
     def collect_stats(self, G, input_tensors, step_idx=None):
         if self._graph_execution is None:
@@ -113,7 +147,9 @@ def collect_stats(self, G, input_tensors, step_idx=None):
                     {
                         'min': float('inf'),
                         'max': float('-inf'),
-                        'std': 0.0
+                        'std': Rolling(),
+                        'mean': Rolling(),
+                        'b': Rolling()
                     } for _ in output_tensors]
                 stat = {
                     'range_in': range_in,
@@ -145,26 +181,18 @@ def collect_stats(self, G, input_tensors, step_idx=None):
 
             for idx, tensor in enumerate(output_tensors):
                 range_out = stat['range_out'][idx]
-                self.update_ranges(range_out, tensor.min(), tensor.max())
-                range_out['std'] = np.std(tensor)
+                ema_decay = self.ema_decay if self.use_ema else None
+                update_ranges(range_out, tensor.min(), tensor.max(), ema_decay=ema_decay)
+                range_out['std'].add_val(np.std(tensor))
+                mean = np.mean(tensor)
+                range_out['mean'].add_val(mean)
+                range_out['b'].add_val(np.mean(np.abs(tensor - mean)))
                 update_peraxis(range_out, tensor)
 
-            if isinstance(node, FilterParameters):
-                if details:
-                    self.collect_stat(stat, 'range_acc',
-                                      details, details_name='acc')
-                    if isinstance(node, MultiplicativeBiasParameters) and node.has_mul_bias:
-                        self.collect_stat(
-                            stat, 'range_pre_mul_bias', details, details_name='pre_mul_bias')
-            elif isinstance(node, RNNBaseParameters):
-                if details:
-                    for k in details:
-                        if k.startswith('range_'):
-                            self.collect_stat(stat, k, details)
-            elif isinstance(node, ExpressionFusionParameters):
-                if details:
-                    self.update_expression_ranges(stat, details)
-            elif isinstance(node, FusionBase) and pnode.quantize_internals:
+            if details:
+                node.details_collector(self.stats, stat, details)
+
+            if isinstance(node, FusionBase) and pnode.quantize_internals:
                 for inode in node.subgraph.nodes(node_classes=FusionInputParameters):
                     finput_in_stat = stat['range_in'][inode.idx]
                     for edge in node.subgraph.out_edges(inode.name):
diff --git a/tools/nntool/stats/ranges_utils.py b/tools/nntool/stats/ranges_utils.py
new file mode 100644
index 000000000..0c0bdad70
--- /dev/null
+++ b/tools/nntool/stats/ranges_utils.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import numpy as np
+
+
+def update_ema(ema, value, decay):
+    ema = value * decay + (1 - decay) * ema
+    return ema
+
+def update_ranges(range_out, tensor_min, tensor_max, ema_decay=None):
+    if ema_decay is not None and all([range_out['min'] != float('inf'), range_out['max'] != float('-inf')]):
+        range_out['min'] = update_ema(
+            range_out['min'], tensor_min, ema_decay)
+        range_out['max'] = update_ema(
+            range_out['max'], tensor_max, ema_decay)
+    else:
+        range_out['min'] = min(range_out['min'], tensor_min)
+        range_out['max'] = max(range_out['max'], tensor_max)
+
+def collect_stat(stat: dict, name, details, details_name=None, ema_decay=None):
+    range_stat = stat.setdefault(name, {'min': float('inf'), 'max': float('-inf')})
+    postfix = "" if details_name is None else f'_{details_name}'
+    tensors = tuple(details[f'{key}{postfix}'] for key in ('min', 'max'))
+    update_ranges(range_stat, *tensors, ema_decay=ema_decay)
diff --git a/tools/nntool/utils/compatible_transposes.py b/tools/nntool/utils/compatible_transposes.py
index d5c869381..eb3251931 100644
--- a/tools/nntool/utils/compatible_transposes.py
+++ b/tools/nntool/utils/compatible_transposes.py
@@ -13,8 +13,11 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from typing import Sequence
 from functools import reduce
+from typing import Sequence
+
+from graph.manipulations.eliminate_transposes.transpose_helpers import (
+    apply_transpose, indexes_of, reverse_transpose)
 
 
 def reduce_one(state, num):
@@ -85,7 +88,7 @@ def find_first(combination, idx):
             return fc_idx
         elif idx in fc_set:
             # item is not first so no match
-            raise IndexError() # @IgnoreException
+            raise IndexError()  # @IgnoreException
     return None
 
 
@@ -98,7 +101,7 @@ def len_at_start(l, elem):
 
 
 def compatible_transpose(combination, trans):
-    """Determines if the transpose can be expressed in the combination in fc
+    """Determines if the transpose can be expressed in the combination in descriptor found by find_combination
     """
     res = []
     trans = list(trans)
@@ -114,9 +117,9 @@ def compatible_transpose(combination, trans):
             continue
         # we have a segment who's first element matches the transpose
         # the rest of the elements in the segment must be in order
-        fc_idx = first_idx
-        while trans and fc_idx < len(combination):
-            segs = combination[fc_idx]
+        descriptor_idx = first_idx
+        while trans and descriptor_idx < len(combination):
+            segs = combination[descriptor_idx]
             if segs is not None:  # if segs is none then it matches anything
                 # the first seg must match since it was found by find_first
                 # this tests if another seg doesn't match in which case we are
@@ -131,16 +134,16 @@ def compatible_transpose(combination, trans):
                     # so no solution
                     if trans[0] != oidx:
                         return False
-            fc_idx += 1
+            descriptor_idx += 1
         # this trans element no longer matches the next segment or we
         # reached the end
         if trans:
             trans.pop(0)
         # add range from first to last on first time around
         # this will include idxes idxes of Nones after this segment
-        res += list(range(first_idx, fc_idx))
+        res += list(range(first_idx, descriptor_idx))
     at_start = len_at_start(combination, None)
-    # The only bit we won't have matched is a fc that starts with None's so
+    # The only bit we won't have matched is a descriptor that starts with None's so
     # add those indexes to the start
     return tuple(list(range(0, at_start)) + res)
 
@@ -154,3 +157,170 @@ def find_all_compatible_transposes(combinations, trans):
 
 def find_compatible_transpose(fcs, trans):
     return next(find_all_compatible_transposes(fcs, trans), None)
+
+
+def expand_to_len(trans, length):
+    extra = length-len(trans)
+    return tuple(list(range(extra)) + [dim + extra for dim in trans])
+
+
+def reduce_to_len(trans, length):
+    extra = len(trans) - length
+    return tuple([dim - extra for dim in trans if dim >= extra])
+
+
+def no_ones(l):
+    return tuple(elem for elem in l if elem != 1)
+
+
+def ones_shuffled(from_shape, to_shape):
+    if len(from_shape) != len(to_shape) or no_ones(from_shape) != no_ones(to_shape):
+        return False
+    return True
+
+
+def reshape_shuffle_trans(from_shape, to_shape):
+    from_shape = list(enumerate(from_shape))
+    to_shape = list(enumerate(to_shape))
+    ones_pos_from = tuple(shape[0] for shape in from_shape if shape[1] == 1)
+    ones_pos_to = list(shape[0] for shape in to_shape if shape[1] == 1)
+    idx_to = 0
+    idx_from = 0
+    idx_from_ones = 0
+    trans = []
+    for idx_to in range(len(to_shape)):
+        if idx_to in ones_pos_to:
+            trans.append(ones_pos_from[idx_from_ones])
+            idx_from_ones += 1
+        else:
+            while idx_from in ones_pos_from:
+                idx_from += 1
+            trans.append(idx_from)
+            idx_from += 1
+    return trans
+
+
+def is_broadcasted(from_shape, to_shape):
+    from_len = len(from_shape)
+    to_len = len(to_shape)
+    if from_len >= to_len:
+        return False
+    return tuple(([1] * (to_len - from_len)) + list(from_shape)) == tuple(to_shape)
+
+
+def broadcast_transpose(from_shape, to_shape, going_up):
+    from_len = len(from_shape)
+    to_len = len(to_shape)
+    if going_up:
+        return tuple((idx,) for idx in range(to_len - from_len, to_len))
+    return tuple(([None] * (to_len - from_len))+list(range(from_len)))
+
+
+def apply_combination(shape, comb):
+    res = []
+    comb = list(comb)
+    while comb:
+        elem = comb.pop(0)
+        if elem is None:
+            res.append(1)
+        else:
+            res.append(reduce(lambda x, y: x*y, [shape[i] for i in elem]))
+    return tuple(res)
+
+
+def transpose_combination(comb, trans):
+    res = []
+    comb = list(comb)
+    while comb:
+        elem = comb.pop(0)
+        if elem is None:
+            res.append(None)
+        else:
+            res.append(tuple(trans.index(i) for i in elem))
+    return tuple(res)
+
+
+def calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up):
+    if going_up:
+        # transpose is in the up direction so is reversed
+        # we want to apply it in the down direction so reverse it
+        new_to_shape = apply_transpose(to_shape, reverse_transpose(trans))
+        # the from_shape gets the new transpose applied to it - this may result in the reshape being eliminated
+        # since the shape change that it caused is already in the transpose
+        # NOTE - Looking at the reshape as a transpose itself is not correct. It is a shuffle not a transpose
+        # the tensor physical order is not changed unlike a transpose
+        new_from_shape = apply_transpose(
+            from_shape, reverse_transpose(new_trans))
+    else:
+        # transpose is in the down direction but we want to pass it through this reshape so
+        # we want to reverse its effect
+        new_from_shape = apply_transpose(from_shape, reverse_transpose(trans))
+        # the to_shape gets the new transpose applied to it - this may result in the reshape being eliminated
+        # since the shape change that it caused is already in the transpose
+        new_to_shape = apply_transpose(to_shape, reverse_transpose(new_trans))
+    return (
+        tuple(new_trans),
+        tuple(new_from_shape),
+        tuple(new_to_shape))
+
+
+def calc_failure_reshapes(trans, from_shape, to_shape, going_up):
+    if going_up:
+        new_to_shape = tuple(apply_transpose(
+            to_shape, reverse_transpose(trans)))
+        new_from_shape = None
+    else:
+        new_from_shape = tuple(apply_transpose(
+            from_shape, reverse_transpose(trans)))
+        new_to_shape = None
+    return (
+        None,
+        new_from_shape,
+        new_to_shape)
+
+
+def reverse_reshape(trans, from_shape, to_shape, going_up=False):
+    """reverses the effect of this reshape on the transpose. If going up is set then then
+    the transpose is in the direction to_shape -> from_shape"""
+
+    if len(from_shape) == 0 or len(to_shape) == 0:
+        return calc_failure_reshapes(trans, from_shape, to_shape, going_up)
+
+    # if the from_shape -> to_shape is actually a broadcast reshape
+    # i.e. 4, 10, 1 -> 1, 4, 10, 1 we absolutely need to keep the order 4, 10, 1 in
+    # the transpose however the 2 1s in the result are ambiguous so handle this as a
+    # (simple) special case. Just expand the transpose with no transpose at the start
+    # and expand_len + original transpose dim at the end
+    if is_broadcasted(from_shape, to_shape):
+        broad_trans = broadcast_transpose(from_shape, to_shape, going_up)
+        if going_up:
+            new_trans = reverse_transpose(reduce_to_len(
+                reverse_transpose(trans), len(from_shape)))
+        else:
+            new_trans = reverse_transpose(expand_to_len(
+                reverse_transpose(trans), len(to_shape)))
+        return calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up)
+
+    # consider the shapes in the correct order
+    shape_order = (to_shape, from_shape) if going_up else (
+        from_shape, to_shape)
+
+    if ones_shuffled(shape_order[0], shape_order[1]):
+        shuffle_trans = reshape_shuffle_trans(shape_order[0], shape_order[1])
+        new_trans = apply_transpose(trans, shuffle_trans)
+        return calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up)
+
+    for combination in find_combination(*shape_order):
+        if not combination:
+            continue
+        # going down we are looking at where we could transpose the reshape combination back up the
+        # graph in a valid way and then reverse that transpose
+        # going up re are propagating a reversed transpose so we still need to reverse
+        reversed_new_trans = compatible_transpose(
+            combination, reverse_transpose(trans))
+        if not reversed_new_trans or len(reversed_new_trans) != len(shape_order[1]):
+            continue
+        new_trans = reverse_transpose(reversed_new_trans)
+        return calc_new_reshape(trans, new_trans, from_shape, to_shape, going_up)
+
+    return calc_failure_reshapes(trans, from_shape, to_shape, going_up)
diff --git a/tools/nntool/utils/exception.py b/tools/nntool/utils/exception.py
new file mode 100644
index 000000000..a4771cd5e
--- /dev/null
+++ b/tools/nntool/utils/exception.py
@@ -0,0 +1,20 @@
+# Copyright (C) 2022  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+class NNToolInternelError(Exception):
+    pass
+
+class NNToolNotImplementedError(NotImplementedError):
+    pass
diff --git a/tools/nntool/utils/graph.py b/tools/nntool/utils/graph.py
index 31423e1eb..e3372f99e 100644
--- a/tools/nntool/utils/graph.py
+++ b/tools/nntool/utils/graph.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020  GreenWaves Technologies, SAS
+# Copyright (C) 2020, 2022  GreenWaves Technologies, SAS
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -15,9 +15,16 @@
 
 from itertools import zip_longest
 
-from collections import OrderedDict, deque
 from collections.abc import Iterable, Mapping
-from typing import KeysView, Union, Sequence
+from typing import Optional, Set, Tuple, Union, Sequence
+
+
+def is_iterable(x):
+    try:
+        iter(x)  # @IgnoreException
+    except TypeError:
+        return False
+    return True
 
 
 class GraphError(Exception):
@@ -46,9 +53,13 @@ class Node():
     '''Node class to inherit for nodes'''
 
     def __init__(self, name: str, *args, **kwargs):
-        super(Node, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self._name = name
 
+    @property
+    def _noderef_class(self):
+        return NodeRef
+
     @property
     def name(self):
         '''Node name - must not be changed once node is in graph'''
@@ -59,18 +70,72 @@ def name(self, name):
         '''Node name - must not be changed once node is in graph'''
         self._name = name
 
+    def __call__(self, *args, num_outputs=1):
+        inputs = []
+        fragments = set()
+        for arg in args:
+            if arg is not None and not isinstance(arg, self._noderef_class):
+                raise ValueError(
+                    f"expecting {self._noderef_class.__name__} or None")
+            inputs.append(arg.ref[0] if arg else None)
+            fragments.add(arg.ref[1] if arg else None)
+        fragment = next(
+            iter([frag for frag in fragments if frag is not None]), None)
+        if fragment is None:
+            raise ValueError("No inputs")
+        other_fragments = fragments - {fragment}
+
+        for other in other_fragments:
+            if hasattr(fragment, 'merge'):
+                if other is not None:
+                    fragment.merge(other)
+            else:
+                raise ValueError('graph has no merge method')
+
+        for to_idx, from_tuple in enumerate(inputs):
+            if from_tuple is not None:
+                from_node, from_idx = from_tuple
+                fragment.add_edge(fragment._edge_class(from_node=from_node,
+                                                       from_idx=from_idx,
+                                                       to_node=self,
+                                                       to_idx=to_idx))
+        if num_outputs == 1:
+            return self._noderef_class(fragment, self, 0)
+        return tuple(self._noderef_class(fragment, self, idx) for idx in range(num_outputs))
+
     def __str__(self):
         return self._name
 
 
 class NodeRef():
-    def __init__(self, node) -> None:
+    def __init__(self, G: "GraphView", node: Node, idx: int) -> None:
+        self._G = G
         self._node = node
+        self._idx = idx
+
+    @property
+    def G(self) -> "GraphView":
+        return self._G
+
+    @property
+    def ref(self) -> Tuple[Tuple[Node, int], "GraphView"]:
+        return ((self._node, self._idx), self._G)
 
     @property
-    def node(self):
+    def node(self) -> Node:
         return self._node
 
+    def __eq__(self, o: object) -> bool:
+        if isinstance(o, NodeRef):
+            return super().__eq__(o)
+        return self._node.__eq__(o)
+
+    def __hash__(self) -> int:
+        return self._node.__hash__()
+
+    def __call__(self, *args, **kwargs):
+        raise ValueError("this is already a reference")
+
 
 class MatchNode(Node):
     '''Node class to inherit for node matchers'''
@@ -136,6 +201,12 @@ def __init__(self, from_node: Union[str, Node, NodeRef], to_node: Union[str, Nod
             raise ValueError('expecting int for to_idx')
         self._link = (from_node, from_idx, to_node, to_idx)
 
+    @classmethod
+    def from_src_to_dest(cls, from_edge, to_edge):
+        return cls(
+            from_node=from_edge.from_node, from_idx=from_edge.from_idx,
+            to_node=to_edge.to_node, to_idx=to_edge.to_idx)
+
     @property
     def from_node(self):
         '''Edge start node'''
@@ -194,18 +265,53 @@ def __hash__(self):
 
 class GraphView(Mapping):
 
-    def __init__(self):
-        self._out_edges = OrderedDict()
-        self._in_edges = OrderedDict()
-        self._nodes = OrderedDict()
-        self._hidden = False
-        self._hidden_nodes = []
+    def __init__(self, **attr):
+        self._out_edges = {}
+        self._in_edges = {}
+        self._nodes = {}
+        self._attr = attr
 
     @classmethod
     # pylint: disable=unused-argument
     def clone_factory(cls, G):
         return cls()
 
+    def with_hidden_nodes(self, hidden_fn, edge_class=None):
+        if edge_class is None:
+            edge_class = Edge
+
+        def real_up_node(G, edge):
+            if hidden_fn(edge.from_node):
+                edges = G.in_edges(edge.from_node)
+                assert len(edges) == 1
+                return real_up_node(G, edges[0])
+            return edge.from_node, edge.from_idx
+
+        def copy_node(G, new_graph, node):
+            for edge in self.in_edges(node):
+                from_node, from_idx = real_up_node(G, edge)
+                new_edge = edge_class(from_node=from_node, from_idx=from_idx,
+                                      to_node=node, to_idx=edge.to_idx)
+                if new_graph.has_edge(new_edge):
+                    continue
+                new_graph.add_edge(new_edge)
+                copy_node(G, new_graph, from_node)
+
+        new_graph = self.__class__()
+        setattr(new_graph, '_attr', self._attr)
+        for node in self.outputs():
+            copy_node(self, new_graph, node)
+        return new_graph
+
+    def has_edge(self, edge):
+        edges = self._in_edges.get(edge.to_node.name)
+        if not edges:
+            return False
+        edges = edges.get(edge.from_node.name)
+        if not edges:
+            return False
+        return edge in edges
+
     def clear(self):
         '''Clears the graph view'''
         if self._nodes:
@@ -221,6 +327,12 @@ def clone(self) -> 'GraphView':
         clone._nodes = self._nodes.copy()
         return clone
 
+    def merge(self, other: 'Graph'):
+        if self != other:
+            for edge in other.edges:
+                self.add_edge(edge)
+        return self
+
     def num_nodes(self):
         '''Number of nodes len(GraphView) also works'''
         return len(self)
@@ -232,14 +344,8 @@ def num_edges(self):
                    for edge in edge_list)
 
     def __add_in_edge(self, edge: Edge, update=False):
-        edges = self._in_edges.get(edge.to_node.name)
-        if not edges:
-            edges = {}
-            self._in_edges[edge.to_node.name] = edges
-        edge_list = edges.get(edge.from_node.name)
-        if edge_list is None:
-            edge_list = []
-            edges[edge.from_node.name] = edge_list
+        edges = self._in_edges.setdefault(edge.to_node.name, {})
+        edge_list = edges.setdefault(edge.from_node.name, [])
         edge_idx = next((i for i, x in enumerate(edge_list) if x == edge), -1)
         if edge_idx >= 0:
             if update:
@@ -255,14 +361,8 @@ def __add_in_edge(self, edge: Edge, update=False):
             edge_list.append(edge)
 
     def __add_out_edge(self, edge: Edge, update=False):
-        edges = self._out_edges.get(edge.from_node.name)
-        if not edges:
-            edges = {}
-            self._out_edges[edge.from_node.name] = edges
-        edge_list = edges.get(edge.to_node.name)
-        if edge_list is None:
-            edge_list = []
-            edges[edge.to_node.name] = edge_list
+        edges = self._out_edges.setdefault(edge.from_node.name, {})
+        edge_list = edges.setdefault(edge.to_node.name, [])
         edge_idx = next((i for i, x in enumerate(edge_list) if x == edge), -1)
         if edge_idx >= 0:
             if update:
@@ -287,11 +387,6 @@ def verify_edges(self, check_connected=True):
 
     def add_edge(self, edge: Edge):
         '''Adds an edge to the graph'''
-        hidden_state = self._hidden
-        if hidden_state:
-            self._hidden = False
-        if self._hidden_nodes:
-            raise ValueError('cannot modify graph when nodes are hidden')
         if isinstance(edge.from_node, str):
             edge = edge.clone(from_node=self._nodes[edge.from_node])
         elif edge.from_node.name not in self._nodes:
@@ -304,13 +399,9 @@ def add_edge(self, edge: Edge):
             self._nodes[edge.to_node.name] = edge.to_node
         self.__add_in_edge(edge)
         self.__add_out_edge(edge)
-        if hidden_state:
-            self._hidden = True
 
     def node(self, node_name):
         '''Find a node by name. GraphView[node_name] also works'''
-        if self._hidden and any(node_name == node.name for node in self._hidden_nodes):
-            raise IndexError(f'{node_name} is hidden')
         return self[node_name]
 
     def insert_node(self, node_to_insert, from_node_name,
@@ -318,9 +409,6 @@ def insert_node(self, node_to_insert, from_node_name,
                     node_input_idx=0, node_output_idx=0,
                     edge_class=None):
         '''Inserts a node between two existing nodes'''
-        hidden_state = self._hidden
-        if hidden_state:
-            self._hidden = False
         if edge_class is None:
             edge_class = Edge
         node_to_insert = resolve_node(node_to_insert)
@@ -333,8 +421,6 @@ def insert_node(self, node_to_insert, from_node_name,
                                  from_idx=from_idx, to_idx=node_input_idx))
         self.add_edge(edge_class(node_to_insert, to_node_name,
                                  from_idx=node_output_idx, to_idx=to_idx))
-        if hidden_state:
-            self._hidden = True
 
     def edge(self, from_node_name: str, to_node_name: str, from_idx: int = 0, to_idx: int = 0):
         '''Finds first edge between two nodes - WARNING - probably not good in weird situation
@@ -398,7 +484,8 @@ def predecessor_names(self, node_name: str) -> Iterable:
     def nodes(self, node_classes=None, sort=False):
         '''All the nodes in the graph. GraphView.values() also works.'''
         if node_classes is not None:
-            nodes = [node for node in self._nodes.values() if isinstance(node, node_classes)]
+            nodes = [node for node in self._nodes.values(
+            ) if isinstance(node, node_classes)]
         else:
             nodes = list(self._nodes.values())
         if sort:
@@ -451,34 +538,48 @@ def connected_nodes(self, node_or_node_name):
             edge.to_node for edge in self.out_edges(node_or_node_name))
         return list(connected_nodes)
 
-    def is_vertex_cut(self, node_set, node=None, visited=None):
-        if visited is None:
-            visited = set()
-        if node is None:
-            inputs = set(self.inputs())
-            # choose one input node (or successor) that is not in the node_set
-            start_node = None
-            while inputs:
-                node = inputs.pop()
-                # if the input node is actually in the set then move past it
-                # this ensures that if the node_set is at the start of the graph
-                # and does not divide the graph it is not reported as a cut
-                if node not in node_set:
-                    start_node = node
-                    break
-                inputs.update(edge.to_node for edge in self.out_edges(node))
-            self.is_vertex_cut(node_set, node=start_node, visited=visited)
-            return len(visited) < (len(self) - len(node_set))
-        # undirected dfs
+    def _old_undirected_dfs(self, node, stop_at, pass_at, visited):
+        if node in stop_at or node in visited:
+            return
+        if pass_at and node not in pass_at:
+            return
         visited.add(node)
-        for edge in self.out_edges(node):
-            if edge.to_node in visited | node_set:
-                continue
-            self.is_vertex_cut(node_set, node=edge.to_node, visited=visited)
+        yield node
         for edge in self.in_edges(node):
-            if edge.from_node in visited | node_set:
-                continue
-            self.is_vertex_cut(node_set, node=edge.from_node, visited=visited)
+            yield from self._old_undirected_dfs(edge.from_node, stop_at, pass_at, visited)
+        for edge in self.out_edges(node):
+            yield from self._old_undirected_dfs(edge.to_node, stop_at, pass_at, visited)
+
+    def old_undirected_dfs(self, stop_at=None, start_at=None, pass_at=None):
+        if start_at is None:
+            start_at = list(self.inputs())
+        elif is_iterable(start_at):
+            start_at = list(start_at)
+        else:
+            start_at = [start_at]
+        if stop_at is None:
+            stop_at = set()
+        elif not is_iterable(stop_at):
+            stop_at = {stop_at}
+        else:
+            stop_at = set(stop_at)
+        if pass_at is None:
+            pass_at = set()
+        elif not is_iterable(pass_at):
+            pass_at = {stop_at}
+        else:
+            pass_at = set(pass_at)
+
+        visited = set()
+        while start_at:
+            yield from self._old_undirected_dfs(start_at.pop(0), stop_at, pass_at, visited)
+
+    def is_vertex_cut(self, node_set):
+        start_at = next(
+            iter([node for node in self.nodes() if node not in node_set]))
+        visited = list(self.old_undirected_dfs(
+            start_at=start_at, stop_at=node_set))
+        return len(visited) < (len(self) - len(node_set))
 
     def nodes_between_in(self, node_from, node_to, node_set, start=True):
         """Check that the only nodes between from and to are in node set"""
@@ -512,25 +613,38 @@ def nodes_between(self, node_from, node_to, visited=None, path=None):
                                    visited=visited, path=path + [edge.to_node])
         return visited
 
-    def nodes_below(self, node, visited=None):
+    def paths_between(self, node_from, node_to, path=None, topo=None):
+        if topo is None:
+            topo = {node: idx for idx, node in enumerate(self.topological_sort())}
+            path = []
+        if node_from == node_to:
+            return path
+        if topo[node_from] > topo[node_to]:
+            return None
+        found_paths = []
+        for edge in self.in_edges(node_to):
+            up_path = self.paths_between(node_from, edge.from_node, path=[edge] + path, topo=topo)
+            if up_path is None:
+                continue
+            found_paths.append(up_path)
+        if not found_paths:
+            return None
+        if len(found_paths) == 1:
+            return found_paths[0]
+        return found_paths
+
+    def nodes_below(self, node):
         """Return nodes below node not including node"""
-        if visited is None:
-            node = resolve_node_or_str(node, G=self)
-            visited = set()
-        for edge in self.out_edges(node):
-            visited.add(edge.to_node)
-            self.nodes_below(edge.to_node, visited=visited)
-        return visited
+        nodes_below = set(self.directed_dfs(node))
+        nodes_above = set(self.undirected_dfs(
+            node, start_up=True, stop_down_at=nodes_below))
+        return tuple(self.undirected_dfs(node, stop_up_at=nodes_above))
 
     def nodes_above(self, node, visited=None):
         """Return nodes above node not including node"""
-        if visited is None:
-            node = resolve_node_or_str(node, G=self)
-            visited = set()
-        for edge in self.in_edges(node):
-            visited.add(edge.from_node)
-            self.nodes_above(edge.from_node, visited=visited)
-        return visited
+        nodes_above = set(self.directed_dfs(node, go_up=True))
+        nodes_below = set(self.undirected_dfs(node, stop_up_at=nodes_above))
+        return tuple(self.undirected_dfs(node, stop_down_at=nodes_below, start_up=True))
 
     def nodes_below_are_class(self, node, classes, visited=None):
         """Check all nodes below are in classes"""
@@ -546,6 +660,169 @@ def nodes_below_are_class(self, node, classes, visited=None):
                 return False
         return True
 
+    def directed_dfs(self,
+                     node_or_name: Union[str, Node],
+                     stop_at: Optional[Set[Node]] = None,
+                     go_up: bool = False,
+                     yield_start_node=False,
+                     visited=None):
+        """Yields all nodes above or below this node searched directed. This is almost a dfs
+        since it yields in order going down the graph rather than bottom up
+
+        Args:
+            node_or_name (Union[str, Node]): Node or node name to start at
+            stop_at (Optional[Set[Node]], optional): Stop at this set of nodes. Defaults to None.
+            go_up (bool, optional): Go in an upward direction or downwards if False. Defaults to False.
+
+        Yields:
+            Node: Nodes visited
+        """
+        node = resolve_node_or_str(node_or_name, G=self)
+        if visited is None:
+            visited = {node}
+            started = False
+            if stop_at is None:
+                stop_at = {}
+        else:
+            started = True
+        if node in stop_at:
+            return
+        if started or yield_start_node:
+            yield node
+        if not go_up:
+            for edge in self.out_edges(node.name):
+                if edge.to_node in visited:
+                    continue
+                visited.add(edge.to_node)
+                yield from self.directed_dfs(edge.to_node, stop_at=stop_at, go_up=go_up, visited=visited)
+        if go_up:
+            for edge in self.in_edges(node.name):
+                if edge.from_node in visited:
+                    continue
+                visited.add(edge.from_node)
+                yield from self.directed_dfs(edge.from_node, stop_at=stop_at, go_up=go_up, visited=visited)
+
+    def connected_groups(self):
+        nodes = set(self.nodes())
+        groups = []
+        while nodes:
+            start = nodes.pop()
+            group = set(self.undirected_dfs(start))|set(self.undirected_dfs(start, start_up=True))|{start}
+            groups.append(group)
+            nodes -= group
+        return groups
+
+    def undirected_dfs(self,
+                       node_or_name: Union[str, Node],
+                       stop_at: Optional[Set[Node]] = None,
+                       stop_down_at: Optional[Set[Node]] = None,
+                       stop_up_at: Optional[Set[Node]] = None,
+                       start_up: bool = False,
+                       yield_start_node: bool = False,
+                       yield_stop_node: bool = False,
+                       visited=None):
+        """Yields all nodes above or below this node searched undirected. This is almost a dfs
+        since it yields in order going down the graph rather than bottom up. It also has a few modes
+        where it is edge direction sensitive for stopping
+
+        Args:
+            node_or_name (Union[str, Node]): Node or node name to start at
+            stop_at (Optional[Set[Node]], optional): Stop at this set of nodes. Defaults to None.
+            stop_down_at (Optional[Set[Node]], optional): Stop at this set of nodes going down. Defaults to None.
+            stop_up_at (Optional[Set[Node]], optional): Stop at this set of nodes going up. Defaults to None.
+            start_up (bool, optional): Start in an upward direction or downwards if False. Defaults to False.
+
+        Yields:
+            Node: Nodes visited
+        """
+        node = resolve_node_or_str(node_or_name, G=self)
+        if visited is None:
+            visited = {node}
+            started = False
+            if stop_at is None:
+                stop_at = {}
+            if stop_down_at is None:
+                stop_down_at = {}
+            if stop_up_at is None:
+                stop_up_at = {}
+        else:
+            started = True
+        if node in stop_at:
+            if yield_stop_node:
+                yield node
+            return
+        if started or yield_start_node:
+            yield node
+        if not start_up or started:
+            for edge in self.out_edges(node.name):
+                if edge.to_node in visited or edge.to_node in stop_down_at:
+                    continue
+                visited.add(edge.to_node)
+                yield from self.undirected_dfs(edge.to_node, stop_at=stop_at, stop_down_at=stop_down_at,
+                                               stop_up_at=stop_up_at, visited=visited)
+        if start_up or started:
+            for edge in self.in_edges(node.name):
+                if edge.from_node in visited or edge.from_node in stop_up_at:
+                    continue
+                visited.add(edge.from_node)
+                yield from self.undirected_dfs(edge.from_node, stop_at=stop_at, stop_down_at=stop_down_at,
+                                               stop_up_at=stop_up_at, visited=visited)
+
+    def _topological_sort(self, node: Node, visited_edges):
+        yield node
+        for edge_bundle in self.indexed_out_edges(node):
+            for edge in edge_bundle:
+                visited_edges.add(edge)
+                if set(self.in_edges(edge.to_node)).issubset(visited_edges):
+                    yield from self._topological_sort(edge.to_node, visited_edges)
+
+    def _topological_sort_reversed(self, node: Node, visited_edges):
+        yield node
+        for edge in reversed(self.indexed_in_edges(node)):
+            visited_edges.add(edge)
+            if set(self.out_edges(edge.from_node)).issubset(visited_edges):
+                yield from self._topological_sort_reversed(edge.from_node, visited_edges)
+
+    def topological_sort(self,
+                         start_node_or_nodes: Optional[Union[str,
+                                                             Node,
+                                                             Sequence[Union[str, Node]]]] = None,
+                         reverse: bool = False):
+        """[summary]
+
+        Args:
+            start_node_or_nodes (Optional[Union[str, Node, Sequence[Union[str, Node]]]], optional): 
+                Optional start node or nodes. Can also be node names. Defaults to None.
+            reverse (bool, optional): Sort from bottom of the graph up. Tries to be a perfect reverse of order. Defaults to False.
+
+        Raises:
+            ValueError: Bad parameters given
+
+        Yields:
+            (Node): Yields nodes in desired sort order
+        """
+        if start_node_or_nodes is None:
+            if reverse:
+                nodes = list(reversed(self.outputs()))
+            else:
+                nodes = self.inputs()
+        elif isinstance(start_node_or_nodes, str):
+            nodes = [self._nodes[start_node_or_nodes]]
+        elif isinstance(start_node_or_nodes, Iterable):
+            nodes = [node if isinstance(node, Node) else
+                     self[node] for node in start_node_or_nodes]
+        else:
+            raise ValueError('invalid argument')
+        visited_edges = set()
+        if reverse:
+            while nodes:
+                node = nodes.pop(0)
+                yield from self._topological_sort_reversed(node, visited_edges)
+        else:
+            while nodes:
+                node = nodes.pop(0)
+                yield from self._topological_sort(node, visited_edges)
+
     def nodes_above_are_class(self, node, classes, visited=None):
         """Check all nodes above are in classes"""
         if visited is None:
@@ -617,64 +894,6 @@ def num_out_edges(self, node_or_name: Union[str, Node]) -> int:
         node_name = resolve_name(node_or_name)
         return len(self.out_edges(node_name))
 
-    def flood_above(self, node_or_name: Union[str, Node], res=None, in_edge=None):
-        """Return all nodes above this node including it and those connected to it
-
-        Args:
-            node (Node): Node to flood
-
-        Returns:
-            [Sequence[Node]]: Nodes found including node
-        """
-        node = resolve_node_or_str(node_or_name, G=self)
-        if res is None:
-            first = True
-            res = {node}
-        else:
-            first = False
-        for edge in self.in_edges(node.name):
-            if edge.from_node not in res:
-                res.add(edge.from_node)
-                self.flood_above(edge.from_node, res=res, in_edge=edge)
-        if not first:
-            for edge in self.out_edges(node.name):
-                if edge == in_edge:
-                    continue
-                if edge.to_node != node and edge.to_node not in res:
-                    res.add(edge.to_node)
-                    self.flood_below(edge.to_node, res=res)
-        return res
-
-    def flood_below(self, node_or_name: Union[str, Node], stop_at=None, res=None, out_edge=None):
-        """Return all nodes below this node including it and those connected to it
-
-        Args:
-            node (Node): Node to flood
-            stop_at (Node): Optional node to stop flooding at
-        Returns:
-            [Sequence[Node]]: Nodes found including node
-        """
-        node = resolve_node_or_str(node_or_name, G=self)
-        if stop_at:
-            stop_at = resolve_node_or_str(stop_at, G=self)
-        if res is None:
-            first = True
-            res = {node, stop_at} if stop_at else {node}
-        else:
-            first = False
-        for edge in self.out_edges(node.name):
-            if edge.to_node not in res:
-                res.add(edge.to_node)
-                self.flood_below(edge.to_node, res=res, out_edge=edge)
-        if not first:
-            for edge in self.in_edges(node.name):
-                if edge == out_edge:
-                    continue
-                if edge.from_node != node and edge.from_node not in res:
-                    res.add(edge.from_node)
-                    self.flood_above(edge.from_node, res=res)
-        return res
-
     def remove_all(self, nodes: Sequence[Node]):
         """Remove all nodes
 
@@ -691,26 +910,20 @@ def remove_all(self, nodes: Sequence[Node]):
             self.remove(del_node)
 
     def remove_below(self, node: Node):
-        """Remove the nodes below this node. Note: If there are links below this node
-        that go back above it this will do nothing since all nodes are flooded. Use
-        keep_between in this case.
+        """Remove the nodes below this node.
 
         Args:
             node (Node): Remove below this node
         """
-        keep_nodes = self.flood_above(node)
-        self.remove_all(set(self._nodes.values()) - keep_nodes)
+        self.remove_all(self.nodes_below(node))
 
     def remove_above(self, node: Node):
-        """Remove the nodes above this node. Note: If there are links above this node
-        that go down below it this will do nothing since all nodes are flooded. Use
-        keep_between in this case.
+        """Remove the nodes above this node..
 
         Args:
             node (Node): Remove below this node
         """
-        keep_nodes = self.flood_below(node)
-        self.remove_all(set(self._nodes.values()) - keep_nodes)
+        self.remove_all(self.nodes_above(node))
 
     def keep_between(self, from_node: Node, to_node: Node):
         """Remove all nodes that are not between from_node and to_node
@@ -719,8 +932,18 @@ def keep_between(self, from_node: Node, to_node: Node):
             from_node (Node): Remove above this node
             to_node (Node): Remove below this node
         """
-        keep_nodes = self.flood_below(from_node, stop_at=to_node)
-        self.remove_all(set(self._nodes.values()) - keep_nodes)
+        keep_nodes = set(
+            self.directed_dfs(
+                from_node,
+                stop_at={to_node},
+                yield_start_node=True)
+        ) | set(
+            self.directed_dfs(
+                to_node,
+                stop_at={from_node},
+                go_up=True,
+                yield_start_node=True))
+        self.remove_all(set(self._nodes.values()) - set(keep_nodes))
 
     def remove(self, node_or_name: Union[str, Node]):
         '''Removes a node and all its connected edges'''
@@ -757,6 +980,10 @@ def edge_match(x):
         if not self._out_edges[edge.from_node.name][edge.to_node.name]:
             del self._out_edges[edge.from_node.name][edge.to_node.name]
 
+    def remove_edges(self, edges):
+        for edge in edges:
+            self.remove_edge(edge)
+
     def edge_in_graph(self, edge):
         if edge.to_node.name in self._in_edges:
             edges = self._in_edges[edge.to_node.name]
@@ -914,7 +1141,8 @@ def remove_fragment(self, frag: 'Graph'):
                 in_nodes.add(edge.from_node)
         del frag_in_edges, in_nodes
 
-        frag_out_nodes = set((edge.to_node, edge.to_idx) for frag_out_node in frag.outputs(ignore_names=nodes_not_in_graph)
+        frag_out_nodes = set((edge.to_node, edge.to_idx)
+                             for frag_out_node in frag.outputs(ignore_names=nodes_not_in_graph)
                              for edge in self.out_edges(frag_out_node.name))
         assert len(frag_out_nodes) == 1, "doesn't work if more than one output"
         frag_out_node = list(frag_out_nodes)[0]
@@ -946,96 +1174,6 @@ def outputs(self, ignore_names=None):
                 if node_name not in self._out_edges or all(output_name in ignore_names
                                                            for output_name in self._out_edges[node_name])]
 
-    def fast_dfs(self):
-        visited_edges = set()
-        nodes = deque(self.inputs())
-        while nodes:
-            node = nodes.pop()
-            node_name = node.name
-            if node_name in self._in_edges and not set(edge for edge_list in self._in_edges[node_name].values() for edge in edge_list).issubset(visited_edges):
-                continue
-            yield node
-            if node_name not in self._out_edges:
-                return
-            for edge_list in self._out_edges[node_name].values():
-                for out_edge in edge_list:
-                    visited_edges.add(out_edge)
-                    nodes.append(out_edge.to_node)
-
-    def __revdfs(self, node, condition, visited_nodes, visited_edges, from_node, from_edge):
-        if not node:
-            return
-        if isinstance(node, str):
-            node = self._nodes[node]
-        if node not in visited_nodes and\
-            (from_node is None or
-             all((out_edge in visited_edges) for out_edge in self.out_edges(node.name))) and\
-                (not condition or condition(self, from_node, node, from_edge)):
-
-            yield node
-            visited_nodes.add(node)
-            in_edges = self.in_edges(node.name)
-            # Edges are visited in a repeatable order
-            in_edges.sort(key=lambda x: str(x.from_idx) + x.from_node.name + str(x.to_idx),
-                          reverse=True)
-            for edge in in_edges:
-                visited_edges.add(edge)
-
-                yield from self.__revdfs(edge.from_node,
-                                         condition,
-                                         visited_nodes,
-                                         visited_edges,
-                                         node,
-                                         edge)
-
-    def __dfs(self, node, condition, visited_nodes, visited_edges, from_node, from_edge):
-        if not node:
-            return
-        if isinstance(node, str):
-            node = self._nodes[node]
-        if node not in visited_nodes and \
-            (from_node is None or all((in_edge in visited_edges) for in_edge in self.in_edges(node.name))) and \
-                (not condition or condition(self, from_node, node, from_edge)):
-            yield node
-            visited_nodes.add(node)
-            out_edges = self.out_edges(node.name)
-            # Edges are visited in a repeatable order
-            out_edges.sort(key=lambda x: str(x.from_idx) +
-                           x.to_node.name + str(x.to_idx))
-            for edge in out_edges:
-                visited_edges.add(edge)
-
-                yield from self.__dfs(edge.to_node,
-                                      condition,
-                                      visited_nodes,
-                                      visited_edges,
-                                      node,
-                                      edge)
-
-    def dfs(self, node_or_name=None, condition=None, reverse=False):
-        if node_or_name is None:
-            if reverse:
-                nodes = list(self.outputs())
-                # This isn't really necessary but helps with tests
-                nodes.reverse()
-            else:
-                nodes = self.inputs()
-        elif isinstance(node_or_name, str):
-            nodes = [self._nodes[node_or_name]]
-        elif isinstance(node_or_name, Iterable):
-            nodes = [node if isinstance(node, Node) else
-                     self[node] for node in node_or_name]
-        else:
-            raise TypeError()
-
-        visited_nodes = set()
-        visited_edges = set()
-        for node in nodes:
-            if reverse:
-                yield from self.__revdfs(node, condition, visited_nodes, visited_edges, None, None)
-            else:
-                yield from self.__dfs(node, condition, visited_nodes, visited_edges, None, None)
-
     @staticmethod
     def match_semantics(edges, match_edge):
         for edge in edges:
@@ -1174,49 +1312,20 @@ def match_down_edge(self, matched_graphview, fragment, graph_edge,
             return False
         return True
 
-    def match_fragment(self, fragment: 'GraphView', node_or_name: Node = None, allow_extra_edges=False):
-        """Matches a graph fragment against this graph"""
-        inputs = fragment.inputs()
-
-        start_points = {}
-
-        def match_start_points(G, from_node, to_node, unused1):
-            del unused1
-            nonlocal inputs, start_points
-            edge = None if from_node is None else G.edge(
-                from_node.name, to_node.name)
-            for fragment_input_node in inputs:
-                if isinstance(fragment_input_node, MatchNode) and fragment_input_node._match(G, to_node, edge):
-                    start_points[to_node] = fragment_input_node
-            return True
+    @property
+    def _edge_class(self):
+        return Edge
 
-        _ = list(self.dfs(condition=match_start_points,
-                          node_or_name=node_or_name))
-        # start points will now be a list of pairs with the start_node name in the graph and the
-        # corresponding node in the fragment. The start points have all matched an input none in the
-        # fragment
-
-        matched_fragments = []
-        while True:
-            graph_node = next(start_points.__iter__(), None)
-            if graph_node is None:
-                return matched_fragments
-            match_node = start_points[graph_node]
-            del start_points[graph_node]
-
-            matched_graphview = GraphView()
-            matched_graphview.add_node(graph_node)
-            visited_nodes = set()
-            if self.match_down_node(matched_graphview, fragment,
-                                    graph_node, match_node,
-                                    visited_nodes, start_points,
-                                    allow_extra_edges=allow_extra_edges):
-                matched_fragments.append(matched_graphview)
-        return matched_fragments
+    @property
+    def _noderef_class(self):
+        return NodeRef
 
     def __eq__(self, other):
         return set(self.nodes()) == set(other.nodes()) and set(self.edges()) == set(other.edges())
 
+    def __hash__(self) -> int:
+        return (tuple(self.nodes()), tuple(self.edges())).__hash__()
+
     def __len__(self):
         return len(self._nodes)
 
diff --git a/tools/nntool/utils/json_serializable.py b/tools/nntool/utils/json_serializable.py
index b5bae0632..3a0f3e7f9 100644
--- a/tools/nntool/utils/json_serializable.py
+++ b/tools/nntool/utils/json_serializable.py
@@ -64,6 +64,10 @@ def default(self, o):
                 '__contents': o.tolist(),
                 '__dtype': o.dtype.name
             }
+        if hasattr(o, 'dtype'):
+            if np.issubdtype(o.dtype, np.bool):
+                return bool(o)
+
         # Let the base class default method raise the 
         try:
             return json.JSONEncoder.default(self, o)
diff --git a/tools/nntool/utils/maximizer.py b/tools/nntool/utils/maximizer.py
index 061e9b6ea..49a51e9b8 100644
--- a/tools/nntool/utils/maximizer.py
+++ b/tools/nntool/utils/maximizer.py
@@ -25,7 +25,7 @@ def __init__(self, func, var_min, var_max, func_change=None, int_step=False):
         self._args = tuple()
         self._int_step = int_step
 
-    @lru_cache
+    @lru_cache(maxsize=128, typed=False)
     def func(self, var):
         return self._func(var, *self._args)
 
diff --git a/tools/nntool/utils/numpy_helpers.py b/tools/nntool/utils/numpy_helpers.py
index a8a841af3..9641702d9 100644
--- a/tools/nntool/utils/numpy_helpers.py
+++ b/tools/nntool/utils/numpy_helpers.py
@@ -36,3 +36,8 @@ def packbits(value, bits):
         )[:, 0:bits:].flatten(),
         bitorder='little'
     )
+
+def np_asscalar(elem):
+    if isinstance(elem, np.ndarray):
+        return elem.item()
+    return elem
diff --git a/tools/nntool/utils/process_header.py b/tools/nntool/utils/process_header.py
index e6021f3f7..adcb5f261 100644
--- a/tools/nntool/utils/process_header.py
+++ b/tools/nntool/utils/process_header.py
@@ -168,6 +168,7 @@ def gen_infos_array(self, len_key, **vals):
         keys = sorted([(key, self.inf(key), self.inf_len(key))
                        for key in vals], key=lambda x: x[1])
         bvals = np.full((self.inf(len_key),), 0, dtype=np.uint8)
+        comment = ""
         for key, info, info_len in keys:
             val = np.atleast_1d(vals[key])
             val = val.newbyteorder('>')
@@ -177,4 +178,5 @@ def gen_infos_array(self, len_key, **vals):
                 raise ValueError(
                     f'value for {key} is too long {val_len}>{info_len}')
             bvals[info:info+len(val):1] = val
-        return bvals
+            comment += f" {key}: {vals[key]}"
+        return bvals, comment
diff --git a/tools/profiler/.gitignore b/tools/profiler/.gitignore
deleted file mode 100644
index a6f8ec3da..000000000
--- a/tools/profiler/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-*/build/*
-docs/
-gui/Makefile
-*.qmake.stash
-*.debug_info
-gui/uic_wrapper.sh
-function_statistics.txt
diff --git a/tools/profiler/Doxyfile b/tools/profiler/Doxyfile
deleted file mode 100644
index 90e2f5b37..000000000
--- a/tools/profiler/Doxyfile
+++ /dev/null
@@ -1,2427 +0,0 @@
-# Doxyfile 1.8.11
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "GAP Profiler"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = "A tool to help to optimize GAP programs"
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = docs/
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
-# The default value is: NO.
-
-WARN_AS_ERROR          = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  =
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
-# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          =
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = gui/build backend/build docs
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = README.md
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: YES.
-
-HAVE_DOT               = YES
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/tools/profiler/Makefile b/tools/profiler/Makefile
deleted file mode 100644
index e74f4e1a0..000000000
--- a/tools/profiler/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-.PHONY: all clean gui backend run docs
-
-all: backend gui
-
-backend:
-	$(MAKE) -C backend
-
-gui: backend
-	cd gui && qmake && cd ..
-	$(MAKE) -C gui
-	rm -f gui/uic_wrapper.sh
-
-
-clean:
-	$(MAKE) -C backend clean
-	if [ -e gui/Makefile]; then $(MAKE) -C gui clean; fi
-	rm -rf docs
-
-run: all
-	bash ./run_profiler.sh $(folder)
-
-docs:
-	doxygen Doxyfile
diff --git a/tools/profiler/README.md b/tools/profiler/README.md
deleted file mode 100644
index a869e15a2..000000000
--- a/tools/profiler/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-## Requirements
- - GAP SDK : see https://github.com/GreenWaves-Technologies/gap_sdk
- - Qt >= 5.13: see https://doc.qt.io/qt-5/gettingstarted.html
-
-### Requirements on Ubuntu 18
-```
-sudo apt install qt5-default libqt5charts5-dev
-```
-
-## How to build
-Simply run `make`
-
-## How to run
-To  run the profiler, you first need to run the profiler/init.sh script. <br>
-Then, you need to go to your example directory ($HOME/gap-sdk/autotiler/Cifar10 for example) and run the following command:
-```
-$ make all profiler platform=gvsoc
-```
-The profiler window opens and you can first see a View Menu in the toolbar. This menu allows the user to select the different Dock Windows he/she wishes to display. Except the Timeline window, which is fixed, all the dock windows can be moved by clicking the mouse on their Title and placed around the timeline window. 
-
-There are  also four buttons in the tool bar to control the Gvsoc process: 
- - Open : starts the Gvsoc process 
- - Close : closes the Gvsoc Process
- - Run : runs the Gvsoc process on your example
- - Pause : pauses the Gvsoc process. It can be restarted by pushing on the "Run" button again
-  
-**Make sure:**
- - your Makefile compiles your code with the -g option (debugging symbols), in order for the profiler to be able to provide as much information as possible.
- - you sourced the appropriate files that are needed to build your project (as the gap_sdk/sourceme.sh)
-
-## Documentation
-You can access to HTML documentation with the following commands:
-```
-$ make docs
-$ <your favourite internet browser> docs/html/index.html &
-```
-
-## Known problems
-Have a look at the gitlab issue page to get an up-to-date list of known problems.
-Please report all the problems you find (or the improvements you'd like to see) there.
-
-## Implementation details
-Roughly speaking, this program collects traces from gvsoc, extracts performance relevant data from those, and displays it to the user.
- - The *backend* part is responsible for the communication with gvsoc (using a fifo) and for the extraction of relevant data. It is compiled as a static library.
- - The *frontend* part queries the backend, and is responsible for the display.
diff --git a/tools/profiler/backend/Makefile b/tools/profiler/backend/Makefile
deleted file mode 100644
index 7e42335d8..000000000
--- a/tools/profiler/backend/Makefile
+++ /dev/null
@@ -1,36 +0,0 @@
-BUILD_DIR =build
-TARGET_LIB = $(BUILD_DIR)/libprofiling.a
-SRC_DIR =src
-OBJ_DIR =$(BUILD_DIR)/obj
-DEP_DIR =$(BUILD_DIR)/dep
-INCLUDE_DIR = include $(DUMPER_DIR)
-
-CXX = g++
-#CXXFLAGS = -fPIC -Wall -Wextra -Werror -O2 -DDEBUG -g --std=c++14
-CXXFLAGS = -g -fPIC -Wall -Wextra -Werror --std=c++14
-CXXFLAGS += $(foreach dir, $(INCLUDE_DIR), -I $(dir))
-DEPFLAGS = -MT $@ -MMD -MP -MF $(DEP_DIR)/$*.d
-RM = rm -f
-
-OBJS = $(patsubst $(SRC_DIR)/%.cpp, $(OBJ_DIR)/%.o, $(wildcard $(SRC_DIR)/*.cpp))
-DEPS = $(OBJS:$(OBJ_DIR)/%.o=$(DEP_DIR)/%.d)
-
-.PHONY: all clean
-
-all: $(BUILD_DIR) $(OBJ_DIR) $(DEP_DIR) $(TARGET_LIB)
-
-$(BUILD_DIR) $(OBJ_DIR) $(DEP_DIR): ; @mkdir -p $@
-
-$(TARGET_LIB): $(OBJS)
-	ar rvs $@ $^
-
-$(OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(DEP_DIR)/%.d | $(DEP_DIR)
-	$(CXX) $(DEPFLAGS) $(CXXFLAGS) -o $@ -c $<
-
-clean:
-	-${RM} ${TARGET_LIB} ${OBJS} $(DEPS)
-	-$(RM) -r $(BUILD_DIR)
-
-$(DEPS):
-
--include $(wildcard $(DEPS))
diff --git a/tools/profiler/backend/include/backend_interface.hpp b/tools/profiler/backend/include/backend_interface.hpp
deleted file mode 100644
index 4d926d744..000000000
--- a/tools/profiler/backend/include/backend_interface.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef BACKEND_INTERFACE_HPP
-#define BACKEND_INTERFACE_HPP
-
-#include <string>
-#include <vector>
-#include <deque>
-#include <cstdint>
-
-#include "datamanager.hpp"
-
-/* when the GUI reads data from the backend, the same data might be written at
- * the same time by an other thread if new data comes from gvsoc. Therefore, you
- * must use the following mutex based functions when reading non thread safe
- * data. For example, it makes sure that a vector is not read while a new element
- * is pushed back (push_back might cause a realloc, so such a scenerio is
- * definitively not thread safe).
-*/
-void lock_data_mtx();
-void unlock_data_mtx();
-
-
-Function_table get_function_table();
-const char* function_at(int core_id, uint64_t timestamp);
-
-const std::vector<TLData<const char*>>& get_timeline_data();
-const std::vector<int> & get_timeline_id();
-uint get_nPe();
-
-std::string get_source_code(std::string function_name);
-std::string get_asm_code(std::string function_name);
-std::string file_of(std::string function_name);
-int function2line_number(std::string function_name);
-
-
-bool tldata_has_changed();
-void ack_tld_changes();
-
-uint64_t get_max_time();
-const Function_stat* get_function_stat(std::string function_name);
-const TLData<g_compressed_data_t>* event_timestamps(int id);
-std::string get_trace_txt_list();
-
-bool add_trace_to_timeline(std::string path);
-void remove_trace_from_timeline(int index);
-const std::vector<int>& id_to_display();
-
-uint64_t get_cluster_period();
-uint64_t get_fc_period();
-
-std::vector<uint64_t> get_cycles_per_pe();
-bool datamanager_done();
-
-int getSignalIdFromBackend(std::string path);
-
-#endif //BACKEND_INTERFACE_HPP
diff --git a/tools/profiler/backend/include/datamanager.hpp b/tools/profiler/backend/include/datamanager.hpp
deleted file mode 100644
index f5cd9ab49..000000000
--- a/tools/profiler/backend/include/datamanager.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
- #ifndef DATAMANAGER_HPP
-#define DATAMANAGER_HPP
-
-#include <string>
-#include <thread>
-
-#include <unordered_map>
-#include <map>
-#include <vector>
-#include <deque>
-#include <mutex>
-
-#include "profiler_server.hpp"
-#include "pc2source.hpp"
-#include "gdb_interface.hpp"
-#include "tldata.hpp"
-
-/**
-  @brief basic structure to record one execution of a given function
-*/
-typedef struct {
-  const char* function_name;
-  uint64_t t_start; /**< when the function is called */
-  uint64_t t_end;   /**< when the function returns */
-} Function_exec;
-
-/**
-  @brief statistics on stall reasons
-*/
-typedef struct {
-  uint64_t lost_cycles[N_STALL_REASONS] = {0};
-} Stall_stat;
-
-/**
-  @brief execution statistics for a function
-*/
-class Function_stat {
-public:
-  std::string name;
-  uint n_calls;
-  uint64_t tot_time;
-  uint64_t tot_cycle;
-  std::string file;
-  Stall_stat stall_info;
-
-  bool operator< (const Function_stat& f){
-    return this->tot_time > f.tot_time;
-  }
-};
-
-
-typedef std::vector<Function_stat> Function_table;
-
-typedef std::map<std::string, Function_stat> Function_mapping;
-typedef std::unordered_map<std::string, std::string> File_mapping;
-
-/**
-  @brief class to manage profiling data collected by a profiler_server
-*/
-class Data_manager {
-public:
-  const static int TLHeight;
-  const static int fc_idx;
-  const static int dma_idx;
-
-  Data_manager();
-  ~Data_manager();
-  Data_manager(std::string path_to_fifo, std::string path_to_elf, uint64_t f_max);
-  bool start_listening();
-  std::string gdb_request(std::string request);
-  std::string gdb_read();
-  const TLData<g_compressed_data_t>* get_event_timestamps(int id) const;
-  std::string get_trace_txt_list() const;
-  bool trace_registration_done() const;
-  bool add_trace_to_timeline(std::string path);
-  void remove_trace_from_timeline(int absolute_index);
-  uint64_t get_cluster_period() const {return cluster_period;}
-  uint64_t get_fc_period() const {return fc_period;}
-  const uint64_t* get_cycles_per_core() const {return cycles_per_core; }
-  bool is_done() const {return done;}
-  int getSignalId(std::string path);
-
-  std::thread listening_thd;
-  bool stop = false;
-  bool tld_has_changed;
-  std::vector<TLData<const char*> > signal_list;
-  std::vector<int> signal_id;
-  Function_mapping fctm;
-  File_mapping file_m;
-  std::vector<bool> isStateTraceActiv;
-  std::vector<bool> isPcTraceActiv;
-  std::string elf_file;
-  Gdb_interface* gdb;
-  std::vector<int> displayed_id;
-
-  std::mutex data_mutex;
-
-  // For issuing statistics
-  // For storing signal rising edge detected
-  std::unordered_map<uint32_t, bool> risingEdgeDetected;
-  // for storing signals rising edge timestamp
-  std::unordered_map<uint32_t, int> risingEdgeTime;  
-  // for storing cumulative duty time of a signal
-  std::unordered_map<uint32_t, int> signalDutyTime;  
-
-private:
-
-  bool isRisingEdge(const trace_packet &packet);
-  bool isFallingEdge(const trace_packet &packet);
-  void fillUpStats(const trace_packet &packet);
-
-  // Following functions are used to merge state & pc traces on the state trace
-  // ! Only state traces record their trace_id
-  void manageStateTraceStarts(const trace_packet& packet, 
-                            int core_id,
-                            int trace_id);
-  void manageStateTraceStops( const trace_packet& packet, 
-                            int core_id,
-                            int trace_id);
-  void managePcTraceStops(const trace_packet& packet, 
-                        int core_id);
-  void managePcTraceStarts( const trace_packet& packet, 
-                          const Pc_info& pi,
-                          int core_id);      
-                                                                                         
-  void set_end_in_tl(int trace_id, uint64_t timestamp, int i);
-  void listen();
-  void update_data(const trace_packet& packet, const Pc_info& pi, int core_id, int trace_id);
-  void update_dma(const trace_packet& packet, bool active, int trace_id);
-  void update_stall_stats(const stall_event_t& stall);
-  void init_tldata();
-  void begin_stall(stall_trace_t trace, uint64_t timestamp);
-  void end_stall(stall_trace_t trace, uint64_t timestamp);
-  void begin_function_call(uint idx, uint64_t timestamp, const char* s, int trace_id);
-  void end_function_call(uint idx, uint64_t timestamp, int trace_id);
-
-  bool is_buffer_empty(uint idx);
-  void clear_buffer_slot(uint idx);
-  bool traceFound(int trace_id);
-  
-
-  Profiler_server* server = nullptr;
-  Pc_mapping pcm;
-  std::vector<uint64_t> stall_timestamp;
-  std::vector<uint32_t> current_pc;
-  std::vector<Function_exec> insertion_buffer;
-  uint64_t fc_period;
-  uint cluster_period;
-  uint64_t* cycles_per_core = nullptr;;
-  bool done = false;
-
-};
-
-void stop_server();
-bool init_backend(std::string path_to_fifo, std::string path_to_elf,
-                  uint64_t f_max);
-int getSignalId(std::string path);
-
-#endif //DATAMANAGER_HPP
diff --git a/tools/profiler/backend/include/gdb_interface.hpp b/tools/profiler/backend/include/gdb_interface.hpp
deleted file mode 100644
index f0b0cc006..000000000
--- a/tools/profiler/backend/include/gdb_interface.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef GDB_INTERFACE_HPP
-#define GDB_INTERFACE_HPP
-
-class Gdb_interface{
-public:
-  Gdb_interface(std::string elf_file);
-  ~Gdb_interface();
-  std::string gdb_request(std::string request);
-
-private:
-  std::string gdb_read();
-  void setup_gdb_child(std::string elf_file);
-
-  int pipe_to_gdb;
-  int pipe_from_gdb;
-  pid_t gdb_process;
-
-};
-
-#endif  //GDB_INTERFACE_HPP
diff --git a/tools/profiler/backend/include/pc2source.hpp b/tools/profiler/backend/include/pc2source.hpp
deleted file mode 100644
index e451772db..000000000
--- a/tools/profiler/backend/include/pc2source.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef PC2SOURCE_HPP
-#define PC2SOURCE_HPP
-
-#include <unordered_map>
-#include <string>
-#include <fstream>
-#include <sstream>
-
-/**
-  @brief struct to store information related to a given pc value (i.e. an
-         assembly instr)
-*/
-typedef struct {
-  int line;
-  const char* func;
-  const char* inline_func;
-  const char* file;
-  uint64_t tot_time;
-  uint64_t count;
-  /* add assembly ? */
-} Pc_info;
-
-
-/** @brief maps each pc value to the associated pieces of information */
-typedef std::unordered_map<uint32_t, Pc_info> Pc_mapping;
-
-
-void parse_elf(std::string elf_file, Pc_mapping& pcm);
-std::string read_file(std::string filename);
-
-#endif // PC2SOURCE_HPP
diff --git a/tools/profiler/backend/include/profiler_server.hpp b/tools/profiler/backend/include/profiler_server.hpp
deleted file mode 100644
index adc530be6..000000000
--- a/tools/profiler/backend/include/profiler_server.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef PROFILER_SERVER_HPP
-#define PROFILER_SERVER_HPP
-
-
-#include "trace_dumper.hpp"
-#include "trace_dumper_profiling.hpp"
-#include "tldata.hpp"
-
-/* define to active runtime checks when receiving stall information */
-//#define STALL_WARNING
-
-#define MAX_CORE_ID 1000
-
-/**
-    @brief type to store an event of a trace of any type.
-
-    If sizeof(compressed_data_t) is large, it increases memory usage significantly
-    If sizeof(compressed_data_t) is small (as here), some trace cannot be stored
-*/
-typedef uint32_t generic_data_t;
-typedef compressed_data_t<generic_data_t> g_compressed_data_t;
-typedef decompressed_data_t<generic_data_t> g_decompressed_data_t;
-
-/**
-  @brief structure to mark the beginning of an event of any type
-*/
-typedef struct {
-  Data_with_time<g_compressed_data_t> compr_data;
-  uint32_t average_threshold;
-} Event_record;
-
-/**
-  @brief  an extension of trace_dumper_server with advanced packet analysis
-          methods.
- */
-class Profiler_server : public trace_dumper_server
-{
-public:
-  /**
-    creates a new Profiler_server
-    @param file: the file to read packets from
-    @param f_max: maximum number of packets stored per second per signal.
-            Beyond this threshold, there are 2 possibilities:
-              - a lossless compression method (e.g. RLE) can be used to
-                store all packets without storing more than the equivalent of
-                f_max packets per second
-              - otherwise, a lossy compression is used to maintain the storing
-                rate at f_max packets per second
-    @param tolerance: tolerated relative difference between measured frequency
-           of a trace and the ratio f_max / average_threshold. Don't change it
-           if you are not familiar with the average compression method we use
-  */
-  Profiler_server(std::string file, uint64_t f_max, double tolerance=1.3);
-  virtual ~Profiler_server();
-
-  int get_packet(trace_packet* packet) override;
-
-
-  /**
-    returns true and sets core_id and pc_value if packet contains a pc for core
-    core_id. Returns false and does nothing otherwise
-  */
-  bool contains_pc(const trace_packet& packet, int& core_id, uint32_t& pc_value);
-
-  /** same as contains_pc but for a state (active / inactive) */
-  bool contains_state(const trace_packet& packet, int& core_id, bool& active_value);
-
-  /** same as contains_pc but for dma activity */
-  bool contains_dma(const trace_packet& packet, bool& active_value);
-
-  /**
-      if packet contains a rising edge of a stall signal, sets trace accordingly
-      and returns true.
-      Otherwise, does nothing and returns false.
-      If STALL_WARNING is defined, some sanity checks are performed (eg, a core
-      cannot receive a stall rising edge if it is already stalled)
-    */
-  bool contains_begin_stall(const trace_packet& packet, stall_trace_t& trace);
-
-  /** same as contains_end_stall but for a negative edge */
-  bool contains_end_stall(const trace_packet& packet, stall_trace_t& trace);
-
-  /** if packet contains the fc period, sets period and returns true. */
-  bool contains_fc_period(const trace_packet& packet, uint64_t& period);
-
-  /** same as contains_fc_period, butfor the cluster */
-  bool contains_cluster_period(const trace_packet& packet, uint64_t& period);
-
-  /** returns true if the packet indicates that the cluster begins a new cycle */
-  bool contains_cluster_cycle(const trace_packet& packet);
-
-  /**
-      @return a text description of all registered traces.
-              traces whose prefixes match are grouped in a hierarchical manner.
-              (print one output of these function to understand)
-  */
-  std::string get_trace_txt_list();
-
-  /** @returns true iff all traces have been registered in the server (does not
-              mean that all data for all traces has arrived) */
-  bool trace_registration_done() const { return registration_done; };
-
-  // Not implemented
-  int id_from_line_number(int line_number);
-
-  // events Buffer
-  std::unordered_map<uint32_t, TLData<g_compressed_data_t>> events;
-
-  std::unordered_map<std::string, int> path_mapping;
-  std::unordered_map< int, std::string> id_mapping;
-
-  std::unordered_map<uint32_t, uint64_t> event_count;
-
-  int getSignalId(std::string path);
-
-private:
-  void path_analysis(const char* path, const trace_packet* packet);
-	bool contains_stall(const trace_packet& packet, stall_trace_t& trace);
-  void add_event(const trace_packet* packet);
-  uint64_t data_2_uint64(const trace_packet& packet);
-  double data_2_double(const trace_packet& packet);
-
-  /**
-    @return true iff it's possible to store e1 and e2 using a rle compression
-            In other words, returns true if e1 and e2 describe the same kind
-            of event at a different timestamp. The end of e1
-            must match the beginning of e2 for the compression to work.
-            Warning: match_rle(e1, e2) != math_rle(e2, e1)
-  */
-  bool match_rle(const Data_with_time<g_compressed_data_t>& e1,
-                 const Data_with_time<g_compressed_data_t>& e2);
-
-  /**
-    adds e to the rle compression buffer, in order for that event to be rle
-    compressed before being sent to its TLData structure.
-  */
-  int flush_to_rle_buffer(uint32_t trace_id, const Event_record& e);
-
-  /**
-    instead of compressing events to their TLData, we can use this function
-    just to add events to the TLData
-  */
-  void just_add(uint32_t trace_id, uint64_t begin,
-                uint64_t end, generic_data_t data);
-
-  /**
-    instead of adding events directly to their TLData, we can use this function
-    to compress events before being added
-  */
-  void compress_and_add(uint32_t trace_id, uint64_t begin, uint64_t end,
-                        generic_data_t data);
-
-
-  void update_avg_threshold(uint32_t trace_id);
-
-  /* is_pc[id] = i 	if trace id tracks pc of core i
-               = -1 	otherwise
-     is_state: idem but for the activity of a core */
- std::unordered_map<uint32_t, int> is_pc;
- std::unordered_map<uint32_t, int> is_state;
- std::unordered_map<uint32_t, uint32_t> is_dma;
- std::unordered_map<uint32_t, stall_trace_t> stall_mapping;
- uint32_t cluster_period_tr_id;
- uint32_t cluster_cycle_id;
- uint32_t fc_period_tr_id;
- uint32_t fc_cycle_id;
- regex_t dma_regex;
- regex_t stall_regex;
-
- /*
-    when an event comes from gvsoc, it follows the following path:
-    gvsoc => event_buffer => average_buffer => rle_buffer => events[trace_id]
-    average_buffer and rle_buffer are used to compress data before storing it
- */
- std::unordered_map<uint32_t, Data_with_time<generic_data_t> > event_buffer;
- std::unordered_map<uint32_t, Event_record> average_buffer;
- std::unordered_map<uint32_t, Event_record> rle_buffer;
- std::unordered_map<uint32_t, std::deque<uint64_t> > entry_timestamps;
- double f_max;
- double tolerance;
-
- volatile bool registration_done = false;
- bool core_stalled[MAX_CORE_ID] = {0};
- bool core_init[MAX_CORE_ID][N_STALL_REASONS] = {{0}};
-
- // specifies we are getting packets in slow or fast mode
- // In fast mode, events are not compressed anymore
- bool gvsocSlowMode = false; 
-
-};
-
-
-#endif  //PROFILER_SERVER_HPP
diff --git a/tools/profiler/backend/include/tldata.hpp b/tools/profiler/backend/include/tldata.hpp
deleted file mode 100644
index 1cc64f838..000000000
--- a/tools/profiler/backend/include/tldata.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef TLDATA_HPP
-#define TLDATA_HPP
-
-#include <stdlib.h>
-#include <vector>
-#include <cstdint>
-#include <deque>
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-
-
-#define MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
-
-/**
-  @brief to save memory, this structure stores several T values in a compressed
-         manner.
-  */
-template <typename T> class compressed_data_t {
-public:
-  T value;
-  uint8_t rle_coeff;  /**< data represents actually rle_coeff consecutive equivalent events */
-  uint16_t n_items;    /**< value is actually the average of n_items events */
-};
-
-
-/**
-  @brief stores ready-to-use decompressed data + compression method(s) that were
-         used for the compression (thus gives a hint of the loss of precision
-         due to compression)
-*/
-template <typename T> class decompressed_data_t {
-public:
-  T value;
-  uint16_t n_items_in_avg; /**< 'value' is the 'mean of n_items_in_avg' elements */
-};
-
-template <typename T>
-std::ostream &operator<<(std::ostream &os, compressed_data_t<T> const &m) {
-    return os << "(" << (int) m.rle_coeff << " x " << m.value << ")";
-}
-
-/**
-@brief a wrapper class used by the TLData class to store data of any type
-*/
-template <typename T> class Data_with_time{
-public:
-  T d;            /**< data stored by the structure */
-  uint64_t begin; /**< data is displayed from this timestamp in timeline */
-  uint64_t end;   /**< data is displayed to this timestamp in timeline */
-  bool mixed_up;   /**< if true, means that this data is a mix of different events*/
-};
-
-template <typename T> class TLData;
-
-/**
- * @brief iterator over a TLData to get all the visible elements in a given
- *        time window at a given scale.
- *
- * Items in an inverval [t0, t1] are not necessarily yielded in order
- */
-template <typename T> class TLData_iterator{
-public:
-  TLData_iterator(const TLData<T>* ref, uint64_t t0, uint64_t t1,
-      uint64_t timeunits_per_pixel);
-  bool done() const {return is_done;};
-  TLData_iterator<T> & operator++();
-  const Data_with_time<T>& operator*() const;
-  uint64_t get_t0() const {return t0;}
-  uint64_t get_t1() const {return t1;}
-  uint64_t get_tu_p_pxl() const {return timeunits_per_pixel;}
-
-private:
-  void binary_search_j();
-  void update_current();
-  const TLData<T>* ref;
-  bool is_done;
-  uint current_i;
-  uint current_j;
-  uint jmin, jmax;
-  uint64_t t0, t1;
-  uint64_t timeunits_per_pixel;
-  bool initial_line;
-};
-
-/**
- * @brief data structure to efficiently get the elements to display in a given
- *        time window.
- *
- * It relies on these two facts:
- *    - when zoom factor is high (i.e. window very narrow), only a few items are
- *      in the window, even if the timeline is very long
- *    - when zoom factor is low, a lot of items are mixed up, because resolution
- *      is limited. So the number of items to display is not that high in that
- *      case either.
- *
- * Moreover some assumptions are made to be more efficient:
- *    1. items do not overlap, i.e. end(item i) < begin(item i + 1). You can use
- *      several TLData instances to respect that (one instance per core).
- *    2. items are added in chronological order
- *
- * Complexity:
- *    - This implementation provide a O(1) insertion time (on average).
- *    - Iterate over all visible items in a given time window costs O(M + log(N))
- * where M is the number of visible items in the window and N is the total
- * number of items stored in the data structure.
- *
- * It is almost optimal, because the complexity cannot be lower than O(M)
- * and the term log(N) (multiplied by a hiden constant) is not high compared to
- * M in practice.
- *
- * Internal details:
- * The underlying data structure is a 2D array. We group into a vector
- * events with a similar duration. Events whose duration is in range
- * [2^i, 2^(i+1) ) are stored in the vector data[i]. Each data[i] is sorted by
- * chronological order (see assumption 2). Moreover we limit i to be lower than
- * the class member imax (with 64 bit integers, imax is at most 63). So we have
- * imax vectors.
- *
- * When a time window and a screen resolution are given, we first compute the
- * lowest i0 such that an event of duration 2^i0 is visible. Then for each
- * i0 <= i < imax, we binary search the events that intersect the time window.
- * You can easily verify the complexity mentionned above, but you can also
- * understand the idea as follows: at large scale (i.e. i0 large), we avoid
- * the iteration over numerous short events, so it is fast. At small scale
- * (i.e. i0 low), iterating is fast as well because the time window is narrow.
- *
- * The last difficulty comes from the fact that a cluster of non-visible events
- * can be visible. To avoid a costly iteration over short events at large scale,
- * we directly add clusters of shorter events into each data[i]. This is the
- * meaning of the mixed_up field in the Data_with_time_structure. If data[i][j] has
- * mixed_up set to true, it means that data[i][j] does not correspond to a event,
- * it is just a "marker" to say: "between data[i][j].begin and data[i][j].end
- * there is a bunch of short events, but at that scale, you cannot see precisely
- * what is inside". If you do the maths, you'll see that the memory overhead
- * with these markers is at most 100% (i.e. mem usage is multiplied by 2).
- *
- * Possible improvements:
- *    - what if there are so many events that they cannot fit in memory? The
- *      structure could use hard drive storage when the number of items inside
- *      becomes too large
- *
- */
-template <typename T> class TLData {
-public:
-  /**
-   * @param imin: merge all events whose duration is in range [1, 2^imin]
-   * @param resolution: width of the timeline, in pixels (can be the resolution
-            of the screen).
-   */
-  TLData(uint imin = 10, uint resolution = 1920);
-
-  /**
-   * @param begin: beginning of the event (timestamp)
-     @param end: end of the event
-     @param item: data that represents the event to store
-     @param level_coeff (optional): to force an item to be at a higher/lower
-            place in the 2D array. For example, if level_coeff=4, item would
-            be visible iff it is greater than 4 pixels (and not 1). Values
-            between 0 and 1 can also be used to force compressed items as mixed
-            up items.
-    */
-  void add_item(uint64_t begin, uint64_t end, T item);
-
-  /**
-      @param begin
-      @param end
-      @param zoom_factor number of pixels per time unit. So it defines the
-             minimal duration for an event to be visible
-      @return an iterator to get all visible items that are between begin and
-              end
-    */
-  TLData_iterator<T> between(uint64_t begin, uint64_t end, double zoom_factor) const;
-
-  /**
-      print into standard output a summary of data stored. Useful to debug
-    */
-  void overview() const;
-
-  /**
-    @return the timestamp of the end of the last added event
-    */
-  uint64_t get_end_of_last_event() const {return last_event; }
-
-private:
-  friend class TLData_iterator<T>;
-  /* data[i] contains items that are visible iff 1 pixel < 2**i time units */
-  std::vector< std::vector< Data_with_time<T> > > data;
-  uint imin;    /**< items of duration < 2^imin are merged */
-  const uint imax = 55; /**< max duration for an event is 2^imax */
-  uint resolution;      /**< resolution of the screen */
-  uint64_t last_event;   /**< last added event */
-};
-
-/**
-  function to print a Data_with_time structure
-  @param dwt structure to print
-  @param newline if true, print a newline after the data
- */
-template <typename T> void print_dWT(const Data_with_time<T>& dwt, bool newline=true);
-template <typename T> std::deque<Data_with_time<decompressed_data_t<T> > >
-                        decompress(TLData_iterator<compressed_data_t<T> > iter);
-#endif  //TLDATA_HPP
diff --git a/tools/profiler/backend/include/trace_dumper.hpp b/tools/profiler/backend/include/trace_dumper.hpp
deleted file mode 100644
index 8d5e445b7..000000000
--- a/tools/profiler/backend/include/trace_dumper.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef __TRACE_DUMPER_HPP__
-#define __TRACE_DUMPER_HPP__
-
-#include "trace_dumper_types.h"
-#include "trace_dumper_utils.h"
-
-#include <string>
-#include <fstream>
-#include <unordered_map>
-
-#include <sys/types.h>
-#include <regex.h>
-
-class Trace
-{
-public:
-	Trace(std::string path, int id, uint32_t type, int width);
-
-	std::string path;
-	int id;
-	uint32_t type;
-	int width;
-
-	bool operator< (const Trace& t) const {return (this->path.compare(t.path) < 0);}
-};
-
-class trace_packet
-{
-public:
-	trace_packet();
-	virtual ~trace_packet();
-	void dump() const;
-
-	ed_header_t header;
-	ed_conf_t conf;
-	ed_reg_trace_t reg_trace;
-	Trace *trace;
-	uint64_t timestamp;
-	uint8_t *data;
-	int size;
-};
-
-class trace_dumper_client;
-
-class trace_dumper_trace
-{
-public:
-    trace_dumper_trace(trace_dumper_client *client, int id, ed_trace_type_e type, int width);
-
-    void dump(int64_t timestamp, uint8_t *value, int width);
-
-private:
-	int id;
-    ed_trace_type_e type;
-    int width;
-    trace_dumper_client *client;
-    int id_size;
-};
-
-class trace_dumper_server
-{
-public:
-		trace_dumper_server(std::string filepath);
-
-		int open();
-		virtual int get_packet(trace_packet *packet);
-		uint64_t get_timestamp() const {return timestamp;}
-
-private:
-
-	std::string filepath;
-	std::ifstream file;
-
-protected:
-	uint64_t timestamp;
-	std::unordered_map<int, Trace *> traces;
-};
-
-
-
-class trace_dumper_client
-{
-public:
-    trace_dumper_client(std::string filepath);
-
-    int open(ed_conf_timescale_e timescale=ED_CONF_TIMESCALE_PS);
-    void close();
-
-    trace_dumper_trace *reg_trace(std::string path, uint32_t id, ed_trace_type_e type, uint32_t width);
-
-    int dump_trace(int64_t timestamp, int id, int id_size, ed_trace_type_e type, uint8_t *value, int width);
-
-private:
-	std::string filepath;
-	std::ofstream file;
-	int64_t current_timestamp;
-};
-
-
-#endif
diff --git a/tools/profiler/backend/include/trace_dumper_profiling.hpp b/tools/profiler/backend/include/trace_dumper_profiling.hpp
deleted file mode 100644
index efeebc4a6..000000000
--- a/tools/profiler/backend/include/trace_dumper_profiling.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef TRACE_DUMPER_PROFILING_HPP
-#define TRACE_DUMPER_PROFILING_HPP
-
-// used as indices, so don't play with the = 0 value
-// stall_strings must correspond line by line, so don't shuffle the lines either
-enum stall_e {
-	JMP_STALL = 0,
-	I_MISS,
-	MISALIGNED,
-	LD_STALL,
-  N_STALL_REASONS
-};
-const char* const stall_strings[] = {
-	"pcer_jmp_stall",
-	"pcer_imiss",
-	"misaligned",
-	"pcer_ld_stall"
-};
-// depends on the number of parenthesis in the stall regex
-#define STALL_NMATCH (N_STALL_REASONS + 6)
-
-const char* const dma_c_regex = "^.*/dma/channel_[0-9]+$";
-const char* const stall_cluster_regex = "^.*/cluster/pe([0-9]+)/";
-const char* const stall_fc_regex = "^.*/chip/soc/fc/";
-
-typedef struct {
-  stall_e type;
-  int core_id;
-} stall_trace_t;
-
-typedef struct {
-  stall_trace_t trace;
-  uint32_t pc;
-  int cycle_penalty;
-} stall_event_t;
-
-#endif // TRACE_DUMPER_PROFILING_HPP
diff --git a/tools/profiler/backend/include/trace_dumper_types.h b/tools/profiler/backend/include/trace_dumper_types.h
deleted file mode 100644
index 027fcbd69..000000000
--- a/tools/profiler/backend/include/trace_dumper_types.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef __TRACE_DUMPER_TYPES_H__
-#define __TRACE_DUMPER_TYPES_H__
-
-#include <stdint.h>
-
-typedef enum
-{
-	ED_CONF_VERSION = 1,
-	ED_CONF_LAST_VERSION = ED_CONF_VERSION,
-} ed_conf_version_e;
-
-typedef enum
-{
-	ED_CONF_TIMESCALE_PS = 0,
-	ED_CONF_TIMESCALE_NS = 1,
-} ed_conf_timescale_e;
-
-typedef enum
-{
-    ED_TYPE_CONF          = 0x1,
-    ED_TYPE_REG_TRACE     = 0x2,
-    ED_TYPE_TIMESTAMP8    = 0x3,
-    ED_TYPE_TIMESTAMP16   = 0x4,
-    ED_TYPE_TIMESTAMP32   = 0x5,
-    ED_TYPE_TIMESTAMP64   = 0x6,
-    ED_TYPE_TRACE         = 0x7,
-    ED_TYPE_TRACE_SET_0   = 0x8,
-    ED_TYPE_TRACE_SET_1   = 0x9,
-} ed_header_type_e;
-
-typedef enum
-{
-	ED_TRACE_BITFIELD = 0x1,
-    ED_TRACE_REAL     = 0x2,
-    ED_TRACE_VARLEN   = 0x3,
-} ed_trace_type_e;
-
-typedef struct
-{
-    uint8_t type;
-} ed_header_t;
-
-typedef struct
-{
-	uint32_t version;
-	uint32_t timescale;
-} ed_conf_t;
-
-typedef struct
-{
-	uint32_t type;
-	uint32_t width;
-	uint32_t path_len;
-	uint32_t id;
-} ed_reg_trace_t;
-
-
-
-#endif
\ No newline at end of file
diff --git a/tools/profiler/backend/include/trace_dumper_utils.h b/tools/profiler/backend/include/trace_dumper_utils.h
deleted file mode 100644
index a918ebfdc..000000000
--- a/tools/profiler/backend/include/trace_dumper_utils.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef __TRACE_DUMPER_UTILS_H__
-#define __TRACE_DUMPER_UTILS_H__
-
-
-#include <stdint.h>
-#include <fstream>
-
-static inline uint32_t encode_id(uint32_t id, int *_id_size)
-{
-    uint32_t encoded_id = 0;
-    int id_size = 1;
-
-    // ID is encoded with 7 bits every 8 bits
-    while (id)
-    {
-        encoded_id |= (id & 0x7f) << ((id_size - 1)*8);
-
-        // Stop once the whole ID is encoded
-        if (id <= 0x7f)
-            break;
-
-        // Otherwise, set 8th bit of the byte to 1, to specify we continue
-        // to next byte
-        encoded_id |= 1<<(id_size*8-1);
-
-        id_size++;
-        id >>= 7;
-    }
-
-    if (_id_size)
-        *_id_size = id_size;
-
-    return encoded_id;
-}
-
-
-static inline uint32_t decode_id(std::ifstream *file, int *id_size)
-{
-    uint32_t id = 0;
-    int offset = 0;
-
-    while(1)
-    {
-        uint8_t byte;
-        file->read((char *)&byte, 1);
-        if (file->fail())
-            return -1;
-
-        id = id | ((byte & 0x7f) << offset);
-        offset += 7;
-
-        if (byte < 128)
-            break;
-    }
-
-    if (id_size)
-        *id_size = (offset + 7) / 8;
-
-    return id;
-}
-
-
-static inline unsigned int td_get_timestamp(uint64_t diff, int *ts_size)
-{
-    unsigned int type;
-
-    if (diff < (1ULL << 8))
-    {
-        type = ED_TYPE_TIMESTAMP8;
-        *ts_size = 1;
-    }
-    else if (diff < (1ULL << 16))
-    {
-        type = ED_TYPE_TIMESTAMP16;
-        *ts_size = 2;
-    }
-    else if (diff < (1ULL << 32))
-    {
-        type = ED_TYPE_TIMESTAMP32;
-        *ts_size = 4;
-    }
-    else
-    {
-        type = ED_TYPE_TIMESTAMP64;
-        *ts_size = 8;
-    }
-
-    return type;
-}
-
-
-#endif
\ No newline at end of file
diff --git a/tools/profiler/backend/src/backend_interface.cpp b/tools/profiler/backend/src/backend_interface.cpp
deleted file mode 100644
index 180fce761..000000000
--- a/tools/profiler/backend/src/backend_interface.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include <iostream>
-#include <algorithm>
-#include <cstring>
-#include <unistd.h>
-
-#include "backend_interface.hpp"
-#include "datamanager.hpp"
-
-extern Data_manager* core;
-
-int count=0;
-
-Function_table get_function_table(){
-  //std::cout << "** get_function_table ** " << count << std::endl;
-  count +=1;
-  Function_table r;
-  for (auto it : core->fctm) r.push_back(it.second);
-  sort(r.begin(), r.end());
-  return r;
-}
-
-/* return the name of the function that was executed on a given core at a given
- * timestamp. nullptr is returned if no such function exists.
-*/
-const char* function_at(int core_id, uint64_t timestamp){
-  if (core_id >= (int) core->signal_list.size()) return nullptr;
-  auto x = core->signal_list[core_id].between(timestamp, timestamp + 1, 1);
-  if (! x.done()) {
-    return (*x).d;
-  }
-  return nullptr;
-}
-
-const std::vector<TLData<const char*>> & get_timeline_data(){
-  return core->signal_list;
-}
-
-const std::vector<int> & get_timeline_id() {
-  return core->signal_id;
-}
-
-/* useful to know whether the GUI must refresh the timeline or not */
-bool tldata_has_changed(){
-  return core->tld_has_changed;
-}
-
-/* the gui can acknowledge */
-void ack_tld_changes(){
-  core->tld_has_changed = false;
-}
-
-/* useful to know the total length of the displayed timeline */
-uint64_t get_max_time(){
-  //std::cout << "** get_max_time **" << std::endl;
-  uint64_t m = 1;
-  for (auto& c: core->signal_list){
-    uint64_t n = c.get_end_of_last_event();
-    if (n > m) m = n;
-  }
-  return m;
-}
-
-uint get_nPe(){
-  //std::cout << "** get_nPe **" << std::endl;
-  return Data_manager::TLHeight;
-}
-
-std::string get_source_code(std::string function_name){
-  //std::cout << "** get_source_code **" << std::endl;
-  std::string file = file_of(function_name);
-  if (file == ""){
-    std::cout << "[-] Warning: get_source_code of function  ";
-    std::cout << function_name << std::endl;
-    std::cout << "not available" << std::endl;
-    return std::string("");
-  }
-
-  return core->file_m[file];
-}
-
-int function2line_number(std::string function_name){
-  // ask the gdb child process where the given function is in the source code
-  std::string command = "info line " + function_name;
-
-  std::string answer = core->gdb->gdb_request(command);
-  char* line = strdup(answer.c_str());
-  char *token = strtok(line, " ");
-
-  if (token == nullptr || (token = strtok(nullptr, " ")) == nullptr){
-    std::cout << "[-] Error: cannot interpret gdb output: " << line << std::endl;
-    free(line);
-    return 0;
-  }
-
-  char* endptr;
-  int line_number = strtol(token, &endptr, 10);
-
-  if (endptr == token || ! line_number){
-    std::cout << "[-] Warning: no debugging information (please compile with -g option):"  << std::endl;
-    free(line);
-    return 0;
-  }
-  free(line);
-  return line_number;
-}
-
-std::string file_of(std::string function_name){
-  if (core->fctm.find(function_name) == core->fctm.end())
-    return std::string("");
-  return core->fctm[function_name].file;
-}
-
-std::string get_asm_code(std::string function_name){
-  //std::cout << "** get_asm_code **" << std::endl;
-  return core->gdb->gdb_request("disassemble " + function_name);
-}
-
-void lock_data_mtx(){
-  //std::cout << "** lock_data_mtx **" << std::endl;
-  core->data_mutex.lock();
-}
-
-void unlock_data_mtx(){
-  //std::cout << "** unlock_data_mtx **" << std::endl;
-  core->data_mutex.unlock();
-}
-
-const Function_stat* get_function_stat(std::string function_name){
-  //std::cout << "** get_function_stat **" << std::endl;
-  if (core->fctm.find(function_name) == core->fctm.end())
-    return nullptr;
-  return & core->fctm[function_name];
-}
-
-const TLData<g_compressed_data_t>* event_timestamps(int id){
-  //std::cout << "** get_event_timestamps **" << std::endl;
-  return core->get_event_timestamps(id);
-}
-
-const std::vector<int>& id_to_display(){
-  return core->displayed_id;
-}
-
-std::string get_trace_txt_list() {
-  //std::cout << "** get_trace_txt_list **" << std::endl;
-  // we wait until the backend gets the registration of all traces
-  while (!core->trace_registration_done()){
-    usleep(100000);
-  }
-  return core->get_trace_txt_list();
-}
-
-bool add_trace_to_timeline(std::string path){
-  //std::cout << "[+] adding " << path << std::endl;
-  return core->add_trace_to_timeline(path);
-}
-
-void remove_trace_from_timeline(int index){
-  core->remove_trace_from_timeline(index);
-}
-
-uint64_t get_cluster_period(){
-  //std::cout << "** get_cluster_period **" << std::endl;
-  return core->get_cluster_period();
-}
-
-uint64_t get_fc_period(){
-  //std::cout << "** get_fc_period **" << std::endl;
-  return core->get_fc_period();
-}
-
-std::vector<uint64_t> get_cycles_per_pe(){
-  //std::cout << "** get_cycles_per_pe **" << std::endl;
-  const uint64_t* x = core->get_cycles_per_core();
-  if (x == nullptr){
-    return std::vector<uint64_t>(std::max(0, Data_manager::dma_idx - 1), 0);
-  }
-  return std::vector<uint64_t>(x + 1, x + Data_manager::dma_idx);
-}
-
-bool datamanager_done(){
-  return core->is_done();
-}
-
-
-int getSignalIdFromBackend(std::string path) {
-  return core->getSignalId(path);
-}
diff --git a/tools/profiler/backend/src/datamanager.cpp b/tools/profiler/backend/src/datamanager.cpp
deleted file mode 100644
index 8a508991c..000000000
--- a/tools/profiler/backend/src/datamanager.cpp
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include <iostream>
-#include <unistd.h>
-#include <errno.h>
-#include <numeric>
-#include <algorithm>
-
-#include "string.h"
-#include "datamanager.hpp"
-
-// TODO: has to change for GAP9 
-const int Data_manager::TLHeight = 9 + 1 + 1;
-const int Data_manager::fc_idx = 0;
-const int Data_manager::dma_idx = 10;
-Data_manager* core;
-
-Data_manager::Data_manager(){
-  server = nullptr;
-}
-
-Data_manager::Data_manager(std::string path_to_fifo, std::string path_to_elf,
-                          uint64_t f_max){
-  elf_file = path_to_elf;
-  std::cout << "[.] Executable file is: " << path_to_elf << std::endl;
-  gdb = new Gdb_interface(path_to_elf);
-  server = new Profiler_server(path_to_fifo, f_max);
-  std::cout << "[.] gdb child set up" << std::endl;
-  if (server->open())
-  {
-    std::cout << "Failed to open file " << path_to_fifo;
-    std::cout << " with error: " << strerror(errno) << std::endl;
-    server = nullptr;
-    return;
-  }
-  parse_elf(path_to_elf, pcm);
-  init_tldata();
-}
-
-Data_manager::~Data_manager(){
-  delete gdb;
-  gdb = nullptr;
-  delete server;
-  server = nullptr;
-  delete[] cycles_per_core;
-  cycles_per_core = nullptr;
-}
-
-void Data_manager::init_tldata(){
-
-  // i just used to initialize all elements of the the signal_list 
-  // with TLData 
-  // This is just for containing core signals ...
-
-  for (int i = 0; i < Data_manager::TLHeight; i++){
-    signal_list.push_back(TLData<const char*>());
-    // initialize corresponding signal id
-    signal_id.push_back(0); 
-    isStateTraceActiv.push_back(false);
-    isPcTraceActiv.push_back(false);
-    stall_timestamp.push_back(0);
-    current_pc.push_back(0);
-    insertion_buffer.push_back((Function_exec)
-        { .function_name = "", .t_start = 0, .t_end = 0});
-  }
-  cycles_per_core = new uint64_t[dma_idx];
-  for (int i = 0; i< dma_idx; i++) cycles_per_core[i] = 0;
-}
-
-const TLData<g_compressed_data_t>* Data_manager::get_event_timestamps(int id) const {
-  if (server && server->events.find(id) != server->events.end()) {
-    return & server->events[id];
-  }
-  return nullptr;
-}
-
-std::string Data_manager::get_trace_txt_list() const {
-  if (server == nullptr) return "";
-  return server->get_trace_txt_list();
-}
-
-bool Data_manager::trace_registration_done() const {
-  return (server != nullptr && server->trace_registration_done());
-}
-
-bool Data_manager::traceFound(int trace_id) {
-  // searching for trace_id in the displayed_id
-  for (int id : displayed_id) {
-    if (trace_id == id)
-      return true;
-  }
-  return false;  
-}
-
-
-bool Data_manager::add_trace_to_timeline(std::string path) {
-  //std::cout << "add_trace_to_timeline " << path << std::endl;
-  if (server == nullptr) {
-    std::cout << "[-] Server not started " << path << std::endl;
-    return false;
-  }
-  /*
-  if (server->path_mapping.find(path) == server->path_mapping.end()){
-    std::cout << "[-] Error: impossible to find trace " << path << std::endl;
-    return false;
-  }
-  
-  if (server->events.find(server->path_mapping[path]) == server->events.end()){
-    std::cout << "[-] Error: events not found for trace " << path << std::endl;
-    return false;
-  }
-  */
-  //std::cout << "[-] Add trace " << path << " to display " << std::endl;
-  // Here, we need to add them just once! So we need to check that the trace 
-  // has not been entered already
-  if (!traceFound(server->path_mapping[path]))
-    displayed_id.push_back(server->path_mapping[path]);
-  return true;
-}
-
-void Data_manager::remove_trace_from_timeline(int index){
-  if (index < 0) return;
-  if (index >= (int) displayed_id.size()) return;
-  displayed_id.erase(displayed_id.begin() + index);
-}
-
-int Data_manager::getSignalId(std::string path) {
-  return server->getSignalId(path);
-}
-
-bool Data_manager::start_listening(){
-  if (server == nullptr){
-    printf("Error: impossible for the profiling server to listen\n");
-    return false;
-  }
-  listening_thd = std::thread(&Data_manager::listen, this);
-  stop = false;
-  return true;
-}
-
-bool Data_manager::is_buffer_empty(uint idx){
-  return (    insertion_buffer[idx].t_start == 0
-          &&  insertion_buffer[idx].t_end == 0);
-}
-
-void Data_manager::clear_buffer_slot(uint idx){
-  insertion_buffer[idx].t_start = 0;
-  insertion_buffer[idx].t_end = 0;
-}
-
-void Data_manager::begin_function_call(uint idx, uint64_t timestamp, const char* s, int trace_id){
-  //std::cout << " begin-function-call" << std::endl;
-  insertion_buffer[idx] = {
-    .function_name = s,
-    .t_start = timestamp,
-    .t_end = 0    // actually we don't know yet
-  };
-  signal_id[idx]=trace_id;
-}
-
-void Data_manager::end_function_call(uint idx, uint64_t timestamp, int trace_id){
-  Function_exec& fe = insertion_buffer[idx];
-  fe.t_end = timestamp;
-
-  signal_list[idx].add_item(fe.t_start, fe.t_end, fe.function_name);
-  signal_id[idx]= trace_id;
-  
-  fctm[std::string(fe.function_name)].tot_time += fe.t_end - fe.t_start;
-  // TODO : update tot_cycle
-
-#ifdef STALL_WARNING
-  auto& s = fctm[std::string(fe.function_name)];
-  uint64_t t = 0;
-  for (int i = 0; i< N_STALL_REASONS; i++){
-    t += s.stall_info.lost_cycles[i];
-  }
-  if (t > s.tot_time){
-    std::cout << "WARNING stat error for function " << fe.function_name << std::endl;
-    std::cout << t << " > " << s.tot_time << std::endl;
-    for (int i = 0; i< N_STALL_REASONS; i++){
-      std::cout << s.stall_info.lost_cycles[i] << "; ";
-    }
-    std::cout << std::endl;
-    std::cout << "Time range: " << fe.t_start << " -> " << fe.t_end << std::endl;
-    //exit(1);
-  }
-#endif
-
-  clear_buffer_slot(idx);
-}
-
-void Data_manager::set_end_in_tl(int trace_id, uint64_t timestamp, int i){
-  // i represents the core Nb
-  //std::cout << "set_end_in_tl" << std::endl;
-  int maxi = (i >= 0 ? i + 1 : signal_list.size());
-  if (i < 0) i = 0;
-  for (; i < maxi; i++) {
-    if (!is_buffer_empty(i)){
-      end_function_call(i,timestamp,trace_id);
-    }
-  }
-}
-
-void Data_manager::update_data(const trace_packet& packet, const Pc_info& pi,
-                               int core_id, int trace_id){
-  //std::cout << "update_data" << std::endl;
-  data_mutex.lock();
-  tld_has_changed = true;
-  std::string name = std::string(pi.func);
-  if (fctm.find(name) != fctm.end()){
-    fctm[name].n_calls++;
-  } else {
-    fctm[name] = {
-      .name = name,
-      .n_calls = 1,
-      .tot_time = 0,
-      .tot_cycle = 0,
-      .file = std::string(pi.file),
-      .stall_info = Stall_stat()
-    };
-  }
-  if (!is_buffer_empty(core_id)) {
-      end_function_call(core_id, packet.timestamp, trace_id);
-  }
-
-  signal_id[core_id]=trace_id;
-
-  begin_function_call(core_id, packet.timestamp, pi.func, trace_id);
-
-  // TODO is this really correct?
-  stall_timestamp[core_id] = packet.timestamp;
-
-  if (file_m.find(pi.file) == file_m.end()){
-    file_m[pi.file] = read_file(pi.file);
-  }
-  data_mutex.unlock();
-}
-
-void Data_manager::update_dma(const trace_packet& packet, bool active, int trace_id){
-  tld_has_changed = true;
-  if (active && is_buffer_empty(Data_manager::dma_idx)){
-    data_mutex.lock();
-    signal_id[Data_manager::dma_idx]=trace_id;
-    begin_function_call(Data_manager::dma_idx, packet.timestamp, "dma", trace_id);
-    data_mutex.unlock();
-    return;
-  }
-  if (!active && !is_buffer_empty(Data_manager::dma_idx)){
-    data_mutex.lock();
-    end_function_call( Data_manager::dma_idx, packet.timestamp, trace_id);
-    data_mutex.unlock();
-  }
-}
-
-void Data_manager::update_stall_stats(const stall_event_t& stall){
-  if (pcm.find(stall.pc) != pcm.end()
-      && fctm.find(pcm[stall.pc].func) != fctm.end()){
-    auto& s = fctm[pcm[stall.pc].func].stall_info;
-    s.lost_cycles[(uint) stall.trace.type] += stall.cycle_penalty;
-  }
-}
-
-void Data_manager::begin_stall(stall_trace_t trace, uint64_t timestamp){
-  stall_timestamp[trace.core_id] = timestamp;
-}
-
-void Data_manager::end_stall(stall_trace_t trace, uint64_t timestamp){
-  int penalty = timestamp - stall_timestamp[trace.core_id]; //TODO convert penalty in cycles
-  if (penalty <= 0) penalty = 1; // // should not happen
-  update_stall_stats({
-    .trace = trace,
-    .pc = current_pc[trace.core_id],
-    .cycle_penalty = penalty
-  });
-}
-
-void Data_manager::manageStateTraceStarts(const trace_packet& packet, 
-                            int core_id,
-                            int trace_id) {
-  // Dealing with state trace starting for different configurations
-  //std::cout << "[.] Listen: manageStateTraceStarts core_id " << core_id << std::endl;
-  //std::cout << "isStateTraceActiv[core_id]: " << isStateTraceActiv[core_id]<< std::endl;
-  //std::cout << "isPcTraceActiv[core_id]: " << isPcTraceActiv[core_id]<< std::endl;
-  tld_has_changed =true;
-  if(!isStateTraceActiv[core_id] && isPcTraceActiv[core_id]){
-      // starts a new insertion buffer for pc traces
-      // As pc trace is active, the insertion buffer exists 
-      // and we just have to update the timestamp
-      insertion_buffer[core_id].t_start = packet.timestamp;
-  } else if (!isStateTraceActiv[core_id] && !isPcTraceActiv[core_id]) {
-      // starts a new insertion buffer for pc traces
-      insertion_buffer[core_id] = {
-          .function_name = "",
-          .t_start = packet.timestamp,
-          .t_end = 0    // actually we don't know yet
-      };
-      //std::cout << "signal_id " << core_id << " " << trace_id << std::endl;
-     
-  }
-  
-  // Finally set state trace active
-  isStateTraceActiv[core_id] = true;
-  signal_id[core_id]=trace_id;
-
-}
-
-void Data_manager::manageStateTraceStops( const trace_packet& packet, 
-                            int core_id,
-                            int trace_id){
-  // Dealing with a state trace that stops in different 
-  // configurations
-  //std::cout << "[.] Listen: manageStateTraceStops " << core_id << std::endl;
-  //std::cout << "isStateTraceActiv[core_id]: " << isStateTraceActiv[core_id]<< std::endl;
-  //std::cout << "isPcTraceActiv[core_id]: " << isPcTraceActiv[core_id]<< std::endl;
-
-  if ((isStateTraceActiv[core_id] && isPcTraceActiv[core_id]) ||
-      (isStateTraceActiv[core_id] && !isPcTraceActiv[core_id])) {
-    // Close the current insertion buffer
-    Function_exec& fe = insertion_buffer[core_id];
-    fe.t_end = packet.timestamp;
-    signal_list[core_id].add_item(fe.t_start, fe.t_end, fe.function_name);
-    //std::cout << "Function " << fe.function_name << " " << fe.t_start << " " << fe.t_end  << std::endl;
-    //std::cout << "signal_id " << core_id << " " << trace_id << std::endl;
-    signal_id[core_id]= trace_id;
-    fctm[std::string(fe.function_name)].tot_time += fe.t_end - fe.t_start;
-
-    if (isPcTraceActiv[core_id])
-      // update the insertion buffer for the timestamp
-      insertion_buffer[core_id].t_start=packet.timestamp;
-    else 
-      clear_buffer_slot(core_id);
-    }
-  isStateTraceActiv[core_id] = false;
-}
-
-void Data_manager::managePcTraceStops(const trace_packet& packet, 
-                                      int core_id) {
-  //std::cout << "[.] Listen: managePcTraceStops " << core_id << std::endl;
-  //std::cout << "isStateTraceActiv[core_id]: " << isStateTraceActiv[core_id]<< std::endl;
-  //std::cout << "isPcTraceActiv[core_id]: " << isPcTraceActiv[core_id]<< std::endl;
-  if (isStateTraceActiv[core_id] && isPcTraceActiv[core_id]) {
-      // Close the insertion buffer
-      Function_exec& fe = insertion_buffer[core_id];
-      fe.t_end = packet.timestamp;
-      signal_list[core_id].add_item(fe.t_start, fe.t_end, fe.function_name);
-      //std::cout << "Function " << fe.function_name << " " << fe.t_start << " " << fe.t_end  << std::endl;
-      //std::cout << "signal_id " << core_id << " " << trace_id << std::endl;
-      //signal_id[core_id]= trace_id;
-      // Maybe not to do in the second case ..
-      fctm[std::string(fe.function_name)].tot_time += fe.t_end - fe.t_start;
-      clear_buffer_slot(core_id);
-      // Open a new insertion buffer with no function name
-      insertion_buffer[core_id] = {
-          .function_name = "",
-          .t_start = packet.timestamp,
-          .t_end = 0    // actually we don't know yet
-      };
-  }
-  if (!isStateTraceActiv[core_id] && isPcTraceActiv[core_id]) {
-    // just clearing the insertion_buffer
-    clear_buffer_slot(core_id);
-  }
-  // Finally set PC trace not active
-  isPcTraceActiv[core_id] = false;
-}
-
-void Data_manager::managePcTraceStarts( const trace_packet& packet, 
-                          const Pc_info& pi,
-                          int core_id) {
-    // In this function, we take care of the different configurations 
-    // in which a pc trace can start
-    //std::cout << "[.] Listen: managePcTraceStarts " << core_id << std::endl;
-    //std::cout << "isStateTraceActiv[core_id]: " << isStateTraceActiv[core_id]<< std::endl;
-    //std::cout << "isPcTraceActiv[core_id]: " << isPcTraceActiv[core_id]<< std::endl;
-    // Get the current debug info
-    data_mutex.lock();
-    tld_has_changed =true;
-    // get current function name
-    std::string name = std::string(pi.func);
-    //std::cout << "function name: " << name << std::endl;
-    // Checking if it's needed to register the function
-    if (fctm.find(name) != fctm.end()){
-      // new name has been found in fctm: count nb of times it has been called
-      fctm[name].n_calls++;
-    } else {
-      // First time this function has been called
-      // Create its data in the list
-      fctm[name] = {
-        .name = name,
-        .n_calls = 1,
-        .tot_time = 0,
-        .tot_cycle = 0,
-        .file = std::string(pi.file),
-        .stall_info = Stall_stat()
-      }; // finished getting debug info
-    }
-
-     if (!isStateTraceActiv[core_id] && isPcTraceActiv[core_id]) {
-       // In this case, just change the function name in the insertion buffer
-       // and t_start
-       insertion_buffer[core_id] = {
-            .function_name = pi.func,
-            .t_start = packet.timestamp,  
-            .t_end = 0    // actually we don't know yet
-        };
-     }
-
-     if (!isStateTraceActiv[core_id] && isPcTraceActiv[core_id]) {
-         insertion_buffer[core_id] = {
-            .function_name = pi.func,
-            .t_start = packet.timestamp,
-            .t_end = 0    // actually we don't know yet
-        };
-     }
-
-    if ((isStateTraceActiv[core_id] && !isPcTraceActiv[core_id]) ||
-        (isStateTraceActiv[core_id] && isPcTraceActiv[core_id])) {
-
-        // We need to add an item to the signal_list 
-        // and close the current insertion buffer 
-        Function_exec& fe = insertion_buffer[core_id];
-        fe.t_end = packet.timestamp;
-        signal_list[core_id].add_item(fe.t_start, fe.t_end, fe.function_name);
-        //std::cout << "signal_id " << core_id << " " << trace_id << std::endl;
-        //signal_id[core_id]= trace_id;
-        fctm[std::string(fe.function_name)].tot_time += fe.t_end - fe.t_start;
-        clear_buffer_slot(core_id);
-
-        // Open a new insertion buffer as the state trace remains activ
-        // pcTrace becomes activ
-        insertion_buffer[core_id] = {
-            .function_name = pi.func,
-            .t_start = packet.timestamp,
-            .t_end = 0    // actually we don't know yet
-        };
-
-        // TODO is this really correct?
-        stall_timestamp[core_id] = packet.timestamp;
-
-        // read function source file
-        if (file_m.find(pi.file) == file_m.end()){
-          file_m[pi.file] = read_file(pi.file);
-        }
-    }
-
-    if (!isStateTraceActiv[core_id] && !isPcTraceActiv[core_id]) {
-      // In this case, we just create an insertion buffer
-      // but its timestamp will be set later 
-      // Open a new insertion buffer -- begin_function_call
-      insertion_buffer[core_id] = {
-          .function_name = pi.func,
-          .t_start = 0, // not set yet
-          .t_end = 0   // actually we don't know yet
-      };
-      //std::cout << "signal_id " << core_id<< " " << trace_id << std::endl;
-      //signal_id[core_id]=trace_id;
-    }
-    
-  // Finally set PC trace active
-  isPcTraceActiv[core_id] = true;
-
-  // release mutex
-  data_mutex.unlock();
-}
-
-
-bool Data_manager::isRisingEdge(const trace_packet &packet) {
-  if (packet.header.type == ED_TYPE_TRACE_SET_1){
-      // store the rising edge info
-      risingEdgeDetected[packet.trace->id] = true;
-      return true;
-  }
-  else 
-    return false;
-}
-
-bool Data_manager::isFallingEdge(const trace_packet &packet)
-{
-  if (packet.header.type == ED_TYPE_TRACE_SET_0){ 
-    if (risingEdgeDetected.find(packet.trace->id) != risingEdgeDetected.end() ) {
-      risingEdgeDetected[packet.trace->id] = false;
-      return true;
-    }
-  }
-  return false;
-}
-
-
-
-void Data_manager::fillUpStats(const trace_packet &packet) {
-
-// Filling up signal duty statistics
-
-  // temporary cumulative time
-  int cumulTime = 0;
-
-  if (isRisingEdge(packet)){
-    //std::cout << "b - ";
-    //packet.dump();
-    risingEdgeTime[packet.trace->id] = server->get_timestamp();
-  }
-  else if (isFallingEdge(packet)){
-    //std::cout << "e - ";
-    //packet.dump();
-    if (signalDutyTime.find(packet.trace->id) != signalDutyTime.end())
-      cumulTime= signalDutyTime[packet.trace->id];
-    else 
-      cumulTime = 0;
-    signalDutyTime[packet.trace->id] = cumulTime + 
-      (server->get_timestamp() - risingEdgeTime[packet.trace->id]);
-  }
-}
-
-void Data_manager::listen(){
-  int core_id;
-  uint32_t pc;
-  uint64_t hit = 0, miss = 0;
-  uint64_t period;
-  bool active; // ???
-  //bool just_woke_up[isStateTraceActiv.size()]; // ???
-  stall_trace_t stall;
-  //bool isNotStateTrace=true;
- 
- std::cout << "[.] begin listen()" << std::endl;
-
- while(! stop)
-  {
-    
-    trace_packet packet;
-    if (server->get_packet(&packet)){
-      //std::cout << "[.] Got last packet" << std::endl;
-      break;
-    }
-
-    // Filling up signal duty statistics
-    fillUpStats(packet);
-
-    //std::cout << "[.] Listen: New Packet" << std::endl;
-    //packet.dump();
-    if (server->contains_state(packet, core_id, active)){
-      // Dealing with state traces
-        if ((int) isStateTraceActiv.size() <= core_id) continue;
-        // set it activ or not activ
-        if (active && !isStateTraceActiv[core_id]){
-            manageStateTraceStarts(packet, core_id, server->path_mapping[packet.trace->path]);
-        }
-        else if (!active && isStateTraceActiv[core_id])
-            manageStateTraceStops(packet, core_id, server->path_mapping[packet.trace->path]);
-    }// end treating state traces
-
-    if (server->contains_pc(packet, core_id, pc)) {
-      // Dealing with pc traces
-        if (TLHeight <= core_id) { // should not happen
-        std::cout << "[-] error with core " << core_id << " / " << TLHeight << std::endl;
-        continue;
-        }
-        // set it activ or not activ 
-        // fill in the program counter
-        current_pc[core_id] = pc;
-        if (core->pcm.find(pc) == core->pcm.end()){
-            // in this case, it holds no information
-            //isNotStateTrace=true;
-            miss++;
-            managePcTraceStops(packet, core_id);
-        } else {
-            // in this case, the programm counter (pc) holds some information
-            hit++;
-            core->pcm[pc].count++;
-            // if buffer is empty or insertion_buffer function name is different
-            // then PC trace starts
-            if (is_buffer_empty(core_id) ||
-               strcmp(core->pcm[pc].func, insertion_buffer[core_id].function_name)){
-                managePcTraceStarts(packet, core->pcm[pc], core_id);
-            }
-        }
-    }// End dealing with pc traces
-
-    if (server->contains_fc_period(packet, period)){
-      fc_period = period;
-    }
-    if (server->contains_cluster_period(packet, period)){
-      cluster_period = period;
-    }
-
-    if (server->contains_dma(packet, active)){
-      update_dma(packet, active,server->path_mapping[packet.trace->path] );
-    }
-
-    // managing stall signals
-    if (server->contains_begin_stall(packet, stall)){
-      //std::cout << "b - ";
-      //packet.dump();
-      begin_stall(stall, server->get_timestamp());
-    }
-    else if (server->contains_end_stall(packet, stall) && isStateTraceActiv[stall.core_id]){
-      //std::cout << "e - ";
-      //packet.dump();
-      end_stall(stall, server->get_timestamp());
-    }
-    
-    if (server->contains_cluster_cycle(packet)){
-      for (int i = 1; i < dma_idx; i++){
-        if (isStateTraceActiv[i]) cycles_per_core[i]++;
-      }
-    }
-  }
-  done = true;
-  std::cout << "[+] metrics collect done (" << hit << " hits ; " << miss;
-  std::cout << " misses" << std::endl;
-
-}
-
-
-bool init_backend(std::string path_to_fifo, std::string path_to_elf,
-                  uint64_t f_max){
-  core = new Data_manager(path_to_fifo, path_to_elf, f_max);
-  return core->start_listening();
-}
-
-void stop_server(){
-  core->stop = true;
-  delete core->gdb;
-  core->listening_thd.join();
-}
-
-int getSignalId(std::string path) {
-  // Returns the signal ID of the signal, depending on its path
-  return core->getSignalId(path);
-}
diff --git a/tools/profiler/backend/src/gdb_interface.cpp b/tools/profiler/backend/src/gdb_interface.cpp
deleted file mode 100644
index 7ee6bee4a..000000000
--- a/tools/profiler/backend/src/gdb_interface.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include <iostream>
-#include <sys/prctl.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <errno.h>
-
-#include "string.h"
-
-#include "gdb_interface.hpp"
-
-Gdb_interface::Gdb_interface(std::string elf_file){
-  setup_gdb_child(elf_file);
-}
-
-Gdb_interface::~Gdb_interface(){
-  kill(gdb_process, SIGKILL);
-  waitpid(gdb_process, NULL, 0);
-}
-
-void Gdb_interface::setup_gdb_child(std::string elf_file){
-  /* create a gdb child process to answer requests in real time using pipes */
-  pid_t pid = 0;
-  int inpipefd[2];
-  int outpipefd[2];
-
-  if ((0 != pipe(inpipefd)) || (0 != pipe(outpipefd)))
-  {
-    printf("[-] Error: impossible to open pipes to communicate with gdb: %s\n",
-          strerror(errno));
-    exit(1);
-  }
-
-  pid = fork();
-  if (pid == 0)
-  {
-    dup2(outpipefd[0], STDIN_FILENO);
-    dup2(inpipefd[1], STDOUT_FILENO);
-    dup2(inpipefd[1], STDERR_FILENO);
-
-    //ask kernel to deliver SIGTERM in case the parent dies
-    prctl(PR_SET_PDEATHSIG, SIGTERM);
-    execlp("riscv32-unknown-elf-gdb", elf_file.c_str(),
-        "--interpreter=mi", elf_file.c_str(), (char*) NULL);
-    /* send error message through the pipe */
-    printf("[-] Error: failed to launch gdb child process %s\n", strerror(errno));
-    exit(1);
-  }
-
-  close(outpipefd[0]);
-  close(inpipefd[1]);
-  pipe_to_gdb = outpipefd[1];
-  pipe_from_gdb = inpipefd[0];
-  gdb_process = pid;
-  // read and drop gdb initialization text
-  gdb_read();
-}
-
-/* gdb mi output format -> classical format */
-static std::string unescape_string(const std::string& in){
-  int T = in.size();
-  std::string out;
-  for (int i = 0, j = 2; j && i < T; i++){
-    if (in[i] == '\\'){
-      if (i + 1 == T) break;
-      i++;
-      switch(in[i]){
-        case 't':
-          out.push_back('\t'); break;
-        case 'n':
-          out.push_back('\n'); break;
-        case 'r':
-          out.push_back('\r'); break;
-        default:
-          out.push_back(in[i]);
-      }
-    }
-    // drop all non escaped "
-    else if (in[i] != '\"') out.push_back(in[i]);
-    else j--;
-  }
-  if (out.size() > 0 && out.back() == '"') out.pop_back();
-  return out;
-}
-
-std::string Gdb_interface::gdb_read(){
-  std::string answer;
-  char buf[1024];
-  const char* end_marker = "(gdb)";
-  char data_prefix = '~';
-  FILE* in = fdopen(pipe_from_gdb, "r");
-  if (in == NULL){
-    printf("[-] reading from gdb failed: %s\n", strerror(errno));
-    return "";
-  }
-  while (fgets(buf, 1023, in)){
-    if (strncmp(buf, end_marker, strlen(end_marker)) == 0) break;
-    if (buf[0] != data_prefix) continue;
-    answer += unescape_string(buf + 1);
-  }
-  return answer;
-}
-
-std::string Gdb_interface::gdb_request(std::string request){
-  if (request.size() == 0) return "";
-  if (request.back() != '\n') {
-    request.push_back('\r');
-    request.push_back('\n');
-  }
-  if (write(pipe_to_gdb, request.c_str(), request.size()) == -1){
-    printf("[-] Sending command to gdb failed: %s\n", strerror(errno));
-    return "";
-  }
-  return gdb_read();
-}
diff --git a/tools/profiler/backend/src/pc2source.cpp b/tools/profiler/backend/src/pc2source.cpp
deleted file mode 100644
index 960e3ad3e..000000000
--- a/tools/profiler/backend/src/pc2source.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include "pc2source.hpp"
-#include <string.h>
-#include <algorithm>
-#include <vector>
-#include <iostream>
-#include <regex.h>
-
-
-void iss_register_debug_info(const char *binary, Pc_mapping& pcm)
-{
-  FILE *file = fopen(binary, "r");
-  if (file == nullptr){
-    printf("[-] Impossible to open %s\n", binary);
-    printf("%s\n", strerror(errno));
-    return;
-  }
-
-  char* line = NULL;
-  size_t len = 0;
-  while (getline(&line, &len, file) != -1)
-  {
-    char *token = strtok(line, " ");
-    char *tokens[5];
-    int index = 0;
-    while (token)
-    {
-      tokens[index++] = token;
-      token = strtok(NULL, " ");
-    }
-    if (index == 5) {
-      pcm[strtol(tokens[0], NULL, 16)] = (Pc_info) {
-        .line = (int) atoi(tokens[4]),
-        .func = strdup(tokens[1]),
-        .inline_func = strdup(tokens[2]),
-        .file = strdup(tokens[3]),
-        .tot_time = 0,
-        .count = 0
-      };
-    }
-  }
-  free(line);
-}
-
-void parse_elf(const std::string elf_file, Pc_mapping& pcm){
-  std::cout << "[.] parsing elf " << std::endl;
-  std::string info_file = std::string(basename(elf_file.c_str())) + ".debug_info";
-  std::string command = "pulp-pc-info --file " + elf_file;
-  command += " --all-file " + info_file;
-  std::cout << "[.] " << command << std::endl;
-  if (system(command.c_str())){
-    std::cout << "[-] Error: impossible to generate .Debug_info file" << std::endl;
-    return;
-  }
-  std::cout << "[+] parse_elf OK" << std::endl;
-  iss_register_debug_info(info_file.c_str(), pcm);
-}
-
-std::string read_file(std::string filename){
-  std::ifstream t(filename);
-  if (!t) return "";
-  std::stringstream buf;
-  buf << t.rdbuf();
-  return buf.str();
-}
diff --git a/tools/profiler/backend/src/profiler_server.cpp b/tools/profiler/backend/src/profiler_server.cpp
deleted file mode 100644
index f0f4b568d..000000000
--- a/tools/profiler/backend/src/profiler_server.cpp
+++ /dev/null
@@ -1,675 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <iostream>
-#include <algorithm>
-#include <sstream>
-
-#include "profiler_server.hpp"
-#include <string.h>
-
-Profiler_server::Profiler_server(std::string file, uint64_t f_max,
-                                 double tolerance)
-    : trace_dumper_server(file)
-{
-  this->f_max = (double)f_max / 1'000'000'000'000; //convertion seconds to ps
-  this->tolerance = tolerance;
-  /* compile regex used for path interpretation */
-  if (regcomp(&dma_regex, dma_c_regex, REG_EXTENDED))
-  {
-    std::cout << "[-] error: impossible to compile dma regex" << std::endl;
-  }
-  std::string stall_str_regex = "((" + std::string(stall_fc_regex) + ")|(";
-  stall_str_regex += std::string(stall_cluster_regex) + "))";
-  stall_str_regex += "(";
-  for (int i = 0; i < N_STALL_REASONS; i++)
-  {
-    if (i != 0)
-      stall_str_regex += "|";
-    stall_str_regex += "(";
-    stall_str_regex += stall_strings[i];
-    stall_str_regex += ")";
-  }
-  stall_str_regex += ")$";
-  std::cout << "[.] stall regex is " << stall_str_regex << std::endl;
-  if (regcomp(&stall_regex, stall_str_regex.c_str(), REG_EXTENDED))
-  {
-    std::cout << "[-] Impossible to compile stall regex : " << stall_str_regex;
-    std::cout << std::endl;
-  }
-}
-
-Profiler_server::~Profiler_server()
-{
-  regfree(&dma_regex);
-  regfree(&stall_regex);
-}
-
-static int parse_core_id(const char *path, const char *ref)
-{
-  int core_id = -1;
-  int L = strlen(ref);
-  int M = strlen(path);
-  if (M < L + 2)
-    return core_id;
-  path += M - L - 2;
-  if (path[0] == 'f' && path[1] == 'c')
-    core_id = 0;
-  else
-    core_id = (int)strtol(path + 1, NULL, 10) + 1;
-  return core_id;
-}
-
-static bool match_path(const char *path, int path_len, const char *ref)
-{
-  int T = path_len;
-  int L = strlen(ref);
-  return (T >= L + 2 && strcmp(ref, path + (T - L)) == 0 && ((path[T - L - 2] == 'e' && path[T - L - 1] >= '0' && path[T - L - 1] <= '9') || (path[T - L - 2] == 'f' && path[T - L - 1] == 'c')));
-}
-
-void print_regerror(int errcode, size_t length, regex_t *compiled)
-{
-  char buffer[length];
-  (void)regerror(errcode, compiled, buffer, length);
-  fprintf(stderr, "Regex match failed: %s\n", buffer);
-}
-
-int first_non_empty(regmatch_t pmatch[], int nmatch, int start = 1)
-{
-  for (int i = start; i < nmatch; i++)
-  {
-    if (pmatch[i].rm_so != pmatch[i].rm_eo)
-      return i;
-  }
-  return -1;
-}
-
-void Profiler_server::path_analysis(const char *path, const trace_packet *packet)
-{
-  /* store additional info in hash tables to speed up packet analysis */
-  if (match_path(path, packet->reg_trace.path_len, "/pc"))
-  {
-    is_pc[packet->reg_trace.id] = parse_core_id(path, "/pc");
-  }
-  else
-    is_pc[packet->reg_trace.id] = -1;
-
-  if (match_path(path, packet->reg_trace.path_len, "/state"))
-  {
-    is_state[packet->reg_trace.id] = parse_core_id(path, "/state");
-  }
-  else
-    is_state[packet->reg_trace.id] = -1;
-
-  if (path == std::string("/sys/board/chip/cluster_clock/period"))
-  {
-    cluster_period_tr_id = packet->reg_trace.id;
-  }
-  else if (path == std::string("/sys/board/chip/soc_clock/period"))
-  {
-    fc_period_tr_id = packet->reg_trace.id;
-  }
-  else if (path == std::string("/sys/board/chip/cluster_clock/cycles"))
-  {
-    cluster_cycle_id = packet->reg_trace.id;
-  }
-  else if (path == std::string("/sys/board/chip/soc_clock/cycles"))
-  {
-    fc_cycle_id = packet->reg_trace.id;
-  }
-
-  if (!regexec(&dma_regex, path, 0, NULL, 0))
-  {
-    is_dma[packet->reg_trace.id] = true;
-  }
-  else
-    is_dma[packet->reg_trace.id] = false;
-
-  regmatch_t pmatch[STALL_NMATCH];
-  if (!regexec(&stall_regex, path, STALL_NMATCH, pmatch, 0))
-  {
-    int core = first_non_empty(pmatch, 4, 2) - 2;
-    int type = first_non_empty(pmatch, STALL_NMATCH, 6) - 6;
-    if (type >= 0 && type < N_STALL_REASONS)
-    {
-      stall_mapping[packet->reg_trace.id] = (stall_trace_t){
-          .type = (stall_e)type,
-          .core_id = (core ? (int)strtol(path + pmatch[4].rm_so, NULL, 10) + 1
-                           : 0)};
-    }
-  }
-}
-
-/*
-  WARNING match_rle(e1, e2) != match_rle(e2, e1) !!
-  the conditions might seem very restrictive, but the rle compression is first
-  designed for clock signals (for which it returns true!)
-*/
-bool Profiler_server::match_rle(const Data_with_time<g_compressed_data_t> &e1,
-                                const Data_with_time<g_compressed_data_t> &e2)
-{
-  if (e1.end != e2.begin)
-    return false;
-  if (e1.d.value != e2.d.value)
-    return false;
-  if (e1.d.n_items == 0 || e2.d.n_items == 0)
-    return false;
-  if ((e1.end - e1.begin) / (e1.d.n_items * e1.d.rle_coeff) !=
-      (e2.end - e2.begin) / (e2.d.n_items * e2.d.rle_coeff))
-    return false;
-
-  return true;
-}
-
-/**
-    adds e to the rle compression buffer, in order for that event to be rle
-    compressed before being sent to its TLData structure.
-*/
-int Profiler_server::flush_to_rle_buffer(uint32_t trace_id, const Event_record &e)
-{
-  if (rle_buffer.find(trace_id) == rle_buffer.end())
-  {
-    rle_buffer[trace_id] = e;
-    return 1;
-  }
-
-  if (match_rle(rle_buffer[trace_id].compr_data, e.compr_data)
-      && rle_buffer[trace_id].compr_data.d.rle_coeff != 0xff){
-    // cool, we can RLE-compress the new item
-    //std::cout << "--- MATCH ----" << std::endl;
-    //print_dWT(rle_buffer[trace_id].compr_data);
-    //print_dWT(e.compr_data);
-    //std::cout << std::endl;
-    rle_buffer[trace_id].compr_data.d.rle_coeff++;
-    rle_buffer[trace_id].compr_data.end = e.compr_data.end;
-    return 0;
-  } else {
-  
-  /*std::cout << "--- FAILED ----" << std::endl;
-    print_dWT(rle_buffer[trace_id].compr_data);
-    print_dWT(e.compr_data);
-    std::cout << std::endl;*/
-  // too bad, cannot compress
-  // so we send the event to its TLData structure
-  //std::cout << "Add Event " << trace_id << " to events[] buffer" << std::endl;
-  events[trace_id].add_item(rle_buffer[trace_id].compr_data.begin,
-                            rle_buffer[trace_id].compr_data.end,
-                            rle_buffer[trace_id].compr_data.d);
-  // then write the new item into the buffer
-  rle_buffer[trace_id] = e;
-  return 1;
-
-  }
-}
-
-void Profiler_server::update_avg_threshold(uint32_t trace_id)
-{
-  auto &et = entry_timestamps[trace_id];
-  // first get the current threshold
-  uint32_t thr = average_buffer[trace_id].average_threshold;
-  if (thr == 0)
-  { // should not happen
-    average_buffer[trace_id].average_threshold = 1;
-    thr = 1;
-  }
-
-  while (et.size() > thr)
-    et.pop_front();
-  et.push_back(timestamp);
-  // hence, thr + 1 == et.size()
-  uint64_t delta_t = timestamp - et.front();
-  if (delta_t == 0)
-    return;
-
-  // we got thr items in delta_t ps, and only one representant of these thr items
-  // was stored. So the current storing frequency is:
-  double f = 1. / (double)delta_t;
-
-  if (trace_id == 62)
-  {
-    //for (auto x: et) std::cout << x << " ";
-    //std::cout << "f = " << f << "; f_max = " << f_max << std::endl;
-    //std::cout << "current thr = " << thr << std::endl;
-  }
-
-  if (thr < (1 << (MEMBER_SIZE(g_compressed_data_t, n_items) * 8)) / 2 && f > tolerance * f_max)
-  {
-    average_buffer[trace_id].average_threshold *= 2;
-  }
-  else if (thr > 1 && f < 1. / tolerance * f_max)
-  {
-    average_buffer[trace_id].average_threshold /= 2;
-  }
-}
-
-/**
-    instead of compressing events to their TLData, we can use this function
-    just to add events to the TLData
-*/
-void Profiler_server::just_add(uint32_t trace_id, uint64_t begin,
-                               uint64_t end, generic_data_t data)
-{
-
-  //std::cout << " Just add trace_id: " << trace_id << std::endl;
-
-  g_compressed_data_t cdata = {
-      .value = data,
-      .rle_coeff = 1,
-      .n_items = 1};
-  Data_with_time<g_compressed_data_t> dwt = {
-      .d = cdata,
-      .begin = begin,
-      .end = end,
-      .mixed_up = false};
-
-  average_buffer[trace_id] = {
-      .compr_data = dwt,
-      .average_threshold = 1};
-
-  Event_record &e = average_buffer[trace_id];
-  flush_to_rle_buffer(trace_id, e);
-}
-
-/**
-    instead of adding events directly to their TLData, we can use this function
-    to compress events before being added
-*/
-void Profiler_server::compress_and_add(uint32_t trace_id, uint64_t begin,
-                                       uint64_t end, generic_data_t data)
-{
-  g_compressed_data_t cdata = {
-      .value = data,
-      .rle_coeff = 1,
-      .n_items = 1};
-  Data_with_time<g_compressed_data_t> dwt = {
-      .d = cdata,
-      .begin = begin,
-      .end = end,
-      .mixed_up = false};
-
-  /* first we handle the compression using the average */
-  // initialisation case
-  if (average_buffer.find(trace_id) == average_buffer.end())
-  {
-    average_buffer[trace_id] = {
-        .compr_data = dwt,
-        .average_threshold = 1};
-    return;
-  }
-
-  Event_record &e = average_buffer[trace_id];
-
-  if (e.compr_data.d.n_items >= e.average_threshold)
-  {
-    /*
-       we have reached the desired compression level using the average
-       so now we send data to the rle buffer for the next layer of compression
-    */
-    e.compr_data.d.value /= e.compr_data.d.n_items;
-    if (flush_to_rle_buffer(trace_id, e))
-    {
-      // if the new item cannot be rle compressed, we might need to change the
-      // average threshold
-      update_avg_threshold(trace_id);
-    }
-    e.compr_data = dwt;
-  }
-  else
-  {
-    /* not enough items in the average buffer, we add one more */
-    e.compr_data.d.value += data;
-    e.compr_data.d.n_items++;
-    e.compr_data.end = timestamp;
-    update_avg_threshold(trace_id);
-  }
-}
-
-void Profiler_server::add_event(const trace_packet *packet)
-{
-
-  if (packet->trace->type == ED_TRACE_VARLEN)
-  {
-    //std::cout << "add_event -- varlen trace is not supported -- trace_id: " << packet->trace->id << std::endl;
-    return;
-  }
-
-  //packet->dump();
-
-  Data_with_time<generic_data_t> dwt;
-
-  if (event_buffer.find(packet->trace->id) != event_buffer.end())
-  {
-    dwt = event_buffer[packet->trace->id];
-
-    /* ----------   tricky point   ---------
-       in an ideal world, we would simply have added the item to events[id]
-       however, if we do that, memory explodes rapidly
-       so, we monitor in real time memory usage of each signal
-       and if this usage becomes too large, we apply a compression strategy:
-          - add the average instead of several samples
-          - RLE compression (perfect for clocks)
-    */
-
-    //compress_and_add(packet->trace->id, dwt.begin, timestamp, dwt.d);
-    //std::cout << "just_add trace " << packet->trace->id << " at " << timestamp << std::endl;
-    just_add(packet->trace->id, dwt.begin, timestamp, dwt.d);
-  }
-  //else
-  //  std::cout << "packet not added trace_id: " << packet->trace->id << std::endl;
-
-  dwt.begin = timestamp;
-  switch (packet->size)
-  {
-  case 1:
-    dwt.d = (generic_data_t) * (uint8_t *)packet->data;
-    break;
-  case 2:
-    dwt.d = (generic_data_t) * (uint16_t *)packet->data;
-    break;
-  case 4:
-    dwt.d = (generic_data_t) * (uint32_t *)packet->data;
-    break;
-  case 8:
-    dwt.d = (generic_data_t) * (uint64_t *)packet->data;
-    break;
-  default:
-    // if packet data cannot be casted to uint32_t = compressed_data_t,
-    // then ignore the packet
-    return;
-  }
-  event_buffer[packet->trace->id] = dwt;
-}
-
-int Profiler_server::getSignalId(std::string path)
-{
-  // Returns the signal ID of the signal, depending on its path
-  //std::cout << "path_mapping vector" << std::endl;
-  /*for (auto elem : path_mapping) {
-    if (elem.first.compare(std::string("/chip/soc/fc/state")) == 0 )
-      std::cout << "*" << elem.first << " " << elem.second << std::endl;
-  }
-  */
-  return path_mapping[path];
-}
-
-int Profiler_server::get_packet(trace_packet *packet)
-{
-  int x = trace_dumper_server::get_packet(packet);
-  if (x == -1)
-  {
-    //std::cout << "[-] Receiving no packet " << std::endl;
-    return -1; // read failed
-  }
-
-  //std::cout << "[-] Receiving packet " << std::endl;
-  //packet->dump();
-  // Packets are filtered depending on their type
-  if (packet->header.type == ED_TYPE_REG_TRACE)
-  {
-    //std::cout << "Packet Header Type: " << "ED_TYPE_REG_TRACE" << std::endl;
-    path_analysis(packet->trace->path.c_str(), packet);
-    //std::cout << "mapping[" << packet->trace->path << "]= " << packet->reg_trace.id << std::endl;
-    path_mapping[packet->trace->path] = packet->reg_trace.id;
-    id_mapping[packet->reg_trace.id] = packet->trace->path;
-    //packet->dump();
-    //std::cout << "[-] Receiving packet ED_TYPE_REG_TRACE" << std::endl;
-  }
-  else if (packet->header.type == ED_TYPE_TRACE ||
-           packet->header.type == ED_TYPE_TRACE_SET_0 ||
-           packet->header.type == ED_TYPE_TRACE_SET_1)
-  {
-    //std::cout << "Packet Header Type: " << "ED_TYPE_TRACE || ED_TYPE_TRACE_SET_0 || ED_TYPE_TRACE_SET_1" << std::endl;
-    // TODO add a filter to prevent memory from being filled to fast...
-    //if (packet->trace->id == 635){
-    //static uint64_t old_tmstp = 0;
-    //std::cout << "cycle @ " << timestamp << "; diff = " << (timestamp - old_tmstp) << std::endl;
-    //old_tmstp = timestamp;
-    // << "!!! add_event " << packet->trace->path << " id" << packet->trace->id << std::endl;
-    add_event(packet);
-    //std::cout << "[-] Receiving packet: add_event " << std::endl;
-    //}
-    event_count[packet->trace->id]++;
-    registration_done = true;
-  }
-  else if (packet->header.type == ED_TYPE_CONF)
-  {
-    //std::cout << "Packet Header Type: " << "ED_TYPE_CONF -- doing nothing" << std::endl;
-    //std::cout << "!!! add_event " << packet->trace->path << std::endl;
-    //add_event(packet);
-  }
-  else if (packet->header.type == ED_TYPE_TIMESTAMP8 ||
-           packet->header.type == ED_TYPE_TIMESTAMP16 ||
-           packet->header.type == ED_TYPE_TIMESTAMP32 ||
-           packet->header.type == ED_TYPE_TIMESTAMP64)
-  {
-    //std::cout << "Packet Header Type: " << "ED_TYPE_TIMESTAMPXX -- doing nothing" << std::endl;
-    //std::cout << "!!! add_event " << packet->trace->path  << std::endl;
-    //add_event(packet);
-  }
-  //else
-  //std::cout << "Packet Header Type: " << packet->header.type << std::endl;
-  return x;
-}
-
-std::string Profiler_server::get_trace_txt_list()
-{
-  std::string txt = "";
-  std::string token;
-  std::vector<std::string> prefix;
-  std::vector<Trace *> v;
-  char delim = '/';
-  for (auto t : traces)
-  {
-    v.push_back(t.second);
-  }
-  std::sort(v.begin(), v.end(), [](const Trace *t1, const Trace *t2) {
-    return t1->path.compare(t2->path) < 0;
-  });
-  for (Trace *t : v)
-  {
-    std::stringstream ss(t->path);
-    std::string cat;
-    uint i = 0;
-    //drop first '/'
-    std::getline(ss, token, delim);
-    while (std::getline(ss, token, delim) && i < prefix.size() && token == prefix[i])
-    {
-      i++;
-    }
-    prefix.resize(i);
-    do
-    {
-      for (uint j = 0; j < prefix.size(); j++)
-        txt += " ";
-      prefix.push_back(token);
-      txt += token + "\n";
-    } while (std::getline(ss, token, delim));
-  }
-  return txt;
-}
-
-uint64_t Profiler_server::data_2_uint64(const trace_packet &packet)
-{
-  if (packet.size == 1)
-    return *(uint8_t *)packet.data;
-  if (packet.size == 2)
-    return *(uint16_t *)packet.data;
-  if (packet.size == 4)
-    return *(uint32_t *)packet.data;
-  if (packet.size == 8)
-    return *(uint64_t *)packet.data;
-  std::cout << "[-] Warning: decoding packet data to integer failed" << std::endl;
-  return (uint64_t)-1; //error
-}
-
-double Profiler_server::data_2_double(const trace_packet &packet)
-{
-  if (packet.size == 4)
-    return *(float *)packet.data;
-  if (packet.size == 8)
-    return *(double *)packet.data;
-  return -1;
-}
-
-bool Profiler_server::contains_pc(const trace_packet &packet,
-                                  int &core_id, uint32_t &pc_value)
-{
-  if (packet.header.type != ED_TYPE_TRACE)
-    return false;
-  if (is_pc.find(packet.trace->id) == is_pc.end() || (core_id = is_pc[packet.trace->id]) == -1)
-    return false;
-  if (packet.size == 4)
-    pc_value = *((uint32_t *)packet.data);
-  else
-  {
-    printf("Impossible to read pc from packet: invalid size (%d != 4)\n",
-           (int)packet.size);
-    pc_value = 0;
-    return false;
-  }
-  return true;
-}
-
-bool Profiler_server::contains_state(const trace_packet &packet,
-                                     int &core_id, bool &active_value)
-{
-  if (packet.header.type != ED_TYPE_TRACE)
-    return false;
-  if (is_state.find(packet.trace->id) == is_state.end() || (core_id = is_state[packet.trace->id]) == -1)
-  {
-    return false;
-  }
-  if (packet.size != 1)
-  {
-    printf("Impossible to read state from packet: invalid size (%d != 1)\n"
-           "Packet is :",
-           (int)packet.size);
-    //packet.dump();
-
-    return false;
-  }
-  active_value = (*((uint8_t *)packet.data) != 0);
-  return true;
-}
-
-bool Profiler_server::contains_dma(const trace_packet &packet, bool &active_value)
-{
-  if (packet.header.type != ED_TYPE_TRACE)
-    return false;
-  if (is_dma[packet.trace->id])
-  {
-    active_value = (*((uint8_t *)packet.data) != 0);
-    return true;
-  }
-  return false;
-}
-
-
-bool Profiler_server::contains_begin_stall(const trace_packet &packet,
-                                           stall_trace_t &trace)
-{
-  if (packet.header.type == ED_TYPE_TRACE_SET_1)
-  {
-    if (contains_stall(packet, trace))
-    {
-#ifdef STALL_WARNING
-      /* a core should not be stalled twice */
-      if (trace.core_id < MAX_CORE_ID && core_stalled[trace.core_id])
-      {
-        std::cout << "[-] WARNING: core " << trace.core_id << " stalled twice";
-        std::cout << std::endl;
-        packet.dump();
-      }
-#endif
-      core_stalled[trace.core_id] = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool Profiler_server::contains_end_stall(const trace_packet &packet,
-                                         stall_trace_t &trace)
-{
-  if (packet.header.type == ED_TYPE_TRACE_SET_0)
-  {
-    if (contains_stall(packet, trace))
-    {
-      if (trace.core_id < MAX_CORE_ID && !core_stalled[trace.core_id])
-      {
-#ifdef STALL_WARNING
-        /* first packet might be intialization */
-        if (!core_init[trace.core_id][trace.type])
-        {
-          core_init[trace.core_id][trace.type] = true;
-        }
-        else
-        {
-          std::cout << "[-] WARNING: core " << trace.core_id;
-          std::cout << " unstalled be was not stalled" << std::endl;
-          packet.dump();
-        }
-#endif // STALL_WARNING 
-    // do not register a stall with undefined beginning
-        return false;
-      }
-      core_stalled[trace.core_id] = false;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool Profiler_server::contains_stall(const trace_packet &packet,
-                                     stall_trace_t &trace)
-{
-  if (stall_mapping.find(packet.trace->id) != stall_mapping.end())
-  {
-    trace = stall_mapping[packet.trace->id];
-    return true;
-  }
-  return false;
-}
-
-bool Profiler_server::contains_fc_period(const trace_packet &packet,
-                                         uint64_t &period)
-{
-  if (packet.header.type != ED_TYPE_TRACE)
-    return false;
-  if (packet.trace->id != (int)fc_period_tr_id)
-    return false;
-  period = data_2_double(packet);
-  return true;
-}
-
-bool Profiler_server::contains_cluster_period(const trace_packet &packet,
-                                              uint64_t &period)
-{
-  if (packet.header.type != ED_TYPE_TRACE)
-    return false;
-  if (packet.trace->id != (int)cluster_period_tr_id)
-    return false;
-  period = data_2_double(packet);
-  return true;
-}
-
-bool Profiler_server::contains_cluster_cycle(const trace_packet &packet)
-{
-  if (packet.header.type != ED_TYPE_TRACE)
-    return false;
-  return packet.trace->id == (int)cluster_cycle_id;
-}
diff --git a/tools/profiler/backend/src/scripts/pulp-pc-info b/tools/profiler/backend/src/scripts/pulp-pc-info
deleted file mode 100755
index b546e77d4..000000000
--- a/tools/profiler/backend/src/scripts/pulp-pc-info
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-from subprocess import Popen, PIPE
-import re
-
-
-parser = argparse.ArgumentParser(description='Generate PC debug info')
-
-parser.add_argument("--file", dest="file", default=None, help="Specify binary input file")
-parser.add_argument("--all-file", dest="allFile", default=None, help="Specify all-debug info output file")
-parser.add_argument("--pc-file", dest="pcFile", default=None, help="Specify PC output file")
-parser.add_argument("--debug-file", dest="debugFile", default=None, help="Specify debug output file")
-parser.add_argument("--inline-file", dest="inlineFile", default=None, help="Specify inline output file")
-
-args = parser.parse_args()
-
-if args.file == None: raise Exception('Binary input file must be specified through option --file')
-
-
-
-class FunctionPc(object):
-
-	def __init__(self, addr, name, info):
-		self.addr = addr
-		self.name = name
-		self.file, self.line = info.split(':')
-
-
-class Function(object):
-
-	def __init__(self, name, base, size):
-		self.name = name
-		self.base = base
-		self.size = size
-		self.pcs = {}
-
-	def addInfo(self, info):
-		infoList = info.split()
-		self.pcs[infoList[0]] = FunctionPc(int(infoList[0].replace(':', ''), 16), infoList[1], infoList[3])
-
-	def dumpPcs(self, file):
-		for pc in self.pcs.values():
-			file.write('%x %s\n' % (pc.addr, self.name))
-
-	def dumpDebug(self, file):
-		for pc in self.pcs.values():
-			file.write('%x %s:%s\n' % (pc.addr, pc.file, pc.line))
-
-	def dumpAll(self, file):
-		for pc in self.pcs.values():
-			file.write('%x %s %s %s %s\n' % (pc.addr, self.name, pc.name, pc.file, pc.line))
-
-	def dumpInline(self, file):
-		for pc in self.pcs.values():
-			file.write('%x %s\n' % (pc.addr, pc.name))
-
-functions = []
-
-# First use readelf to find out each function, its base and size
-toolchain = os.environ.get('PULP_RISCV_GCC_TOOLCHAIN_CI')
-if toolchain is None:
-	toolchain = os.environ.get('PULP_RISCV_GCC_TOOLCHAIN')
-
-if toolchain is not None:
-	objdump = toolchain + '/bin/riscv32-unknown-elf-objdump'
-	addr2line = toolchain + '/bin/riscv32-unknown-elf-addr2line'
-else:
-	objdump = 'riscv32-unknown-elf-objdump'
-	addr2line = 'riscv32-unknown-elf-addr2line'
-
-process = Popen((objdump + ' -d %s' % args.file).split(), stdin=PIPE, stdout=PIPE)
-
-# Now build the request we'll send to addr2line
-# We need one line per PC, go through all functions to cover all PCs
-req = ''
-for line in process.communicate()[0].decode('utf-8').split('\n'):
-
-	"""
-	try:
-		num, addr, size, typeName, bind, vis, ndx, name = line.split()
-	except:
-		continue
-	if typeName != 'FUNC': continue
-	if '.' in name: name = name.split('.')[0]
-
-	functions.append(Function(name, int(addr,16), int(size, 0)))
-	"""
-	m = re.match("^([0-9A-Fa-f]{8}) <([A-Za-z_0-9]+)>:", line)
-	if m != None:
-		# indexes start from 1, grrrr
-		addr = m.group(1)
-		name = m.group(2)
-		functions.append(Function(name, int(addr, 16), 0))
-
-	m = re.match("^([0-9A-Fa-f]{8}):", line)
-	if m != None:
-		req += '0x' + m.group(1) + '\n'
-		if functions:
-			functions[-1].size += 1
-
-# Get result debug info
-process = Popen((addr2line + ' -a -f -i -p -e %s' % args.file).split(), stdin=PIPE, stdout=PIPE)
-reply = (process.communicate(bytes(req, 'UTF-8'))[0])
-
-funcInfos = []
-# Drop the additional info for inlinining
-for info in reply.decode('utf-8').split('\n'):
-	if info != '' and info[0] != ' ': funcInfos.append(info)
-
-
-# Now assign each PC info in the reply to its funstion
-index = 0
-for f in functions:
-	for i in range(f.size):
-		f.addInfo(funcInfos[index])
-		index += 1
-
-
-# And finally generate the output files
-
-# PC oriented file
-if args.allFile != None:
-	with open(args.allFile, 'w') as file:
-		for f in functions:
-			f.dumpAll(file)
-
-# PC oriented file
-if args.pcFile != None:
-	with open(args.pcFile, 'w') as file:
-		for f in functions:
-			f.dumpPcs(file)
-
-# Debug info oriented file
-if args.debugFile != None:
-	with open(args.debugFile, 'w') as file:
-		for f in functions:
-			f.dumpDebug(file)
-
-# Inlined function oriented file
-if args.inlineFile != None:
-	with open(args.inlineFile, 'w') as file:
-		for f in functions:
-			f.dumpInline(file)
diff --git a/tools/profiler/backend/src/scripts/pulp-trace-extend b/tools/profiler/backend/src/scripts/pulp-trace-extend
deleted file mode 100755
index f84a7ddcb..000000000
--- a/tools/profiler/backend/src/scripts/pulp-trace-extend
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-from subprocess import Popen, PIPE
-
-
-parser = argparse.ArgumentParser(description='Generate PC debug info')
-
-parser.add_argument("--input", dest="input", default=None, help="Specify trace input file")
-parser.add_argument("--output", dest="output", default=None, help="Specify trace output file")
-parser.add_argument("--binary", dest="binaries", default=[], action="append", help="Specify binary file")
-
-args = parser.parse_args()
-
-debug_info = {}
-
-for binary in args.binaries:
-
-  if os.system('pulp-pc-info --file %s --all-file %s' % (binary, binary + '.debugInfo')) != 0:
-      raise Exception('Error while generating debug symbols information, make sure the toolchain and the binaries are accessible ')
-
-  with open(binary + '.debugInfo') as f:
-    for line in f.readlines():
-      line = line.split()
-      debug_info[line[0]] = line
-
-with open(args.output, 'w') as output_file:
-  with open(args.input) as f:
-      for line in f.readlines()[1:]:
-        line = line.strip('\n').split()
-        pc = line[3]
-        debug_str = '-'
-        debug = debug_info.get(pc)
-        if debug is not None:
-          debug_str = '%s:%s' % (debug[1], debug[4])
-        
-        line.insert(3, debug_str)
-        output_file.write('%15s %s %10s %-30s %10s %10s %10s %s\n' % (line[0], line[1], line[2], line[3], line[4], line[5], line[6], '\t'.join(line[7:])))
\ No newline at end of file
diff --git a/tools/profiler/backend/src/tldata.cpp b/tools/profiler/backend/src/tldata.cpp
deleted file mode 100644
index 75ed86a1a..000000000
--- a/tools/profiler/backend/src/tldata.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include <cmath>
-#include <iostream>
-#include <algorithm>
-#include "tldata.hpp"
-
-template <typename T>
-void print_dWT(const Data_with_time<T>& dwt, bool newline){
-  if (dwt.mixed_up) std::cout << ANSI_COLOR_RED;
-  else std::cout << ANSI_COLOR_GREEN;
-  std::cout << "[" << dwt.begin << "; " << dwt.end << "; ";
-  if (dwt.mixed_up)
-    std::cout << "mixed-up";
-  else std::cout << dwt.d ;
-  std::cout << "]" << ANSI_COLOR_RESET;
-  if (newline) std::cout << std::endl;
-}
-
-template <typename T>
-TLData<T>::TLData(uint imin, uint resolution){
-  this->imin = imin;
-  this->resolution = resolution;
-  last_event = 0;
-  data.resize(imax);
-}
-
-/* item must be added in chronological order! */
-template <typename T>
-void TLData<T>::add_item(uint64_t begin, uint64_t end, T item){
-  uint i;
-  /* first find index i to add item to data[i] */
-  if (end < begin){
-    std::cout << "[-] Warning: end = " << end;
-    std::cout << " < begin = " << begin << " ; item not added" << std::endl;
-    return;
-  }
-  if (begin == end){
-    i = imin;
-  }
-  else{
-    i = (uint) std::log2((double) (end - begin));
-    if (i < imin) i = imin;
-    if (i >= imax) i = imax - 1; //should not happen
-  }
-
-  //std::cout << "adding [" << begin << "; " << end << "; " << item << "] at level " << i << std::endl;
-  if (data[i].size() > 0 && data[i].back().end > begin && !data[i].back().mixed_up){
-    std::cout << "[-] Warning: TLData.add_item: order not respected" << std::endl;
-    std::cout << "    last item was: ";
-    print_dWT(data[i].back());
-    std::cout << "    currently trying to add ";
-    print_dWT(Data_with_time<T> {.d = item, .begin = begin, .end = end, .mixed_up = false});
-    return;
-  }
-  last_event = end;
-
-  /* then insert the new item */
-  Data_with_time<T> dwt = {.d = item, .begin = begin, .end = end, .mixed_up = false};
-  if (!data[i].empty() && data[i].back().mixed_up && data[i].back().end > begin){
-    if (data[i].back().begin - begin < (((uint64_t) 1) << i)){
-      data[i].back() = dwt;
-    } else {
-      data[i].back().end = begin;
-      data[i].push_back(dwt);
-    }
-  } else{
-    data[i].push_back(dwt);
-  }
-
-  /* add mixed up data above, if needed */
-  for (i++; i < imax; i++){
-    uint64_t step = (((uint64_t) 1) << i);
-    // assumes it does not overflow
-    end = begin + step;
-    if (data[i].empty() ){
-      data[i].push_back({.d = item, .begin = begin, .end = end, .mixed_up = true});
-    }
-    else if (! data[i].back().mixed_up){
-      // or data[i].back().end + step < begin ? to force a bit of space...
-      if (data[i].back().end < begin){
-        data[i].push_back({.d = item, .begin = begin, .end = end, .mixed_up = true});
-      }
-    }
-    // data is mixed up, do we need to merge?
-    else if (data[i].back().end < begin && begin - data[i].back().end > step){
-      // no, append
-      data[i].push_back({.d = item, .begin = begin, .end = end, .mixed_up = true});
-    }
-    else if (data[i].back().end > end){
-      // should not happen
-      std::cout << "[-] Warning: corrupted timestamp in TLData" << std::endl;
-      std::cout << "    at level " << i << ": ";
-      print_dWT(data[i].back());
-    }
-    else {
-      // yes, merge
-      data[i].back().end = end;
-      
-    }
-  }
-}
-
-template <typename T>
-void TLData<T>::overview() const {
-  std::cout << "------- TLData overview ---------" << std::endl;
-  uint s = 0;
-  for (uint i = imin; i < imax; i++){
-    std::cout << "level " << i << ": " << data[i].size() << " items";
-    s += data[i].size();
-    uint t = 0;
-    for (uint j = 0; j < data[i].size(); j++) t += data[i][j].mixed_up;
-    std::cout << " (" << t << " mixed_up, " << ( data[i].size() - t) << " real)";
-    for(uint j = 0; j < 50 && j < data[i].size(); j++){
-      std::cout << " " << j << ":";
-      print_dWT(data[i][j], false);
-    }
-    std::cout << std::endl;
-  }
-  std::cout << "Total = " << s << std::endl;
-  std::cout << "--------------------------------" << std::endl;
-}
-
-
-template <typename T>
-TLData_iterator<T> TLData<T>::between(uint64_t begin, uint64_t end,
-        double zoom_factor) const{
-  return TLData_iterator<T>(this, begin, end, 1. / zoom_factor);
-}
-
-
-template <typename T>
-TLData_iterator<T>::TLData_iterator(const TLData<T>* ref, uint64_t t0,
-          uint64_t t1, uint64_t timeunits_per_pixel){
-  this->ref = ref;
-  this->t0 = t0;
-  this->t1 = t1;
-  this->timeunits_per_pixel = timeunits_per_pixel;
-  current_i = std::min(std::max((uint) std::log2(timeunits_per_pixel), ref->imin),
-                      ref->imax - 1);
-  initial_line = true;
-  binary_search_j();
-  current_j = jmin;
-  update_current();
-  is_done = (current_i >= ref->imax);
-}
-
-template <typename T>
-void TLData_iterator<T>::update_current(){
-  //for (uint i  = current_i; i < ref->imax; i++)
-  //  std::cout << "level " <<  i << ": " << ref->data[i].size() << std::endl;
-
-  while (1){
-    for (; current_j < jmax && !initial_line
-            && ref->data[current_i][current_j].mixed_up; current_j++);
-    if (current_j < jmax) return;
-    initial_line = false;
-    current_i++;
-    if (current_i >= ref->imax) {
-      is_done = true;
-      return;
-    }
-    binary_search_j();
-    current_j = jmin;
-  }
-}
-
-template <typename T>
-TLData_iterator<T>& TLData_iterator<T>::operator++(){
-  current_j++;
-  update_current();
-  //std::cout << "iter level " << current_i << ": " << jmin << " " << jmax << std::endl;
-  return *this;
-}
-
-template <typename T>
-const Data_with_time<T>& TLData_iterator<T>::operator*() const{
-  //std::cout << "accessing i = " << current_i << " j = " << current_j << std::endl;
-  //std::cout << ref->data.size() << " " << ref->data[current_i].size();
-  //std::cout << std::endl;
-  return ref->data[current_i][current_j];
-}
-
-template <typename T>
-void TLData_iterator<T>::binary_search_j(){
-  //std::cout << "bin search i = " << current_i << "; " << ref->data[current_i].size() << " item on the line" << std::endl;
-  //std::cout << "query is " << t0 << " " << t1 << std::endl;
-  uint left = 0;
-  uint J = ref->data[current_i].size();
-  uint right = J;
-  if (J == 0 || ref->data[current_i][0].begin >= t1){
-    jmin = 0; jmax = 0;
-    return;
-  }
-  while (right - left > 1){
-    uint j = (right + left) / 2;
-    if (ref->data[current_i][j].begin > t0) right = j;
-    else left = j;
-  }
-  jmin = left;
-  // here, ref->data[current_i][jmin].begin < t1 always true
-  if (ref->data[current_i][jmin].end <= t0) jmin++;
-  if (jmin == J || ref->data[current_i][jmin].begin > t1){
-    jmax = jmin;
-    return;
-  }
-  // now ref->data[current_i][jmin] intersects [t0, t1] for sure
-  // so at that point, we are sure the interval is not empty
-  if (jmin == J - 1){
-    jmax = J;
-    return;
-  }
-  if (ref->data[current_i][jmin + 1].begin >= t1){
-    jmax = jmin + 1;
-    return;
-  }
-  left = jmin + 1;
-  right = ref->data[current_i].size();
-  while (right - left > 1){
-    uint j = (right + left) / 2;
-    if (ref->data[current_i][j].begin >= t1) right = j;
-    else left = j;
-  }
-  // end of loop: ref->data[current_i][left].begin < t1
-  jmax = left + 1;
-}
-
-
-/*
-  this is not optimal! everything is stored in a deque, to be able to iterate
-  over it later. With a custom iterator we could have only the current element
-  in memory when iterating
-*/
-template <typename T>
-std::deque<Data_with_time<decompressed_data_t<T> > > decompress(
-                            TLData_iterator<compressed_data_t<T> > iter){
-  std::deque<Data_with_time<decompressed_data_t<T> > > r;
-  for(; !iter.done(); ++iter){
-    Data_with_time<compressed_data_t<T> > dwt = *iter;
-    Data_with_time<decompressed_data_t<T> > to_be_pushed;
-
-    /* first point: do we need to decompress?
-       (no need to decompress something that is going to appear mixed up!)*/
-    uint rounds = (uint) dwt.d.rle_coeff * (uint) dwt.d.n_items;
-    if (dwt.end <= dwt.begin || rounds == 0) continue;
-    uint64_t tw = dwt.end - dwt.begin;
-
-    //if (dwt.mixed_up || tw / rounds < iter.get_tu_p_pxl()){
-    /*if ( tw / rounds < iter.get_tu_p_pxl()){
-      //easy case: data is mixed up, no need to decompress! :)
-      // corine: trying to decompress mixed_up
-      to_be_pushed.begin = dwt.begin;
-      to_be_pushed.end = dwt.end;
-      to_be_pushed.mixed_up = true;
-      to_be_pushed.d.value = dwt.d.value;
-      to_be_pushed.d.n_items_in_avg = 1; // does not matter
-      r.push_back(to_be_pushed);
-      continue;
-    }*/
-
-    // ok, go for the hard work, we have to decompress
-    uint64_t t0 = iter.get_t0();
-    uint64_t t1 = iter.get_t1();
-    /*
-      recall: dwt represents 'rounds' consecutive events, starting from begin
-      to end, which have the same duration
-      we are interested in events that are in range [t0, t1]
-
-      the following remark can be used to improve perfs:
-      the equation dwt.begin + i * tw / rounds >= t0 is equivalent to
-      i >= (t0 - dwt.begin) * rounds / tw
-
-      Similarly, we also want: i < (t1 - dwt.begin) * rounds / tw
-    */
-    uint i0 = (t0 > dwt.begin ? (t0 - dwt.begin) * rounds / tw : 0);
-    uint i1 = std::min(rounds,
-                  (uint) (t1 > dwt.begin ? (t1 - dwt.begin) * rounds / tw + 1: 0));
-    to_be_pushed.d.value = dwt.d.value;
-    to_be_pushed.d.n_items_in_avg = dwt.d.n_items;
-    to_be_pushed.mixed_up = dwt.mixed_up;
-    for (uint i = i0; i < i1; i++) {
-      to_be_pushed.begin = dwt.begin + (uint64_t) i * tw / rounds;
-      to_be_pushed.end = dwt.begin + (uint64_t) (i + 1) * tw / rounds;
-      r.push_back(to_be_pushed);
-    }
-  }
-  return r;
-}
-
-template class TLData_iterator<const char*>;
-template class TLData<const char*>;
-template class TLData_iterator<uint32_t>;
-template class TLData<uint32_t>;
-template class TLData_iterator< compressed_data_t<uint32_t> >;
-template class TLData< compressed_data_t<uint32_t> >;
-template std::deque<Data_with_time<decompressed_data_t<uint32_t> > >
-    decompress(TLData_iterator<compressed_data_t<uint32_t> >);
diff --git a/tools/profiler/backend/src/trace_dumper.cpp b/tools/profiler/backend/src/trace_dumper.cpp
deleted file mode 100644
index 4bbf4bc27..000000000
--- a/tools/profiler/backend/src/trace_dumper.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include "trace_dumper.hpp"
-#include <string>
-#include <cstring>
-#include <iostream>
-
-trace_dumper_trace::trace_dumper_trace(trace_dumper_client *client, int id, ed_trace_type_e type, int width)
-: type(type), width(width), client(client)
-{
-    this->id = encode_id(id, &this->id_size);
-}
-
-
-void trace_dumper_trace::dump(int64_t timestamp, uint8_t *value, int width)
-{
-    this->client->dump_trace(timestamp, this->id, this->id_size, this->type, value, width);
-}
-
-
-
-
-trace_dumper_server::trace_dumper_server(std::string file)
-: filepath(file), timestamp(0)
-{
-
-}
-
-
-int trace_dumper_server::open()
-{
-    this->file.open(this->filepath, std::ifstream::in | std::ifstream::out);
-    if (this->file.fail())
-        return -1;
-
-    return 0;
-}
-
-
-
-int trace_dumper_server::get_packet(trace_packet *packet)
-{
-    ed_header_t header;
-    this->file.read((char *)&packet->header, sizeof(header));
-    if (this->file.fail())
-        return -1;
-
-    if (packet->header.type == ED_TYPE_CONF)
-    {
-        this->file.read((char *)&packet->conf, sizeof(packet->conf));
-        if (this->file.fail())
-            return -1;
-    }
-    else if (packet->header.type == ED_TYPE_REG_TRACE)
-    {
-        /* register a new trace in the server */
-        this->file.read((char *)&packet->reg_trace, sizeof(packet->reg_trace));
-        if (this->file.fail())
-            return -1;
-
-        char path[packet->reg_trace.path_len+1];
-        this->file.read(path, packet->reg_trace.path_len);
-        if (this->file.fail())
-            return -1;
-        path[packet->reg_trace.path_len] = 0;
-
-        this->traces[packet->reg_trace.id] = new Trace(path, packet->reg_trace.id, packet->reg_trace.type, packet->reg_trace.width);
-        packet->trace = this->traces[packet->reg_trace.id];
-
-    }
-    else if (packet->header.type == ED_TYPE_TIMESTAMP8 ||
-             packet->header.type == ED_TYPE_TIMESTAMP16 ||
-             packet->header.type == ED_TYPE_TIMESTAMP32 ||
-             packet->header.type == ED_TYPE_TIMESTAMP64)
-    {
-        int width = packet->header.type == ED_TYPE_TIMESTAMP8 ? 1 : packet->header.type == ED_TYPE_TIMESTAMP16 ? 2 : packet->header.type == ED_TYPE_TIMESTAMP32 ? 4 : 8;
-
-        uint64_t diff = 0;
-        this->file.read((char *)&diff, width);
-        if (this->file.fail())
-            return -1;
-
-        this->timestamp += diff;
-        packet->timestamp = this->timestamp;
-    }
-    else if (packet->header.type == ED_TYPE_TRACE ||
-             packet->header.type == ED_TYPE_TRACE_SET_0 ||
-             packet->header.type == ED_TYPE_TRACE_SET_1)
-    {
-        uint32_t id = decode_id(&this->file, NULL);
-        uint32_t size;
-
-        packet->trace = this->traces[id];
-        packet->timestamp = this->timestamp;
-
-        if (packet->trace == NULL)
-        {
-            printf("Didn't find trace ID %d\n", id);
-            return -1;
-        }
-
-        if (packet->data)
-            delete packet->data;
-
-        if (packet->header.type == ED_TYPE_TRACE)
-        {
-            if (packet->trace->type == ED_TRACE_BITFIELD || packet->trace->type == ED_TRACE_REAL)
-            {
-                size = (packet->trace->width + 7) / 8;
-                packet->data = new uint8_t[size];
-                this->file.read((char *)packet->data, size);
-                if (this->file.fail())
-                    return -1;
-            }
-            else
-            {
-                this->file.read((char *)&size, 4);
-                if (this->file.fail())
-                    return -1;
-
-                packet->data = new uint8_t[size];
-
-                this->file.read((char *)packet->data, size);
-                if (this->file.fail())
-                    return -1;
-            }
-        }
-        else
-        {
-            packet->data = new uint8_t[1];
-            *(packet->data) = packet->header.type == ED_TYPE_TRACE_SET_1;
-            size = 1;
-        }
-
-        packet->size = size;
-    }
-
-    return 0;
-}
-
-trace_packet::trace_packet()
-: data(NULL)
-{
-
-}
-
-
-trace_packet::~trace_packet()
-{
-    if (this->data)
-        delete this->data;
-}
-
-
-void trace_packet::dump() const
-{
-    if (this->header.type == ED_TYPE_CONF)
-    {
-        printf("Configuration (version: %d, timescale: %s)\n", this->conf.version, this->conf.timescale == ED_CONF_TIMESCALE_PS ? "ps" : "ns");
-    }
-    else if (this->header.type == ED_TYPE_REG_TRACE)
-    {
-        printf("Trace registration (id: %d, type: %s, width: %d, path: %s)\n", this->trace->id, this->trace->type == ED_TRACE_BITFIELD ? "bitfield" : this->trace->type == ED_TRACE_REAL ? "real" : "varlen", this->trace->width, this->trace->path.c_str());
-    }
-    else if (this->header.type == ED_TYPE_TIMESTAMP8 ||
-             this->header.type == ED_TYPE_TIMESTAMP16 ||
-             this->header.type == ED_TYPE_TIMESTAMP32 ||
-             this->header.type == ED_TYPE_TIMESTAMP64)
-    {
-    }
-    else if (this->header.type == ED_TYPE_TRACE ||
-             this->header.type == ED_TYPE_TRACE_SET_0 ||
-             this->header.type == ED_TYPE_TRACE_SET_1)
-    {
-        if (this->trace->type == ED_TRACE_REAL)
-        {
-            double value = 0;
-            if (this->size == 4)
-                value = *(float *)this->data;
-            else
-                value = *(double *)this->data;
-
-
-            printf("[%ld] Trace (id: %d, path: %s, type: real, size: %d, value: %f)\n", this->timestamp, this->trace->id, this->trace->path.c_str(), this->size, value);
-        }
-        else
-        {
-            printf("[%ld] Trace (id: %d, path: %s, type: %s, width: %d, size:%d)\n",
-                this->timestamp, this->trace->id, this->trace->path.c_str(),
-                this->trace->type == ED_TRACE_BITFIELD ? (this->header.type == ED_TYPE_TRACE_SET_0 ? "BIT 0" : "BIT 1") : this->trace->type == ED_TRACE_REAL ? "real" : "varlen",
-                this->trace->width, this->size);
-
-            //if (this->trace->type == ED_TRACE_VARLEN) printf("func, content = %s\n", (char*) this->data);
-        }
-    }
-}
-
-
-Trace::Trace(std::string path, int id, uint32_t type, int width)
-: path(path), id(id), type(type), width(width)
-{
-}
diff --git a/tools/profiler/benchmark.sh b/tools/profiler/benchmark.sh
deleted file mode 100755
index cfcecc55a..000000000
--- a/tools/profiler/benchmark.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-output_file=benchmark_results.txt
-
-echo "" > $output_file
-if [ "$#" -ne 1 ]
-then
-  echo "Usage: ./benchmark.sh <directory of the makefile of your project>"
-  exit 1
-fi
-
-for ((freq = 40000 ; freq <= 1000000000 ; freq *= 2 ))
-do
-  echo "Current run @ " $freq
-  echo "Run @ " $freq >>$output_file
-  /usr/bin/time -v bash -c "$(pwd)/run_profiler.sh $1 $freq \
-    | grep ' Got last packet' -m 1" 1>/dev/null 2>>$output_file
-
-  echo -e "\n\n" >> $output_file
-done
-
-echo "Done! All results have been saved in " $output_file
diff --git a/tools/profiler/gui/images/Apps-Zoom-Fit-icon.png b/tools/profiler/gui/images/Apps-Zoom-Fit-icon.png
deleted file mode 100644
index e0a81c453..000000000
Binary files a/tools/profiler/gui/images/Apps-Zoom-Fit-icon.png and /dev/null differ
diff --git a/tools/profiler/gui/images/Apps-Zoom-In-icon.png b/tools/profiler/gui/images/Apps-Zoom-In-icon.png
deleted file mode 100644
index 36b92e0aa..000000000
Binary files a/tools/profiler/gui/images/Apps-Zoom-In-icon.png and /dev/null differ
diff --git a/tools/profiler/gui/images/Apps-Zoom-Out-icon.png b/tools/profiler/gui/images/Apps-Zoom-Out-icon.png
deleted file mode 100644
index 43bfb853a..000000000
Binary files a/tools/profiler/gui/images/Apps-Zoom-Out-icon.png and /dev/null differ
diff --git a/tools/profiler/gui/images/Start-icon.png b/tools/profiler/gui/images/Start-icon.png
deleted file mode 100644
index 3bf541219..000000000
Binary files a/tools/profiler/gui/images/Start-icon.png and /dev/null differ
diff --git a/tools/profiler/gui/images/Stop-red-icon.png b/tools/profiler/gui/images/Stop-red-icon.png
deleted file mode 100644
index 54fb01c3c..000000000
Binary files a/tools/profiler/gui/images/Stop-red-icon.png and /dev/null differ
diff --git a/tools/profiler/gui/images/launch-24px.png b/tools/profiler/gui/images/launch-24px.png
deleted file mode 100644
index dd9d2d639..000000000
Binary files a/tools/profiler/gui/images/launch-24px.png and /dev/null differ
diff --git a/tools/profiler/gui/images/pause-icon.png b/tools/profiler/gui/images/pause-icon.png
deleted file mode 100644
index 6acef5c6c..000000000
Binary files a/tools/profiler/gui/images/pause-icon.png and /dev/null differ
diff --git a/tools/profiler/gui/images/pause.png b/tools/profiler/gui/images/pause.png
deleted file mode 100644
index a362fcd29..000000000
Binary files a/tools/profiler/gui/images/pause.png and /dev/null differ
diff --git a/tools/profiler/gui/images/play_circle_filled-24px.png b/tools/profiler/gui/images/play_circle_filled-24px.png
deleted file mode 100644
index 7aa972671..000000000
Binary files a/tools/profiler/gui/images/play_circle_filled-24px.png and /dev/null differ
diff --git a/tools/profiler/gui/images/signalstree.txt b/tools/profiler/gui/images/signalstree.txt
deleted file mode 100644
index e1ecf6810..000000000
--- a/tools/profiler/gui/images/signalstree.txt
+++ /dev/null
@@ -1,205 +0,0 @@
-SOC;                                     /sys/board/chip/soc/state
-    FC;                                  /sys/board/chip/soc/fc/state
-        stalls;                          null 
-            pcer_cycles;                     /sys/board/chip/soc/fc/pcer_cycles
-            pcer_instr;                      /sys/board/chip/soc/fc/pcer_instr
-            pcer_ld_stall;                   /sys/board/chip/soc/fc/pcer_ld_stall
-            pcer_jmp_stall;                  /sys/board/chip/soc/fc/pcer_jmp_stall
-            pcer_imiss;                      /sys/board/chip/soc/fc/pcer_imiss
-            pcer_ld;                         /sys/board/chip/soc/fc/pcer_ld
-            pcer_st;                         /sys/board/chip/soc/fc/pcer_st
-            pcer_jump;                       /sys/board/chip/soc/fc/pcer_jump
-            pcer_branch;                     /sys/board/chip/soc/fc/pcer_branch
-            pcer_taken_branch;               /sys/board/chip/soc/fc/pcer_taken_branch
-            pcer_rvc;                        /sys/board/chip/soc/fc/pcer_rvc
-            pcer_ld_ext;                     /sys/board/chip/soc/fc/pcer_ld_ext
-            pcer_st_ext;                     /sys/board/chip/soc/fc/pcer_st_ext
-            pcer_ld_ext_cycles;              /sys/board/chip/soc/fc/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;              /sys/board/chip/soc/fc/pcer_st_ext_cycles
-            pcer_tcdm_cont;                  /sys/board/chip/soc/fc/pcer_tcdm_cont
-            misaligned;                      /sys/board/chip/soc/fc/misaligned
-    udma;                                null
-        spim0_rx;                        /sys/board/chip/soc/udma/spim0_rx/state
-        spim0_tx;                        /sys/board/chip/soc/udma/spim0_tx/state
-        spim1_rx;                        /sys/board/chip/soc/udma/spim1_rx/state
-        spim1_tx;                        /sys/board/chip/soc/udma/spim1_tx/state
-        hyper0_rx;                       /sys/board/chip/soc/udma/hyper0_rx/state
-        hyper0_tx;                       /sys/board/chip/soc/udma/hyper0_tx/state
-        i2c0_rx;                         /sys/board/chip/soc/udma/i2c0_rx/state
-        i2c0_tx;                         /sys/board/chip/soc/udma/i2c0_tx/state
-        i2c1_rx;                         /sys/board/chip/soc/udma/i2c1_rx/state
-        i2c1_tx;                         /sys/board/chip/soc/udma/i2c1_tx/state
-        uart0_rx;                        /sys/board/chip/soc/udma/uart0_rx/state
-        uart0_tx;                        /sys/board/chip/soc/udma/uart0_tx/state
-        cpi0_rx;                         /sys/board/chip/soc/udma/cpi0_rx/state
-Cluster;                                 /sys/board/chip/cluster/state
-    PE0;                                 /sys/board/chip/cluster/pe0/state
-        stalls;                          /sys/board/chip/cluster/pe0/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe0/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe0/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe0/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe0/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe0/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe0/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe0/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe0/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe0/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe0/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe0/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe0/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe0/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe0/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe0/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe0/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe0/misaligned
-    PE1;                                 /sys/board/chip/cluster/pe1/state
-        stalls;                          /sys/board/chip/cluster/pe1/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe1/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe1/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe1/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe1/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe1/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe1/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe1/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe1/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe1/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe1/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe1/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe1/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe1/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe1/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe1/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe1/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe1/misaligned       
-    PE2;                                 /sys/board/chip/cluster/pe2/state
-        stalls;                          /sys/board/chip/cluster/pe2/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe2/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe2/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe2/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe2/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe2/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe2/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe2/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe2/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe2/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe2/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe2/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe2/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe2/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe2/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe2/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe2/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe2/misaligned
-    PE3;                                 /sys/board/chip/cluster/pe3/state
-        stalls;                          /sys/board/chip/cluster/pe3/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe3/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe3/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe3/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe3/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe3/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe3/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe3/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe3/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe3/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe3/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe3/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe3/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe3/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe3/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe3/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe3/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe3/misaligned
-    PE4;                                 /sys/board/chip/cluster/pe4/state
-        stalls;                          /sys/board/chip/cluster/pe4/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe4/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe4/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe4/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe4/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe4/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe4/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe4/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe4/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe4/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe4/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe4/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe4/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe4/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe4/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe4/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe4/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe4/misaligned
-    PE5;                                 /sys/board/chip/cluster/pe5/state
-        stalls;                          /sys/board/chip/cluster/pe5/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe5/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe5/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe5/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe5/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe5/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe5/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe5/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe5/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe5/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe5/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe5/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe5/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe5/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe5/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe5/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe5/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe5/misaligned
-    PE6;                                 /sys/board/chip/cluster/pe6/state
-        stalls;                          /sys/board/chip/cluster/pe6/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe6/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe6/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe6/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe6/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe6/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe6/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe6/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe6/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe6/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe6/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe6/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe6/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe6/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe6/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe6/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe6/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe6/misaligned
-    PE7;                                 /sys/board/chip/cluster/pe7/state
-        stalls;                          /sys/board/chip/cluster/pe7/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe7/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe7/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe7/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe7/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe7/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe7/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe7/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe7/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe7/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe7/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe7/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe7/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe7/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe7/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe7/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe7/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe7/misaligned
-    cycles;                              /sys/board/chip/cluster_clock/cycles
-    dma;                                 null
-        channel_0;                       /sys/board/chip/cluster/dma/channel_0
-        channel_1;                       /sys/board/chip/cluster/dma/channel_1
-        channel_2;                       /sys/board/chip/cluster/dma/channel_2
-        channel_3;                       /sys/board/chip/cluster/dma/channel_3
-        channel_4;                       /sys/board/chip/cluster/dma/channel_4
-        channel_5;                       /sys/board/chip/cluster/dma/channel_5
-        channel_6;                       /sys/board/chip/cluster/dma/channel_6
-        channel_7;                       /sys/board/chip/cluster/dma/channel_7
-        channel_8;                       /sys/board/chip/cluster/dma/channel_8
-        channel_9;                       /sys/board/chip/cluster/dma/channel_9
-        channel_10;                      /sys/board/chip/cluster/dma/channel_10
-        channel_11;                      /sys/board/chip/cluster/dma/channel_11
-        channel_12;                      /sys/board/chip/cluster/dma/channel_12
-        channel_13;                      /sys/board/chip/cluster/dma/channel_13
-        channel_14;                      /sys/board/chip/cluster/dma/channel_14
-        channel_15;                      /sys/board/chip/cluster/dma/channel_15
diff --git a/tools/profiler/gui/images/signalstree_GAP8.txt b/tools/profiler/gui/images/signalstree_GAP8.txt
deleted file mode 100644
index e1ecf6810..000000000
--- a/tools/profiler/gui/images/signalstree_GAP8.txt
+++ /dev/null
@@ -1,205 +0,0 @@
-SOC;                                     /sys/board/chip/soc/state
-    FC;                                  /sys/board/chip/soc/fc/state
-        stalls;                          null 
-            pcer_cycles;                     /sys/board/chip/soc/fc/pcer_cycles
-            pcer_instr;                      /sys/board/chip/soc/fc/pcer_instr
-            pcer_ld_stall;                   /sys/board/chip/soc/fc/pcer_ld_stall
-            pcer_jmp_stall;                  /sys/board/chip/soc/fc/pcer_jmp_stall
-            pcer_imiss;                      /sys/board/chip/soc/fc/pcer_imiss
-            pcer_ld;                         /sys/board/chip/soc/fc/pcer_ld
-            pcer_st;                         /sys/board/chip/soc/fc/pcer_st
-            pcer_jump;                       /sys/board/chip/soc/fc/pcer_jump
-            pcer_branch;                     /sys/board/chip/soc/fc/pcer_branch
-            pcer_taken_branch;               /sys/board/chip/soc/fc/pcer_taken_branch
-            pcer_rvc;                        /sys/board/chip/soc/fc/pcer_rvc
-            pcer_ld_ext;                     /sys/board/chip/soc/fc/pcer_ld_ext
-            pcer_st_ext;                     /sys/board/chip/soc/fc/pcer_st_ext
-            pcer_ld_ext_cycles;              /sys/board/chip/soc/fc/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;              /sys/board/chip/soc/fc/pcer_st_ext_cycles
-            pcer_tcdm_cont;                  /sys/board/chip/soc/fc/pcer_tcdm_cont
-            misaligned;                      /sys/board/chip/soc/fc/misaligned
-    udma;                                null
-        spim0_rx;                        /sys/board/chip/soc/udma/spim0_rx/state
-        spim0_tx;                        /sys/board/chip/soc/udma/spim0_tx/state
-        spim1_rx;                        /sys/board/chip/soc/udma/spim1_rx/state
-        spim1_tx;                        /sys/board/chip/soc/udma/spim1_tx/state
-        hyper0_rx;                       /sys/board/chip/soc/udma/hyper0_rx/state
-        hyper0_tx;                       /sys/board/chip/soc/udma/hyper0_tx/state
-        i2c0_rx;                         /sys/board/chip/soc/udma/i2c0_rx/state
-        i2c0_tx;                         /sys/board/chip/soc/udma/i2c0_tx/state
-        i2c1_rx;                         /sys/board/chip/soc/udma/i2c1_rx/state
-        i2c1_tx;                         /sys/board/chip/soc/udma/i2c1_tx/state
-        uart0_rx;                        /sys/board/chip/soc/udma/uart0_rx/state
-        uart0_tx;                        /sys/board/chip/soc/udma/uart0_tx/state
-        cpi0_rx;                         /sys/board/chip/soc/udma/cpi0_rx/state
-Cluster;                                 /sys/board/chip/cluster/state
-    PE0;                                 /sys/board/chip/cluster/pe0/state
-        stalls;                          /sys/board/chip/cluster/pe0/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe0/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe0/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe0/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe0/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe0/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe0/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe0/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe0/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe0/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe0/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe0/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe0/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe0/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe0/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe0/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe0/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe0/misaligned
-    PE1;                                 /sys/board/chip/cluster/pe1/state
-        stalls;                          /sys/board/chip/cluster/pe1/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe1/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe1/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe1/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe1/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe1/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe1/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe1/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe1/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe1/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe1/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe1/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe1/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe1/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe1/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe1/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe1/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe1/misaligned       
-    PE2;                                 /sys/board/chip/cluster/pe2/state
-        stalls;                          /sys/board/chip/cluster/pe2/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe2/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe2/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe2/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe2/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe2/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe2/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe2/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe2/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe2/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe2/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe2/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe2/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe2/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe2/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe2/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe2/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe2/misaligned
-    PE3;                                 /sys/board/chip/cluster/pe3/state
-        stalls;                          /sys/board/chip/cluster/pe3/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe3/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe3/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe3/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe3/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe3/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe3/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe3/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe3/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe3/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe3/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe3/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe3/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe3/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe3/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe3/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe3/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe3/misaligned
-    PE4;                                 /sys/board/chip/cluster/pe4/state
-        stalls;                          /sys/board/chip/cluster/pe4/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe4/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe4/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe4/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe4/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe4/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe4/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe4/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe4/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe4/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe4/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe4/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe4/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe4/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe4/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe4/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe4/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe4/misaligned
-    PE5;                                 /sys/board/chip/cluster/pe5/state
-        stalls;                          /sys/board/chip/cluster/pe5/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe5/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe5/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe5/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe5/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe5/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe5/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe5/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe5/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe5/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe5/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe5/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe5/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe5/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe5/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe5/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe5/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe5/misaligned
-    PE6;                                 /sys/board/chip/cluster/pe6/state
-        stalls;                          /sys/board/chip/cluster/pe6/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe6/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe6/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe6/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe6/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe6/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe6/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe6/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe6/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe6/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe6/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe6/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe6/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe6/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe6/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe6/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe6/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe6/misaligned
-    PE7;                                 /sys/board/chip/cluster/pe7/state
-        stalls;                          /sys/board/chip/cluster/pe7/stalls
-            pcer_cycles;                 /sys/board/chip/cluster/pe7/pcer_cycles
-            pcer_instr;                  /sys/board/chip/cluster/pe7/pcer_instr
-            pcer_ld_stall;               /sys/board/chip/cluster/pe7/pcer_ld_stall
-            pcer_jmp_stall;              /sys/board/chip/cluster/pe7/pcer_jmp_stall
-            pcer_imiss;                  /sys/board/chip/cluster/pe7/pcer_imiss
-            pcer_ld;                     /sys/board/chip/cluster/pe7/pcer_ld
-            pcer_st;                     /sys/board/chip/cluster/pe7/pcer_st
-            pcer_jump;                   /sys/board/chip/cluster/pe7/pcer_jump
-            pcer_branch;                 /sys/board/chip/cluster/pe7/pcer_branch
-            pcer_taken_branch;           /sys/board/chip/cluster/pe7/pcer_taken_branch
-            pcer_rvc;                    /sys/board/chip/cluster/pe7/pcer_rvc
-            pcer_ld_ext;                 /sys/board/chip/cluster/pe7/pcer_ld_ext
-            pcer_st_ext;                 /sys/board/chip/cluster/pe7/pcer_st_ext
-            pcer_ld_ext_cycles;          /sys/board/chip/cluster/pe7/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /sys/board/chip/cluster/pe7/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /sys/board/chip/cluster/pe7/pcer_tcdm_cont
-            misaligned;                  /sys/board/chip/cluster/pe7/misaligned
-    cycles;                              /sys/board/chip/cluster_clock/cycles
-    dma;                                 null
-        channel_0;                       /sys/board/chip/cluster/dma/channel_0
-        channel_1;                       /sys/board/chip/cluster/dma/channel_1
-        channel_2;                       /sys/board/chip/cluster/dma/channel_2
-        channel_3;                       /sys/board/chip/cluster/dma/channel_3
-        channel_4;                       /sys/board/chip/cluster/dma/channel_4
-        channel_5;                       /sys/board/chip/cluster/dma/channel_5
-        channel_6;                       /sys/board/chip/cluster/dma/channel_6
-        channel_7;                       /sys/board/chip/cluster/dma/channel_7
-        channel_8;                       /sys/board/chip/cluster/dma/channel_8
-        channel_9;                       /sys/board/chip/cluster/dma/channel_9
-        channel_10;                      /sys/board/chip/cluster/dma/channel_10
-        channel_11;                      /sys/board/chip/cluster/dma/channel_11
-        channel_12;                      /sys/board/chip/cluster/dma/channel_12
-        channel_13;                      /sys/board/chip/cluster/dma/channel_13
-        channel_14;                      /sys/board/chip/cluster/dma/channel_14
-        channel_15;                      /sys/board/chip/cluster/dma/channel_15
diff --git a/tools/profiler/gui/images/signalstree_GAP9.txt b/tools/profiler/gui/images/signalstree_GAP9.txt
deleted file mode 100644
index 0cd7117f8..000000000
--- a/tools/profiler/gui/images/signalstree_GAP9.txt
+++ /dev/null
@@ -1,224 +0,0 @@
-SOC;                                     /chip/soc/state
-    FC;                                  /chip/soc/fc/state
-        stalls;                          null 
-            pcer_cycles;                     /chip/soc/fc/pcer_cycles
-            pcer_instr;                      /chip/soc/fc/pcer_instr
-            pcer_ld_stall;                   /chip/soc/fc/pcer_ld_stall
-            pcer_jmp_stall;                  /chip/soc/fc/pcer_jmp_stall
-            pcer_imiss;                      /chip/soc/fc/pcer_imiss
-            pcer_ld;                         /chip/soc/fc/pcer_ld
-            pcer_st;                         /chip/soc/fc/pcer_st
-            pcer_jump;                       /chip/soc/fc/pcer_jump
-            pcer_branch;                     /chip/soc/fc/pcer_branch
-            pcer_taken_branch;               /chip/soc/fc/pcer_taken_branch
-            pcer_rvc;                        /chip/soc/fc/pcer_rvc
-            pcer_ld_ext;                     /chip/soc/fc/pcer_ld_ext
-            pcer_st_ext;                     /chip/soc/fc/pcer_st_ext
-            pcer_ld_ext_cycles;              /chip/soc/fc/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;              /chip/soc/fc/pcer_st_ext_cycles
-            pcer_tcdm_cont;                  /chip/soc/fc/pcer_tcdm_cont
-            misaligned;                      /chip/soc/fc/misaligned
-    udma;                                null
-        spim0_rx;                        /chip/soc/udma/spim0_rx/state
-        spim0_tx;                        /chip/soc/udma/spim0_tx/state
-        spim1_rx;                        /chip/soc/udma/spim1_rx/state
-        spim1_tx;                        /chip/soc/udma/spim1_tx/state
-        hyper0_rx;                       /chip/soc/udma/hyper0_rx/state
-        hyper0_tx;                       /chip/soc/udma/hyper0_tx/state
-        i2c0_rx;                         /chip/soc/udma/i2c0_rx/state
-        i2c0_tx;                         /chip/soc/udma/i2c0_tx/state
-        i2c1_rx;                         /chip/soc/udma/i2c1_rx/state
-        i2c1_tx;                         /chip/soc/udma/i2c1_tx/state
-        uart0_rx;                        /chip/soc/udma/uart0_rx/state
-        uart0_tx;                        /chip/soc/udma/uart0_tx/state
-        cpi0_rx;                         /chip/soc/udma/cpi0_rx/state
-Cluster;                                 /chip/cluster/state
-    PE0;                                 /chip/cluster/pe0/state
-        stalls;                          /chip/cluster/pe0/stalls
-            pcer_cycles;                 /chip/cluster/pe0/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe0/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe0/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe0/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe0/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe0/pcer_ld
-            pcer_st;                     /chip/cluster/pe0/pcer_st
-            pcer_jump;                   /chip/cluster/pe0/pcer_jump
-            pcer_branch;                 /chip/cluster/pe0/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe0/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe0/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe0/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe0/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe0/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe0/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe0/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe0/misaligned
-    PE1;                                 /chip/cluster/pe1/state
-        stalls;                          /chip/cluster/pe1/stalls
-            pcer_cycles;                 /chip/cluster/pe1/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe1/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe1/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe1/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe1/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe1/pcer_ld
-            pcer_st;                     /chip/cluster/pe1/pcer_st
-            pcer_jump;                   /chip/cluster/pe1/pcer_jump
-            pcer_branch;                 /chip/cluster/pe1/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe1/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe1/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe1/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe1/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe1/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe1/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe1/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe1/misaligned       
-    PE2;                                 /chip/cluster/pe2/state
-        stalls;                          /chip/cluster/pe2/stalls
-            pcer_cycles;                 /chip/cluster/pe2/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe2/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe2/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe2/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe2/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe2/pcer_ld
-            pcer_st;                     /chip/cluster/pe2/pcer_st
-            pcer_jump;                   /chip/cluster/pe2/pcer_jump
-            pcer_branch;                 /chip/cluster/pe2/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe2/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe2/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe2/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe2/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe2/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe2/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe2/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe2/misaligned
-    PE3;                                 /chip/cluster/pe3/state
-        stalls;                          /chip/cluster/pe3/stalls
-            pcer_cycles;                 /chip/cluster/pe3/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe3/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe3/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe3/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe3/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe3/pcer_ld
-            pcer_st;                     /chip/cluster/pe3/pcer_st
-            pcer_jump;                   /chip/cluster/pe3/pcer_jump
-            pcer_branch;                 /chip/cluster/pe3/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe3/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe3/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe3/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe3/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe3/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe3/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe3/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe3/misaligned
-    PE4;                                 /chip/cluster/pe4/state
-        stalls;                          /chip/cluster/pe4/stalls
-            pcer_cycles;                 /chip/cluster/pe4/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe4/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe4/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe4/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe4/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe4/pcer_ld
-            pcer_st;                     /chip/cluster/pe4/pcer_st
-            pcer_jump;                   /chip/cluster/pe4/pcer_jump
-            pcer_branch;                 /chip/cluster/pe4/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe4/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe4/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe4/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe4/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe4/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe4/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe4/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe4/misaligned
-    PE5;                                 /chip/cluster/pe5/state
-        stalls;                          /chip/cluster/pe5/stalls
-            pcer_cycles;                 /chip/cluster/pe5/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe5/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe5/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe5/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe5/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe5/pcer_ld
-            pcer_st;                     /chip/cluster/pe5/pcer_st
-            pcer_jump;                   /chip/cluster/pe5/pcer_jump
-            pcer_branch;                 /chip/cluster/pe5/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe5/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe5/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe5/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe5/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe5/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe5/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe5/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe5/misaligned
-    PE6;                                 /chip/cluster/pe6/state
-        stalls;                          /chip/cluster/pe6/stalls
-            pcer_cycles;                 /chip/cluster/pe6/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe6/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe6/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe6/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe6/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe6/pcer_ld
-            pcer_st;                     /chip/cluster/pe6/pcer_st
-            pcer_jump;                   /chip/cluster/pe6/pcer_jump
-            pcer_branch;                 /chip/cluster/pe6/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe6/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe6/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe6/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe6/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe6/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe6/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe6/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe6/misaligned
-    PE7;                                 /chip/cluster/pe7/state
-        stalls;                          /chip/cluster/pe7/stalls
-            pcer_cycles;                 /chip/cluster/pe7/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe7/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe7/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe7/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe7/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe7/pcer_ld
-            pcer_st;                     /chip/cluster/pe7/pcer_st
-            pcer_jump;                   /chip/cluster/pe7/pcer_jump
-            pcer_branch;                 /chip/cluster/pe7/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe7/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe7/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe7/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe7/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe7/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe7/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe7/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe7/misaligned
-    PE8;                                 /chip/cluster/pe7/state
-        stalls;                          /chip/cluster/pe7/stalls
-            pcer_cycles;                 /chip/cluster/pe7/pcer_cycles
-            pcer_instr;                  /chip/cluster/pe7/pcer_instr
-            pcer_ld_stall;               /chip/cluster/pe7/pcer_ld_stall
-            pcer_jmp_stall;              /chip/cluster/pe7/pcer_jmp_stall
-            pcer_imiss;                  /chip/cluster/pe7/pcer_imiss
-            pcer_ld;                     /chip/cluster/pe7/pcer_ld
-            pcer_st;                     /chip/cluster/pe7/pcer_st
-            pcer_jump;                   /chip/cluster/pe7/pcer_jump
-            pcer_branch;                 /chip/cluster/pe7/pcer_branch
-            pcer_taken_branch;           /chip/cluster/pe7/pcer_taken_branch
-            pcer_rvc;                    /chip/cluster/pe7/pcer_rvc
-            pcer_ld_ext;                 /chip/cluster/pe7/pcer_ld_ext
-            pcer_st_ext;                 /chip/cluster/pe7/pcer_st_ext
-            pcer_ld_ext_cycles;          /chip/cluster/pe7/pcer_ld_ext_cycles
-            pcer_st_ext_cycles;          /chip/cluster/pe7/pcer_st_ext_cycles
-            pcer_tcdm_cont;              /chip/cluster/pe7/pcer_tcdm_cont
-            misaligned;                  /chip/cluster/pe7/misaligned
-    cycles;                              /chip/cluster_clock/cycles
-    dma;                                 null
-        channel_0;                       /chip/cluster/dma/channel_0
-        channel_1;                       /chip/cluster/dma/channel_1
-        channel_2;                       /chip/cluster/dma/channel_2
-        channel_3;                       /chip/cluster/dma/channel_3
-        channel_4;                       /chip/cluster/dma/channel_4
-        channel_5;                       /chip/cluster/dma/channel_5
-        channel_6;                       /chip/cluster/dma/channel_6
-        channel_7;                       /chip/cluster/dma/channel_7
-        channel_8;                       /chip/cluster/dma/channel_8
-        channel_9;                       /chip/cluster/dma/channel_9
-        channel_10;                      /chip/cluster/dma/channel_10
-        channel_11;                      /chip/cluster/dma/channel_11
-        channel_12;                      /chip/cluster/dma/channel_12
-        channel_13;                      /chip/cluster/dma/channel_13
-        channel_14;                      /chip/cluster/dma/channel_14
-        channel_15;                      /chip/cluster/dma/channel_15
diff --git a/tools/profiler/gui/images/stop-24px.png b/tools/profiler/gui/images/stop-24px.png
deleted file mode 100644
index 8e69f9891..000000000
Binary files a/tools/profiler/gui/images/stop-24px.png and /dev/null differ
diff --git a/tools/profiler/gui/images/viewall.png b/tools/profiler/gui/images/viewall.png
deleted file mode 100644
index a9ac188ff..000000000
Binary files a/tools/profiler/gui/images/viewall.png and /dev/null differ
diff --git a/tools/profiler/gui/images/zoomin.png b/tools/profiler/gui/images/zoomin.png
deleted file mode 100644
index 6edd2c561..000000000
Binary files a/tools/profiler/gui/images/zoomin.png and /dev/null differ
diff --git a/tools/profiler/gui/images/zoomout.png b/tools/profiler/gui/images/zoomout.png
deleted file mode 100644
index 0fe6f1a99..000000000
Binary files a/tools/profiler/gui/images/zoomout.png and /dev/null differ
diff --git a/tools/profiler/gui/include/dialog.hpp b/tools/profiler/gui/include/dialog.hpp
deleted file mode 100644
index 44fcecf6f..000000000
--- a/tools/profiler/gui/include/dialog.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef DIALOG_H
-#define DIALOG_H
-
-#include <QWidget>
-
-QT_BEGIN_NAMESPACE
-class QCheckBox;
-class QLabel;
-class QErrorMessage;
-QT_END_NAMESPACE
-
-class DialogOptionsWidget;
-
-class Dialog : public QWidget
-{
-    Q_OBJECT
-
-public:
-    Dialog(QWidget *parent = nullptr);
-
-private slots:
-    /*void setInteger();
-    void setDouble();
-    void setItem();
-    void setText();
-    void setMultiLineText();
-    void setColor();
-    void setFont();
-    */
-    void setExistingDirectory();
-    /*void setOpenFileName();
-    void setOpenFileNames();
-    */
-    void setExecFileName();
-    /*void criticalMessage();
-    void informationMessage();
-    void questionMessage();
-    void warningMessage();
-    void errorMessage();
-    */
-private:
-    /*QLabel *integerLabel;
-    QLabel *doubleLabel;
-    QLabel *itemLabel;
-    QLabel *textLabel;
-    QLabel *multiLineTextLabel;
-    QLabel *colorLabel;
-    QLabel *fontLabel;
-    */
-    QLabel *directoryLabel;
-    /*QLabel *openFileNameLabel;
-    QLabel *openFileNamesLabel;
-    */
-    QLabel *execFileNameLabel;
-
-    /*QLabel *criticalLabel;
-    QLabel *informationLabel;
-    QLabel *questionLabel;
-    QLabel *warningLabel;
-    QLabel *errorLabel;
-    QErrorMessage *errorMessageDialog;
-    */
-    DialogOptionsWidget *fileDialogOptionsWidget;
-    /*DialogOptionsWidget *colorDialogOptionsWidget;
-    DialogOptionsWidget *fontDialogOptionsWidget;
-    QString openFilesPath;
-    */
-};
-
-#endif
-
diff --git a/tools/profiler/gui/include/execoverview.hpp b/tools/profiler/gui/include/execoverview.hpp
deleted file mode 100644
index 4bfe81ff5..000000000
--- a/tools/profiler/gui/include/execoverview.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef EXEC_OVERVIEW_HPP
-#define EXEC_OVERVIEW_HPP
-
-#include <QWidget>
-#include <QTimer>
-#include <QtCharts/QChartView>
-#include <QtCharts/QBarSeries>
-#include <QtCharts/QBarSet>
-#include <QHBoxLayout>
-#include <QValueAxis>
-
-class ExecOverview : public QWidget {
-  Q_OBJECT
-
-public:
-  ExecOverview(QWidget* parent);
-  ~ExecOverview();
-
-private:
-
-  QtCharts::QBarSeries *series = nullptr;
-  QtCharts::QChart *chart = nullptr;
-  QTimer* timer;
-
-private slots:
-  void refresh();
-};
-
-
-
-#endif    //EXEC_OVERVIEW_HPP
diff --git a/tools/profiler/gui/include/functiondetails.hpp b/tools/profiler/gui/include/functiondetails.hpp
deleted file mode 100644
index ab02be32c..000000000
--- a/tools/profiler/gui/include/functiondetails.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef FUNCTIONDETAILS_HPP
-#define FUNCTIONDETAILS_HPP
-
-#include <QWidget>
-#include <QSplitter>
-#include <QTextEdit>
-#include <QTableWidget>
-#include <QString>
-#include <QHBoxLayout>
-#include <QFile>
-#include <QTimer>
-#include <QPlainTextEdit>
-
-#include "highlighter.hpp"
-#include "stallchart.hpp"
-#include "splitterdesign.hpp"
-#include "util.hpp"
-
-/**
-  @brief displays per-function metrics, such as total execution time per
-         function, stall reasons, ...
-  */
-class FunctionDetails : public QWidget
-{
-  Q_OBJECT
-
-  static const char* columnHeader[];
-  static const int nbColumn;
-
-  public :
-    FunctionDetails(QWidget* parent,
-                    QPlainTextEdit* sourceCode,
-                    QPlainTextEdit* asmCode,
-                    StallChart* stallChart);
-    /** in order for the app to be more accessible, table data can be exported to
-        a text file (that an other program can read out to a blind person for
-        example)*/
-    void exportTableToTextFile(const char* filename) const;
-    StallChart* stallChart;
-    void switchLegendMode(LegendMode newMode);
-
-  private :
-    void updateTimeStamps(void);
-    void setSourceCode(std::string fname);
-    QTableWidget* table = nullptr;
-    QPlainTextEdit* sourceCode;
-    QPlainTextEdit* asmCode;
-    QHBoxLayout* layout;
-    QTimer* timer;
-    std::string textVersion;
-    LegendMode currentMode=TIME_MODE;
-
-    Highlighter* cppHighlighter;
-    Highlighter* asmHighlighter;
-    std::string selectedFunction = "";
-
-  private slots:
-    void fillTable();
-    QString formatTimeStamp(QString ts) const;
-
-  public slots:
-    /** selects the function for which more details are displayed */
-    void selectFunction(QTableWidgetItem* it);
-    void selectFunction(std::string fname);
-    void selectFunction(int row);
-    void selectRow(const char* name);
-};
-
-#endif //FUNCTIONDETAILS_HPP
diff --git a/tools/profiler/gui/include/highlighter.hpp b/tools/profiler/gui/include/highlighter.hpp
deleted file mode 100644
index 5ba43738a..000000000
--- a/tools/profiler/gui/include/highlighter.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef HIGHLIGHTER_HPP
-#define HIGHLIGHTER_HPP
-
-#include <QSyntaxHighlighter>
-#include <QRegularExpression>
-#include <QTextCharFormat>
-#include <QRegularExpressionMatch>
-#include <QRegularExpressionMatchIterator>
-#include <QtGlobal>
-
-class Highlighter : public QSyntaxHighlighter
-{
-    Q_OBJECT
-
-public:
-    Highlighter(QTextDocument *parent = 0);
-
-protected:
-    void highlightBlock(const QString &text) override;
-
-private:
-    struct HighlightingRule
-    {
-        QRegularExpression pattern;
-        QTextCharFormat format;
-    };
-    QVector<HighlightingRule> highlightingRules;
-
-    QRegularExpression commentStartExpression;
-    QRegularExpression commentEndExpression;
-
-    QTextCharFormat keywordFormat;
-    QTextCharFormat classFormat;
-    QTextCharFormat singleLineCommentFormat;
-    QTextCharFormat multiLineCommentFormat;
-    QTextCharFormat quotationFormat;
-    QTextCharFormat functionFormat;
-    QTextCharFormat operatorPattern;
-
-    QTextCharFormat pcFormat;
-};
-
-#endif //HIGHLIGHTER_HPP
diff --git a/tools/profiler/gui/include/mainwindow.hpp b/tools/profiler/gui/include/mainwindow.hpp
deleted file mode 100644
index 48ba9a470..000000000
--- a/tools/profiler/gui/include/mainwindow.hpp
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef MAINWINDOW_HPP
-#define MAINWINDOW_HPP
-
-#include <QMainWindow>
-#include <QPushButton>
-#include <QHBoxLayout>
-#include <QWidget>
-#include <QSize>
-#include <QMdiArea>
-#include <QSplitter>
-#include <QTabWidget>
-#include <QPlainTextEdit>
-#include <QMenu>
-#include <QMenuBar>
-#include <QGroupBox>
-#include <gv/gvsoc_proxy.hpp>
-#include <QDialog>
-#include <QGroupBox>
-#include <QCheckBox>
-
-#include "timeline.hpp"
-#include "execoverview.hpp"
-#include "backend_interface.hpp"
-#include "dialog.hpp"
-
-class MainWindow : public QMainWindow
-{
-    Q_OBJECT
-
-public:
-    explicit MainWindow(std::string exampleDir,
-                        std::string path_to_elf,
-                        std::string configFileName, 
-                        QString signalsTreeFileName);
-    ~MainWindow();
-
-private:
-
-  Timeline* timeline = NULL;
-  FunctionDetails* fd;
-  ExecOverview* overview;
-  SignalTree* signalsView;
-  QPlainTextEdit* sourceCode;
-  QPlainTextEdit* asmCode;
-  StallChart* stallchart;
-  QGroupBox* commands;
-
-  // Dock Widgets
-  QDockWidget* functionsDock;
-  QDockWidget* stallChartDock;
-  QDockWidget* sourceCodeDock;
-  QDockWidget* asmCodeDock;
-  QDockWidget* overviewDock;
-  QDockWidget* signalsDock;
-  QDockWidget* commandsDock;
-
-  // Buttons
-  QPushButton* closeB;
-  QPushButton* runB;
-  QPushButton* pauseB;
-  QPushButton* parametersB;
-  QButtonGroup* gvsocModeGroup;
-
-  // Buttons for switching signals On or Off
-  QDialog* signalsDialog;
-  QGroupBox *signalsBox;
-  QCheckBox *coresBox;
-  QCheckBox *debugBox;
-  QCheckBox *dmaBox;
-  QCheckBox *stallsBox;
-  QCheckBox *statisticsBox;
-  QCheckBox *cachesBox;
-  QCheckBox *powerBox;
-  bool firstGvsocInit = true;
-
-  QDialog* createNonExclusiveGroup();
-
-  // changes color of a button
-  void changeColor(QPushButton* button, QColor color);
-
-  // Tool & Menu bars
-  QToolBar* toolBar;
-  void makeToolBar();
-  QMenu *mainMenu;
-  QMenu* gvsocSettingsMenu;
-  QAction* signalsStatAction;
-  void createDockWindows();
-  void createMenus();
-  void closeWindows();
-
-  // path to the example directory containing its Makefile
-  std::string exampleDir;
-  // path to the executable of the example
-  std::string path_to_elf;
-  // Gvsoc configuration file name
-  std::string configFileName;
-  // Name of the txt file containing the signals tree to be uploaded
-  QString signalsTreeFileName;
-
-  // Name of the FIFO file
-  char const* fifoName="all.bin";
-
-  bool dockWindowsCreated = false;
-  bool gvsocRun = false; // gvsoc run at least once
-  bool gvsocRunning = false;
-  bool gvsocOpened = false;
-  bool signalsAdded = false;
-
-  // Gvsoc Proxy
-  Gvsoc_proxy *gvsoc = NULL;
-
-  // Dialog Widget
-  Dialog dialog;
-
-// Gvsoc signals for fast Gvsoc Mode
-std::vector<std::string>  gvsocSignals = {
-  std::string("/sys/board/chip/soc/fc/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/spim0_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/spim0_tx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/spim1_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/spim1_tx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/hyper0_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/hyper0_tx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2c0_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2c0_tx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2c1_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2c1_tx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_0/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_1/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_2/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_3/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_4/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_5/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_6/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/i2s0_tdm_7/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/uart0_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/uart0_tx/state@all.bin"),
-  std::string("/sys/board/chip/soc/udma/cpi0_rx/state@all.bin"),
-  std::string("/sys/board/chip/soc_clock/period@all.bin"),
-  std::string("/sys/board/chip/cluster/pe0/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe1/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe2/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe3/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe4/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe5/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe6/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe7/state@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_0@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_1@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_2@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_3@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_4@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_5@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_6@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_7@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_8@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_9@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_10@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_11@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_12@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_13@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_14@all.bin"),
-  std::string("/sys/board/chip/cluster/dma/channel_15@all.bin"),
-  std::string("/sys/board/chip/cluster_clock/period@all.bin"),
-  std::string("/sys/board/chip/cluster_clock/cycles"),
-  std::string("/sys/board/chip/cluster/pe0/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe1/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe2/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe3/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe4/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe5/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe6/ipc_stat@all.bin"),
-  std::string("/sys/board/chip/cluster/pe7/ipc_stat@all.bin")
-};
-
-// Gvsoc mandatory signals : must always be received frol gvsoc
-std::vector<std::string>  mandatorySig = {
-  std::string("/sys/board/chip/soc/fc/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe1/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe2/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe3/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe4/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe5/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe6/state@all.bin"),
-  std::string("/sys/board/chip/cluster/pe7/state@all.bin")
-};
-
-// Gvsoc always active Core signals Table
-std::vector<std::string>  coresSig = {
-  std::string("/sys/board/chip/soc/state"),
-  std::string("/sys/board/chip/soc/fc/state"),
-  std::string("/sys/board/chip/cluster/state"),
-  std::string("/sys/board/chip/cluster/pe0/state"),
-  std::string("/sys/board/chip/cluster/pe1/state"),
-  std::string("/sys/board/chip/cluster/pe2/state"),
-  std::string("/sys/board/chip/cluster/pe3/state"),
-  std::string("/sys/board/chip/cluster/pe4/state"),
-  std::string("/sys/board/chip/cluster/pe5/state"),
-  std::string("/sys/board/chip/cluster/pe6/state"),
-  std::string("/sys/board/chip/cluster/pe7/state")
-};
-
-// Gvsoc debug symbols signals Table
-// signal integer 1-32 bits but different from the others
-std::vector<std::string>  debugSymbolsSig = {
-  std::string("/sys/board/chip/soc/fc/pc"),
-  std::string("/sys/board/chip/cluster/pe0/pc"),
-  std::string("/sys/board/chip/cluster/pe1/pc"),
-  std::string("/sys/board/chip/cluster/pe2/pc"),
-  std::string("/sys/board/chip/cluster/pe3/pc"),
-  std::string("/sys/board/chip/cluster/pe4/pc"),
-  std::string("/sys/board/chip/cluster/pe5/pc"),
-  std::string("/sys/board/chip/cluster/pe6/pc"),
-  std::string("/sys/board/chip/cluster/pe7/pc")
-};
-
-
-// Gvsoc Dma Symbols Signals Table
-std::vector<std::string>  dmaSig = {
-  std::string("/sys/board/chip/soc/udma/spim0_rx/state"),
-  std::string("/sys/board/chip/soc/udma/spim0_tx/state"),
-  std::string("/sys/board/chip/soc/udma/spim1_rx/state"),
-  std::string("/sys/board/chip/soc/udma/spim1_tx/state"),
-  std::string("/sys/board/chip/soc/udma/hyper0_rx/state"),
-  std::string("/sys/board/chip/soc/udma/hyper0_tx/state"),
-  std::string("/sys/board/chip/soc/udma/i2c0_rx/state"),
-  std::string("/sys/board/chip/soc/udma/i2c0_tx/state"),
-  std::string("/sys/board/chip/soc/udma/i2c1_rx/state"),
-  std::string("/sys/board/chip/soc/udma/i2c1_tx/state"),
-  std::string("/sys/board/chip/soc/udma/uart0_rx/state"),
-  std::string("/sys/board/chip/soc/udma/uart0_tx/state"),
-  std::string("/sys/board/chip/soc/udma/cpi0_rx/state"),
-  std::string("/sys/board/chip/cluster/dma/channel_0"),
-  std::string("/sys/board/chip/cluster/dma/channel_1"),
-  std::string("/sys/board/chip/cluster/dma/channel_2"),
-  std::string("/sys/board/chip/cluster/dma/channel_3"),
-  std::string("/sys/board/chip/cluster/dma/channel_4"),
-  std::string("/sys/board/chip/cluster/dma/channel_5"),
-  std::string("/sys/board/chip/cluster/dma/channel_6"),
-  std::string("/sys/board/chip/cluster/dma/channel_7"),
-  std::string("/sys/board/chip/cluster/dma/channel_8"),
-  std::string("/sys/board/chip/cluster/dma/channel_9"),
-  std::string("/sys/board/chip/cluster/dma/channel_10"),
-  std::string("/sys/board/chip/cluster/dma/channel_11"),
-  std::string("/sys/board/chip/cluster/dma/channel_12"),
-  std::string("/sys/board/chip/cluster/dma/channel_13"),
-  std::string("/sys/board/chip/cluster/dma/channel_14"),
-  std::string("/sys/board/chip/cluster/dma/channel_15")
-};
-
-// Gvsoc Stalls Signals Table
-std::vector<std::string>  stallsSig  = {
-  std::string("/sys/board/chip/soc/fc/pcer_cycles"),
-  std::string("/sys/board/chip/soc/fc/pcer_instr"),
-  std::string("/sys/board/chip/soc/fc/pcer_ld_stall"),
-  std::string("/sys/board/chip/soc/fc/pcer_jmp_stall"),
-  std::string("/sys/board/chip/soc/fc/pcer_imiss"),
-  std::string("/sys/board/chip/soc/fc/pcer_ld"),
-  std::string("/sys/board/chip/soc/fc/pcer_st"),
-  std::string("/sys/board/chip/soc/fc/pcer_jump"),
-  std::string("/sys/board/chip/soc/fc/pcer_branch"),
-  std::string("/sys/board/chip/soc/fc/pcer_taken_branch"),
-  std::string("/sys/board/chip/soc/fc/pcer_rvc"),
-  std::string("/sys/board/chip/soc/fc/pcer_ld_ext"),
-  std::string("/sys/board/chip/soc/fc/pcer_st_ext"),
-  std::string("/sys/board/chip/soc/fc/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/soc/fc/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/soc/fc/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/soc/fc/misaligned"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe0/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe0/misaligned"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe1/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe1/misaligned"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe2/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe2/misaligned"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe3/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe3/misaligned"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe4/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe4/misaligned"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe5/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe5/misaligned"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe6/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe6/misaligned"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_cycles"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_instr"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_ld_stall"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_jmp_stall"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_imiss"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_ld"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_st"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_jump"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_branch"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_taken_branch"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_rvc"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_ld_ext"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_st_ext"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_ld_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_st_ext_cycles"),
-  std::string("/sys/board/chip/cluster/pe7/pcer_tcdm_cont"),
-  std::string("/sys/board/chip/cluster/pe7/misaligned"),
-  std::string("/sys/board/chip/cluster_clock/cycles")
-};
-
-void switchSignalsGroup(std::vector<std::string> &signalsTable, bool state);
-void handleOpenB();
-
-private slots:
-  void foo();
-
-  void handleCloseB();
-  void handleRunB();
-  void handlePauseB();
-
-  void initGvsoc();
-  void switchCoresSig(bool state);
-  void switchDebugSig(bool state);
-  void switchDmaSig(bool state);
-  void switchStallsSig(bool state);
-  void signalsStatActionChecked();
-};
-
-#endif // MAINWINDOW_HPP
diff --git a/tools/profiler/gui/include/prolabel.hpp b/tools/profiler/gui/include/prolabel.hpp
deleted file mode 100644
index ffb1db162..000000000
--- a/tools/profiler/gui/include/prolabel.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef PROLABEL_HPP
-#define PROLABEL_HPP
-
-#include <QLabel>
-
-// This class is made to create label that is clickable
-// All that's needed is to connect the clicked() signal of the label
-// to some slot. 
-class ProLabel : public QLabel 
-{
-   Q_OBJECT
-public:
-   ProLabel(const QString & text, QWidget * parent = 0, Qt::WindowFlags f = 0);
-   ~ProLabel();
-signals:
-   void clicked();
-protected:
-   void mousePressEvent(QMouseEvent*);
-};
-
-#endif /* PROLABEL_HPP */
diff --git a/tools/profiler/gui/include/splitterdesign.hpp b/tools/profiler/gui/include/splitterdesign.hpp
deleted file mode 100644
index aeeeaba0c..000000000
--- a/tools/profiler/gui/include/splitterdesign.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <QSplitter>
-
-// This class contains functions to redesign the application
-// splitters
-class SplitterDesign
-{
-    public:
-    // Menber Functions()
-
-    // Makes splitter bigger (3 lines instead of 3 points)
-    // Adds a line that shows where the splitter is moved to
-    void decorateSplitter(QSplitter* splitter, int index);
-};
\ No newline at end of file
diff --git a/tools/profiler/gui/include/stallchart.hpp b/tools/profiler/gui/include/stallchart.hpp
deleted file mode 100644
index f63c1cd30..000000000
--- a/tools/profiler/gui/include/stallchart.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef STALLCHART_HPP
-#define STALLCHART_HPP
-
-#include <QWidget>
-#include <QFrame>
-#include <QtWidgets/QApplication>
-#include <QtWidgets/QMainWindow>
-#include <QtCharts/QChartView>
-#include <QtCharts/QPieSeries>
-#include <QtCharts/QPieSlice>
-#include <QtCharts/QBarSeries>
-#include <QtCharts/QBarSet>
-#include <QHBoxLayout>
-
-// Class used fo the "Stall reasons" tab in the functions tab
-class StallChart : public QWidget
-{
-  Q_OBJECT
-
-public:
-  StallChart(QWidget* parent);
-  void construct(std::string functionName);
-
-private:
-  std::string name;
-  QtCharts::QBarSeries *series = nullptr;
-  QtCharts::QChart *chart = nullptr;
-};
-
-#endif  //STALLCHART_HPP
diff --git a/tools/profiler/gui/include/statmodel.hpp b/tools/profiler/gui/include/statmodel.hpp
deleted file mode 100644
index 6b8dd4345..000000000
--- a/tools/profiler/gui/include/statmodel.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef STATMODEL_HPP
-#define STATMODEL_HPP
-#include <unistd.h>
-#include <QAbstractTableModel>
-#include "treemodel.hpp"
-#include <QTreeView>
-#include <QTimer>
-#include "backend_interface.hpp"
-#include "datamanager.hpp"
-#include "timeline.hpp"
-
-class Timeline;
-
-class StatModel : public QAbstractTableModel
-{
-    Q_OBJECT
-
-public:
-
-    StatModel(Timeline* timeLine, QObject *parent = 0);
-    void populateData(uint64_t t0 , uint64_t t1);
-
-    int rowCount(const QModelIndex &parent = QModelIndex()) const Q_DECL_OVERRIDE;
-    int columnCount(const QModelIndex &parent = QModelIndex()) const Q_DECL_OVERRIDE;
-
-    QVariant data(const QModelIndex &index, int role = Qt::DisplayRole) const Q_DECL_OVERRIDE;
-    QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const Q_DECL_OVERRIDE;
-    Timeline* tl; // timeline widget
-
-public slots:
-
-    void updateTreeParameters(TreeModel* sigModel,
-                              QTreeView* signalsTreeView);
-private:
-    QList<QString> smDutyTime; // the data stored by the model
-
-    // usefull objects to construct the model
-    TreeModel* signalsModel = NULL;
-    QTreeView* signalsView = NULL;
-    QTimer* timer;
-    int rowNb = -1;
-
-    // utility functions to help construct the model
-    template <typename T>
-    uint64_t getTimeUp(const Data_with_time<T>& dwt,
-                                    uint64_t t0,
-                                    uint64_t t1
-                                    );
-    uint64_t calculateTimeUp(const QString signalPath,
-                        std::vector<TLData<const char*>> data,
-                        std::vector<int> signalIdList,
-                        uint64_t t0,
-                        uint64_t t1);
-    void insertRow( const QString signalPath,
-                    std::vector<TLData<const char*>> data,
-                    std::vector<int> signalIdList,
-                    int line,
-                    uint64_t t0,
-                    uint64_t t1);
-    void buildModel(QModelIndex parent,
-                    QModelIndex signalIdx,
-                    std::vector<TLData<const char*>> data,
-                    std::vector<int> signalIdList,
-                    int* line,
-                    uint64_t t0,
-                    uint64_t t1);
-    //void updateModel(QModelIndex parent,QModelIndex signalIdx);
-    //void updateRow(const QString signalPath);
-//private slots:
-    //void refreshData();
-};
-
-#endif // STATMODEL_HPP
diff --git a/tools/profiler/gui/include/timeline.hpp b/tools/profiler/gui/include/timeline.hpp
deleted file mode 100644
index 3795d7789..000000000
--- a/tools/profiler/gui/include/timeline.hpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef TIMELINE_HPP
-#define TIMELINE_HPP
-
-#include <string>
-#include <unordered_map>
-
-#include <QWidget>
-#include <QGraphicsView>
-#include <QHBoxLayout>
-#include <QVBoxLayout>
-#include <QGridLayout>
-#include <QFrame>
-#include <QPaintEvent>
-#include <QMdiSubWindow>
-#include <QPushButton>
-#include <QButtonGroup>
-#include <QRadioButton>
-#include <QScrollArea>
-#include <QGraphicsScene>
-#include <QGraphicsRectItem>
-#include <QResizeEvent>
-#include <QLabel>
-#include <QGraphicsItem>
-#include <QValueAxis>
-#include <QToolBar>
-#include <QMainWindow>
-#include <QLineEdit>
-#include <QMutex>
-#include <QTreeView>
-#include <QPoint>
-#include <QDockWidget>
-#include <QScrollArea>
-#include <QMessageBox>
-#include <QFile>
-#include <QProgressBar>
-
-#include <thread>
-#include <mutex>
-#include <QMutex>
-#include <unistd.h>
-
-#include "functiondetails.hpp"
-#include "treemodel.hpp"
-#include "stallchart.hpp"
-#include "prolabel.hpp"
-
-#include "tldata.hpp"
-#include "treemodel.hpp"
-#include "statmodel.hpp"
-#include <QTableView>
-
-class Timeline;
-class TLGView;
-class StatModel;
-
-class AdjustingScrollArea : public QScrollArea {
-  Q_OBJECT
-  bool eventFilter(QObject * obj, QEvent * ev);
-public:
-  AdjustingScrollArea(QTreeView* signalsView,
-                      TLGView* gview,
-                       QWidget * parent = 0);
-  void setWidget(QWidget *w);
-  void updateWidgetPosition();
-  QTreeView* signalsTreeView;
-  TLGView* tlgView;
-public slots:
-  void syncWithTreeView();
-private:
-    QMutex syncMutex;
-};
-
-class HLegendWidget : public QWidget
-{
-  Q_OBJECT
-
-public:
-
-    HLegendWidget(TLGView* gview);
-
-  protected:
-    void paintEvent(QPaintEvent* event) override;
-
-  private:
-    TLGView* gview;
-    QPen pen;
-
-};
-
-/**
- @brief TLGView stands for TimeLine Graphic View. This class is responsible for the
- display in a QWidget
- */
-class TLGView : public QWidget
-{
-  Q_OBJECT
-
-public:
-  const static double rectRelativeSize;
-  const static double rectRelativeOffset;
-  const static int ticStep;        /**< time axis tic step, in pixels */
-  const static int legendWidth;   /**< width of labels of the time axis */
-  const static int legendTextSpace;/**< space between two labels */
-  const static int ticSize;     /**< tic length on the time axis */
-  const static int functionNameMinSize; /**< minimum width of a rect for the function name to be displayed */
-  const static int autoRedrawPeriod; /**< number of ms between 2 updates of the view */
-  const static int scrollBarMaxRange; /**< range of the scrollBar */
-  const static double scrollCoeff; /**< defines how fast user can scroll */
-
-  TLGView(FunctionDetails* fd,
-          QDockWidget* functionsDock,
-          Timeline* tl,
-          QScrollBar* sbar,
-          QVBoxLayout* legendLayout,
-          QPlainTextEdit* sourceCode,
-          QPlainTextEdit* asmCode,
-          StallChart* stallchart,
-          TreeModel* model,
-          QTreeView* signalsView,
-          StatModel* statModelRef,
-          QTableView*  statTableViewRef);
-
-public:
-  int getSignalIndex(std::vector<int> signalIdList , int signalId);
-  void addSignalToGview(QString signalPath);
-  void addSignalsToGview (QModelIndex parent,QModelIndex signalIdx );
-
-  // For the selection range being displayed
-  uint64_t t0Select=0;
-  uint64_t t1Select=0;
-  // For saving the selection range already displayed
-  uint64_t t0SelectDisplayed=0;
-  uint64_t t1SelectDisplayed=0;
-
-public slots:
-  //void show();
-  void zoomIn();
-  void zoomOut();
-  void viewAll();
-
-  void gotoTimestamp(int64_t t);
-  /**
-    changes time unit. The mode can be seconds, or cluster cycle, ...
-    The cycle mode (cluster or soc) it is based on the last registered cycle
-    period, so it might be wrong it cycle period has changed
-  */
-
-  //void toggleMode();
-  void radioB1Clicked();
-  void radioB2Clicked();
-  void radioB3Clicked();
-  void sliderMoved(int x);
-
-  uint getLineHeight() const { return peHeight; }
-
-  //void updateLeftLegendFast();
-  //void updateLeftLegendSlow();
-  void addTraceToLegend(std::string path, bool addMenuAction=true);
-  void deleteLegendItem(QLayoutItem* item, QLayoutItem* stretch);
-  void handleSignalNodeCollapsed();
-  void handleSignalNodeExpanded();
-
-signals:
-  void zoomOccured();
-
-protected:
-  void paintEvent(QPaintEvent* event) override;
-  void wheelEvent(QWheelEvent*) override;
-  QString findEltOnRow( QModelIndex parent,
-                              QModelIndex signalIdx,
-                              int rowNb,
-                              int* rowIdx);
-  void mousePressEvent(QMouseEvent *event) override;
-  void mouseReleaseEvent(QMouseEvent *event) override;
-  void mouseMoveEvent(QMouseEvent *event);
-  void highlightFunction(QPointF q);
-  void mouseDoubleClickEvent(QMouseEvent* event);
-  void drawBackground(QPainter *painter, const QRect &exposed, int nlines);
-  void scrollWithPixels(const QPoint &pixel);
-  void scrollWithDegrees(const QPoint &step);
-  int extractPower10Round(int x);
-
-  private:
-  // GvsocMode
-  bool gvsocSlowMode=false;
-
-  // for the ruler & selection area on gview-- added by Corine
-  QLine m_line; // line drawn by mouse move event
-  QRect m_rect; // rectangle drawn by mouse move event
-  bool selectionRect= false; // true if a selection rect is displayed on gview
-  bool movingLeftEdge = false; // true if user is currently adjusting left edge of select rect
-  bool movingRightEdge = false; // true if user is currently adjusting right edge of select rect
-  QPixmap m_nPTargetPixmap; // pixmap for displaying the time range selection area
-  bool m_nbMousePressed;
-  bool drawRect = false;
-  bool drawLine = false;
-  bool mouseMoving = false;
-  bool leftClick=false;
-  bool rightClick=false;
-  QPointF functionPoint;
-  QWidget* parentWidget;
-
-
-  void updateVerticalLine(QPointF q);
-  void updateVerticalLineText();
-  void updateTimeInterval(uint64_t position);
-  void resizeEvent(QResizeEvent *event) override;
-
-  /** gives the time range currently displayed to the user */
-  void getTimeWindow(uint64_t& t0, uint64_t& t1) const;
-  /** gives the time range that corresponds to the pixels between x0 and x1 */
-  void getTimeWindow(int x0, int x1, uint64_t& t0, uint64_t& t1) const;
-  uint64_t timeWidth() const;
-
-  void setMaxScrollTime();
-  void shiftAfterZoom(double coeff);
-  void setPageStep();
-
-
-  void paintSignalToGview(QPainter& painter, const QString signalPath,
-                                std::vector<TLData<const char*>> data, std::vector<int> signalIdList, int line,
-                                uint64_t t0, uint64_t t1, int x1);
-
-
-  void paintSubTree(QPainter& painter,
-                    QModelIndex parent,
-                    QModelIndex signalIdx,
-                    std::vector<TLData<const char*>> data,
-                    std::vector<int> signalIdList,
-                    int* line,
-                    uint64_t t0,
-                    uint64_t t1,
-                    int x1);
-
-  /** converts a timestamp to a position in pixel in the widget */
-  int time2position(uint64_t t) const;
-  /** converts a position in pixel in the widget to a timestamp */
-  uint64_t position2time(int x) const;
-  /** converts a width in pixel in the widget to a time interval */
-  uint64_t width2time(int x) const;
-
-
-  /** @return a string description of the given position
-              (e.g. 123 pixels -> "432 ns")
-  */
-  QString position2string(int x) const;
-  QString time2string(uint64_t t) const;
-  QString time2stringBis(uint64_t t) const;
-  QString formatTimeStamp(QString ts) const;
-
-  void centerOnMousePosition(uint64_t mouseTimestamp);
-  void placeMouseTimestampOnPoint(uint64_t mouseTimestamp, QPointF mousePosition);
-  void zoomThread();
-  void addToList(uint64_t mouseTimestamp);
-
-  std::list<uint64_t> mouseTimestamps; /** list to store the different mousetimestamps during a wheelEvent **/
-  std::mutex mouseMutex; /** mutex used to help manage the mouse centered zoom **/
-  uint16_t wheelEventCount =0; /** nb of wheelEvents during one mouse scroll zoom commande **/
-  std::thread th1;
-
-  QColor getColor(std::string fname);
-  void drawCustomTraces();
-  void clearWidgets(QLayout * layout);
-
-
-  int getCoreNb(QString sigShortName);
-  int getCoreNb2(QString sigLongName);
-
-
-  /**
-    draws a Data_with_time element
-    this function is not trivial to implement because all supported types T must
-    be handled correctly
-  */
-  template <typename T> int drawDwt(const Data_with_time<T>& dwt, int x1, int y,
-                                    QPainter& painter);
-  //QString getLegendModeText();
-
-  ProLabel* insertLegendItem(int index, const char* str, bool addMenuAction);
-
-
-  double zoomFactor;    /**< number of pixels per time unit */
-  uint64_t timeOffset;  /**< timestamp of the leftmost displayed pixel */
-  uint64_t maxTime;     /**< timestamp of the last event */
-  uint64_t maxScrollTime; /**< user cannot scroll beyond this timestamp */
-  uint64_t verticalLineTime;  /**< abscissa of the red vertical line */
-  uint64_t timeIntervalValue;// Value of the time interval displayed by the mouse move
-  uint maxY;
-  uint peHeight=30;  /**< height in pixels of the timeline for 1 core */
-  uint nCore;
-  Timeline* tl;
-  FunctionDetails* fd;
-
-
-  // Widgets we need to communicate with
-  QDockWidget* functionsDock;
-  QPlainTextEdit* sourceCode;
-  QPlainTextEdit* asmCode;
-  StallChart* stallchart;
-  QScrollArea* sArea;
-
-  TreeModel* signalsModel;
-  QTreeView* signalsView;
-  StatModel* statModel;
-  QTableView* statTableView;
-
-
-  QTabWidget* tabw;
-  QMessageBox msgBox;
-
-  //int functionsIdx;
-  QScrollBar* sbar;
-  QVBoxLayout* legendLayout;
-  QPen pen, greenPen, redPen;
-  QTimer timer;
-  QMutex drawMutex;
-  LegendMode currentMode=TIME_MODE;
-
-  std::unordered_map<std::string, QColor> functionColors;
-  int hueStep = 90;
-  int currentHue = 0;
-
-  friend class HLegendWidget;
-
-  private slots:
-  /** checks whether new data has arrived from gvsoc. If yes, view is updated */
-  void autoReconstruct();
-  void execMessage(const QString &text);
-
-};
-
-
-/**
-  @brief class to let the user browse and choose a trace to add to the timeline
-*/
-class SignalTree : public QTreeView
-{
-  Q_OBJECT
-public:
-  SignalTree(TLGView* gview, QWidget* parent);
-
-private:
-  QAction* addTraceAction;
-  TLGView* gview;
-
-private slots:
-  void updateScrollArea(const QModelIndex& i);
-  void contextMenuHandler();
-
-};
-
-
-/**
-  @brief class to let the user browse and choose a group of traces to add to the timeline
-*/
-/*class SignalsCommand : public
-{
-  Q_OBJECT
-public:
-  SignalTree(TLGView* gview, QWidget* parent);
-
-private:
-  QAction* addTraceAction;
-  TLGView* gview;
-
-private slots:
-  void updateScrollArea(const QModelIndex& i);
-  void contextMenuHandler();
-
-};
-*/
-
-/**
-  @brief class that regroups controls elements and timeline view
-SignalTree
-  class that regroups
-    - the view of the timeline (TLGView)
-    - the legend of the view
-    - the toolBar to control the view
-    - the scrollbar
-    - the signalTree to add traces to the view
-    */
-class Timeline : public QWidget
-{
-  Q_OBJECT
-public:
-  Timeline( QMainWindow* mw,
-            QToolBar* toolBar,
-            FunctionDetails* fd,
-            QDockWidget* functionsDock,
-            QPlainTextEdit* sourceCode,
-            QPlainTextEdit* asmCode,
-            StallChart* stallchart, 
-            QString signalsTreeFileName);
-  ~Timeline();
-  void setTimestamp(const QString& s) {timestamp->setText(s); }
-  void setTimeInterval(const QString& s) {timeInterval->setText(s); }
-  void resizeEvent(QResizeEvent *event) override;
-  void setModeButtonText(QString s);
-  void foo(int x);
-  void clearCurrentExecToolBar();
-  void addTraceToLegend(std::string path, bool addMenuAction=true);
-  TLGView* getTLGView(){return gview;};
-  //void setGviewVScrollValue();
-  void updateSignalsStatView( TreeModel* sigModel,
-                              QTreeView* signalsTreeView,
-                              uint64_t t0,
-                              uint64_t t1);
-  void updateSignalsStatView( TreeModel* sigModel,
-                              QTreeView* signalsTreeView);
-  void changeSignalsStatVisibility(bool visible);
-  void updateFlags(bool gvsocRun) {gvsocRunFlag = gvsocRun;};
-  // Flags
-  // gvsocRunFlag is set to True when gvsoc has started running
-  // gvsocRunFlag is set to False when a new gvsoc process is opened
-  bool gvsocRunFlag = false;
-  // Timeline Scroll Area
-  //QScrollArea* scrollArea;
-  AdjustingScrollArea* scrollArea;
-
-private:
-  void makeLegend();
-  void completeToolBar();
-  void gotoGivenTimestamp();
-  void createSignalsTree(QString signalsTreeFileName);
-
-  QMainWindow* mw;
-  QWidget *viewW; // widget for the signalsTreeView + verViewV
-  QVBoxLayout* mainLayout;
-  QToolBar* toolBar;
-  QVBoxLayout* legendLayout;
-  QHBoxLayout* viewLayout;
-  QGridLayout* tlLayout;
-  QTreeView* signalsTreeView;
-  QWidget* hLegendW ;
-  QWidget* bottomW; //bottom timeline view
-  QWidget* signalsW; // headerW + signalsTreeView
-  QVBoxLayout* bottomLayout;
-
-  // widget for the gview + scrollbar
-  QWidget* verViewW;
-  QVBoxLayout* verViewLayout;
-  TreeModel* model;
-  QLabel *treeLabel;
-  QLabel *statLabel;
-  QVBoxLayout* verTreeLayout;
-
-
-
-  // Timeline Graphic View
-  TLGView *gview = NULL;
-
-  // Statistics Table
-  StatModel *statModel = NULL;
-  QTableView *statTableView = NULL;
-  // Dock windows we need to communicate with
-  QDockWidget* functionsDock;
-  QPlainTextEdit* sourceCode;
-  QPlainTextEdit* asmCode;
-  StallChart* stallchart;
-
-  // For the tool Bar
-  QPushButton* zoomIn;
-  QPushButton* zoomOut;
-  QPushButton* viewAll;
-  QButtonGroup* buttonGroup;
-  //QRadioButton* radioB1;
-  //QRadioButton* radioB2;
-  QRadioButton* radioB3;
-  QLineEdit* timestamp;
-  QLineEdit* timeInterval;
-
-  QAction* zoomInAct;
-  QAction* zoomOutAct;
-  QAction* viewAllAct;
-
-  QLabel* timeLabel;
-  QLabel* intervalLabel;
-
-  QScrollBar* scrollbar;
-  QProgressBar* progressBar;
-
-  QList<QAction*> toBeRemoveAfterExec;
-
-signals:
-  void timeUpValueChanged();
-
-public  slots:
-
-  //void setGviewVScrollValue(int value1);
-  void setGviewVScrollValue();
-  void setGviewVScrollValue(int value1);
-  void refreshTimelineWidgets();
-
-
-private slots:
-
-};
-
-#endif // TIMELINE_HPP
diff --git a/tools/profiler/gui/include/treeitem.hpp b/tools/profiler/gui/include/treeitem.hpp
deleted file mode 100644
index e93467516..000000000
--- a/tools/profiler/gui/include/treeitem.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef TREEITEM_H
-#define TREEITEM_H
-
-#include <QVariant>
-#include <QVector>
-
-//! [0]
-class TreeItem
-{
-public:
-    explicit TreeItem(const QVector<QVariant> &data, TreeItem *parentItem = nullptr);
-    ~TreeItem();
-
-    void appendChild(TreeItem *child);
-
-    TreeItem *child(int row);
-    int childCount() const;
-    int columnCount() const;
-    QVariant data(int column) const;
-    int row() const;
-    TreeItem *parentItem();
-
-private:
-    QVector<TreeItem*> m_childItems;
-    QVector<QVariant> m_itemData;
-    TreeItem *m_parentItem;
-};
-//! [0]
-
-#endif // TREEITEM_H
diff --git a/tools/profiler/gui/include/treemodel.hpp b/tools/profiler/gui/include/treemodel.hpp
deleted file mode 100644
index 1f8fee648..000000000
--- a/tools/profiler/gui/include/treemodel.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#ifndef TREEMODEL_H
-#define TREEMODEL_H
-
-#include <QAbstractItemModel>
-#include <QModelIndex>
-#include <QVariant>
-
-class TreeItem;
-
-//! [0]
-class TreeModel : public QAbstractItemModel
-{
-    Q_OBJECT
-
-public:
-    explicit TreeModel(const QString &data, QObject *parent = nullptr);
-    TreeModel(const QString &data, const QString &path, QObject *parent);
-    ~TreeModel();
-
-    QVariant data(const QModelIndex &index, int role) const override;
-    Qt::ItemFlags flags(const QModelIndex &index) const override;
-    QVariant headerData(int section, Qt::Orientation orientation,
-                        int role = Qt::DisplayRole) const override;
-    QModelIndex index(int row, int column,
-                      const QModelIndex &parent = QModelIndex()) const override;
-    QModelIndex parent(const QModelIndex &index) const override;
-    int rowCount(const QModelIndex &parent = QModelIndex()) const override;
-    int columnCount(const QModelIndex &parent = QModelIndex()) const override;
-
-private:
-    void setupModelData(const QStringList &lines, TreeItem *parent);
-
-    TreeItem *rootItem;
-};
-//! [0]
-
-#endif // TREEMODEL_H
diff --git a/tools/profiler/gui/include/util.hpp b/tools/profiler/gui/include/util.hpp
deleted file mode 100644
index 2e6217392..000000000
--- a/tools/profiler/gui/include/util.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#ifndef UTIL_HPP
-#define UTIL_HPP
-
-enum LegendMode {
-  TIME_MODE = 0,
-  FC_CYCLE_MODE,
-  CLUSTER_CYCLE_MODE,
-  N_LEGEND_MODE
-};
-
-#endif // UTIL_HPP
\ No newline at end of file
diff --git a/tools/profiler/gui/profiler.pro b/tools/profiler/gui/profiler.pro
deleted file mode 100644
index db60abd84..000000000
--- a/tools/profiler/gui/profiler.pro
+++ /dev/null
@@ -1,40 +0,0 @@
-BACKEND_BUILD_DIR = ../backend/build
-GAP_SDK_HOME = $$(GAP_SDK_HOME)
-
-QT       += core gui
-QT += charts
-
-greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
-
-LIBS += -L$$BACKEND_BUILD_DIR -lprofiling
-LIBS += -L$$GAP_SDK_HOME/install/workstation/lib 
-LIBS += -lpulpvp-debug
-
-PRE_TARGETDEPS += $$BACKEND_BUILD_DIR/libprofiling.a
-
-TARGET = profiler
-TEMPLATE = app
-
-DESTDIR = build
-OBJECTS_DIR = build/obj
-MOC_DIR = build/moc
-UI_DIR = build/ui
-RCC_DIR = build/rcc
-
-SOURCES += $$files("src/*.cpp", true)
-HEADERS += $$files("include/*.hpp", true)
-INCLUDEPATH += "../backend/include/"
-FORMS += $$files("forms/*.ui", true)
-
-INCLUDEPATH += include
-INCLUDEPATH += $$GAP_SDK_HOME/install/workstation/include
-
-CONFIG += debug
-CONFIG += c++14 
-QMAKE_CXXFLAGS += -g -Wall -Wextra -Werror 
-
-
-RESOURCES = images/
-
-QT_DEBUG_PLUGINS = 1
-QML_IMPORT_TRACE = 1
diff --git a/tools/profiler/gui/src/dialog.cpp b/tools/profiler/gui/src/dialog.cpp
deleted file mode 100644
index 65063eb1e..000000000
--- a/tools/profiler/gui/src/dialog.cpp
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <QtWidgets>
-#include <unistd.h>
-#include <iostream>
-
-#include "dialog.hpp"
-
-#define MESSAGE \
-    Dialog::tr("<p>Message boxes have a caption, a text, " \
-               "and any number of buttons, each with standard or custom texts." \
-               "<p>Click a button to close the message box. Pressing the Esc button " \
-               "will activate the detected escape button (if any).")
-#define MESSAGE_DETAILS \
-    Dialog::tr("If a message box has detailed text, the user can reveal it " \
-               "by pressing the Show Details... button.")
-
-
-class DialogOptionsWidget : public QGroupBox
-{
-public:
-    explicit DialogOptionsWidget(QWidget *parent = nullptr);
-
-    void addCheckBox(const QString &text, int value);
-    void addSpacer();
-    int value() const;
-
-private:
-    typedef QPair<QCheckBox *, int> CheckBoxEntry;
-    QVBoxLayout *layout;
-    QList<CheckBoxEntry> checkBoxEntries;
-};
-
-DialogOptionsWidget::DialogOptionsWidget(QWidget *parent) :
-    QGroupBox(parent) , layout(new QVBoxLayout)
-{
-    setTitle(Dialog::tr("Options"));
-    setLayout(layout);
-}
-
-void DialogOptionsWidget::addCheckBox(const QString &text, int value)
-{
-    QCheckBox *checkBox = new QCheckBox(text);
-    layout->addWidget(checkBox);
-    checkBoxEntries.append(CheckBoxEntry(checkBox, value));
-}
-
-void DialogOptionsWidget::addSpacer()
-{
-    layout->addItem(new QSpacerItem(0, 0, QSizePolicy::Ignored, QSizePolicy::MinimumExpanding));
-}
-
-int DialogOptionsWidget::value() const
-{
-    int result = 0;
-    for (const CheckBoxEntry &checkboxEntry : qAsConst(checkBoxEntries)) {
-        if (checkboxEntry.first->isChecked())
-            result |= checkboxEntry.second;
-    }
-    return result;
-}
-
-Dialog::Dialog(QWidget *parent)
-    : QWidget(parent)
-{
-    QVBoxLayout *verticalLayout;
-    if (QGuiApplication::styleHints()->showIsFullScreen() || QGuiApplication::styleHints()->showIsMaximized()) {
-        QHBoxLayout *horizontalLayout = new QHBoxLayout(this);
-        QGroupBox *groupBox = new QGroupBox(QGuiApplication::applicationDisplayName(), this);
-        horizontalLayout->addWidget(groupBox);
-        verticalLayout = new QVBoxLayout(groupBox);
-    } else {
-        verticalLayout = new QVBoxLayout(this);
-    }
-
-    QToolBox *toolbox = new QToolBox;
-    verticalLayout->addWidget(toolbox);
-
-    int frameStyle = QFrame::Sunken | QFrame::Panel;
-
-    /*integerLabel = new QLabel;
-    integerLabel->setFrameStyle(frameStyle);
-    QPushButton *integerButton =
-            new QPushButton(tr("QInputDialog::get&Int()"));
-
-    doubleLabel = new QLabel;
-    doubleLabel->setFrameStyle(frameStyle);
-    QPushButton *doubleButton =
-            new QPushButton(tr("QInputDialog::get&Double()"));
-
-    itemLabel = new QLabel;
-    itemLabel->setFrameStyle(frameStyle);
-    QPushButton *itemButton = new QPushButton(tr("QInputDialog::getIte&m()"));
-
-    textLabel = new QLabel;
-    textLabel->setFrameStyle(frameStyle);
-    QPushButton *textButton = new QPushButton(tr("QInputDialog::get&Text()"));
-
-    multiLineTextLabel = new QLabel;
-    multiLineTextLabel->setFrameStyle(frameStyle);
-    QPushButton *multiLineTextButton = new QPushButton(tr("QInputDialog::get&MultiLineText()"));
-
-    colorLabel = new QLabel;
-    colorLabel->setFrameStyle(frameStyle);
-    QPushButton *colorButton = new QPushButton(tr("QColorDialog::get&Color()"));
-
-    fontLabel = new QLabel;
-    fontLabel->setFrameStyle(frameStyle);
-    QPushButton *fontButton = new QPushButton(tr("QFontDialog::get&Font()"));
-*/
-    directoryLabel = new QLabel;
-    directoryLabel->setFrameStyle(frameStyle);
-    QPushButton *directoryButton =
-            new QPushButton(tr("QFileDialog::getE&xistingDirectory()"));
-
-    /*openFileNameLabel = new QLabel;
-    openFileNameLabel->setFrameStyle(frameStyle);
-    QPushButton *openFileNameButton =
-            new QPushButton(tr("QFileDialog::get&OpenFileName()"));
-
-    openFileNamesLabel = new QLabel;
-    openFileNamesLabel->setFrameStyle(frameStyle);
-    QPushButton *openFileNamesButton =
-            new QPushButton(tr("QFileDialog::&getOpenFileNames()"));
-    */
-
-    execFileNameLabel = new QLabel;
-    execFileNameLabel->setFrameStyle(frameStyle);
-    QPushButton *execFileNameButton =
-            new QPushButton(tr("QFileDialog::get&execFileName()"));
-
-    /*connect(integerButton, &QAbstractButton::clicked, this, &Dialog::setInteger);
-    connect(doubleButton, &QAbstractButton::clicked, this, &Dialog::setDouble);
-    connect(itemButton, &QAbstractButton::clicked, this, &Dialog::setItem);
-    connect(textButton, &QAbstractButton::clicked, this, &Dialog::setText);
-    connect(multiLineTextButton, &QAbstractButton::clicked, this, &Dialog::setMultiLineText);
-    connect(colorButton, &QAbstractButton::clicked, this, &Dialog::setColor);
-    connect(fontButton, &QAbstractButton::clicked, this, &Dialog::setFont);
-    */
-    connect(directoryButton, &QAbstractButton::clicked,
-            this, &Dialog::setExistingDirectory);
-    /*connect(openFileNameButton, &QAbstractButton::clicked,
-            this, &Dialog::setOpenFileName);
-    connect(openFileNamesButton, &QAbstractButton::clicked,
-            this, &Dialog::setOpenFileNames);
-            */
-    connect(execFileNameButton, &QAbstractButton::clicked,
-            this, &Dialog::setExecFileName);
-
-    /*layout->setColumnStretch(1, 1);
-    layout->setColumnMinimumWidth(1, 250);
-    layout->addWidget(integerButton, 0, 0);
-    layout->addWidget(integerLabel, 0, 1);
-    layout->addWidget(doubleButton, 1, 0);
-    layout->addWidget(doubleLabel, 1, 1);
-    layout->addWidget(itemButton, 2, 0);
-    layout->addWidget(itemLabel, 2, 1);
-    layout->addWidget(textButton, 3, 0);
-    layout->addWidget(textLabel, 3, 1);
-    layout->addWidget(multiLineTextButton, 4, 0);
-    layout->addWidget(multiLineTextLabel, 4, 1);
-    layout->addItem(new QSpacerItem(0, 0, QSizePolicy::Ignored, QSizePolicy::MinimumExpanding), 5, 0);
-    toolbox->addItem(page, tr("Input Dialogs"));
-    */
-    const QString doNotUseNativeDialog = tr("Do not use native dialog");
-    /*
-    page = new QWidget;
-    layout = new QGridLayout(page);
-    layout->setColumnStretch(1, 1);
-    layout->addWidget(colorButton, 0, 0);
-    layout->addWidget(colorLabel, 0, 1);
-    colorDialogOptionsWidget = new DialogOptionsWidget;
-    colorDialogOptionsWidget->addCheckBox(doNotUseNativeDialog, QColorDialog::DontUseNativeDialog);
-    colorDialogOptionsWidget->addCheckBox(tr("Show alpha channel") , QColorDialog::ShowAlphaChannel);
-    colorDialogOptionsWidget->addCheckBox(tr("No buttons") , QColorDialog::NoButtons);
-    layout->addItem(new QSpacerItem(0, 0, QSizePolicy::Ignored, QSizePolicy::MinimumExpanding), 1, 0);
-    layout->addWidget(colorDialogOptionsWidget, 2, 0, 1 ,2);
-
-    toolbox->addItem(page, tr("Color Dialog"));
-
-    page = new QWidget;
-    layout = new QGridLayout(page);
-    layout->setColumnStretch(1, 1);
-    layout->addWidget(fontButton, 0, 0);
-    layout->addWidget(fontLabel, 0, 1);
-    fontDialogOptionsWidget = new DialogOptionsWidget;
-    fontDialogOptionsWidget->addCheckBox(doNotUseNativeDialog, QFontDialog::DontUseNativeDialog);
-    fontDialogOptionsWidget->addCheckBox(tr("Show scalable fonts"), QFontDialog::ScalableFonts);
-    fontDialogOptionsWidget->addCheckBox(tr("Show non scalable fonts"), QFontDialog::NonScalableFonts);
-    fontDialogOptionsWidget->addCheckBox(tr("Show monospaced fonts"), QFontDialog::MonospacedFonts);
-    fontDialogOptionsWidget->addCheckBox(tr("Show proportional fonts"), QFontDialog::ProportionalFonts);
-    fontDialogOptionsWidget->addCheckBox(tr("No buttons") , QFontDialog::NoButtons);
-    layout->addItem(new QSpacerItem(0, 0, QSizePolicy::Ignored, QSizePolicy::MinimumExpanding), 1, 0);
-    layout->addWidget(fontDialogOptionsWidget, 2, 0, 1 ,2);
-    toolbox->addItem(page, tr("Font Dialog"));
-*/
-    QWidget* page = new QWidget;
-    QGridLayout* layout = new QGridLayout(page);
-    layout->setColumnStretch(1, 1);
-    layout->addWidget(directoryButton, 0, 0);
-    layout->addWidget(directoryLabel, 0, 1);
-    /*layout->addWidget(openFileNameButton, 1, 0);
-    layout->addWidget(openFileNameLabel, 1, 1);
-    layout->addWidget(openFileNamesButton, 2, 0);
-    layout->addWidget(openFileNamesLabel, 2, 1);
-    */
-    layout->addWidget(execFileNameButton, 1, 0);
-    layout->addWidget(execFileNameLabel, 1, 1);
-    layout->addItem(new QSpacerItem(0, 0, QSizePolicy::Ignored, QSizePolicy::MinimumExpanding), 4, 0);
-    toolbox->addItem(page, tr("File Dialogs"));
-
-    /*page = new QWidget;
-    layout = new QGridLayout(page);
-    layout->setColumnStretch(1, 1);
-    layout->addWidget(criticalButton, 0, 0);
-    layout->addWidget(criticalLabel, 0, 1);
-    layout->addWidget(informationButton, 1, 0);
-    layout->addWidget(informationLabel, 1, 1);
-    layout->addWidget(questionButton, 2, 0);
-    layout->addWidget(questionLabel, 2, 1);
-    layout->addWidget(warningButton, 3, 0);
-    layout->addWidget(warningLabel, 3, 1);
-    layout->addWidget(errorButton, 4, 0);
-    layout->addWidget(errorLabel, 4, 1);
-    layout->addItem(new QSpacerItem(0, 0, QSizePolicy::Ignored, QSizePolicy::MinimumExpanding), 5, 0);
-    toolbox->addItem(page, tr("Message Boxes"));
-    */
-    //setWindowTitle(QGuiApplication::applicationDisplayName());
-}
-
-/*void Dialog::setInteger()
-{
-//! [0]
-    bool ok;
-    int i = QInputDialog::getInt(this, tr("QInputDialog::getInt()"),
-                                 tr("Percentage:"), 25, 0, 100, 1, &ok);
-    if (ok)
-        integerLabel->setText(tr("%1%").arg(i));
-//! [0]
-}
-
-void Dialog::setDouble()
-{
-//! [1]
-    bool ok;
-    double d = QInputDialog::getDouble(this, tr("QInputDialog::getDouble()"),
-                                       tr("Amount:"), 37.56, -10000, 10000, 2, &ok);
-    if (ok)
-        doubleLabel->setText(QString("$%1").arg(d));
-//! [1]
-}
-
-void Dialog::setItem()
-{
-//! [2]
-    QStringList items;
-    items << tr("Spring") << tr("Summer") << tr("Fall") << tr("Winter");
-QFileDialog daliog(this);
-  dialog.setFileMode(QFileDialog::Directory);
-  dialog.setViewMode(QFileDialog::Detail);
-  QStringList fileNames;
-  if (dialog.exec())
-    fileNames = dialog.selectedFiles();("User name:"), QLineEdit::Normal,
-                                         QDir::home().dirName(), &ok);
-    if (ok && !text.isEmpty())
-        textLabel->setText(text);
-//! [3]
-}
-
-void Dialog::setMultiLineText()
-{
-//! [4]
-    bool ok;
-    QString text = QInputDialog::getMultiLineText(this, tr("QInputDialog::getMultiLineText()"),
-                                                  tr("Address:"), "John Doe\nFreedom Street", &ok);
-    if (ok && !text.isEmpty())
-        multiLineTextLabel->setText(text);
-//! [4]
-}
-
-void Dialog::setColor()
-{
-    const QColorDialog::ColorDialogOptions options = QFlag(colorDialogOptionsWidget->value());
-    const QColor color = QColorDialog::getColor(Qt::green, this, "Select Color", options);
-
-    if (color.isValid()) {
-        colorLabel->setText(color.name());
-        colorLabel->setPalette(QPalette(color));
-        colorLabel->setAutoFillBackground(true);
-    }
-}
-
-void Dialog::setFont()
-{
-    const QFontDialog::FontDialogOptions options = QFlag(fontDialogOptionsWidget->value());
-    bool ok;
-    QFont font = QFontDialog::getFont(&ok, QFont(fontLabel->text()), this, "Select Font", options);
-    if (ok) {
-        fontLabel->setText(font.key());
-        fontLabel->setFont(font);
-    }
-}
-*/
-void Dialog::setExistingDirectory()
-{
-    std::cout << "**** setExistingDirectory *****" << std::endl;
-    QFileDialog::Options options = QFlag(fileDialogOptionsWidget->value());
-    options |= QFileDialog::DontResolveSymlinks | QFileDialog::ShowDirsOnly;
-    std::cout << "**** setExistingDirectory 1 *****" << std::endl;
-    QString directory = QFileDialog::getExistingDirectory(this,
-                                tr("QFileDialog::getExistingDirectory()"),
-                                directoryLabel->text(),
-                                options);
-    std::cout << "**** setExistingDirectory 2 *****" << std::endl;
-    if (!directory.isEmpty())
-        directoryLabel->setText(directory);
-    std::cout << "**** setExistingDirectory 3 *****" << std::endl;
-}
-/*
-void Dialog::setOpenFileName()
-{
-    const QFileDialog::Options options = QFlag(fileDialogOptionsWidget->value());
-    QString selectedFilter;
-    QString fileName = QFileDialog::getOpenFileName(this,
-                                tr("QFileDialog::getOpenFileName()"),
-                                openFileNameLabel->text(),
-                                tr("All Files (*);;Text Files (*.txt)"),
-                                &selectedFilter,
-                                options);
-    if (!fileName.isEmpty())
-        openFileNameLabel->setText(fileName);
-}
-
-void Dialog::setOpenFileNames()
-{
-    const QFileDialog::Options options = QFlag(fileDialogOptionsWidget->value());
-    QString selectedFilter;
-    QStringList files = QFileDialog::getOpenFileNames(
-                                this, tr("QFileDialog::getOpenFileNames()"),
-                                openFilesPath,
-                                tr("All Files (*);;Text Files (*.txt)"),
-                                &selectedFilter,
-                                options);
-    if (files.count()) {
-        openFilesPath = files[0];
-        openFileNamesLabel->setText(QString("[%1]").arg(files.join(", ")));
-    }
-}
-*/
-
-void Dialog::setExecFileName()
-{
-    const QFileDialog::Options options = QFlag(fileDialogOptionsWidget->value());
-    QString selectedFilter;
-    QString fileName = QFileDialog::getOpenFileName(this,
-                                tr("QFileDialog::getExecFileName()"),
-                                execFileNameLabel->text(),
-                                tr("All Files (*)"),
-                                &selectedFilter,
-                                options);
-    if (!fileName.isEmpty())
-        execFileNameLabel->setText(fileName);
-}
-/*
-void Dialog::criticalMessage()
-{
-    QMessageBox::StandardButton reply;
-    reply = QMessageBox::critical(this, tr("QMessageBox::critical()"),
-                                    MESSAGE,
-                                    QMessageBox::Abort | QMessageBox::Retry | QMessageBox::Ignore);
-    if (reply == QMessageBox::Abort)
-        criticalLabel->setText(tr("Abort"));
-    else if (reply == QMessageBox::Retry)
-        criticalLabel->setText(tr("Retry"));
-    else
-        criticalLabel->setText(tr("Ignore"));
-}
-
-void Dialog::informationMessage()
-{
-    QMessageBox::StandardButton reply;
-    reply = QMessageBox::information(this, tr("QMessageBox::information()"), MESSAGE);
-    if (reply == QMessageBox::Ok)
-        informationLabel->setText(tr("OK"));
-    else
-        informationLabel->setText(tr("Escape"));
-}
-
-void Dialog::questionMessage()
-{
-    QMessageBox::StandardButton reply;
-    reply = QMessageBox::question(this, tr("QMessageBox::question()"),
-                                    MESSAGE,
-                                    QMessageBox::Yes | QMessageBox::No | QMessageBox::Cancel);
-    if (reply == QMessageBox::Yes)
-        questionLabel->setText(tr("Yes"));
-    else if (reply == QMessageBox::No)
-        questionLabel->setText(tr("No"));
-    else
-        questionLabel->setText(tr("Cancel"));
-}
-
-void Dialog::warningMessage()
-{
-    QMessageBox msgBox(QMessageBox::Warning, tr("QMessageBox::warning()"),
-                       MESSAGE, nullptr, this);
-    msgBox.setDetailedText(MESSAGE_DETAILS);
-    msgBox.addButton(tr("Save &Again"), QMessageBox::AcceptRole);
-    msgBox.addButton(tr("&Continue"), QMessageBox::RejectRole);
-    if (msgBox.exec() == QMessageBox::AcceptRole)
-        warningLabel->setText(tr("Save Again"));
-    else
-        warningLabel->setText(tr("Continue"));
-
-}
-
-void Dialog::errorMessage()
-{
-    errorMessageDialog->showMessage(
-            tr("This dialog shows and remembers error messages. "
-               "If the checkbox is checked (as it is by default), "
-               "the shown message will be shown again, "
-               "but if the user unchecks the box the message "
-               "will not appear again if QErrorMessage::showMessage() "
-               "is called with the same message."));
-    errorLabel->setText(tr("If the box is unchecked, the message "
-                           "won't appear again."));
-}
-*/
diff --git a/tools/profiler/gui/src/execoverview.cpp b/tools/profiler/gui/src/execoverview.cpp
deleted file mode 100644
index 9befa5f8c..000000000
--- a/tools/profiler/gui/src/execoverview.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <iostream>
-#include "execoverview.hpp"
-
-#include "backend_interface.hpp"
-#include <QBarCategoryAxis>
-#include <QDebug>
-#include <QColor>
-#include <random>
-
-
-QT_CHARTS_USE_NAMESPACE
-
-ExecOverview::ExecOverview(QWidget* parent){
-  this->setParent(parent);
-  chart = new QChart();
-  chart->setTheme(QChart::ChartThemeBrownSand);
-  chart->setTitleFont(QFont("Arial", 22));
-  chart->setTitle("Total number of cycles per PE");
-  QChartView *chartView = new QChartView(chart);
-  chartView->setRenderHint(QPainter::Antialiasing);
-
-  QHBoxLayout* layout = new QHBoxLayout;
-  layout->addWidget(chartView);
-  setLayout(layout);
-
-  timer = new QTimer(this);
-  connect(timer, SIGNAL(timeout()), this, SLOT(refresh()));
-  timer->start(2000);
-  refresh();
-}
-
-ExecOverview::~ExecOverview(){
-  delete timer;
-}
-
-void ExecOverview::refresh(){
-  QColor colors[8] = {QColor(196,216,46), QColor(119,187,0),
-                  QColor(17,158,0), QColor(17,119,0), QColor(17,102,0),
-                  QColor(0,85,0),QColor(0,51,0), QColor(0,17,0)};
-
-  chart->removeAllSeries();
-  auto v = get_cycles_per_pe();
-  series = new QBarSeries();
-
-  float vmin=0.0;
-  float vmax=0.0;
-
-
-  for (uint i = 0; i < v.size(); i++){
-    if (i==8)
-      break;
-    QBarSet* set = new QBarSet("nothing");
-
-    set->setLabel(QString("PE")+ QString::number(i));
-
-    set->setColor(colors[i]);
-    *set << v[i];
-    if (i==0) {
-      vmin=v[i];
-      vmax=v[i];
-    } else {
-      if (v[i] < vmin){
-        vmin=v[i];
-      }
-      if (v[i] > vmax){
-        vmax=v[i];
-      }
-    }
-    series->append(set);
-  }
-
-
-  chart->addSeries(series);
-  //chart->setAnimationOptions(QChart::SeriesAnimations);
-
-  chart->legend()->setVisible(true);
-  chart->legend()->setAlignment(Qt::AlignBottom);
-
-  QList<QAbstractAxis*> axisy_list = chart->axes(Qt::Vertical);
-
-  if (axisy_list.isEmpty()) {
-    axisy_list.append(new QValueAxis);
-  }
-
-  QValueAxis* axisY= static_cast<QValueAxis*>(axisy_list.back());
-
-  axisY->setRange(vmin, vmax);
-  axisY->setTickCount(5);
-  axisY->setLabelFormat("%'\''d \n");
-}
diff --git a/tools/profiler/gui/src/functiondetails.cpp b/tools/profiler/gui/src/functiondetails.cpp
deleted file mode 100644
index 5f052aaaa..000000000
--- a/tools/profiler/gui/src/functiondetails.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include <iostream>
-#include <QTableWidgetItem>
-#include <QAbstractItemView>
-#include <QTimer>
-#include <QHeaderView>
-#include <QSizePolicy>
-#include <string>
-#include <fstream>
-
-#include "backend_interface.hpp"
-#include "functiondetails.hpp"
-
-
-const char* FunctionDetails::columnHeader[] = {
-  "Function name",
-  "Total time or cycles",
-  "calls",
-  "From file"
-};
-const int FunctionDetails::nbColumn = \
-    sizeof(FunctionDetails::columnHeader) / sizeof(FunctionDetails::columnHeader[0]);
-
-
-FunctionDetails::FunctionDetails(
-  QWidget* parent,
-  QPlainTextEdit* sourceCode,
-  QPlainTextEdit* asmCode,
-  StallChart* stallChart) :
-  stallChart(stallChart),
-  sourceCode(sourceCode),
-  asmCode(asmCode)
-{
-  this->setParent(parent);
-  fillTable();
-  sourceCode->setWordWrapMode(QTextOption::NoWrap);
-  asmCode->setWordWrapMode(QTextOption::NoWrap);
-  sourceCode->setReadOnly(true);
-  asmCode->setReadOnly(true);
-
-  cppHighlighter = new Highlighter(sourceCode->document());
-  asmHighlighter = new Highlighter(asmCode->document());
-
-  layout = new QHBoxLayout;
-  layout->addWidget(table);
-  setLayout(layout);
-  timer = new QTimer(this);
-  connect(timer, SIGNAL(timeout()), this, SLOT(fillTable()));
-  timer->start(2000);
-}
-
-void FunctionDetails::switchLegendMode(LegendMode newMode)
-{
-    this->currentMode = newMode;
-    this->updateTimeStamps();
-}
-
-void FunctionDetails::updateTimeStamps(void)
-{
-  // changes from former representation
-  auto data = get_function_table();
-  int T = 1;
-  switch(this->currentMode)
-  {
-      case FC_CYCLE_MODE:
-          T = get_fc_period();
-          break;
-      case CLUSTER_CYCLE_MODE:
-          T = get_cluster_period();
-          break;
-      default:
-          T = 1;
-          break;
-  }
-
-  assert(T > 0);
-
-  for (uint i = 0; i < data.size(); i++){
-    QString time_string = formatTimeStamp(QString::number(data[i].tot_time/T));
-    QTableWidgetItem* item = new QTableWidgetItem(tr("%1").arg(time_string));
-    item->setTextAlignment(Qt::AlignRight);
-    table->setItem(i, 1, item);
-  }
-  table->update();
-}
-
-QString FunctionDetails::formatTimeStamp(QString ts) const {
-
-  const int step = 3;
-  const char mychar = ',';
-  for (int i = ts.length()-step; i>0; i=i-step)
-      ts.insert(i, mychar);
-
-  return ts;
-}
-
-void FunctionDetails::fillTable(){
-  if (table == nullptr) table = new QTableWidget();
-  QHeaderView* header = table->horizontalHeader();
-
-  auto data = get_function_table();
-  table->sizePolicy().setHorizontalPolicy(QSizePolicy::Ignored);
-  table->setRowCount(data.size());
-  table->setEditTriggers(QAbstractItemView::NoEditTriggers);
-
-  table->setColumnCount(nbColumn);
-  for (int i = 0; i < nbColumn; i++){
-    table->setHorizontalHeaderItem(i, new QTableWidgetItem(columnHeader[i]));
-  }
-
-  connect(table, SIGNAL(itemClicked(QTableWidgetItem*)), this,
-                                      SLOT(selectFunction(QTableWidgetItem*)));
-  connect(table->verticalHeader(), SIGNAL(sectionClicked(int)), this,
-                                      SLOT(selectFunction(int)));
-  textVersion = "";
-  for (uint i = 0; i < data.size(); i++){
-    QTableWidgetItem* it =  new QTableWidgetItem(QString(data[i].name.c_str()));
-    table->setItem(i, 0, it);
-    QString time_string= formatTimeStamp(QString::number(data[i].tot_time));
-    //table->setItem(i, 1, new QTableWidgetItem(tr("%1").arg(time_string)));
-    QTableWidgetItem* item = new QTableWidgetItem(tr("%1").arg(data[i].n_calls));
-    item->setTextAlignment(Qt::AlignRight);
-    table->setItem(i, 2, item);
-    table->setItem(i, 3, new QTableWidgetItem(QString::fromUtf8(data[i].file.c_str())));
-    textVersion += data[i].name + "\t" + time_string.toStdString() + "\t"\
-                   + std::to_string(data[i].n_calls) + "\t" + data[i].file + "\n";
-  }
-
-  this->updateTimeStamps();
-  header->resizeSections(QHeaderView::Stretch);
-  header->setSectionResizeMode(1,QHeaderView::Stretch);
-  exportTableToTextFile("function_statistics.txt");
-}
-
-void FunctionDetails::exportTableToTextFile(const char* filename) const {
-  std::ofstream out(filename, std::ofstream::trunc);
-  if (out.is_open()){
-    out << textVersion;
-    out.close();
-  }
-  else {
-    std::cout << "[-] Error: Impossible to export function table to file " << filename;
-    std::cout << std::endl;
-  }
-}
-
-void FunctionDetails::selectFunction(int row){
-  QTableWidgetItem* firstColItem = table->item(row, 0);
-  if (firstColItem == nullptr){
-    std::cout << "[-] Error: impossible to get function name from selection in table" << std::endl;
-    return;
-  }
-  selectFunction(firstColItem->text().toStdString());
-}
-
-void FunctionDetails::selectFunction(QTableWidgetItem* it){
-  selectFunction(it->row());
-}
-
-void FunctionDetails::selectFunction(std::string fname){
-  if (fname == selectedFunction) return;
-  selectedFunction = fname;
-  setSourceCode(fname);
-  asmCode->setPlainText(QString::fromUtf8(get_asm_code(fname).c_str()));
-  stallChart->construct(fname);
-}
-
-void FunctionDetails::setSourceCode(std::string fname){
-  sourceCode->setPlainText(QString::fromUtf8(get_source_code(fname).c_str()));
-  int line = function2line_number(fname);
-  if (line > 0){
-    sourceCode->moveCursor(QTextCursor::End);
-    sourceCode->setTextCursor(QTextCursor(
-      sourceCode->document()->findBlockByLineNumber(
-        std::max(0, line - 5))));
-  }
-}
-
-void FunctionDetails::selectRow(const char* name){
-  if (table == nullptr) return;
-  for (int i = 0; i < table->rowCount(); i++){
-    if (table->item(i, 0)->text() == QString(name)){
-      table->selectRow(i);
-      return;
-    }
-  }
-}
diff --git a/tools/profiler/gui/src/highlighter.cpp b/tools/profiler/gui/src/highlighter.cpp
deleted file mode 100644
index 6425e7b27..000000000
--- a/tools/profiler/gui/src/highlighter.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include "highlighter.hpp"
-
-Highlighter::Highlighter(QTextDocument *parent)
-: QSyntaxHighlighter(parent)
-{
-  HighlightingRule rule;
-
-  keywordFormat.setForeground(Qt::darkBlue);
-  keywordFormat.setFontWeight(QFont::Bold);
-  const QString keywordPatterns[] = {
-    QStringLiteral("\\bchar\\b"), QStringLiteral("\\bclass\\b"), QStringLiteral("\\bconst\\b"),
-    QStringLiteral("\\bdouble\\b"), QStringLiteral("\\benum\\b"), QStringLiteral("\\bexplicit\\b"),
-    QStringLiteral("\\bfriend\\b"), QStringLiteral("\\binline\\b"), QStringLiteral("\\bint\\b"),
-    QStringLiteral("\\blong\\b"), QStringLiteral("\\bnamespace\\b"), QStringLiteral("\\boperator\\b"),
-    QStringLiteral("\\bprivate\\b"), QStringLiteral("\\bprotected\\b"), QStringLiteral("\\bpublic\\b"),
-    QStringLiteral("\\bshort\\b"), QStringLiteral("\\bsignals\\b"), QStringLiteral("\\bsigned\\b"),
-    QStringLiteral("\\bslots\\b"), QStringLiteral("\\bstatic\\b"), QStringLiteral("\\bstruct\\b"),
-    QStringLiteral("\\btemplate\\b"), QStringLiteral("\\btypedef\\b"), QStringLiteral("\\btypename\\b"),
-    QStringLiteral("\\bunion\\b"), QStringLiteral("\\bunsigned\\b"), QStringLiteral("\\bvirtual\\b"),
-    QStringLiteral("\\bvoid\\b"), QStringLiteral("\\bvolatile\\b"), QStringLiteral("\\bbool\\b"),
-    QStringLiteral("\\bif\\b"), QStringLiteral("\\belse\\b"),  QStringLiteral("\\bfor\\b"),
-    QStringLiteral("\\bwhile\\b"), QStringLiteral("\\bdo\\b"), QStringLiteral("\\bswitch\\b"),
-    QStringLiteral("\\bcase\\b")
-  };
-  for (const QString &pattern : keywordPatterns) {
-    rule.pattern = QRegularExpression(pattern);
-    rule.format = keywordFormat;
-    highlightingRules.append(rule);
-  }
-
-  operatorPattern.setForeground(Qt::blue);
-  rule.pattern = QRegularExpression(QStringLiteral("[&\\+\\-\\*/=]"));
-  rule.format = operatorPattern;
-  highlightingRules.append(rule);
-
-  classFormat.setFontWeight(QFont::Bold);
-  classFormat.setForeground(Qt::darkMagenta);
-  rule.pattern = QRegularExpression(QStringLiteral("\\bQ[A-Za-z]+\\b"));
-  rule.format = classFormat;
-  highlightingRules.append(rule);
-
-  quotationFormat.setForeground(Qt::darkGreen);
-  rule.pattern = QRegularExpression(QStringLiteral("\".*\""));
-  rule.format = quotationFormat;
-  highlightingRules.append(rule);
-
-  functionFormat.setFontItalic(true);
-  functionFormat.setForeground(Qt::blue);
-  rule.pattern = QRegularExpression(QStringLiteral("\\b[A-Za-z0-9_]+(?=\\()"));
-  rule.format = functionFormat;
-  highlightingRules.append(rule);
-
-  singleLineCommentFormat.setForeground(Qt::red);
-  rule.pattern = QRegularExpression(QStringLiteral("//[^\n]*"));
-  rule.format = singleLineCommentFormat;
-  highlightingRules.append(rule);
-
-  multiLineCommentFormat.setForeground(Qt::red);
-
-  commentStartExpression = QRegularExpression(QStringLiteral("/\\*"));
-  commentEndExpression = QRegularExpression(QStringLiteral("\\*/"));
-
-  /* rules for ASM */
-  pcFormat.setForeground(Qt::darkGray);
-  rule.pattern = QRegularExpression(QStringLiteral("0x[0-9a-f]{8} <[A-Za-z0-9_]*\\+[0-9]*>"));
-  rule.format = pcFormat;
-  highlightingRules.append(rule);
-}
-
-void Highlighter::highlightBlock(const QString &text)
-{
-  for (const HighlightingRule &rule : highlightingRules) {
-    QRegularExpressionMatchIterator matchIterator = rule.pattern.globalMatch(text);
-    while (matchIterator.hasNext()) {
-      QRegularExpressionMatch match = matchIterator.next();
-      setFormat(match.capturedStart(), match.capturedLength(), rule.format);
-    }
-  }
-
-  setCurrentBlockState(0);
-
-  int startIndex = 0;
-  if (previousBlockState() != 1)
-  startIndex = text.indexOf(commentStartExpression);
-
-  while (startIndex >= 0) {
-    QRegularExpressionMatch match = commentEndExpression.match(text, startIndex);
-    int endIndex = match.capturedStart();
-    int commentLength = 0;
-    if (endIndex == -1) {
-      setCurrentBlockState(1);
-      commentLength = text.length() - startIndex;
-    } else {
-      commentLength = endIndex - startIndex
-      + match.capturedLength();
-    }
-    setFormat(startIndex, commentLength, multiLineCommentFormat);
-    startIndex = text.indexOf(commentStartExpression, startIndex + commentLength);
-  }
-}
diff --git a/tools/profiler/gui/src/main.cpp b/tools/profiler/gui/src/main.cpp
deleted file mode 100644
index 711da5b09..000000000
--- a/tools/profiler/gui/src/main.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include <QApplication>
-#include <QCommandLineParser>
-#include <QtGlobal>
-
-#include <iostream>
-#include <fstream>
-#include <stdio.h>
-#include <execinfo.h>
-#include <signal.h>
-#include <stdlib.h>
-using namespace std;
-#include <unistd.h>
-#include <stdio.h>
-#include <QtDebug>
-
-#include "mainwindow.hpp"
-#include "backend_interface.hpp"
-
-void myMessageOutput(QtMsgType type, const QMessageLogContext &context, const QString &msg)
-{
-    QByteArray localMsg = msg.toLocal8Bit();
-    switch (type) {
-    case QtDebugMsg:
-        //fprintf(stderr, "Debug: %s (%s:%u, %s)\n", localMsg.constData(), context.file, context.line, context.function);
-        break;
-    case QtInfoMsg:
-        fprintf(stderr, "Info: %s (%s:%u, %s)\n", localMsg.constData(), context.file, context.line, context.function);
-        break;
-    case QtWarningMsg:
-        fprintf(stderr, "Warning: %s (%s:%u, %s)\n", localMsg.constData(), context.file, context.line, context.function);
-        break;
-    case QtCriticalMsg:
-        fprintf(stderr, "Critical: %s (%s:%u, %s)\n", localMsg.constData(), context.file, context.line, context.function);
-        break;
-    case QtFatalMsg:
-        fprintf(stderr, "Fatal: %s (%s:%u, %s)\n", localMsg.constData(), context.file, context.line, context.function);
-        abort();
-    }
-}
-
-int main(int argc, char *argv[])
-{
-    qInstallMessageHandler(myMessageOutput); // Install the handler
-
-    qDebug() << "[-] Entering Profiler Main Application";
-    QApplication app(argc, argv);
-
-    QCommandLineParser parser;
-    parser.setApplicationDescription(
-            "\nProfiler is a part of GWT GAP SDK and used with GVSOC, GWT Full "
-            "System SoC Simulator.\n"
-            "It gives a visual view of what is happening inside the chip "
-            "and allows to control the simulator through a graphic interface.\n"
-            "It is extremely useful for developing and debugging applications "
-            "on GAP processors.");
-    parser.addHelpOption();
-    parser.addPositionalArgument("directory", "Program directory");
-    parser.addPositionalArgument("executable", "Path of the program to profile");
-    parser.addPositionalArgument("config", "GVSoC configuration file");
-
-    QCommandLineOption signalTreeOption("signal-tree-file",
-            "Path to the signal tree configuration file, default is profiler/gui/src/images/signalstree.txt",
-            "file");
-    parser.addOption(signalTreeOption);
-
-    parser.process(app);
-
-    const QStringList args = parser.positionalArguments();
-    if (args.isEmpty() || args.size() < 3)
-    {
-        parser.showHelp();
-        return 0;
-    }
-
-    QString signalsTreeFileName(":/images/signalstree.txt");
-    if (parser.isSet(signalTreeOption))
-    {
-        signalsTreeFileName = parser.value(signalTreeOption);
-        qDebug() << "[-] Using custom signal tree " << signalsTreeFileName;
-    }
-
-    std::string exampleDirectory = args.at(0).toUtf8().constData();
-    std::string executablePath = args.at(1).toUtf8().constData();
-    std::string gvsocConfigurationFile = args.at(2).toUtf8().constData();
-
-    MainWindow win(
-            exampleDirectory,
-            executablePath,
-            gvsocConfigurationFile,
-            signalsTreeFileName
-            );
-
-    win.show();
-    int result = app.exec();
-    qDebug() << "[-] Ending Profiler Main Application";
-
-    return result;
-}
-
diff --git a/tools/profiler/gui/src/mainwindow.cpp b/tools/profiler/gui/src/mainwindow.cpp
deleted file mode 100644
index d73fa61fc..000000000
--- a/tools/profiler/gui/src/mainwindow.cpp
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <iostream>
-#include <QDockWidget>
-#include <QPalette>
-#include <QColor>
-#include <QDebug>
-#include <QDesktopWidget>
-#include <QMessageBox>
-#include "mainwindow.hpp"
-
-#include "functiondetails.hpp"
-#include "backend_interface.hpp"
-#include "stallchart.hpp"
-#include "execoverview.hpp"
-#include <QAction>
-#include <QtDebug>
-#include <assert.h>
-
-
-void MainWindow::makeToolBar(){
-
-  toolBar = this->addToolBar(tr("File"));
-
-  // Add the GvSoc command buttons
-  QString closeIconFile = ":/images/Stop-red-icon.png";
-  QString runIconFile = ":/images/Start-icon.png";
-  QString pauseIconFile = ":/images/pause-icon.png";
-
-  runB = new QPushButton(QIcon(runIconFile),"");
-  pauseB = new QPushButton(QIcon(pauseIconFile),"");
-  closeB = new QPushButton(QIcon(closeIconFile),"");
-
-  runB->setToolTip(QString("Run Gvsoc"));
-  pauseB->setToolTip(QString("Pause Gvsoc"));
-  closeB->setToolTip(QString("Close Gvsoc"));
-
-
-  runB->update();
-  pauseB->update();
-  closeB->update();
-
-
-  connect(runB, SIGNAL (released()), this, SLOT (handleRunB()));
-  connect(pauseB, SIGNAL (released()), this, SLOT (handlePauseB()));
-  connect(closeB, SIGNAL (released()), this, SLOT (handleCloseB()));
-
-  //toolBar->addWidget(openB);
-  toolBar->addWidget(runB);
-  toolBar->addWidget(pauseB);
-  toolBar->addWidget(closeB);
-
-  update();
-}
-
-
-void MainWindow::switchCoresSig(bool state){
-  switchSignalsGroup(coresSig, state);
-}
-
-void MainWindow::switchDebugSig(bool state){
-  switchSignalsGroup(debugSymbolsSig,state);
-}
-
-void MainWindow::switchDmaSig(bool state){
-  switchSignalsGroup(dmaSig,state);
-}
-
-void MainWindow::switchStallsSig(bool state){
-  switchSignalsGroup(stallsSig, state);
-}
-
-
-void MainWindow::switchSignalsGroup(std::vector<std::string> &signalsTable, bool state){
-  qDebug() << "[-] Switching Signals";
-  // Switch ON  specific traces event in Gvsoc
-  for ( auto& signalName : signalsTable){
-    if (state){
-      qDebug() << " Signal " <<  signalName.c_str() << " ON";
-      gvsoc->add_event_regex(signalName);
-    }
-    else
-    {
-      qDebug() << " Signal " <<  signalName.c_str() << " OFF";
-      gvsoc->remove_event_regex(signalName);
-    }
-  }
-  qDebug() << "[-] Switching Signals Done";
-}
-
-void MainWindow::initGvsoc() {
-
-  gvsoc->add_event_regex("/sys/board/chip/cluster_clock/cycles");
-  qDebug() << "[-] Initialising Gvsoc Settings";
-  if (firstGvsocInit) {
-    // Set default settings with cores and debug signals ON
-    //qDebug() << "[-] Default Signals Init";
-    // Always switch on mandatory signals
-    //switchSignalsGroup(gvsocSignals,true);
-    switchSignalsGroup(coresSig,true);
-    switchSignalsGroup(debugSymbolsSig,true);
-    //switchSignalsGroup(dmaSig,true);
-    //switchSignalsGroup(stallsSig,true);
-    coresBox->setCheckState(Qt::Checked);
-    debugBox->setCheckState(Qt::Checked);
-    dmaBox->setCheckState(Qt::Checked);
-    stallsBox->setCheckState(Qt::Checked);
-    firstGvsocInit = false;
-    qDebug() << "[-] END Default Signals Init";
-  }
-  else {
-    // Keep previous user settings
-      qDebug() << "[-] Keep User Signals Init";
-      switchSignalsGroup(coresSig,coresBox->isChecked());
-      switchSignalsGroup(debugSymbolsSig,debugBox->isChecked());
-      switchSignalsGroup(dmaSig,dmaBox->isChecked());
-      switchSignalsGroup(stallsSig,stallsBox->isChecked());
-      qDebug() << "[-] End Keep User Signals Init";
-  }
-
-  qDebug() << "[-] Gvsoc initialised" ;
-}
-
-//void MainWindow::handleParametersB(){
-//  dialog.show();
-//}
-
-void MainWindow::closeWindows(){
-
-  // Delete Dock windows
-  assert(timeline);
-  assert(overviewDock);
-  assert(sourceCodeDock);
-  assert(asmCodeDock);
-  assert(stallChartDock);
-  assert(functionsDock);
-  delete timeline;
-  delete overviewDock;
-  delete sourceCodeDock;
-  delete asmCodeDock;
-  delete stallChartDock;
-  delete functionsDock;
-  timeline=NULL;
-  overviewDock= NULL;
-  sourceCodeDock = NULL;
-  asmCodeDock=NULL;
-  stallChartDock = NULL;
-  functionsDock = NULL;
-  dockWindowsCreated=false;
-}
-
-
-void MainWindow::changeColor(QPushButton* button, QColor color)
-{
-  QPalette pal = button->palette();
-  pal.setColor(QPalette::Button, QColor(color));
-  button->setAutoFillBackground(true);
-  button->setPalette(pal);
-  button->update();
-}
-
-
-QDialog* MainWindow::createNonExclusiveGroup(){
-
-  // Create non exclusive check boxes
-  coresBox = new QCheckBox(tr("&Cores"));
-  debugBox = new QCheckBox(tr("&Debug Symbols"));
-  dmaBox = new QCheckBox(tr("&DMAs"));
-  stallsBox = new QCheckBox(tr("&Stalls"));
-  statisticsBox = new QCheckBox(tr("&Statistics"));
-  cachesBox = new QCheckBox(tr("&Caches"));
-  powerBox = new QCheckBox(tr("&Power"));
-
-  // Initialize Cores box to state ON
-  coresBox->setChecked(true);
-  // Initialize all other boxes to state OFF
-  debugBox->setChecked(false);
-  dmaBox->setChecked(false);
-  stallsBox->setChecked(false);
-  statisticsBox->setChecked(false);
-  cachesBox->setChecked(false);
-  powerBox->setChecked(false);
-
-  // make 3 last checkboxes non usable
-  statisticsBox->setAttribute(Qt::WA_TransparentForMouseEvents);
-  statisticsBox->setFocusPolicy(Qt::NoFocus);
-  cachesBox->setAttribute(Qt::WA_TransparentForMouseEvents);
-  cachesBox->setFocusPolicy(Qt::NoFocus);
-  powerBox->setAttribute(Qt::WA_TransparentForMouseEvents);
-  powerBox->setFocusPolicy(Qt::NoFocus);
-
-  // Create QDialogButtonBox
-  signalsDialog = new QDialog();
-
-  // Add all button boxes in a vertical layout
-  QVBoxLayout* vLayout = new QVBoxLayout;
-  vLayout->addWidget(coresBox);
-  vLayout->addWidget(debugBox);
-  vLayout->addWidget(dmaBox);
-  vLayout->addWidget(stallsBox);
-  vLayout->addWidget(statisticsBox);
-  vLayout->addWidget(cachesBox);
-  vLayout->addWidget(powerBox);
-  signalsDialog->setLayout(vLayout);
-
-  // Create the connections between button triggers and slots
-  connect(coresBox,SIGNAL(clicked(bool)), this, SLOT(switchCoresSig(bool)));
-  connect(debugBox,SIGNAL(clicked(bool)), this, SLOT(switchDebugSig(bool)));
-  connect(dmaBox,SIGNAL(clicked(bool)), this, SLOT(switchDmaSig(bool)));
-  connect(stallsBox,SIGNAL(clicked(bool)), this, SLOT(switchStallsSig(bool)));
-
-  return signalsDialog;
-}
-
-
-void MainWindow::createMenus()
-{
-  mainMenu = menuBar()->addMenu(tr("&View"));
-  gvsocSettingsMenu = menuBar()->addMenu(tr("&Settings"));
-  // Create signals Dialog
-  signalsDialog = createNonExclusiveGroup();
-  gvsocSettingsMenu->addAction("Gvsoc Settings",signalsDialog, SLOT(exec()));
-}
-
-MainWindow::MainWindow( std::string exampleDir,
-                        std::string path_to_elf,
-                        std::string configFileName,
-                        QString signalsTreeFileName):
-      exampleDir(exampleDir),
-      path_to_elf(path_to_elf),
-      configFileName(configFileName),
-      signalsTreeFileName(signalsTreeFileName)
-{
-
-  qDebug() << "[-] " ;
-  qDebug() << "[-] Create MainWindow" ;
-  createMenus();
-  setWindowTitle(tr("PROFILER"));
-  makeToolBar();
-  resize(QDesktopWidget().availableGeometry(this).size() * 0.7);
-  // Create gvsox proxy server
-  gvsoc = new Gvsoc_proxy(configFileName);
-  // Opens gvsoc Proxy server
-  qDebug() << "[-] handleOpenB from MainWindow" ;
-  handleOpenB();
-  qDebug() << "[-] handleOpenB from MainWindow ENDED" ;
-  update();
-}
-
-MainWindow::~MainWindow()
-{
-  handleCloseB();
-  closeWindows();
-}
-
-void MainWindow::foo(){
-  get_function_table();
-}
-
-void MainWindow::createDockWindows()
-{
-// Create Overview Dock Window
-  qDebug() << "[-] Create overview dock window" ;
-  overviewDock = new QDockWidget(tr("Overview"), this);
-  overviewDock->setAllowedAreas(Qt::BottomDockWidgetArea | Qt::TopDockWidgetArea | Qt::LeftDockWidgetArea | Qt::RightDockWidgetArea );
-  overview= new ExecOverview(overviewDock);
-  overviewDock->setWidget(overview);
-
-  mainMenu->addAction(overviewDock->toggleViewAction());
-
-  // Create Source Code Dock Window
-  qDebug() << "[-] Create code window" ;
-  sourceCodeDock = new QDockWidget(tr("Source Code"), this);
-  sourceCodeDock->setAllowedAreas(Qt::BottomDockWidgetArea | Qt::TopDockWidgetArea);
-  sourceCode = new QPlainTextEdit("",sourceCodeDock);
-  sourceCode->setWordWrapMode(QTextOption::NoWrap);
-  sourceCode->setReadOnly(true);
-  sourceCodeDock->setWidget(sourceCode);
-  mainMenu->addAction(sourceCodeDock->toggleViewAction());
-
-  // create ASM Code Dock Window
-  qDebug() << "[-] Create ASM code window" ;
-  asmCodeDock = new QDockWidget(tr("ASM Code"));
-  asmCodeDock->setAllowedAreas(Qt::BottomDockWidgetArea | Qt::TopDockWidgetArea);
-  asmCode = new QPlainTextEdit("",asmCodeDock);
-  asmCode->setWordWrapMode(QTextOption::NoWrap);
-  asmCode->setReadOnly(true);
-  asmCodeDock->setWidget(asmCode);
-  addDockWidget(Qt::BottomDockWidgetArea,asmCodeDock);
-  mainMenu->addAction(asmCodeDock->toggleViewAction());
-
-  // Create StallChart Window
-  qDebug() << "[-] Create stallchart window" ;
-  stallChartDock = new QDockWidget(tr("Stall Chart"), this);
-  stallChartDock->setAllowedAreas(Qt::BottomDockWidgetArea | Qt::LeftDockWidgetArea | Qt::RightDockWidgetArea | Qt::TopDockWidgetArea);
-  stallchart = new StallChart(stallChartDock);
-  stallChartDock->setWidget(stallchart);
-  addDockWidget(Qt::RightDockWidgetArea,stallChartDock);
-  stallChartDock->hide();
-  mainMenu->addAction(stallChartDock->toggleViewAction());
-
-  // Create Function Details Dock Window
-  qDebug() << "[-] Create functions Window" ;
-  functionsDock = new QDockWidget(tr("Functions"), this);
-  functionsDock->setAllowedAreas(Qt::BottomDockWidgetArea | Qt::TopDockWidgetArea);
-  fd = new FunctionDetails(functionsDock, sourceCode, asmCode, stallchart);
-  functionsDock->setWidget(fd);
-
-  // Add all Bottom Widgets
-  //qDebug() << " **** Add Widgets ****" ;
-  addDockWidget(Qt::BottomDockWidgetArea,overviewDock);
-  overviewDock->hide();
-  addDockWidget(Qt::BottomDockWidgetArea, functionsDock);
-  addDockWidget(Qt::BottomDockWidgetArea,sourceCodeDock);
-  asmCodeDock->hide();
-
-  mainMenu->addAction(functionsDock->toggleViewAction());
-  dockWindowsCreated=true;
-
-  // add signalsStatAction to pilote visibility of SignalsTableView
-  signalsStatAction = new QAction(tr("&Signals Statistics"), this);
-  signalsStatAction->setCheckable(true);
-  signalsStatAction->setChecked(true);
-  mainMenu->addAction(signalsStatAction);
-  connect(signalsStatAction , SIGNAL(triggered()), this, SLOT(signalsStatActionChecked()));
-
-}
-
-void MainWindow::signalsStatActionChecked() {
-  // called when the "signals statistics" table action has been triggered
-
-  if(signalsStatAction->isChecked())
-    timeline->changeSignalsStatVisibility(true);
-  else
-    timeline->changeSignalsStatVisibility(false);
-}
-
-void MainWindow::handleCloseB()
-{
-  // Handles Closing Gvsoc
-  qDebug() << "[-] handleCloseB " ;
-
-  // Change button color
-  if (gvsocOpened && gvsocRunning)
-  {
-    qDebug() << "[-] Closing Gvsoc" ;
-    assert(gvsoc);
-    gvsoc->close(); // What does it do?
-    qDebug() << "[-] gvsoc Closed ";
-    changeColor(runB,Qt::white);
-    // closes all sub windows, including the timeline window.
-    // Is this really necessary?
-    // closeWindows();
-  }
-
-  // Reinitialise Booleans
-  signalsAdded =false;
-  gvsocRunning = false;
-  gvsocOpened=false;
-  qDebug() << "[-] END handleCloseB " ;
-}
-
-void MainWindow::handlePauseB()
-{
-  // Handles pausig gvsoc
-  qDebug() << "[-] handlePauseB" ;
-  if (gvsocOpened && gvsocRunning){
-    // Change button color
-    changeColor(runB,Qt::red);
-    gvsoc->pause();
-    qDebug() << "[-] gvsoc paused ";
-    gvsocRunning = false;
-  }
-
-  qDebug() << "[-] End handlePauseB" ;
-}
-
-void MainWindow::handleRunB()
-{
-  qDebug() << "[-] handleRunB" ;
-  // handles running Gvsoc for first time or after a Pause or after a stop
-
-
-  assert(NULL != timeline);
-  // Run GVSoc
-  if (gvsocRunning) {
-    std::cout << "[-] Gvsoc is already running" << std::endl;
-    return;
-  }
-  else
-  {
-    if (!gvsocOpened) {
-    handleOpenB();
-    }
-    // Change button color
-    changeColor(runB,Qt::green);
-    qDebug() << "[-] Start Running Gvsoc" ;
-    gvsoc->run();
-    qDebug() << "[-] gvsoc Running ";
-    gvsocRunning = true;
-    gvsocRun = true;
-    timeline->updateFlags(gvsocRun);
-  }
-  qDebug() << "[-] signalsAdded " << signalsAdded << " " << timeline->getTLGView() ;
-  if (!signalsAdded){
-    timeline->getTLGView()->handleSignalNodeExpanded();
-    signalsAdded=true;
-  }
-  qDebug() << "[-] end SignalsAdded" ;
-  timeline->update();
-  qDebug() << "[-] End handleRunB" ;
-}
-
-
-void MainWindow::handleOpenB()
-{
-  if (gvsocOpened)
-    return;
-  // Gvsoc supposed not to be opened when called
-  qDebug() << "[-] handleOpenB2 " << "gvsocOpened " << gvsocOpened << "gvsocRunning " << gvsocRunning;
-  qDebug() << "dockWindowsCreated " << dockWindowsCreated ;
-
-  if (gvsoc->open()){
-    qDebug() << "[-] Gvsoc process didn't open correctly" ;
-    QMessageBox msgBox;
-    msgBox.setText("Gvsoc didn't open correctly");
-    msgBox.exec();
-    qDebug() << "Gvsoc didn't open correctly";
-  }
-  qDebug() << "[-] Gvsoc process opened correctly2" ;
-
-  // re-initializing gvsocRun
-  gvsocRun = false;
-  if (timeline!=NULL)
-      timeline->updateFlags(gvsocRun);
-  // It's crashing here obviously ... ??
-  //gvsoc->remove_event_regex(".*@all.bin");
-
-   //if (!gvsocOpened) {
-    qDebug() << "[-] Initializing Backend Process" ;
-    printf("[.] Profiling @ %u KHz\n", (uint) 100000 / 1000);
-    if (! init_backend("all.bin", path_to_elf, 100000)){
-      QMessageBox msgBox;
-      msgBox.setText("[.] Fatal error: impossible to init backend. Program exits");
-      msgBox.exec();
-      printf("[.] Fatal error: impossible to init backend");
-      qDebug() << "Fatal error: impossible to init backend";
-      exit (1);
-    }
-    qDebug() << "[-] Backend Process initialized correctly" ;
-  //}
-
-  if (!dockWindowsCreated) {
-    qDebug() << "[-] Creating Dock Windows " ;
-    createDockWindows();
-    dockWindowsCreated=true;
-    qDebug() << "[-] Dock Windows created" ;
-  }
-
-  if (timeline == NULL)
-  {
-    qDebug() << " [-] Creating new Timeline Window " ;
-    timeline = new Timeline(this, toolBar, fd, functionsDock,
-                            sourceCode, asmCode, stallchart,signalsTreeFileName);
-    if (timeline!=NULL ) {
-      setCentralWidget(timeline);
-      timeline->show();
-    }
-    else
-       qDebug() << " [-] Timeline Window hasn't been created" ;
-  }
-
-  gvsocOpened = true;
-  initGvsoc();
-  update();
-  qDebug() << "[-] End handleOpenB " ;
-}
diff --git a/tools/profiler/gui/src/prolabel.cpp b/tools/profiler/gui/src/prolabel.cpp
deleted file mode 100644
index 4b957978d..000000000
--- a/tools/profiler/gui/src/prolabel.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include "prolabel.hpp"
-
-ProLabel::ProLabel(const QString & text, QWidget * parent, Qt::WindowFlags f)
-    :QLabel(text, parent, f)
-{
-}
-
-ProLabel::~ProLabel()
-{
-}
-
-void ProLabel::mousePressEvent(QMouseEvent*)
-{
-    emit clicked();
-}
\ No newline at end of file
diff --git a/tools/profiler/gui/src/splitterdesign.cpp b/tools/profiler/gui/src/splitterdesign.cpp
deleted file mode 100644
index 232704abd..000000000
--- a/tools/profiler/gui/src/splitterdesign.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <QHBoxLayout>
-#include <QVBoxLayout>
-
-#include "splitterdesign.hpp"
-
-void SplitterDesign::decorateSplitter(QSplitter* splitter, int index)
-{
-    Q_ASSERT(splitter != NULL);
-
-    const int gripLength = 15; 
-    const int gripWidth = 1;
-    const int grips = 3;
-
-    splitter->setOpaqueResize(false);
-    splitter->setChildrenCollapsible(false);
-
-    splitter->setHandleWidth(7);
-    QSplitterHandle* handle = splitter->handle(index);
-    Qt::Orientation orientation = splitter->orientation();
-    QHBoxLayout* layout = new QHBoxLayout(handle);
-    layout->setSpacing(0);
-    layout->setMargin(0);
-
-    if (orientation == Qt::Horizontal)
-    {
-        for (int i=0;i<grips;++i)
-        {
-            QFrame* line = new QFrame(handle);
-            line->setMinimumSize(gripWidth, gripLength);
-            line->setMaximumSize(gripWidth, gripLength);
-            line->setFrameShape(QFrame::StyledPanel);
-            layout->addWidget(line);
-        }
-    }
-    else
-    {
-        //this will center the vertical grip
-        //add a horizontal spacer
-        layout->addStretch();
-        //create the vertical grip
-        QVBoxLayout* vbox = new QVBoxLayout;
-        for (int i=0;i<grips;++i)
-        {
-            QFrame* line = new QFrame(handle);
-            line->setMinimumSize(gripLength, gripWidth);
-            line->setMaximumSize(gripLength, gripWidth);
-            line->setFrameShape(QFrame::StyledPanel);
-            vbox->addWidget(line);
-        }
-        layout->addLayout(vbox);
-        //add another horizontal spacer
-        layout->addStretch();
-    }
-}
\ No newline at end of file
diff --git a/tools/profiler/gui/src/stallchart.cpp b/tools/profiler/gui/src/stallchart.cpp
deleted file mode 100644
index def867395..000000000
--- a/tools/profiler/gui/src/stallchart.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-#include "stallchart.hpp"
-#include "backend_interface.hpp"
-
-#include <iostream>
-#include <QBarCategoryAxis>
-#include <QValueAxis>
-
-QT_CHARTS_USE_NAMESPACE
-
-StallChart::StallChart(QWidget* parent){
-  this->setParent(parent);
-  series = new QBarSeries();
-
-  chart = new QChart();
-  chart->addSeries(series);
-  chart->setTheme(QChart::ChartThemeBrownSand);
-
-  QChartView *chartView = new QChartView(chart);
-  chartView->setRenderHint(QPainter::Antialiasing);
-
-  QHBoxLayout* layout = new QHBoxLayout;
-  layout->addWidget(chartView);
-  setLayout(layout);
-}
-
-void StallChart::construct(std::string functionName){
-  if (functionName == name) return;
-  const Function_stat* fs = get_function_stat(functionName);
-  if (fs == nullptr){
-    std::cout << "[-] error: impossible to get stall stat for function ";
-    std::cout << functionName << std::endl;
-    return;
-  }
-  name = functionName;
-
-  chart->removeAllSeries();
-  uint64_t sum = 0;
-  for (int i = 0; i < N_STALL_REASONS; i++){
-    sum += (uint64_t) fs->stall_info.lost_cycles[i];
-  }
-  if (sum == 0){
-      chart->setTitle("No stall reasons identified for that function");
-      return;
-  }
-  series = new QBarSeries();
-  QBarSet* set = new QBarSet("cycles");
-  QStringList categories;
-  *set << fs->tot_time;
-  categories << "Total time";
-  // TODO: handle case fs->tot_time = 0...
-  for (int i = 0; i < N_STALL_REASONS; i++){
-    /*series->append((std::string(stall_strings[i]) + " " +
-      std::to_string(100 * ss->lost_cycles[i] / sum) + "%").c_str(),
-      ss->lost_cycles[i]);*/
-    *set << fs->stall_info.lost_cycles[i];
-    categories << (std::string(stall_strings[i]) + " " +
-      std::to_string(100 * fs->stall_info.lost_cycles[i] / fs->tot_time) + "%").c_str();
-  }
-  series->append(set);
-  /*for (auto& c: series->slices()){
-    c->setLabelVisible();
-  }*/
-
-
-  /*QValueAxis *axisY = new QValueAxis();
-  axisY->setRange(0,15);
-  chart->addAxis(axisY, Qt::AlignLeft);
-  series->attachAxis(axisY);*/
-
-  chart->setTitle(functionName.c_str());
-  chart->setTitleFont(QFont("Arial", 22));
-  chart->addSeries(series);
-
-  chart->createDefaultAxes();
-  QBarCategoryAxis *axisX = new QBarCategoryAxis();
-  axisX->append(categories);
-  auto l = chart->axes(Qt::Horizontal);
-  if (! l.empty()) chart->removeAxis(l.front());
-  chart->addAxis(axisX, Qt::AlignBottom);
-  series->attachAxis(axisX);
-
-  chart->legend()->setVisible(true);
-  chart->legend()->setAlignment(Qt::AlignRight);
-}
diff --git a/tools/profiler/gui/src/statmodel.cpp b/tools/profiler/gui/src/statmodel.cpp
deleted file mode 100644
index 3220e8c4c..000000000
--- a/tools/profiler/gui/src/statmodel.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include "statmodel.hpp"
-#include <QDebug>
-#include <iostream>
-#include <cmath>
-
-extern Data_manager* core;
-
-StatModel::StatModel(Timeline* timeline, QObject *parent) : QAbstractTableModel(parent)
-{
-  tl=timeline;
-
-  // Refreshing Implementation
-  //timer = new QTimer(this);
-  //connect(timer, SIGNAL(timeout()), this, SLOT(refreshData()));
-  //timer->start(2000);
-}
-
-//updates the parameters allowing the right construction
-//of the tree model. Called each time the signalsTreeView
-//is expanded or collapsed under user action
-// Function called from two functions:
-// handleSignalNodeExpanded()
-// handleSignalNodeCollapsed()
-void StatModel::updateTreeParameters(TreeModel* sigModel,
-                                     QTreeView* signalsTreeView) {
-  this->signalsView = signalsTreeView;
-  this->signalsModel = sigModel;
-}
-
-template <typename T >
-uint64_t StatModel::getTimeUp(const Data_with_time<T>& dwt,
-                        uint64_t t0,
-                        uint64_t t1) {
-  //std::cout << "[-]  StatModel::getTimeUp t0: " << t0 << "t1: " << t1 <<std::endl;
-  // First calculate the interval
-  int avg_factor =0;
-  uint64_t timeUp=0;
-  uint64_t tBegin = dwt.begin;
-  uint64_t tEnd = dwt.end;
-  if (tBegin< t0)
-    tBegin=t0;
-  if (tEnd > t1)
-    tEnd=t1;
-
-  std::string txtValue;
-  txtValue = std::string("1");
-  if (! dwt.mixed_up){
-    if (std::is_same<T, const char*>::value){
-      // pointer trick, because the compiler does not understand that T is
-      // nothing but const char* in this "if"
-      const char* s;
-      memcpy(&s, &(dwt.d), sizeof(const char*));
-      //txtValue = std::to_string(signalId) + "-" + std::string(s);
-      txtValue =  std::string(s);
-    }
-    else if (std::is_same<T, generic_data_t>::value){
-      generic_data_t g;
-      memcpy(&g, &(dwt.d), sizeof(generic_data_t));
-      //txtValue = std::to_string(signalId) + "-" + std::to_string(g);
-      txtValue =  std::to_string(g);
-    }
-    else if (std::is_same<T, g_decompressed_data_t>::value){
-      g_decompressed_data_t ud;
-      memcpy(&ud, &(dwt.d), sizeof(g_decompressed_data_t));
-      txtValue = std::to_string(ud.value);
-      avg_factor = ud.n_items_in_avg;
-      if (ud.value!=0 && avg_factor!=1)
-        //txtValue = std::to_string(signalId) + "-" + (std::to_string(ud.value) + "(avg. "
-        txtValue =  (std::to_string(ud.value) + "(avg. "
-                 + std::to_string(avg_factor) + ")" );
-      else if (ud.value!=0)
-        //txtValue = std::to_string(signalId) + "-" + (std::to_string(ud.value));
-        txtValue =   (std::to_string(ud.value));
-    }
-    else {
-      std::cout << "[-] Warning data type not recognized by the GUI" << std::endl;
-      //std::cout << "[-] timeUp" << 0<< std::endl;
-      return 0;
-    }
-    //painter.setBrush(QBrush(getColor(txtValue)));
-  }
-  else {
-    // Trace is mixed up .. means it is composed of merged traces.
-    // painter.setBrush(QBrush(Qt::black));
-    // shouldn't happen
-    std::cout << "[-] Signal shouldn't be mixed up" << std::endl;
-    //std::cout << "[-] timeUp" << 0<< std::endl;
-    return 0;
-  }
-
-  if (txtValue.compare(std::string("0")) != 0) {
-    timeUp= tEnd - tBegin;
-    //std::cout << "[-] timeUp" << timeUp << std::endl;
-  }
-
-  return timeUp;
-}
-
-
-// Calculates the time the signal is Up within the [t0,T1] range
-uint64_t StatModel::calculateTimeUp(const QString signalPath,
-                    std::vector<TLData<const char*>> data,
-                    std::vector<int> signalIdList,
-                    uint64_t t0,
-                    uint64_t t1) {
-
-  //std::cout << "[-]  ******* StatModel::calculateTimeUp " << std::endl;
-  //std::cout << "[-]  ******* SignalPath " << signalPath.toStdString() << std::endl ;
-  QString pcSignalPath = signalPath;
-  uint64_t timeUp = 0;
-  int signalId = 0;
-
-  // First Look if it's a core or fc & state signal
-  // if it's a peX/state signal, or fc/state signal, we define the path to a new signal that should be taken
-  // for the calculation in pcSignalPath. In the other cases, pcSignalPath is set to "" .
-  if (pcSignalPath.contains("pe") & pcSignalPath.contains("state")) {
-    pcSignalPath.replace("state", "pcer_instr");
-  } else if (pcSignalPath.contains("fc") & pcSignalPath.contains("state")) {
-    pcSignalPath.replace("state", "pc");
-  }
-
-  // Second look for the signal ID
-  signalId = getSignalIdFromBackend(pcSignalPath.toStdString());
-  // Get the signal index in list
-  int signalIdx = tl->getTLGView()->getSignalIndex(signalIdList, signalId);
-  //std::cout << "[-]  ******* signalIdx" << signalIdx <<  std::endl;
-  //std::cout <<  "[-] ******* pcSignalPath " << pcSignalPath.toStdString() << std::endl;
-
-  if (signalIdx != -1) { // signal found in core signals
-    //qDebug() << "[-] Calculate timeUp of core Signal " << signalId << " " << pcSignalPath  ;
-    // Index exist ==> just calculate TimeUp value for the timeline interval
-    for(auto iter = data[signalIdx].between(t0, t1, 1); !iter.done(); ++iter){
-      // Gets the time the signal is up during the [t0,t1] range
-      timeUp+= getTimeUp<const char*>(*iter,t0,t1);
-    }
-    //qDebug() << "timeUp " << timeUp ;
-    return timeUp;
-  }
-  //else
-  //  std::cout << " Signal ID " <<  signalId << " not found in signalIdList or not to be displayed" << std::endl;
-
-  //Regular signal ==> just calculate timeUp value
-  auto l = event_timestamps(signalId);
-  if (l != nullptr){
-    //qDebug() << "[-] Calculate TimeUp for  Other Signal " << signalId << " " << pcSignalPath ;
-    for(auto item : decompress(l->between(t0, t1, 1))){
-      timeUp += getTimeUp<g_decompressed_data_t>(item,t0,t1);
-    }
-    //qDebug() << "timeUp " << timeUp ;
-    return timeUp;
-  }
-
-  return timeUp;
-}
-
-void StatModel::insertRow(const QString signalPath,
-                          std::vector<TLData<const char*>> data,
-                          std::vector<int> signalIdList,
-                          int line,
-                          uint64_t t0,
-                          uint64_t t1) {
-  //qDebug() << "[-]  StatModel::insertRow " ;
-  //std::cout <<  "[-] ******* timestamp " << t0 << "  " << t1 <<  std::endl;
-  //beginInsertRows();
-
-  // First get signal ID
-  //uint32_t signalId = getSignalId(signalPath.toStdString());
-  // Calculate Time Up
-
-  // Get signal Stat from id
-  //int dutyTime = core->signalDutyTime[signalId];
-  //auto maxTime = get_max_time();
-  uint64_t dutyTime = calculateTimeUp(signalPath, data, signalIdList,t0,t1);
-
-  uint64_t rangeTime = (t0 > t1) ? t0 - t1 : t1 - t0;
-  //qDebug() << "[-] dutyTime  " << dutyTime ;
-  //qDebug() << "[-] rangeTime  " << rangeTime ;
-  int percentage =0;
-  if (rangeTime>0)
-    percentage = (int)(100.0 * dutyTime / rangeTime);
-  else
-    percentage =0;
-
-  //std::cout << "[-] line " << line << "percentage  " << percentage << std::endl;
-  // Enter data in smDutyTime list at the row n° line .. or in order?? check it
-  if (signalPath.contains("pe") & signalPath.contains("state"))
-    smDutyTime.insert(line,QString::number(((float)percentage) / 100.00) + QString(" ipc"));
-  else 
-    smDutyTime.insert(line,QString::number(percentage) + QString("%"));
-
-  //endInsertRows()
-}
-
-void StatModel::buildModel( QModelIndex parent,
-                            QModelIndex signalIdx,
-                            std::vector<TLData<const char*>> data,
-                            std::vector<int> signalIdList,
-                            int* line,
-                            uint64_t t0,
-                            uint64_t t1) {
-  // This function walks recursively through the signals tree
-  // and fills up the signals statistics table accordingly
-  //qDebug() << "[-]  StatModel::buildModel " ;
-  //std::cout <<  "[-] ******* timestamp " << t0 << "  " << t1 <<  std::endl;
-  int i =0;
-  QModelIndex idx;
-  QModelIndex siblingIdx;
-  // dump parent node
-  if (parent != signalsView->rootIndex()){
-      (*line)++;
-      insertRow(signalIdx.data().toString(), data, signalIdList,*line,
-                          t0, t1);
-  }
-
-  if ((parent == signalsView->rootIndex()) || signalsView->isExpanded(parent)) {
-      for (i=0; i< signalsModel->rowCount(parent) ; i++) {
-          idx = signalsModel->index(i,0,parent);
-          siblingIdx = signalsModel->index(i,1,parent);
-          buildModel(idx,siblingIdx, data, signalIdList,line,
-                          t0, t1);
-      }
-  }
-
-}
-
-// Create a method to populate the model with data:
-void StatModel::populateData(uint64_t t0 , uint64_t t1)
-{
-    //qDebug() << "[-]  StatModel::populateData ";
-
-    // clear signals stats list
-    smDutyTime.clear();
-
-    auto data = get_timeline_data();
-
-    auto signalIdList = get_timeline_id();
-    int line = -1;
-     //std::cout <<  "[-] ******* timestamp " << t0 << "  " << t1 <<  std::endl;
-    if (signalsView != NULL) {
-      //std::cout << "[-]  signalsView: " << signalsView << std::endl;
-      auto index = signalsView->rootIndex();
-      //std::cout <<  "[-] ******* timestamp " << t0 << "  " << t1 <<  std::endl;
-      buildModel(index,
-              signalsView->rootIndex(),
-              data, signalIdList, &line,
-              t0, t1);
-    }
-    QModelIndex top =  createIndex(0,0);
-    QModelIndex bottom = createIndex(rowCount() - 1, 0);
-    emit dataChanged(top, bottom);
-    return;
-}
-
-
-
-int StatModel::rowCount(const QModelIndex &parent) const
-{
-    Q_UNUSED(parent);
-    return smDutyTime.length();
-}
-
-int StatModel::columnCount(const QModelIndex &parent) const
-{
-    Q_UNUSED(parent);
-    return 1;
-}
-
-QVariant StatModel::data(const QModelIndex &index, int role) const
-{
-    if (!index.isValid())  {
-        return QVariant();
-    }
-
-    if (role == Qt::SizeHintRole) {
-        return QSize(100, 30); // for all rows
-    } else if (index.column() == 0 && role == Qt::TextAlignmentRole) {
-        return Qt::AlignRight;
-    }
-    else if (role != Qt::DisplayRole)
-        return QVariant();
-
-    if (index.column() == 0) {
-        return smDutyTime[index.row()];
-    }
-    /*else if (index.column() == 1) {
-        return tm_contact_phone[index.row()];
-    }
-    */
-    return QVariant();
-}
-
-QVariant StatModel::headerData(int section, Qt::Orientation orientation, int role) const
-{
-    if (role == Qt::DisplayRole && orientation == Qt::Horizontal) {
-        if (section == 0) {
-            //return QString("Name");
-            return QVariant();
-        }
-        else if (section == 1) {
-            //return QString("Phone");
-            return QVariant();
-        }
-
-    }
-    return QVariant();
-}
diff --git a/tools/profiler/gui/src/timeline.cpp b/tools/profiler/gui/src/timeline.cpp
deleted file mode 100644
index e6dc9498b..000000000
--- a/tools/profiler/gui/src/timeline.cpp
+++ /dev/null
@@ -1,1785 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <iostream>
-#include <QPushButton>
-#include <QScrollBar>
-#include <QApplication>
-#include <QStyle>
-#include <QSpacerItem>
-#include <QTimer>
-#include <QIcon>
-#include <QToolBar>
-#include <QGraphicsSimpleTextItem>
-#include <QAbstractSlider>
-#include <QSizePolicy>
-#include <QMessageBox>
-#include <QColor>
-#include <QPalette>
-#include <QFile>
-#include <QHeaderView>
-#include <unistd.h>
-#include <QtDebug>
-#include <assert.h>
-
-#include "math.h"
-
-#include "timeline.hpp"
-
-#include "backend_interface.hpp"
-
-const double  TLGView::rectRelativeSize = 0.8;
-const double  TLGView::rectRelativeOffset = (1 - TLGView::rectRelativeSize) / 2;
-const int     TLGView::ticStep = 200; // pixels
-const int     TLGView::legendWidth = 100;
-const int     TLGView::legendTextSpace = 3;
-const int     TLGView::ticSize = 8;
-const int     TLGView::functionNameMinSize = 50;
-const int     TLGView::autoRedrawPeriod = 2000; // ms
-const int     TLGView::scrollBarMaxRange = 100000;
-const double  TLGView::scrollCoeff = 2. / 10'000;
-
-
-bool AdjustingScrollArea::eventFilter(QObject * obj, QEvent * ev) {
-  if (obj == widget() && ev->type() == QEvent::Resize) {
-    qDebug() << "[-] eventFilter";
-    // Before sync, we should repaint the graphic view unless it's not
-    // right
-    tlgView->addSignalsToGview(signalsTreeView->rootIndex(),signalsTreeView->rootIndex());
-    tlgView->update();
-    syncWithTreeView();
-    tlgView->update();
-    tlgView->show();
-    this->update();
-    this->show();
-  }
-
-  //std::cout << "[-] end eventFilter" << std::endl;
-  return QScrollArea::eventFilter(obj, ev);
-}
-
-void AdjustingScrollArea::syncWithTreeView() {
-     qDebug() << "[-] syncWithTreeView" ;
-    // Problem is when we enter this function, sometimes, the SignalsTreeView scrollbar
-    // range is not yet updated.
-    // So first, update signalsTreeView
-    signalsTreeView->update();
-    signalsTreeView->verticalScrollBar()->update();
-    verticalScrollBar()->update();
-    syncMutex.lock();
-    // First block the scrollArea vertical bar signals
-    this->verticalScrollBar()->blockSignals(true);
-
-    // First calculate the percentage of move on the signalsTreeView
-    double  min1 = (double) signalsTreeView->verticalScrollBar()->minimum();
-    double  max1 = (double) signalsTreeView->verticalScrollBar()->maximum();
-    if ((max1 - min1) <= 0) {
-      qDebug() << "min-max " << min1 << " " << max1 ;
-      qDebug() << "Error no sync possible: max1 -min1 <=0";
-      syncMutex.unlock();
-      return;
-    } else {
-      // Then get min and max of the   Vartical bar
-      double min2 = (double) verticalScrollBar()->minimum();
-      double max2 = (double) verticalScrollBar()->maximum();
-      double factor = (max2 - min2) / (max1-min1) ;
-      qDebug() << "min-max " << min1 << " " << max1 << " " << min2 << " " << max2;
-      qDebug() << "factor " << factor;
-      double value4= factor * (double) signalsTreeView->verticalScrollBar()->value();
-      double value3= round(value4);
-      int value2 = value3;
-      qDebug() << "signals slider value" << signalsTreeView->verticalScrollBar()->value();
-      qDebug() << "scrollarea slider value (double) " << value4;
-      qDebug() << "round'scrollarea slider value) (double) " << value3;
-      qDebug() << "scrollarea slider value (int)" << value2;
-      // set the scrollArea scroll bar value
-      this->verticalScrollBar()->setValue(value2);
-
-      // update widget position
-      QPoint topLeft = viewport()->rect().topLeft();
-      this->widget()->move(0, topLeft.y() - value2);
-
-      // Finally unblock scrollarea signals
-      this->verticalScrollBar()->blockSignals(false);
-      this->update();
-      this->show();
-    }
-    syncMutex.unlock();
-    qDebug() << "End syncWithTreeView";
-}
-
-void AdjustingScrollArea::updateWidgetPosition(){
-  qDebug() << "[-] updateWidgetPosition" ;
-  int vvalue = verticalScrollBar()->value();
-  QPoint topLeft = viewport()->rect().topLeft();
-
-  widget()->move(0, topLeft.y() - vvalue);
-}
-
-AdjustingScrollArea::AdjustingScrollArea(QTreeView* signalsView,
-                                         TLGView* gview,
-                                         QWidget * parent) : QScrollArea{parent} {
-  signalsTreeView = signalsView;
-  tlgView=gview;
-                                         }
-
-void AdjustingScrollArea::setWidget(QWidget *w) {
-    QScrollArea::setWidget(w);
-    // It happens that QScrollArea already filters widget events,
-    // but that's an implementation detail that we shouldn't rely on.
-    w->installEventFilter(this);
-}
-
-void HLegendWidget::paintEvent(QPaintEvent* event) {
-
-  //std::cout << "[-] Paint HLegendWidget 1" << std::endl;
-  const QRect r = event->rect();
-  int x0 = r.x();
-  int x1 = r.x() + r.width();
-  uint64_t t0, t1;
-
-  gview->getTimeWindow(x0, x1, t0, t1);
-
-  uint X = size().width();
-  uint Y = gview->peHeight;
-
-  QPainter painter(this);
-  pen = QPen(Qt::black);
-  pen.setWidthF(0);
-
-  painter.setPen(pen);
-  // Paint horizontal line of the legend
-  painter.drawLine(0, gview->peHeight, X,Y );
-
-  // calculating time interval for 5 ticsteps approximatly
-  float vi= (gview->position2time(X) - gview->position2time(0))/10;
-  int viRound = gview->extractPower10Round(vi);
-  uint64_t xtmin= (uint64_t) (gview->position2time(0)/viRound) * viRound;
-  uint64_t xtmax= (uint64_t) (gview->position2time(X)/viRound) * viRound;
-
-
-  QString textSave="";
-  if ((xtmax-xtmin)/viRound > 12)
-    viRound = viRound *2;
-  if ((xtmax-xtmin)/viRound < 5)
-    viRound = viRound/2;
-  int firstText=true;
-  for (uint64_t xt=xtmin; xt<=xtmax; xt+= viRound){
-    int x = gview->time2position(xt);
-    // paint vertical stick of the legend for the given timestamp
-    painter.drawLine(x, Y, x, Y - gview->ticSize);
-    QRect rect( x - gview->legendWidth / 2,
-                Y - gview->peHeight,
-                gview->legendWidth,
-                gview->peHeight - gview->ticSize - 1);
-    // Don't write first time number on legend as it would be half written
-    if (!firstText) {
-      painter.drawText(rect, Qt::AlignCenter, gview->time2stringBis(xt));
-    } else {
-      firstText=false;
-    }
-  }
-
-
-  //std::cout << "[-] Paint HLegendWidget End" << std::endl;
-}
-
-HLegendWidget::HLegendWidget(TLGView* gview): gview(gview) {
-
-  //std::cout << "[-] create HLegendWidget 0" << std::endl;
-
-}
-
-
-
-TLGView::TLGView( FunctionDetails* fd,
-                  QDockWidget* functionsDock,
-                  Timeline* tl,
-                  QScrollBar* sbar,
-                  QVBoxLayout* legendLayout,
-                  QPlainTextEdit* sourceCode,
-                  QPlainTextEdit* asmCode,
-                  StallChart* stallchart,
-                  TreeModel* signalsModel,
-                  QTreeView* signalsView,
-                  StatModel* statModelRef,
-                  QTableView*  statTableViewRef
-                  ):
-                  functionsDock(functionsDock),
-                  sourceCode(sourceCode),
-                  asmCode(asmCode),
-                  stallchart(stallchart),
-                  signalsModel(signalsModel),
-                  signalsView(signalsView),
-                  statModel(statModelRef),
-                  statTableView(statTableViewRef)
-{
-  this->fd = fd;
-  this->tabw=tabw;
-  this->tl = tl;
-  this->sbar =sbar;
-  this->legendLayout = legendLayout;
-  currentMode = TIME_MODE;
-  pen = QPen(Qt::black);
-  greenPen = QPen(Qt::darkGreen);
-  redPen = QPen(Qt::red);
-  pen.setWidthF(0);
-  greenPen.setWidthF(0);
-  redPen.setWidthF(0);
-  zoomFactor = 1. / 10'000'000;
-  timeOffset = 0;
-  maxTime = 1;
-  maxScrollTime = 1;
-  verticalLineTime = 0;
-  timeIntervalValue = 0;
-  //qDebug() << "this->sbar: " << this->sbar ;
-  this->sbar->setRange(0, TLGView::scrollBarMaxRange);
-  setPageStep();
-
-  m_nbMousePressed = false;
-
-  connect(&timer, SIGNAL(timeout()), this, SLOT(autoReconstruct()));
-  timer.start(TLGView::autoRedrawPeriod);
-  qDebug() << "End create Gview: " ;
-}
-
-QColor TLGView::getColor(std::string fname){
-  if (functionColors.find(fname) != functionColors.end())
-    return functionColors[fname];
-
-  functionColors[fname] = QColor::fromHsl(currentHue, 0xff, 0xc0);
-  currentHue += hueStep;
-  if (currentHue >= 360){
-    if( hueStep >= 2) hueStep /= 2;
-    currentHue = (currentHue % 360) + hueStep;
-  }
-  return functionColors[fname];
-}
-
-void TLGView::drawBackground(QPainter *painter, const QRect &exposed, int nlines)
-{
-    QColor c1 = Qt::white;
-    QColor c2 = QColor::fromRgb(0xf0, 0xf0, 0xf0);
-    painter->setPen(Qt::NoPen);
-    for (int i = 0; i < nlines; i++){
-      painter->setBrush(i & 1 ? c1 : c2);
-      painter->drawRect(exposed.x(), peHeight * i, exposed.width(), peHeight);
-    }
-}
-
-void TLGView::setPageStep(){
-  uint64_t t0, t1;
-  getTimeWindow(t0, t1);
-  assert(maxScrollTime!= 0);
-  assert(zoomFactor!=0);
-  this->sbar->setPageStep(TLGView::scrollBarMaxRange * (t1 - t0) / maxScrollTime);
-  this->sbar->setSingleStep(std::max(1, (int) (TLGView::scrollCoeff / zoomFactor)));
-}
-
-void TLGView::setMaxScrollTime(){
-  // << "[-]setMaxScrollTime " << std::endl;
-  uint64_t w = timeWidth();
-  // add a 30 pixel margin after last event (no margin looks weird)
-  //std::cout << "maxTime " << maxTime << std::endl;
-  //std::cout << "w " << w << std::endl;
-  //std::cout << "zoomFactor" << zoomFactor << std::endl;
-  assert(zoomFactor!=0);
-  if (maxTime > w)
-    maxScrollTime = maxTime - w + 30 / zoomFactor;
-  else
-    maxScrollTime = 1;
-
-  //std::cout << "maxScrollTime " << maxScrollTime << std::endl;
-  // readjust sbar->value to be consistent with timeOffset
-  assert(maxScrollTime!= 0);
-  sbar->setValue(timeOffset * TLGView::scrollBarMaxRange / maxScrollTime);
-}
-
-void TLGView::autoReconstruct(){
-  //std::cout << "autoREconstruct  " << std::endl;
-  if (tldata_has_changed()){
-    ack_tld_changes();
-    maxTime = get_max_time();
-    setMaxScrollTime();
-    setPageStep();
-    update();
-  }
-  if (datamanager_done()){
-    tl->clearCurrentExecToolBar();
-  }
-}
-
-void TLGView::getTimeWindow(uint64_t& t0, uint64_t& t1) const{
-  getTimeWindow(0, size().width(), t0, t1);
-}
-
-void TLGView::getTimeWindow(int x0, int x1, uint64_t& t0, uint64_t& t1) const{
-  t0 = timeOffset;
-  assert(zoomFactor!= 0);
-  t1 = timeOffset + (uint64_t) ((x1 - x0) / zoomFactor);
-}
-
-int TLGView::time2position(uint64_t t) const {
-  if (t < timeOffset) return -1;   // not in display range
-  return round((double) (t - timeOffset) * zoomFactor);
-}
-
-uint64_t TLGView::position2time(int x) const {
-  assert(zoomFactor!= 0);
-  return timeOffset + (uint64_t) ((double) x / zoomFactor);
-}
-
-uint64_t TLGView::width2time(int x) const {
-  assert(zoomFactor!= 0);
-  return (uint64_t) ((double) x / zoomFactor);
-}
-
-uint64_t TLGView::timeWidth() const{
-  assert(zoomFactor!= 0);
-  return (double) size().width() / zoomFactor;
-}
-
-QString TLGView::position2string(int x) const {
-  return time2string(position2time(x));
-}
-
-QString TLGView::formatTimeStamp(QString ts) const {
-
-  const int step = 3;
-  const char mychar = ',';
-  for (int i = ts.length()-step; i>0; i=i-step)
-      ts.insert(i, mychar);
-
-  return ts;
-}
-
-QString TLGView::time2string(uint64_t t) const {
-  int T;
-  switch(currentMode){
-    default:
-    case TIME_MODE:
-      return formatTimeStamp(QString::number(t / 1000)) + " ns";
-    case FC_CYCLE_MODE:
-      T = get_fc_period();
-      if (T == 0) {
-        std::cout << "[-] Error: cluster period = 0" << std::endl;
-        exit(1);
-      }
-      return formatTimeStamp(QString::number(t / T));
-    case CLUSTER_CYCLE_MODE:
-      T = get_cluster_period();
-      if (T == 0) {
-        std::cout << "[-] Error: cluster period = 0" << std::endl;
-        exit(1);
-      }
-      return formatTimeStamp(QString::number(t / T));
-  }
-}
-
-QString TLGView::time2stringBis(uint64_t t) const {
-  int T;
-  switch(currentMode){
-    default:
-    case TIME_MODE:
-      // format in us unit
-      return QString::number(double(t) / double(1000000),'f',2 );
-      //return formatTimeStamp(QString::number(double(t) / double(1000000),'g',2 ));
-    case FC_CYCLE_MODE:
-      T = get_fc_period();
-      if (T == 0) {
-        std::cout << "[-] Error: cluster period = 0" << std::endl;
-        exit(1);
-      }
-      return formatTimeStamp(QString::number(t / T));
-    case CLUSTER_CYCLE_MODE:
-      T = get_cluster_period();
-      if (T == 0) {
-        std::cout << "[-] Error: cluster period = 0" << std::endl;
-        exit(1);
-      }
-      return formatTimeStamp(QString::number(t / T));
-  }
-}
-
-template <typename T>
-int TLGView::drawDwt(const Data_with_time<T>& dwt, int x1, int y,
-                        QPainter& painter){
-
-  int displayedItems = 0; // nb of displayed items
-  int avg_factor = 1;
-  int xa = time2position(dwt.begin);
-  int xb = time2position(dwt.end);
-  if (xa == -1) xa = 0;
-  if (xb == -1) xb = x1;
-  if (xa > xb){
-    std::cout << "[-] warning: invalid element to display (begin > end)" << x1 << std::endl;
-    return 0;
-  }
-  //std::cout <<  "[-] printing Signal #" << signalId << std::endl;
-
-  /*
-     depending on the type T of the template, we build the text description
-     of the item to be displayed
-  */
-  std::string txtValue;
-  txtValue = std::string("1");
-  if (! dwt.mixed_up){
-    if (std::is_same<T, const char*>::value){
-      // pointer trick, because the compiler does not understand that T is
-      // nothing but const char* in this "if"
-      if (y==30 )
-        qDebug() << "case 1";
-      const char* s;
-      memcpy(&s, &(dwt.d), sizeof(const char*));
-      //txtValue = std::to_string(signalId) + "-" + std::string(s);
-      txtValue =  std::string(s);
-    }
-    else if (std::is_same<T, generic_data_t>::value){
-      generic_data_t g;
-       if (y==30 )
-        qDebug() << "case 2";
-      memcpy(&g, &(dwt.d), sizeof(generic_data_t));
-      //txtValue = std::to_string(signalId) + "-" + std::to_string(g);
-      txtValue =  std::to_string(g);
-    }
-    else if (std::is_same<T, g_decompressed_data_t>::value){
-      g_decompressed_data_t ud;
-
-      memcpy(&ud, &(dwt.d), sizeof(g_decompressed_data_t));
-      txtValue = std::to_string(ud.value);
-      avg_factor = ud.n_items_in_avg;
-      if (ud.value!=0 && avg_factor!=1) {
-       if (y==30 )
-        qDebug() << "case 3";
-        //txtValue = std::to_string(signalId) + "-" + (std::to_string(ud.value) + "(avg. "
-        txtValue =  (std::to_string(ud.value) + "(avg. "
-                 + std::to_string(avg_factor) + ")" );
-      }
-      else if (ud.value!=0) {
-        if (y==30 )
-          qDebug() << "case 4";
-        //txtValue = std::to_string(signalId) + "-" + (std::to_string(ud.value));
-        txtValue =   (std::to_string(ud.value));
-      }
-    }
-    else {
-      std::cout << "[-] Warning data type not recognized by the GUI" << std::endl;
-      return 0;
-    }
-    painter.setBrush(QBrush(getColor(txtValue)));
-  }
-  else {
-    painter.setBrush(QBrush(Qt::black));
-  }
-  if (y==30)
-    qDebug() << "txtValue" << QString::fromUtf8(txtValue.c_str());
-
-  if (txtValue.compare(std::string("0")) != 0) {
-    //y += 0.1 *peHeight;
-    y += 3;
-    //qDebug() << "QRect: xa=" << xa << " y=" << y << " xb-xa=" << xb-xa ;
-    //qDebug() << "y0=" << y - rectRelativeOffset * peHeight;
-    //QRect curRect = QRect(xa, y, xb - xa, peHeight * rectRelativeSize);
-    QRect curRect = QRect(xa, y, xb - xa, 24);
-    painter.drawRect(curRect);
-    displayedItems++;
-    // draw text in the center of the rectangle
-    if ((xb - xa > functionNameMinSize) && (! dwt.mixed_up)) {
-      painter.drawText(curRect, Qt::AlignCenter, QString::fromUtf8(txtValue.c_str()));
-      displayedItems++;
-    }
-  }
-  return displayedItems;
-
-}
-
-int TLGView::getSignalIndex(std::vector<int> signalIdList , int signalId) {
-  // search for the signalId in the vector. Returns its index in the list or -1 if not found
-
-  std::vector<int>::iterator it = std::find(signalIdList.begin(), signalIdList.end(), signalId);
-  if (it != signalIdList.end())
-    // signal Id found in list
-    return std::distance(signalIdList.begin(), it);
-  else
-    // signal Id not found in list
-    return -1;
-}
-
-
-void TLGView::paintSignalToGview(QPainter& painter, const QString signalPath,
-                                std::vector<TLData<const char*>> data,
-                                std::vector<int> signalIdList,
-                                int line,
-                                uint64_t t0,
-                                uint64_t t1,
-                                int x1) {
-
-  QString pcSignalPath = signalPath;
-  uint displayedItems = 0;
-  // First Look if it's a core or fc & state signal
-  /*if (pcSignalPath.contains("pe") & pcSignalPath.contains("state")) {
-    pcSignalPath.replace("state", "pc");
-  } else if (pcSignalPath.contains("fc") & pcSignalPath.contains("state")) {
-    pcSignalPath.replace("state", "pc");
-  }
-  else
-    pcSignalPath="";
-  */
-
-  // Look for the signal ID
-  int signalId = getSignalIdFromBackend(signalPath.toStdString());
-
-  // Get the signal index in list
-  int signalIdx = getSignalIndex(signalIdList, signalId);
-
-  if (signalIdx != -1)  {
-    // Index exist ==> just paint the signal in the timeline view for the timeline interval
-    for(auto iter = data[signalIdx].between(t0, t1, zoomFactor); !iter.done(); ++iter){
-      displayedItems += drawDwt<const char*>(*iter, x1, line * peHeight, painter);
-    }
-    return;
-  }
-  //else
-  //  std::cout << " Signal ID " <<  signalId << " not found in signalIdList or not to be displayed" << std::endl;
-
-  // Signal was not found in the cores traces
-  // Then, search it in the basic signals if it is not a core trace
-  if (!(signalPath.contains("pe") & signalPath.contains("state"))
-      & !(signalPath.contains("fc") & signalPath.contains("state")) ) {
-    if (std::find(id_to_display().begin(), id_to_display().end(), signalId) != id_to_display().end()) {
-      // Signal found in the list ==> just paint it on the TLGView
-      auto l = event_timestamps(signalId);
-      if (l != nullptr){
-        for(auto item : decompress(l->between(t0, t1, zoomFactor))){
-            displayedItems += drawDwt<g_decompressed_data_t>(
-                                            item, x1, line * peHeight, painter);
-        }
-      }
-    }
-  }
-
-  show();
-}
-
-void TLGView::paintSubTree( QPainter& painter,
-                            QModelIndex parent,
-                            QModelIndex signalIdx ,
-                            std::vector<TLData<const char*>> data,
-                            std::vector<int> signalIdList,
-                            int* line,
-                            uint64_t t0,
-                            uint64_t t1,
-                            int x1){
-
-    //std::cout << "[-] Paint Signals to GView: " << std::endl;
-    int i = 0;
-    QModelIndex idx;
-    QModelIndex siblingIdx;
-    // dump parent node
-    if (parent != signalsView->rootIndex()){
-        //std::cout << parent.data().toString().toStdString() << std::endl;
-        //std::cout << signalIdx.data().toString().toStdString() << std::endl;
-        (*line)++;
-        //std::cout << "line = " << *line << std::endl;
-        paintSignalToGview(painter, signalIdx.data().toString(), data, signalIdList,*line,
-                          t0, t1, x1 );
-    }
-
-
-    //std::cout << "row count : "<<  signalsModel->rowCount(parent) << std::endl;
-
-    if ((parent == signalsView->rootIndex()) || signalsView->isExpanded(parent)) {
-        for (i=0; i< signalsModel->rowCount(parent) ; i++) {
-            idx = signalsModel->index(i,0,parent);
-            siblingIdx = signalsModel->index(i,1,parent);
-            paintSubTree(painter, idx,siblingIdx,data,signalIdList,line, t0, t1,x1);
-        }
-    }
-}
-
-void TLGView::paintEvent(QPaintEvent* event){
-
-  const QRect r = event->rect();
-  // [x0, x1] correxponds to the TLGView X coordinates
-  int x0 = r.x(); // first abcisse of the event rectangle
-  int x1 = r.x() + r.width(); // second abcisse of the event rectangle
-  uint64_t t0, t1;
-
-  // calculate corresponding Time Range values
-  getTimeWindow(x0, x1, t0, t1);
-  QPainter painter(this);
-
-  drawMutex.lock();
-  lock_data_mtx();
-  auto data = get_timeline_data();
-  auto signalIdList = get_timeline_id();
-
-  //qDebug() << "signalsView rows nb :" << signalsView->height()/30;
-
-  // switch to a constant peHeight and see what happens
-  peHeight = 30;
-  // drawBackground(&painter, r, nlines);
-  painter.setPen(pen);
-
-  int line = -1;
-  paintSubTree(painter, signalsView->rootIndex(),signalsView->rootIndex(), data, signalIdList, &line,
-              t0,t1,x1);
-  // add extra line to match the signalsTreeView ?? what's this? might be wrong
-  line++;
-  maxY = line * peHeight;
-  setMinimumSize(0,line * peHeight);
-  unlock_data_mtx();
-
-  /*********************** draw the vertical red line **********************/
-  if (drawLine){
-    painter.setPen(redPen);
-    painter.drawLine(time2position(verticalLineTime), 0,
-                     time2position(verticalLineTime), size().height());
-  }
-
-  /*************************** Draw Ruler if needed *****************************/
-  //static bool wasPressed = false;
-  QPen pen1(Qt::red);
-  QBrush brush1(Qt::red,Qt::Dense6Pattern);
-  painter.setPen(pen1);
-  painter.setBrush(brush1);
-
-  if(drawRect)
-  {
-    // update selected area rectangle and draw it
-    if (!mouseMoving) {
-      m_rect.setBottomLeft(QPoint(time2position(t0Select),0));
-      m_rect.setWidth(time2position(t1Select) - time2position(t0Select));
-    }
-    m_rect.setHeight(this->height());
-    painter.drawRect(m_rect);
-  }
-
-  drawMutex.unlock();
-
-}
-
-
-void TLGView::resizeEvent(__attribute__((unused)) QResizeEvent *event){
-  setMaxScrollTime();
-  setPageStep();
-}
-
-/* if index is negative, item is added to the end */
-ProLabel* TLGView::insertLegendItem(int index, const char* str, bool addMenuAction){
-  //std::cout << "Adding String to Legend: " << str <<  " ----" << std::endl;
-  ProLabel* label = new ProLabel(str);
-  label->setSizePolicy(QSizePolicy::Minimum,
-                        QSizePolicy::Fixed);
-  label->setFixedHeight(peHeight);
-  // Inserting the label in the legend Layout
-  legendLayout->insertWidget(index, label,0,Qt::AlignTop);
-
-  addMenuAction = addMenuAction;
-  legendLayout->update();
-  this->update();
-
-  return label;
-
-}
-
-static int searchChildInLayout(const QLayoutItem* item, const QLayout* layout){
-  // reimplementation of the indexOf method (for Qt < 5.12 compatibility)
-  for (int i = 0; i < layout->count(); i++){
-    if (item == layout->itemAt(i)) return i;
-  }
-  return -1;
-}
-
-void TLGView::deleteLegendItem(QLayoutItem* item, QLayoutItem* stretch){
-  int index = searchChildInLayout(item, legendLayout); //legendLayout->indexOf(item)
-  legendLayout->removeItem(item);
-  legendLayout->removeItem(stretch);
-  if (index == -1) return;
-  // we have to compute the index in the user-added trace list from the index in
-  // the layout. There is a factor 2 because there are stretches between each
-  // legend label
-  int i = (index - 1 - 2 * Data_manager::TLHeight) / 2;
-  if (i < 0) return;
-  lock_data_mtx();
-  remove_trace_from_timeline(i);
-  unlock_data_mtx();
-  update();
-
-  delete item;
-  delete stretch;
-}
-
-void TLGView::clearWidgets(QLayout * layout) {
-   if (! layout)
-      return;
-   while (auto item = layout->takeAt(0)) {
-      delete item->widget();
-      clearWidgets(item->layout());
-   }
-}
-
-void TLGView::execMessage(const QString &text){
-  msgBox.setText(text);
-  msgBox.exec();
-}
-
-int TLGView::extractPower10Round(int x){
-  return pow(10,floor(log10(x) + 0.6));
-}
-
-void TLGView::addTraceToLegend(std::string path,bool addMenuAction){
-
-  //qDebug() << "##### AddTraceToLegend ##### " << path;
-
-  int i = path.size() - 1;
-  for (; i >= 0 && path[i] != '/'; i--);
-  int insertIdx = Data_manager::TLHeight * 2 + 1;
-
-  if (gvsocSlowMode)
-    insertIdx += std::max(0, (int) id_to_display().size() - 1 ) * 2;
-  else
-    insertIdx += std::max(0, (int) id_to_display().size() - 1 -10) * 2;
-
-  insertLegendItem(insertIdx, path.c_str(), addMenuAction);
-}
-
-void TLGView::shiftAfterZoom(double coeff){
-  uint64_t  t0, t1;
-  getTimeWindow(t0, t1);
-  int64_t delta = (int64_t) ((double)(t1 - t0) * coeff);
-  timeOffset = (uint64_t) std::max((int64_t) 0, ((int64_t) timeOffset + delta));
-  sbar->setValue(timeOffset * TLGView::scrollBarMaxRange / maxScrollTime);
-}
-
-void TLGView::zoomIn(){
-  if (zoomFactor >= 1. / 10) return;
-  zoomFactor *= 2;
-  setMaxScrollTime();
-  shiftAfterZoom(+1 / 2.);
-  setPageStep();
-  update();
-  emit zoomOccured();
-
-}
-
-void TLGView::zoomOut(){
-  // prevent user from crashing the app by zoom out infinitly
-  if (timeWidth() > 1.5 * maxTime) return;
-  zoomFactor /= 2;
-  setMaxScrollTime();
-  shiftAfterZoom(-1 / 4.);
-  setPageStep();
-  update();
-  emit zoomOccured();
-
-}
-
-void TLGView::viewAll() {
-  // Displays all the timeline in the Timeline Window
-  zoomFactor = (double)(size().width() -30) / (double) maxTime;
-  setMaxScrollTime();
-  shiftAfterZoom(-1 / 4.);
-  setPageStep();
-  update();
-
-}
-
-
-
-void TLGView::updateVerticalLine(QPointF q){
-  verticalLineTime = position2time(q.x());
-  updateVerticalLineText();
-  update();
-
-}
-
-void TLGView::updateVerticalLineText(){
-  tl->setTimestamp(time2string(verticalLineTime));
-}
-
-void TLGView::updateTimeInterval(uint64_t position){
-  uint64_t time = width2time(position);
-  tl->setTimeInterval(time2string(time));
-  update();
-
-}
-
-int TLGView::getCoreNb(QString sigShortName){
-  // get coreNb from signal short name
-  int coreNb=-1;
-
-  QRegularExpression re("PE\\dstate");
-  QRegularExpression rd("\\d");
-  int idx;
-  if (sigShortName.contains(re)) {
-    idx=sigShortName.indexOf(rd);
-    coreNb = sigShortName[idx].digitValue();
-  }
-  return coreNb;
-}
-
-int TLGView::getCoreNb2(QString sigLongName){
-  // get coreNb from signal short name
-  int coreNb=-1;
-
-  QRegularExpression re("pe\\d/state");
-  QRegularExpression rd("\\d");
-  int idx;
-  if (sigLongName.contains(re)) {
-    idx=sigLongName.indexOf(rd);
-    coreNb = sigLongName[idx].digitValue();
-  }
-  return coreNb;
-}
-
-
-QString TLGView::findEltOnRow(  QModelIndex parent,
-                                QModelIndex signalIdx,
-                                int rowNb,
-                                int* rowIdx){
-  // Finds the visible element on row rowNb in a QTreeView
-
-  int i=0;
-  // Loop on the treeView elements
-  //std::cout << "[-] FindEltOnRow " << std::endl;
-
-  QModelIndex idx;
-  QModelIndex siblingIdx;
-  QString signalName;
-  // treat parent node
-  if (parent != signalsView->rootIndex()){
-    (*rowIdx)++;
-    if (*rowIdx==rowNb)
-      return signalIdx.data().toString();
-  }
-
-
-  if ((parent == signalsView->rootIndex()) || signalsView->isExpanded(parent)) {
-    for (i=0; i< signalsModel->rowCount(parent) ; i++) {
-      idx = signalsModel->index(i,0,parent);
-      siblingIdx = signalsModel->index(i,1,parent);
-      signalName=findEltOnRow(idx,siblingIdx,rowNb, rowIdx);
-      if (QString::compare(signalName, QString(""))!=0)
-        return signalName;
-    }
-  }
-  return QString("");
-}
-
-
-
-void TLGView::mousePressEvent(QMouseEvent* event)
-{
-    // Dealing with mouse press event
-    // set usefull variables for this event
-    qDebug() << "mousePressEvent";
-    m_line.setP1(event->pos());
-    m_line.setP2(event->pos());
-    if (!selectionRect) {
-      // If no selection rectangle is already displayed
-      m_rect.setBottomLeft(event->pos());
-      m_rect.setWidth(0);
-      m_rect.setHeight(this->height());
-    }
-
-    // used to pinpoint functions
-    functionPoint.setX(event->x());
-    functionPoint.setY(event->y());
-
-  if (event->button() == Qt::LeftButton) {
-    leftClick = true;
-  } else if (event->button() == Qt::RightButton) {
-    rightClick=true;
-  }
-}
-
-
-void TLGView::highlightFunction(QPointF q) {
-    // This function highlights the function on which the user clicked
-    // in the functions table.
-    if (peHeight == 0) return;
-
-    // q.y() represents the coordinate of the point clicked in the ScrollArea
-    // We need to find the row number of the signal in the scrollArea
-    // qy()/peHeight represents the index of the signal in the signalsTreeView
-    int rowNb =  1 + q.y()/peHeight;
-    int rowIdx=0;
-    QString longSigName = findEltOnRow(signalsView->rootIndex(),
-                                        signalsView->rootIndex(),
-                                        rowNb,
-                                        &rowIdx
-                                        );
-    if (QString::compare(longSigName, QString("")) != 0) {
-      int coreNb=getCoreNb2(longSigName);
-      if (coreNb == -1) return;
-      const char* name = function_at(coreNb +1, position2time(q.x()));
-      if (name == nullptr) return;
-      // show functions Details Dock window if it was hidden
-      functionsDock->show();
-      fd->selectFunction(std::string(name));
-      fd->selectRow(name);
-      stallchart->construct(name);
-    }
-}
-
-void TLGView::mouseReleaseEvent(QMouseEvent *event) {
-
-  if (mouseMoving & rightClick) {
-
-    // set variables values for the selected area
-    m_line.setP2(event->pos());
-
-    if (!selectionRect) {
-      // We need to draw the selection rectangle and calculate the time interval
-      t0Select = this->position2time(m_line.p1().x());
-      t1Select = this->position2time(m_line.p2().x());
-    } else if (movingLeftEdge || movingRightEdge) {
-    // A selection rectangle has already been displayed
-    // We need to just adjust the rectangle edges
-      if (movingLeftEdge) {
-        t0Select = this->position2time(m_line.p2().x());
-        t1Select = t1SelectDisplayed;
-      } else  {
-        t0Select = t0SelectDisplayed;
-        t1Select = this->position2time(m_line.p2().x());
-      }
-    }
-
-    if (t1Select < t0Select) {
-      // inverse the values
-      uint64_t t0Save = t0Select;
-      t0Select = t1Select;
-      t1Select = t0Save;
-    }
-    tl->updateSignalsStatView(signalsModel, signalsView,t0Select,t1Select);
-    // update m_rect
-    m_rect.setBottomLeft(QPoint(time2position(t0Select),0));
-    m_rect.setWidth(time2position(t1Select) - time2position(t0Select));
-    // update the time interval display field
-    auto diff = m_rect.right() - m_rect.left();
-    uint64_t distance = abs(diff);
-    updateTimeInterval(distance);
-
-    drawRect = true;
-    selectionRect = true;
-    // save the selection area abcisses that is displayed
-    t0SelectDisplayed = t0Select;
-    t1SelectDisplayed = t1Select;
-    selectionRect=true;
-  }
-  else if (leftClick){
-    updateVerticalLine(functionPoint);// updates the timestamp value
-    highlightFunction(functionPoint); // highlights the function wher the click occured
-    drawLine = true; // used when repainting the gview to draw a vertical line
-  }
-  else if (rightClick) {
-    // Nothing to draw : nor timestamp line, nore selected area
-    drawLine = false;
-    drawRect = false;
-    selectionRect = false;
-    t0Select=0;
-    t1Select=0;
-  }
-
-   // reset all flags
-   rightClick = false;
-   leftClick  = false;
-   mouseMoving = false;
-   movingRightEdge=false;
-   movingLeftEdge=false;
-
-   update();
-}
-
-void TLGView::mouseMoveEvent(QMouseEvent *event) {
-
-  if ((event->type() == QEvent::MouseMove) & rightClick) {   // & click inside TLGView
-    if (!selectionRect) {
-      // No Selection rectangle has been displayed yet
-      m_line.setP2(event->pos());  // ??
-      auto diff = m_line.p2().x() - m_line.p1().x();
-      uint64_t distance = abs(diff);
-      if (diff>=0) {
-        m_rect.setHeight(this->height());
-        m_rect.setWidth(distance);
-      } else {
-        m_rect.setBottomLeft(event->pos());
-        m_rect.setHeight(this->height());
-        m_rect.setWidth(distance);
-      }
-
-      // Display difference of Time between P1 and P2
-      updateTimeInterval(distance);
-
-      drawRect = true;
-      mouseMoving = true;
-    }
-    else {
-      // A selection rectangle has already been displayed
-      // check if right click close to one of the edges of the rectangle
-      m_line.setP2(event->pos());
-      if (abs(m_line.p1().x() - time2position(t1SelectDisplayed)) <= 5) {
-        m_rect.setRight(m_line.p2().x());
-        movingLeftEdge = false;
-        movingRightEdge=true;
-      } else if (abs(m_line.p1().x() - time2position(t0SelectDisplayed)) <= 5) {
-        m_rect.setLeft(m_line.p2().x());
-        movingLeftEdge = true;
-        movingRightEdge=false;
-      }
-      auto diff = m_rect.bottomRight().x() - m_rect.bottomLeft().x();
-      uint64_t distance = abs(diff);
-      // Display difference of Time between P1 and P2
-      updateTimeInterval(distance);
-      drawRect = true;
-      mouseMoving = true;
-    }
-  }
-  update();
-}
-
-
-void TLGView::centerOnMousePosition(uint64_t mouseTimestamp) {
-// center TLGView on mouse position (X wise)
-
-  uint64_t t0, t1;
-  getTimeWindow(t0, t1);
-  uint64_t midTimestamp= timeOffset + (t1-t0)/2 ;
-
-  if (mouseTimestamp <= midTimestamp) {
-    if (timeOffset >= (midTimestamp - mouseTimestamp))
-      timeOffset = timeOffset - (midTimestamp - mouseTimestamp);
-  }
-  else {
-    timeOffset = timeOffset + (mouseTimestamp - midTimestamp);
-  }
-
-  // readjust sbar->value to be consistent with timeOffset
-  sbar->setValue(timeOffset * TLGView::scrollBarMaxRange / maxScrollTime);
-  update();
-
-}
-
-void TLGView::placeMouseTimestampOnPoint(uint64_t mouseTimestamp, QPointF mousePosition) {
-// place the mouseTimestamp on  mouse position (X wise)
-
-  uint64_t t0, t1;
-  getTimeWindow(t0, t1);
-
-  uint64_t targetTimestamp = position2time(mousePosition.rx());
-
-  if (mouseTimestamp <= targetTimestamp) {
-    if (timeOffset >= (targetTimestamp - mouseTimestamp))
-      timeOffset = timeOffset - (targetTimestamp - mouseTimestamp);
-  }
-  else {
-    timeOffset = timeOffset + (mouseTimestamp - targetTimestamp);
-  }
-
-}
-
-void TLGView::addToList(uint64_t mouseTimestamp){
-  //viewLayout->addStretch(0);
-  mouseTimestamps.push_back(mouseTimestamp);
-  wheelEventCount++;
-}
-
-void TLGView::zoomThread(){
-
-  // the access to this function is mutually exclusive
-  mouseMutex.lock();
-  //std::cout << "[-] ZoomThread started" << std::endl;
-  // wait arbitrarily 2s till user is done with mouse scroll
-  sleep(0.5);
-
-  // clear mouseTimestamp list & wheelEventCount
-  mouseTimestamps.clear();
-  //std::cout << "[-] ZoomThread started 5" << std::endl;
-
-  mouseMutex.unlock();
-
-  wheelEventCount=0;
-  //std::cout << "[-] ZoomThread ended" << std::endl;
-}
-
-
-
-void TLGView::wheelEvent(QWheelEvent *event)
-{
-   //qDebug<< "[-] WheelEvent";
-  // Get Mouse Position and store in the list
-  QPointF mousePosition(event->x(), event->y());
-  uint64_t mouseTimestamp = position2time(mousePosition.rx());
-  addToList(mouseTimestamp);
-
-  if (wheelEventCount == 1) {
-
-    QPoint numPixels = event->pixelDelta();
-    QPoint numDegrees = event->angleDelta() / 8;
-
-    if (!numPixels.isNull()) {
-        scrollWithPixels(numPixels);
-    } else if (!numDegrees.isNull()) {
-        QPoint numSteps = numDegrees / 15;
-        scrollWithDegrees(numSteps);
-    }
-
-    // Get first mouseTimestamp
-    mouseTimestamp=mouseTimestamps.front();
-    // center window on this timestamp if possible
-    //centerOnMousePosition(mouseTimestamp);
-    placeMouseTimestampOnPoint(mouseTimestamp, mousePosition);
-
-    // Launch thread
-    th1=std::thread(&TLGView::zoomThread,this);
-    th1.detach();
-  }
-
-  event->accept();
-  update();
-
-  // std::cout << "[-] WheelEvent Ended" << std::endl;
-}
-
-void TLGView::mouseDoubleClickEvent(QMouseEvent* event) {
-  QPointF mousePosition(event->x(), event->y());
-  uint64_t mouseTimestamp = position2time(mousePosition.rx());
-
-  zoomIn();
-  placeMouseTimestampOnPoint(mouseTimestamp, mousePosition);
-
-  update();
-}
-
-void TLGView::scrollWithPixels(const QPoint &pixel)
-{
-
-    int s = pixel.y();
-    //double z = std::pow(1.01, s);
-    std::cout << "Type of mouse wheel not yet implemented" << s << std::endl;
-}
-
-void TLGView::scrollWithDegrees(const QPoint &step)
-{
-    int s = step.y();
-    if (s==1)
-      zoomIn();
-    else
-      zoomOut();
-
-}
-
-void TLGView::gotoTimestamp(int64_t t){
-  updateVerticalLine(QPointF(time2position(t), 0));
-  //centerOn(t, sceneRect().height() / 2);
-}
-
-void TLGView::sliderMoved(int x){
-  timeOffset = x * maxScrollTime / scrollBarMaxRange;
-  update();
-
-}
-
-void TLGView::radioB1Clicked(){
-  currentMode=TIME_MODE;
-  // Need to act on the timestamp as it changed
-  updateVerticalLineText();
-  fd->switchLegendMode(this->currentMode);
-  update();
-
-}
-void TLGView::radioB2Clicked(){
-  currentMode=FC_CYCLE_MODE;
-  updateVerticalLineText();
-  fd->switchLegendMode(this->currentMode);
-  update();
-
-}
-
-void TLGView::radioB3Clicked(){
-  currentMode=CLUSTER_CYCLE_MODE;
-  updateVerticalLineText();
-  fd->switchLegendMode(this->currentMode);
-  update();
-
-}
-
-void TLGView::addSignalToGview(const QString signalPath) {
-
-  add_trace_to_timeline(signalPath.toStdString());
-}
-
-void TLGView::addSignalsToGview(QModelIndex parent,
-                                QModelIndex signalIdx){
-    //qDebug() << "[-] Add Signals to GView" ;
-    int i =0;
-    QModelIndex idx;
-    QModelIndex siblingIdx;
-    // dump parent node
-    if (parent != signalsView->rootIndex()){
-        //std::cout << parent.data().toString().toStdString() << std::endl;
-        //std::cout << signalIdx.data().toString().toStdString() << std::endl;
-        addSignalToGview(signalIdx.data().toString());
-    }
-
-    //std::cout << "row count : "<<  signalsModel->rowCount(parent) << std::endl;
-
-    if ((parent == signalsView->rootIndex()) || signalsView->isExpanded(parent)) {
-        for (i=0; i< signalsModel->rowCount(parent) ; i++) {
-            idx = signalsModel->index(i,0,parent);
-            siblingIdx = signalsModel->index(i,1,parent);
-            addSignalsToGview(idx,siblingIdx);
-        }
-    }
-}
-
-
-void TLGView::handleSignalNodeCollapsed() {
-  handleSignalNodeExpanded();
-   qDebug() << "[-] handleSignalNodeCollapsed" ;
-}
-
-// Handles the sync with the timeline view whenever a node in the signals
-// Tree View is expanded or collapsed. As both operations need to browse the
-// signal tree and repaint the timeline window accordingly.
-void TLGView::handleSignalNodeExpanded(){
-
-    qDebug() << "[-] handleSignalNodeExpanded: Adding/removing  Signals to Gview" ;
-    // First add signals to the timeline
-    addSignalsToGview(signalsView->rootIndex(),signalsView->rootIndex());
-    // Update stats Models & view if already created
-    // We must check here if gvsoc has already been run. If not , this
-    // call to updateSignalsStatView ends in a core dump because .....
-    // gvsocRunFlag is useless as it is set before gvsoc is really started
-    // qDebug() << "[-] gvsocRunFlag: " << tl->gvsocRunFlag;
-    if (tl->gvsocRunFlag) {
-      tl->updateSignalsStatView(signalsModel, signalsView);
-    }
-
-    // Then, sync the two views so signals are displayed in front of their name
-    this->repaint();
-    tl->setGviewVScrollValue();
-    tl->update();
-    this->update();
-    this->show();
-    tl->show();
-}
-
-SignalTree::SignalTree( TLGView* gview, QWidget* parent) {
-  this->setParent(parent);
-  this->gview = gview;
-  std::string txt;
-  setModel(new TreeModel(get_trace_txt_list().c_str()));
-  setContextMenuPolicy(Qt::ActionsContextMenu);
-  addTraceAction = new QAction(QString("Add to timeline"), this);
-  connect(addTraceAction, &QAction::triggered, this, &SignalTree::contextMenuHandler);
-  addAction(addTraceAction);
-  horizontalScrollBar()->setEnabled(true);
-  setHorizontalScrollBarPolicy(Qt::ScrollBarAsNeeded);
-  resizeColumnToContents(0);
-  setAutoScroll(false);
-  connect(this, SIGNAL(expanded(QModelIndex)), this, SLOT(updateScrollArea(QModelIndex)));
-  connect(this, SIGNAL(collapsed(QModelIndex)), this, SLOT(updateScrollArea(QModelIndex)));
-}
-
-void SignalTree::updateScrollArea(const QModelIndex& i)
-{
-  resizeColumnToContents(i.column());
-}
-
-
-void SignalTree::contextMenuHandler(){
-  QModelIndexList indexes = selectionModel()->selectedRows();
-  if (indexes.size() > 0) {
-    QModelIndex selectedIndex = indexes.at(0);
-    std::string path = "";
-    std::vector<std::string> v;
-    do {
-      v.push_back(selectedIndex.data().toString().toStdString());
-      selectedIndex = selectedIndex.parent();
-    } while (selectedIndex != QModelIndex());
-    for (auto c = v.rbegin(); c != v.rend(); c++){
-      path += "/" + *c;
-    }
-    if (add_trace_to_timeline(path)) {
-      //std::cout << "===== AddTraceToLegend " << path << std::endl;
-      gview->addTraceToLegend(path);
-      gview->update();
-    }
-    else {
-      QMessageBox::warning(this, "Error",
-        ("Impossible to add trace " + path + "\nFormat not supported yet").c_str());
-    }
-  }
-}
-
-void Timeline::gotoGivenTimestamp(){
-  std::string s = timestamp->text().toStdString();
-  char* end;
-  int64_t t = strtoll(s.c_str(), &end, 10);
-  // ??? we get the value in ns. needs to be converted in ps.
-  t=t*1000;
-  /* invalid string -> does nothing */
-  if (end == s.c_str()) return;
-  gview->gotoTimestamp(t);
-}
-
-void Timeline::addTraceToLegend(std::string path, bool addMenuAction){
-  //std::cout << "====== 2 addTraceToLegend " << path << std::endl;
-  gview->addTraceToLegend(path,addMenuAction);
-  gview->update();
-  //scrollArea->update();
-}
-
-void Timeline::completeToolBar(){
-
-  const QIcon zoomInIcon = QIcon::fromTheme("zoom-in", QIcon(":/images/Apps-Zoom-In-icon.png"));
-  const QIcon zoomOutIcon = QIcon::fromTheme("zoom-out", QIcon(":/images/Apps-Zoom-Out-icon.png"));
-  const QIcon viewAllIcon = QIcon::fromTheme("zoom-fit", QIcon(":/images/Apps-Zoom-Fit-icon.png"));
-
-  QAction *zoomInAct = new QAction(zoomInIcon, tr("Zoom In"), this);
-  QAction *zoomOutAct = new QAction(zoomOutIcon, tr("Zoom Out"), this);
-  QAction *viewAllAct = new QAction(viewAllIcon, tr("Zoom Fit"), this);
-
-  //std::cout << "connects mainwindow line 271" << std::endl;
-  connect(zoomInAct, &QAction::triggered, gview, &TLGView::zoomIn);
-  connect(zoomOutAct, &QAction::triggered, gview, &TLGView::zoomOut);
-  connect(viewAllAct, &QAction::triggered, gview, &TLGView::viewAll);
-
-  //connect(radioB1, &QRadioButton::toggled, gview, &TLGView::radioB1Clicked);
-  //connect(radioB2, &QRadioButton::toggled, gview, &TLGView::radioB2Clicked);
-  timestamp = new QLineEdit("0 µs");
-  timeInterval = new QLineEdit("0");
-  timestamp->setSizePolicy(QSizePolicy::Fixed, QSizePolicy::Fixed);
-  timeInterval->setSizePolicy(QSizePolicy::Fixed, QSizePolicy::Fixed);
-  connect(timestamp, &QLineEdit::returnPressed, this, &Timeline::gotoGivenTimestamp);
-  progressBar = new QProgressBar();
-  progressBar->setMinimum(0);
-  progressBar->setMaximum(0);
-  progressBar->setFixedWidth(100);
-
-  toolBar->addAction(zoomInAct);
-  toolBar->addAction(zoomOutAct);
-  toolBar->addAction(viewAllAct);
-
-  //toolBar->addSeparator();
-  timeLabel = new QLabel("Timestamp: ");
-  toolBar->addWidget(timeLabel);
-  toolBar->addWidget(timestamp);
-  //toolBar->addSeparator();
-  intervalLabel = new QLabel("Time Interval: ");
-  toolBar->addWidget(intervalLabel);
-  toolBar->addWidget(timeInterval);
-  //toolBar->addSeparator();
-  //toolBar->addWidget(radioB1);
-  //toolBar->addWidget(radioB2);
-  //toolBar->addWidget(radioB3);
-
-  toBeRemoveAfterExec.append(
-    toolBar->addWidget(new QLabel("   your code is being profiled...   ")));
-  toBeRemoveAfterExec.append(toolBar->addWidget(progressBar));
-
-  update();
-}
-
-
-void Timeline::foo(int x){
-  std::cout << "foo called " << x << std::endl;
-}
-
-void Timeline::createSignalsTree(QString signalsTreeFileName){
-    std::cout << "[-] Opening Signals Tree File " << signalsTreeFileName.toStdString() << std::endl;
-    QFile file(signalsTreeFileName);
-    if (!file.open(QIODevice::ReadOnly | QIODevice::Text))
-      std::cout << "[-] Error: File " << signalsTreeFileName.toStdString()
-              << " didn't open " << std::endl;
-    // need to check if model exists
-    model = new TreeModel(file.readAll());
-    file.close();
-    signalsTreeView = new QTreeView();
-    signalsTreeView->setModel(model);
-    signalsTreeView->setWindowTitle(QObject::tr("Signals"));
-    signalsTreeView->setStyleSheet("alternate-background-color:yellow;");
-    // hide the column containing the full path of the signal
-    signalsTreeView->hideColumn(1);
-    signalsTreeView->setColumnWidth(200, 800);
-    signalsTreeView->expandToDepth(0);
-}
-
-void Timeline::setGviewVScrollValue() {
-
-  qDebug() << "[-] setGViewVScrollValue () " ;
-  int value = signalsTreeView->verticalScrollBar()->value();
-  signalsTreeView->verticalScrollBar()->setValue(signalsTreeView->verticalScrollBar()->minimum());
-  signalsTreeView->verticalScrollBar()->setValue(value);
-  //setGviewVScrollValue(signalsTreeView->verticalScrollBar()->value());
-  scrollArea->syncWithTreeView();
-  gview->update();
-  gview->show();
-  scrollArea->update();
-  scrollArea->show();
-}
-
-void Timeline::setGviewVScrollValue(int value1) {
-  // syncs the Scrollarea vertical scroll bar value according to the
-  // value of the signalsTreeView scrollbar
-  qDebug() << "[-] setGViewVScrollValue (int value1) " ;
-  scrollArea->syncWithTreeView();
-  return;
-  // First block the scrollArea vertical bar signals
-  scrollArea->verticalScrollBar()->blockSignals(true);
-
-  // First calculate the percentage of move on the signalsTreeView
-  double  min1 = (double) signalsTreeView->verticalScrollBar()->minimum();
-  double  max1 = (double) signalsTreeView->verticalScrollBar()->maximum();
-
-  if (max1-min1<=0)
-    return;
-  // Then get min and max of the   Vartical bar
-  double min2 = (double) scrollArea->verticalScrollBar()->minimum();
-  double max2 = (double) scrollArea->verticalScrollBar()->maximum();
-  double factor = (max2 - min2) / (max1 - min1) ;
-  double value3= round(factor * value1);
-  int value2 = value3;
-  // set the scrollArea scroll bar value
-  scrollArea->verticalScrollBar()->setValue(value2);
-
-  // update widget position
-  QPoint topLeft = scrollArea->viewport()->rect().topLeft();
-  scrollArea->widget()->move(0, topLeft.y() - value2);
-
-  // Finally unblock scrollarea signals
-  scrollArea->verticalScrollBar()->blockSignals(false);
-  gview->update();
-  gview->show();
-  scrollArea->update();
-  scrollArea->show();
-
-}
-
-
-void Timeline::updateSignalsStatView( TreeModel* sigModel,
-                                      QTreeView* signalsTreeView,
-                                      uint64_t t0,
-                                      uint64_t t1) {
-  // Display signals statistics between x1 and x2
-  //displaySignalsStats(auto m_line.p1().x(),m_line.p2().x());
-
-  if ((gview!= NULL) && (statModel != NULL) && (statTableView != NULL)){
-    delete statModel;
-    statModel= new StatModel(this);
-    statModel->updateTreeParameters(sigModel, signalsTreeView);
-    statModel->populateData(t0, t1);
-    statTableView->setModel(statModel);
-    statTableView->horizontalHeader()->setSectionResizeMode(0, QHeaderView::ResizeToContents);
-    //statTableView->setColumnWidth(0,statTableView->width());
-    statTableView->resizeColumnToContents(2);
-    statTableView->horizontalHeader()->setStretchLastSection(true);
-    //statTableView->setVisible(false);
-    mw->update();
-    mw->repaint();
-    mw->show();
-    emit timeUpValueChanged();
-
-  }
-}
-
-void Timeline::updateSignalsStatView( TreeModel* sigModel,
-                                      QTreeView* signalsTreeView) {
-  //std::cout << "[-] Timeline::updateSignalsStatView()  " << std::endl;
-  // Display signals statistics between previously set range [t0Select, t1Select]
-  if ((gview!= NULL) && (statModel != NULL) && (statTableView != NULL)) {
-    delete statModel;
-    statModel= new StatModel(this);
-    statModel->updateTreeParameters(sigModel, signalsTreeView);
-    statModel->populateData(gview->t0Select,gview->t1Select);
-    statTableView->setModel(statModel);
-    statTableView->horizontalHeader()->setSectionResizeMode(0, QHeaderView::ResizeToContents);
-    //statTableView->setColumnWidth(0,statTableView->width());
-    statTableView->resizeColumnToContents(2);
-    statTableView->horizontalHeader()->setStretchLastSection(true);
-    //statTableView->setVisible(false);
-    mw->update();
-    mw->repaint();
-    mw->show();
-    emit timeUpValueChanged();
-  }
-
-}
-
-void Timeline::changeSignalsStatVisibility(bool visible){
-   // sets the signals Table View visible or not
-  QLayoutItem *item = 0;
-  QWidget *widget = 0;
-   for(int row = 0; row < 2; ++row)
-    {
-      item = tlLayout->itemAtPosition(row,2);
-      widget=item?item->widget():0;
-      if(widget)
-        widget->setVisible(visible);
-    }
- }
-
-Timeline::Timeline( QMainWindow* mw,
-                    QToolBar* toolBar,
-                    FunctionDetails* fd,
-                    QDockWidget* functionsDock,
-                    QPlainTextEdit* sourceCode,
-                    QPlainTextEdit* asmCode,
-                    StallChart* stallchart,
-                    QString signalsTreeFileName):
-                  mw(mw),
-                  toolBar(toolBar),
-                  functionsDock(functionsDock),
-                  sourceCode(sourceCode),
-                  asmCode(asmCode),
-                  stallchart(stallchart)
-{
-
-  // Assertions
-  assert(mw != NULL);
-  assert(toolBar != NULL);
-  assert(fd != NULL);
-  assert(functionsDock != NULL);
-  assert(sourceCode != NULL);
-  assert(asmCode != NULL);
-  assert(stallchart != NULL);
-
-  // Add a group button to the toolBar
-  //qDebug() << "[-] Create Button Group";
-  //buttonGroup = new QButtonGroup(this);
-  //radioB1 = new QRadioButton("Time Mode");
-  //radioB2 = new QRadioButton("Cluster Cycle Mode");
-  //buttonGroup->addButton(radioB1);
-  //buttonGroup->addButton(radioB2);
-  //radioB1->setChecked(true);
-
-  // construct hierarchy
-
-  //Define Timeline layout as a Grid Layout
-  tlLayout = new QGridLayout();
-  this->setLayout(tlLayout);
-
-  // Create SignalsTree View
-  qDebug() << "[-] Create Signals Tree";
-  createSignalsTree(signalsTreeFileName);
-  signalsTreeView->setSizePolicy(QSizePolicy::Minimum, QSizePolicy::Expanding);
-  signalsTreeView->setHeaderHidden(true);
-  signalsTreeView->setHorizontalScrollBarPolicy(Qt::ScrollBarAlwaysOn);
-  signalsTreeView->verticalScrollBar()->setSingleStep(30);
-  signalsTreeView->verticalScrollBar()->setPageStep(120);
-  signalsTreeView->update();
-  treeLabel = new QLabel();
-  treeLabel->setText("Time (us)");
-  treeLabel->setFixedHeight(30);
-  statLabel = new QLabel();
-  statLabel->setText("Signal Time Up");
-  statLabel->setFixedHeight(30);
-  tlLayout->addWidget(signalsTreeView,1,0);
-
-  // Create the stats Model
-  statModel= new StatModel(this);
-  // Create the TableView for signals statistics
-  // Connect Model to the TableView
-  statTableView = new QTableView();
-  //statTableView->setModel(statModel);
-
-  // Make table header unvisible
-  statTableView->horizontalHeader()->setVisible(false);
-
-  //statTableView->setSizePolicy(QSizePolicy::Minimum, QSizePolicy::Minimum);
-  statTableView->setSizePolicy(QSizePolicy::Expanding, QSizePolicy::Expanding);
-  statTableView->setVerticalScrollBarPolicy(Qt::ScrollBarAlwaysOff);
-  statTableView->setHorizontalScrollBarPolicy(Qt::ScrollBarAlwaysOn);
-  statTableView->verticalScrollBar()->setPageStep(signalsTreeView->verticalScrollBar()->pageStep());
-  statTableView->verticalScrollBar()->setMinimum(signalsTreeView->verticalScrollBar()->minimum());
-  statTableView->verticalScrollBar()->setMaximum(signalsTreeView->verticalScrollBar()->maximum());
-  statTableView->verticalScrollBar()->setSingleStep(signalsTreeView->verticalScrollBar()->singleStep());
-  // statTableView->setColumnWidth(0,100); should be done after setting the model
-  statTableView->setAlternatingRowColors(true);
-  //statTableView->horizontalHeader()->setSectionResizeMode(QtWidgets.QHeaderView.Stretch);
-  //statTableView->setSizeAdjustPolicy(QtWidgets.QAbstractScrollArea.AdjustToContents);
-
-  // Create TL horizontal scroll bar
-  qDebug() << "[-] Create Horizontal ScrollBar";
-  scrollbar = new QScrollBar(Qt::Horizontal);
-  scrollbar->setSizePolicy(QSizePolicy::Expanding, QSizePolicy::Fixed);
-
-  // Create the Timeline Graphic View
-  qDebug() << "[-] Create gview" ;
-  gview = new TLGView(fd, functionsDock, this, scrollbar, legendLayout,
-                      sourceCode, asmCode, stallchart, model, signalsTreeView,
-                      statModel, statTableView);
-  completeToolBar();
-  qDebug() << "[-] Tool Bar Completed" ;
-
-  verViewW = new QWidget();
-  verViewLayout = new QVBoxLayout();
-
-  verViewW->setLayout(verViewLayout);
-
-  //create scroll area containing
-  qDebug() << "[-] Create ScrollArea";
-  scrollArea = new AdjustingScrollArea(signalsTreeView,gview,this);
-  scrollArea->setSizePolicy(QSizePolicy::Minimum, QSizePolicy::Expanding);
-  scrollArea->setVerticalScrollBarPolicy(Qt::ScrollBarAlwaysOff);
-  scrollArea->setHorizontalScrollBarPolicy(Qt::ScrollBarAlwaysOff);
-	scrollArea->setWidgetResizable(true);
-  scrollArea->setWidget(gview);
-  scrollArea->ensureWidgetVisible(gview);
-  scrollArea->verticalScrollBar()->setPageStep(signalsTreeView->verticalScrollBar()->pageStep());
-  scrollArea->verticalScrollBar()->setMinimum(signalsTreeView->verticalScrollBar()->minimum());
-  scrollArea->verticalScrollBar()->setMaximum(signalsTreeView->verticalScrollBar()->maximum());
-  scrollArea->verticalScrollBar()->setSingleStep(signalsTreeView->verticalScrollBar()->singleStep());
-  scrollArea->setWidgetResizable(true);
-  scrollArea->adjustSize();
-
-  // Create horizontal Legend for the gview
-  qDebug() << "[-] hLegendWidget";
-  hLegendW = new HLegendWidget(gview);
-  hLegendW->setStyleSheet("border: 2px solid red");
-  connect(gview, SIGNAL(zoomOccured()), hLegendW, SLOT(update()));
-  connect(scrollbar, SIGNAL(valueChanged(int)), hLegendW, SLOT(update()));
-
-  qDebug() << "[-] Create scrollArea + scrollBar container widget";
-  verViewLayout->addWidget(scrollArea);
-  verViewLayout->addWidget(scrollbar);
-  verViewLayout->setMargin(0);
-  verViewLayout->setSpacing(0);
-  verViewLayout->setAlignment(scrollbar,Qt::AlignBottom);
-  verViewW->adjustSize();
-
-
-  qDebug() << "[-] Fill up  Grid Layout";
-  tlLayout->addWidget(treeLabel,0,0);
-  tlLayout->addWidget(verViewW,1,1);
-  tlLayout->addWidget(hLegendW,0,1);
-  tlLayout->addWidget(statTableView,1,2, Qt::AlignRight);
-  tlLayout->addWidget(statLabel,0,2, Qt::AlignRight);
-  //statLabel->setVisible(false);
-  //statTableView->setVisible(false);
-  tlLayout->setColumnStretch(2,0);
-  tlLayout->setColumnStretch(0,0);
-  tlLayout->setColumnStretch(1,800);
-  tlLayout->setColumnMinimumWidth(2,50);
-
-  // Connect different signals to corresponding slots
-  // using the connection type Qt::QueuedConnection ensures that the slot code gets
-  // executed after whatever operation emits the signal is completed and control
-  // is given back to QT's event loop
-  QObject::connect(signalsTreeView, SIGNAL(expanded(QModelIndex)),
-                     gview, SLOT(handleSignalNodeExpanded()),Qt::QueuedConnection);
-  QObject::connect(signalsTreeView, SIGNAL(collapsed(QModelIndex)),
-                     gview, SLOT(handleSignalNodeCollapsed()),Qt::QueuedConnection);
-
-  QObject::connect(signalsTreeView->verticalScrollBar(),SIGNAL(rangeChanged(int,int)),
-                  this, SLOT(setGviewVScrollValue()),Qt::QueuedConnection);
-  QObject::connect(signalsTreeView->verticalScrollBar(), SIGNAL(valueChanged(int)),
-                 this, SLOT(setGviewVScrollValue(int)),Qt::QueuedConnection);
-  /*
-  QObject::connect(signalsTreeView->verticalScrollBar(), &QScrollBar::rangeChanged, [this](int value1, int value2){
-            scrollArea->verticalScrollBar()->blockSignals(true);
-            scrollArea->verticalScrollBar()->setRange(value1,value2);
-            scrollArea->verticalScrollBar()->blockSignals(false);
-        });
-  QObject::connect(signalsTreeView->verticalScrollBar(), &QScrollBar::valueChanged, [this](int value){
-            scrollArea->verticalScrollBar()->blockSignals(true);
-            scrollArea->verticalScrollBar()->setValue(value);
-            scrollArea->verticalScrollBar()->blockSignals(false);
-        });
-  */
-  /*QObject::connect(scrollArea->verticalScrollBar(), &QScrollBar::valueChanged, [this](int value){
-            signalsTreeView->verticalScrollBar()->blockSignals(true);
-            signalsTreeView->verticalScrollBar()->setValue(value);
-            signalsTreeView->verticalScrollBar()->blockSignals(false);
-        });
-  */
-  QObject::connect(signalsTreeView->verticalScrollBar(),SIGNAL(rangeChanged(int,int)),
-                  statTableView->verticalScrollBar(), SLOT(setRange(int,int)),Qt::QueuedConnection);
-  QObject::connect(signalsTreeView->verticalScrollBar(), SIGNAL(valueChanged(int)),
-                  statTableView->verticalScrollBar(), SLOT(setValue(int)),Qt::QueuedConnection);
-
-  QObject::connect(this, SIGNAL(timeUpValueChanged()),
-                   this, SLOT(refreshTimelineWidgets()),
-                   Qt::QueuedConnection);
-  connect(scrollbar, &QScrollBar::valueChanged, gview, &TLGView::sliderMoved);
-
-  // Update timeline window sub-windows
-  qDebug() << "[-] Timeline window  created";
-  scrollbar->update();
-  hLegendW->update();
-  signalsTreeView->update();
-  gview->update();
-  gview->raise();
-  verViewW->update();
-  scrollArea->update();
-  this->update();
-
-  // Show timeline window sub-windows
-  qDebug() << "[-] Timeline window  updates done";
-  scrollbar->show();
-  hLegendW->show();
-  gview->show();
-  verViewW->show();
-  scrollArea->show();
-  signalsTreeView->show();
-  this->show();
-
-  qDebug() << "[-] Timeline window  displayed";
-}
-
-Timeline::~Timeline(){
-  // need to delete the buttonGroup
-  delete buttonGroup;
-  delete timestamp;
-  delete timeInterval;
-  delete zoomInAct;
-  delete zoomOutAct;
-  delete viewAllAct;
-  delete progressBar;
-  delete timeLabel;
-  delete intervalLabel;
-  //delete radioB1;
-  //delete radioB2;
-  // delete radioB3;
-  clearCurrentExecToolBar();
-}
-
-void Timeline::resizeEvent(__attribute__((unused)) QResizeEvent *event){
-
-}
-
-void Timeline::clearCurrentExecToolBar(){
-  for (auto c: toBeRemoveAfterExec){
-    toolBar->removeAction(c);
-    }
-  toBeRemoveAfterExec.clear();
-}
-
-void Timeline::refreshTimelineWidgets() {
-    statTableView->repaint();
-    gview->repaint();
-    scrollArea->repaint();
-    this->repaint();
-    mw->repaint();
-    statTableView->show();
-    gview->show();
-    scrollArea->show();
-    this->show();
-    mw->show();
-}
diff --git a/tools/profiler/gui/src/treeitem.cpp b/tools/profiler/gui/src/treeitem.cpp
deleted file mode 100644
index 79f6a318a..000000000
--- a/tools/profiler/gui/src/treeitem.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include "treeitem.hpp"
-
-//! [0]
-TreeItem::TreeItem(const QVector<QVariant> &data, TreeItem *parent)
-    : m_itemData(data), m_parentItem(parent)
-{}
-//! [0]
-
-//! [1]
-TreeItem::~TreeItem()
-{
-    qDeleteAll(m_childItems);
-}
-//! [1]
-
-//! [2]
-void TreeItem::appendChild(TreeItem *item)
-{
-    m_childItems.append(item);
-}
-//! [2]
-
-//! [3]
-TreeItem *TreeItem::child(int row)
-{
-    if (row < 0 || row >= m_childItems.size())
-        return nullptr;
-    return m_childItems.at(row);
-}
-//! [3]
-
-//! [4]
-int TreeItem::childCount() const
-{
-    return m_childItems.count();
-}
-//! [4]
-
-//! [5]
-int TreeItem::columnCount() const
-{
-    return m_itemData.count();
-}
-//! [5]
-
-//! [6]
-QVariant TreeItem::data(int column) const
-{
-    if (column < 0 || column >= m_itemData.size())
-        return QVariant();
-    return m_itemData.at(column);
-}
-//! [6]
-
-//! [7]
-TreeItem *TreeItem::parentItem()
-{
-    return m_parentItem;
-}
-//! [7]
-
-//! [8]
-int TreeItem::row() const
-{
-    if (m_parentItem)
-        return m_parentItem->m_childItems.indexOf(const_cast<TreeItem*>(this));
-
-    return 0;
-}
-//! [8]
diff --git a/tools/profiler/gui/src/treemodel.cpp b/tools/profiler/gui/src/treemodel.cpp
deleted file mode 100644
index 9c5c2b8a5..000000000
--- a/tools/profiler/gui/src/treemodel.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (C) 2020  GreenWaves Technologies, SAS
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-
-/*
-    treemodel.cpp
-
-    Provides a simple tree model to show how to create and use hierarchical
-    models.
-*/
-
-#include "treemodel.hpp"
-#include "treeitem.hpp"
-
-#include <QStringList>
-#include <QSize>
-#include <QColor>
-#include <iostream>
-
-
-//! [0]
-TreeModel::TreeModel(const QString &data, QObject *parent)
-    : QAbstractItemModel(parent)
-{
-    rootItem = new TreeItem({tr("Signal"), tr("Path")});
-    setupModelData(data.split('\n'), rootItem);
-}
-
-
-TreeModel::TreeModel(const QString &data, const QString &path, QObject *parent)
-    : QAbstractItemModel(parent)
-{
-    rootItem = new TreeItem({tr("Signal"), tr(path.toStdString().c_str())});
-    setupModelData(data.split('\n'), rootItem);
-}
-
-
-//! [0]
-
-//! [1]
-TreeModel::~TreeModel()
-{
-    delete rootItem;
-}
-//! [1]
-
-//! [2]
-int TreeModel::columnCount(const QModelIndex &parent) const
-{
-    if (parent.isValid())
-        return static_cast<TreeItem*>(parent.internalPointer())->columnCount();
-    return rootItem->columnCount();
-}
-//! [2]
-
-//! [3]
-QVariant TreeModel::data(const QModelIndex &index, int role) const
-{
-    if (!index.isValid())
-        return QVariant();
-
-    if (role == Qt::SizeHintRole)
-      {
-        // std::cout << "index.row() = " << index.row() << cout::endl;
-        // std::cout << "index.column() = " << index.column() << cout::endl;
-        // An example. Set the size of the first cell.
-        //if (index.row() == 0 && index.column() == 0)
-        //{
-          return QSize(100, 30); // for all rows
-        //}
-      }
-      else if (role == Qt::BackgroundRole) {
-        if (0 == index.row() % 2)
-            return QColor(247,245,243);
-            //return QColor(222,221,219);
-        else
-            return QColor(255,255,255);
-      }
-      else if (role != Qt::DisplayRole)
-        return QVariant();
-
-    TreeItem *item = static_cast<TreeItem*>(index.internalPointer());
-
-    return item->data(index.column());
-}
-
-//! [3]
-
-//! [4]
-Qt::ItemFlags TreeModel::flags(const QModelIndex &index) const
-{
-    if (!index.isValid())
-        return Qt::NoItemFlags;
-
-    return QAbstractItemModel::flags(index);
-}
-//! [4]
-
-//! [5]
-QVariant TreeModel::headerData(int section, Qt::Orientation orientation,
-                               int role) const
-{
-    if (orientation == Qt::Horizontal && role == Qt::DisplayRole)
-        return rootItem->data(section);
-
-    return QVariant();
-}
-//! [5]
-
-//! [6]
-QModelIndex TreeModel::index(int row, int column, const QModelIndex &parent) const
-{
-    if (!hasIndex(row, column, parent))
-        return QModelIndex();
-
-    TreeItem *parentItem;
-
-    if (!parent.isValid())
-        parentItem = rootItem;
-    else
-        parentItem = static_cast<TreeItem*>(parent.internalPointer());
-
-    TreeItem *childItem = parentItem->child(row);
-    if (childItem)
-        return createIndex(row, column, childItem);
-    return QModelIndex();
-}
-//! [6]
-
-//! [7]
-QModelIndex TreeModel::parent(const QModelIndex &index) const
-{
-    if (!index.isValid())
-        return QModelIndex();
-
-    TreeItem *childItem = static_cast<TreeItem*>(index.internalPointer());
-    TreeItem *parentItem = childItem->parentItem();
-
-    if (parentItem == rootItem)
-        return QModelIndex();
-
-    return createIndex(parentItem->row(), 0, parentItem);
-}
-//! [7]
-
-//! [8]
-int TreeModel::rowCount(const QModelIndex &parent) const
-{
-    TreeItem *parentItem;
-    if (parent.column() > 0)
-        return 0;
-
-    if (!parent.isValid())
-        parentItem = rootItem;
-    else
-        parentItem = static_cast<TreeItem*>(parent.internalPointer());
-
-    return parentItem->childCount();
-}
-//! [8]
-
-
-void TreeModel::setupModelData(const QStringList &lines, TreeItem *parent)
-{
-    QVector<TreeItem*> parents;
-    QVector<int> indentations;
-    parents << parent;
-    indentations << 0;
-
-    int number = 0;
-
-    while (number < lines.count()) {
-        int position = 0;
-        while (position < lines[number].length()) {
-            if (lines[number].at(position) != ' ')
-                break;
-            position++;
-        }
-
-        const QString lineData = lines[number].mid(position).trimmed();
-
-
-        if (!lineData.isEmpty()) {
-            // Read the column data from the rest of the line.
-            //const QStringList columnStrings = lineData.split('\t', QString::SkipEmptyParts);
-            QStringList columnStrings = lineData.split(';', QString::SkipEmptyParts);
-            QVector<QVariant> columnData;
-            columnData.reserve(columnStrings.count());
-            for (QString &columnString : columnStrings) {
-                // first eliminate blanks and tabs
-                //std::cout << "*" << columnString.toStdString() << "*" << std::endl;
-                columnString=columnString.simplified(); // changes all whitespace characters to single instance of ASCII 32
-                //std::cout << "*" << columnString.toStdString() << "*" << std::endl;
-                columnString.replace(" ", "" );
-                //std::cout << "*" << columnString.toStdString() << "*" << std::endl;
-
-                columnData << columnString;
-                //std::cout << "columnString[" << number << "] = " << columnString.toStdString() << std::endl;
-            }
-
-
-            if (position > indentations.last()) {
-                // The last child of the current parent is now the new parent
-                // unless the current parent has no children.
-
-                if (parents.last()->childCount() > 0) {
-                    parents << parents.last()->child(parents.last()->childCount()-1);
-                    indentations << position;
-                }
-            } else {
-                while (position < indentations.last() && parents.count() > 0) {
-                    parents.pop_back();
-                    indentations.pop_back();
-                }
-            }
-
-            // Append a new item to the current parent's list of children.
-            parents.last()->appendChild(new TreeItem(columnData, parents.last()));
-        }
-        ++number;
-    }
-}
diff --git a/tools/profiler/init.sh b/tools/profiler/init.sh
deleted file mode 100644
index 60d8c0ce2..000000000
--- a/tools/profiler/init.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-if [  -n "${ZSH_VERSION:-}" ]; then
-	DIR="$(readlink -f -- "${(%):-%x}")"
-	script_path="$(dirname $DIR)"
-else
-	script_path="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
-fi
-
-export PATH=$script_path/gui/build:$script_path/backend/src/scripts/:$PATH
diff --git a/tools/profiler/init_profiler.sh b/tools/profiler/init_profiler.sh
deleted file mode 100755
index 49ab0a6ef..000000000
--- a/tools/profiler/init_profiler.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-##!/bin/bash
-set -e
-
-fifo_name="all.bin"
-exe_name="${exe_name:-GCC_RISCV/test}"
-build_dir="${build_dir:-BUILD}"
-
-if [ "$#" -gt 2 ]
-then
-  echo "Error: you must provide at most : "\
-      " the directory containing the makefile of your project and the profiling"\
-      " frequency"
-  exit 1
-fi
-
-#default profiling frequency: 1 MHz
-frequency=1000000
-if [ "$#" -eq 2 ]
-then
-  frequency=$2
-fi
-
-if ! [ -d $1 ]
-then
-  if [ $(basename $1) == "Makefile" ]
-  then
-    set -- $(dirname $1)
-  else
-    echo "Error: directory '"$1"' does not exist"
-    exit 1
-  fi
-fi
-
-# WARNING DO NOT use -j option of make
-# for some very well written makefiles in sdk/examples it does not work
-
-make -C $1 all platform=gvsoc
-if ! [ -d $1/$build_dir ]
-then
- echo "Error: wrong build dir specified "
- exit 1
-fi
-
-full_exe_name=$(readlink --canonicalize \
-    $(find $1/$build_dir -name $(basename $exe_name) | grep -m 1 $exe_name))
-if [ $full_exe_name = "" ]
-then
-  echo "Error: cannot find executable file after build" \
-    "Make sure variable $exe_name is set correctly"
-  exit 1
-fi
-
-echo " *********************** exe name"
-echo $full_exe_name
-full_fifo_name=$(readlink --canonicalize $(dirname $full_exe_name)/$fifo_name)
-echo " *********************** fifo name"
-echo $full_fifo_name
-if ! [ -e $full_fifo_name ]
-then
-  mkfifo $full_fifo_name
-else#
-  rm $full_fifo_name
-  mkfifo $full_fifo_name
-fi
-
-
-
-make -C $1 run platform=gvsoc runner_args="--vcd --event=.*@$fifo_name \
-  --event-format=raw" \
- || (echo "Build or run failed, did you install and source the sdk correctly?" \
-      && exit 1) &
-
-#echo $full_fifo_name
-#echo $full_exe_name $frequency
-#(cd $(dirname $0) && gui/build/profiler $full_fifo_name $full_exe_name $frequency)
diff --git a/tools/profiler/killgvsoc.sh b/tools/profiler/killgvsoc.sh
deleted file mode 100644
index 96baffcfa..000000000
--- a/tools/profiler/killgvsoc.sh
+++ /dev/null
@@ -1 +0,0 @@
-for pid in $(ps -ef | grep "grep" | awk '{print $2}'); do kill -9 $pid; done
\ No newline at end of file
diff --git a/tools/profiler/profiler_benchmark.ods b/tools/profiler/profiler_benchmark.ods
deleted file mode 100644
index effb9591a..000000000
Binary files a/tools/profiler/profiler_benchmark.ods and /dev/null differ
diff --git a/tools/profiler/run_profiler.sh b/tools/profiler/run_profiler.sh
deleted file mode 100755
index 4776de1b3..000000000
--- a/tools/profiler/run_profiler.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-##!/bin/bash
-set -e
-
-fifo_name="all.bin"
-exe_name="${exe_name:-GCC_RISCV/test}"
-build_dir="${build_dir:-BUILD}"
-
-if [ "$#" -gt 2 ]
-then
-  echo "Error: you must provide at most : "\
-      " the directory containing the makefile of your project and the profiling"\
-      " frequency"
-  exit 1
-fi
-
-#default profiling frequency: 1 MHz
-frequency=1000000
-if [ "$#" -eq 2 ]
-then
-  frequency=$2
-fi
-
-if ! [ -d $1 ]
-then
-  if [ $(basename $1) == "Makefile" ]
-  then
-    set -- $(dirname $1)
-  else
-    echo "Error: directory '"$1"' does not exist"
-    exit 1
-  fi
-fi
-
-# WARNING DO NOT use -j option of make
-# for some very well written makefiles in sdk/examples it does not work
-
-make -C $1 all platform=gvsoc
-if ! [ -d $1/$build_dir ]
-then
- echo "Error: wrong build dir specified "
- exit 1
-fi
-
-full_exe_name=$(readlink --canonicalize \
-    $(find $1/$build_dir -name $(basename $exe_name) | grep -m 1 $exe_name))
-if [[ $full_exe_name == "" ]]
-then
-  echo "Error: cannot find executable file after build" \
-    "Make sure variable $exe_name is set correctly"
-  exit 1
-fi
-
-full_fifo_name=$(readlink --canonicalize $(dirname $full_exe_name)/$fifo_name)
-echo " *********************** fifo name"
-echo $full_fifo_name
-if ! [[ -e $full_fifo_name ]]
-then
-  mkfifo $full_fifo_name
-else
-  rm $full_fifo_name
-  mkfifo $full_fifo_name
-fi
-
-make -C $1 run platform=gvsoc runner_args="--vcd --event=.*@$fifo_name \
-  --event-format=raw" \
- || (echo "Build or run failed, did you install and source the sdk correctly?" \
-      && exit 1) &
-
-(cd $(dirname $0) && gui/build/profiler $full_fifo_name $full_exe_name $frequency)
diff --git a/utils/bin/binary-size b/utils/bin/binary-size
index 8918afc8d..8bba53976 100755
--- a/utils/bin/binary-size
+++ b/utils/bin/binary-size
@@ -125,6 +125,7 @@ groups = {
             Group('PulpOS:irq', ['pos_irq.*']),
             Group('PulpOS:time', ['pos_time.*']),
             Group('PulpOS:alloc', ['pos_alloc.*', 'pos_free.*']),
+            Group('PulpOS:i2c', ['__pi_i2c.*', 'pi_i2c.*']),
             Group('PulpOS:udma', ['pos_udma.*']),
             Group('PulpOS:soc_event', ['pos_soc_event.*']),
             Group('PulpOS:task', ['pos_task_.*', 'pos_sched.*']),
@@ -157,6 +158,7 @@ groups = {
             Group('FreeRTOS:octospi', ['__pi_octospi.*', 'pi_octospi_.*']),
             Group('FreeRTOS:pmu', ['__pi_pmu.*', 'pi_pmu.*']),
             Group('FreeRTOS:rtc', ['__pi_rtc.*', 'pi_rtc.*']),
+            Group('FreeRTOS:i2c', ['__pi_i2c.*', 'pi_i2c.*']),
             Group('FreeRTOS:spim', ['pos_spim.*', 'pi_spi_.*']),
             Group('FreeRTOS:uart', ['__pi_uart.*', 'pi_uart_.*']),
             Group('FreeRTOS:malloc', ['pi_malloc_.*', 'pi_l2_*', 'pi_l1_*', 'pi_cl_l1_*']),
diff --git a/utils/gap_configs/configs/chips/gap9_v2/gap9_v2_rtl.json b/utils/gap_configs/configs/chips/gap9_v2/gap9_v2_rtl.json
index d0932c616..0f715db39 100644
--- a/utils/gap_configs/configs/chips/gap9_v2/gap9_v2_rtl.json
+++ b/utils/gap_configs/configs/chips/gap9_v2/gap9_v2_rtl.json
@@ -14,12 +14,25 @@
     "board": {
       "name": "gap9_v2",
       "devices": {
-        "spiflash": {
-          "@includes@": ["devices/spiflash_atxp032.json"]
-        },
-        "flash": {
-          "@include@" : "devices/hyperflash.json",
-          "cs": 1
+        "@cond@": {
+          "@os.environ.get('BOARD_NAME') == 'gap9_evk'@": {
+            "spiflash": {
+              "@includes@": ["devices/spiflash_mx25um51245g.json"]
+            },
+            "flash": {
+              "@include@" : "devices/hyperflash.json",
+              "cs": 1
+            }
+          },
+          "@os.environ.get('BOARD_NAME') != 'gap9_evk'@": {
+            "spiflash": {
+              "@includes@": ["devices/spiflash_atxp032.json"]
+            },
+            "flash": {
+              "@include@" : "devices/hyperflash.json",
+              "cs": 1
+            }
+          }
         }
       },
 
@@ -34,7 +47,7 @@
             },
             "nina_b112": {
               "@includes@": ["devices/nina_b112.json"]
-            },
+            },              
             "mic0": {
               "@includes@": ["devices/microphone.json"]
             },
diff --git a/utils/gap_configs/configs/devices/spiflash_mx25um51245g.json b/utils/gap_configs/configs/devices/spiflash_mx25um51245g.json
new file mode 100644
index 000000000..50a6b5dec
--- /dev/null
+++ b/utils/gap_configs/configs/devices/spiflash_mx25um51245g.json
@@ -0,0 +1,54 @@
+{
+    "name": "SPI_flash",
+  
+    "datasheet": {
+        "type": "spi",
+        "size": "4MB",
+        "block-size": "4KB"
+    },
+  
+    "models": {
+        "gvsoc": {
+  
+        },
+        "rtl": {
+            "stimuli": {
+                "format": "slm",
+                "file": "slm_files/mx25um51245g_flash_stim.slm"
+            }
+        }
+    },
+  
+    "content": {
+      "partitions": {
+          "readfs": {
+              "type": "readfs",
+              "files": []
+          },
+          "hostfs": {
+              "type": "hostfs",
+              "files": []
+          },
+          "lfs": {
+              "type": "lfs",
+              "root_dir": null
+          }
+      }
+    },
+  
+    "doc_rst": "devices/spiflash.rst",
+    "description": "SPI flash model",
+    "platforms": [ "gvsoc", "rtl" ],
+  
+      "vp_class": "devices/spiflash/spiflash",
+      "vp_component": "devices.spiflash.spiflash_impl",
+      "type": "spiflash",
+      "size": "0x00800000",
+  
+      "fs": {
+        "files": [],
+        "encrypt": false,
+        "aes_key": 0,
+        "aes_iv": 0
+      }
+  }
\ No newline at end of file
diff --git a/utils/gap_configs/python/devices/gpio/fxl6408.py b/utils/gap_configs/python/devices/gpio/fxl6408.py
new file mode 100644
index 000000000..e197a4687
--- /dev/null
+++ b/utils/gap_configs/python/devices/gpio/fxl6408.py
@@ -0,0 +1,24 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import gsystree as st
+
+
+class Fxl6408(st.Component):
+
+    def __init__(self, parent, name):
+        super(Fxl6408, self).__init__(parent, name)
+
+        self.add_property('vp_component', 'devices.gpio.fxl6408')
+
diff --git a/utils/gap_configs/python/devices/sound/dac/ak4332.py b/utils/gap_configs/python/devices/sound/dac/ak4332.py
new file mode 100644
index 000000000..032abebda
--- /dev/null
+++ b/utils/gap_configs/python/devices/sound/dac/ak4332.py
@@ -0,0 +1,24 @@
+# Copyright (C) 2020  GreenWaves Technologies, SAS
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import gsystree as st
+
+
+class Ak4332(st.Component):
+
+    def __init__(self, parent, name):
+        super(Ak4332, self).__init__(parent, name)
+
+        self.add_property('vp_component', 'devices.sound.dac.ak4332')
+
diff --git a/utils/gap_configs/python/gap/gap9/cluster.json b/utils/gap_configs/python/gap/gap9/cluster.json
index be0a99df5..482b54a5d 100644
--- a/utils/gap_configs/python/gap/gap9/cluster.json
+++ b/utils/gap_configs/python/gap/gap9/cluster.json
@@ -1,5 +1,8 @@
 
 {
+    "vp_component": "gap9.cluster",
+    "power_models_file": "gap/gap9/power_models/cluster.json",
+
     "alias": "0x00000000",
     "mapping": {
         "base": "0x10000000",
diff --git a/utils/gap_configs/python/gap/gap9/cluster.py b/utils/gap_configs/python/gap/gap9/cluster.py
index 24de26182..b42c0799d 100644
--- a/utils/gap_configs/python/gap/gap9/cluster.py
+++ b/utils/gap_configs/python/gap/gap9/cluster.py
@@ -68,6 +68,7 @@ def __init__(self, parent, name, config_file, cid: int=0):
         #
 
         self.add_properties(self.load_property_file(config_file))
+        self.add_properties({ 'power_models': self.load_property_file(self.get_property("power_models_file"))})
 
         nb_pe               = self.get_property('nb_pe', int)
         cluster_size        = self.get_property('mapping/size', int)
@@ -111,7 +112,7 @@ def __init__(self, parent, name, config_file, cid: int=0):
         demux_periph_ico = Router(self, 'demux_periph_ico')
 
         # MCHAN
-        mchan = Mchan(self, 'dma', nb_channels=nb_pe+1)
+        mchan = Mchan(self, 'dma', nb_channels=nb_pe+1, power_models_file="gap/gap9/power_models/mchan.json")
 
         # Timer
         timer = Timer(self, 'timer')
@@ -141,6 +142,7 @@ def __init__(self, parent, name, config_file, cid: int=0):
 
         # Cores
         for i in range(0, nb_pe):
+            self.bind(pes[i], 'busy', self, 'core_busy_%d' % i)
             self.bind(pes[i], 'data', l1, 'data_pe_%d' % i)
             self.bind(pes[i], 'fetch', icache, 'input_%d' % i)
             self.bind(pes[i], 'irq_ack', event_unit, 'irq_ack_%d' % i)
@@ -211,6 +213,7 @@ def __init__(self, parent, name, config_file, cid: int=0):
         # MCHAN
         self.bind(mchan, 'ext_irq_itf', self, 'dma_irq')
         self.bind(mchan, 'ext_itf', cluster_ico, 'input')
+        self.bind(mchan, 'busy', self, 'dma_busy')
 
         for i in range(0, 4):
             self.bind(mchan, 'loc_itf_%d' % i, l1, 'dma_in_%d' % i)
@@ -222,11 +225,13 @@ def __init__(self, parent, name, config_file, cid: int=0):
     
         # Timer
         self.bind(self, 'ref_clock', timer, 'ref_clock')
+        self.bind(timer, 'busy', self, 'timer0_busy')
         for i in range(0, nb_pe):
             self.bind(timer, 'irq_itf_0', event_unit, 'in_event_%d_pe_%d' % (timer_irq_0, i))
             self.bind(timer, 'irq_itf_1', event_unit, 'in_event_%d_pe_%d' % (timer_irq_1, i))
 
         # Cluster control
+        self.bind(cluster_control, 'clock_gating_en', self, 'cluster_clock_gating_en')
         for i in range(0, nb_pe):
             self.bind(cluster_control, 'bootaddr_%d' % i, pes[i], 'bootaddr')
             self.bind(cluster_control, 'fetchen_%d' % i, pes[i], 'fetchen')
diff --git a/utils/gap_configs/python/gap/gap9/gap9.py b/utils/gap_configs/python/gap/gap9/gap9.py
index 81bedbf87..ae6840a92 100644
--- a/utils/gap_configs/python/gap/gap9/gap9.py
+++ b/utils/gap_configs/python/gap/gap9/gap9.py
@@ -161,6 +161,7 @@ def __init__(self, parent, name, soc_config_file='gap/gap9/soc.json', cluster_co
         self.bind(soc_clock, 'out', pmu, 'clock')
         self.bind(soc, 'pmu_input', pmu, 'input')
         self.bind(pmu, 'icu6_reset', clusters[0], 'reset')
+        self.bind(pmu, 'icu6_power', clusters[0], 'power_supply')
         self.bind(pmu, 'icu5_reset', soc, 'reset')
         self.bind(ref_clock, 'out', pmu, 'ref_clock')
         self.bind(pmu, 'event', soc, 'event')
diff --git a/utils/gap_configs/python/gap/gap9/gap9_evk.py b/utils/gap_configs/python/gap/gap9/gap9_evk.py
index 938970292..eba5839af 100644
--- a/utils/gap_configs/python/gap/gap9/gap9_evk.py
+++ b/utils/gap_configs/python/gap/gap9/gap9_evk.py
@@ -15,6 +15,9 @@
 from gap.gap9.gapmod import Gapmod
 from devices.testbench.testbench import Testbench
 from devices.uart.uart_checker import Uart_checker
+from devices.gpio.fxl6408 import Fxl6408
+from devices.sound.dac.ak4332 import Ak4332
+from devices.i2c.i2c_bus import I2c_bus
 
 
 class Gap9_evk(Gapmod):
@@ -26,15 +29,18 @@ class Gap9_evk(Gapmod):
     addon_testbench_enabled : bool, optional
         If True, this enables the testbench addon, which is a specific GVSOC addon for
         generating stimuli on the pads (default: False).
+    addon_audio : bool, optional
+        If True, this enables the audio addon (default: False).
     
     """
 
-    def __init__(self, parent, name, addon_testbench_enabled: bool=False, addon_uart_checker: bool=True):
+    def __init__(self, parent, name, addon_testbench_enabled: bool=False, addon_uart_checker: bool=True, addon_audio: bool=False):
         super(Gap9_evk, self).__init__(parent, name)
 
         # Register all parameters as properties so that they can be overwritten from the command-line
         self.add_property('addon_testbench_enabled', addon_testbench_enabled)
         self.add_property('addon_uart_checker', addon_uart_checker)
+        self.add_property('addon_audio', addon_audio)
 
         gap = self.get_component('chip')
 
@@ -55,6 +61,18 @@ def __init__(self, parent, name, addon_testbench_enabled: bool=False, addon_uart
             self.bind(testbench, 'i2s1', gap, 'i2s1')
             self.bind(testbench, 'i2s2', gap, 'i2s2')
 
+        # Addon Testbench
+        elif self.get_property('addon_audio'):
+
+            i2c1_bus = I2c_bus(self, 'i2c1_bus')
+            io_expander = Fxl6408(self, 'io_expander')
+            dac = Ak4332(self, 'dac')
+
+            self.bind(gap, 'i2c1', i2c1_bus, 'input')
+            self.bind(io_expander, 'i2c', i2c1_bus, 'input')
+            self.bind(dac, 'i2c', i2c1_bus, 'input')
+
+
         else:
             # Addon uart checker
             if self.get_property('addon_uart_checker'):
@@ -67,4 +85,10 @@ def __init__(self, parent, name, addon_testbench_enabled: bool=False, addon_uart
                 self.bind(gap, 'i2c3', self, 'i2c3')
 
         
+class Gap9_evk_audio(Gap9_evk):
+    """
+    GAP9 EVK board with audio addon
+    """
 
+    def __init__(self, parent, name):
+        super(Gap9_evk_audio, self).__init__(parent, name, addon_audio=True)
diff --git a/utils/gap_configs/python/gap/gap9/padframe.json b/utils/gap_configs/python/gap/gap9/padframe.json
index 5f690662e..a2200efee 100644
--- a/utils/gap_configs/python/gap/gap9/padframe.json
+++ b/utils/gap_configs/python/gap/gap9/padframe.json
@@ -59,6 +59,18 @@
         "type": "i2c",
         "is_master": true
       },
+      "i2c1": {
+        "type": "i2c",
+        "is_master": true
+      },
+      "i2c2": {
+        "type": "i2c",
+        "is_master": true
+      },
+      "i2c3": {
+        "type": "i2c",
+        "is_master": true
+      },
       "i2s0": {
         "type": "i2s",
         "is_slave": true
diff --git a/utils/gap_configs/python/gap/gap9/power_models/cluster.json b/utils/gap_configs/python/gap/gap9/power_models/cluster.json
new file mode 100644
index 000000000..4e1ef9178
--- /dev/null
+++ b/utils/gap_configs/python/gap/gap9/power_models/cluster.json
@@ -0,0 +1,28 @@
+{
+    "background": {
+        "dynamic": {
+            "type": "linear",
+            "unit": "W",
+        
+            "values": {
+                "25": {
+                    "1.2": {
+                        "any": "0.00284"
+                    }
+                }
+            }
+        },
+        "leakage": {
+            "type": "linear",
+            "unit": "W",
+        
+            "values": {
+                "25": {
+                    "1.2": {
+                        "any": "0.0011045"
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/utils/gap_configs/python/gap/gap9/power_models/core/riscy.json b/utils/gap_configs/python/gap/gap9/power_models/core/riscy.json
index c48a5e38f..bb035c2c9 100644
--- a/utils/gap_configs/python/gap/gap9/power_models/core/riscy.json
+++ b/utils/gap_configs/python/gap/gap9/power_models/core/riscy.json
@@ -1,18 +1,4 @@
 {
-    "clock_gated": {
-        "dynamic": {
-            "type": "linear",
-            "unit": "W",
-        
-            "values": {
-                "25": {
-                    "1.2": {
-                        "any": "0.0000016"
-                    }
-                }
-            }
-        }
-    },
     "insn_groups": [
         {
             "dynamic": {
@@ -22,7 +8,7 @@
                 "values": {
                     "25": {
                         "1.2": {
-                            "any": "6.36"
+                            "any": "4.98"
                         }
                     }
                 }
@@ -36,14 +22,26 @@
                 "values": {
                     "25": {
                         "1.2": {
-                            "any": "6.36"
+                            "any": "4.98"
                         }
                     }
                 }
             }
         }
     ],
-    "leakage": {
+    "background": {
+        "dynamic": {
+            "type": "linear",
+            "unit": "W",
+        
+            "values": {
+                "25": {
+                    "1.2": {
+                        "any": "0.000"
+                    }
+                }
+            }
+        },
         "leakage": {
             "type": "linear",
             "unit": "W",
@@ -51,7 +49,7 @@
             "values": {
                 "25": {
                     "1.2": {
-                        "any": "0.0001714375"
+                        "any": "0.0000355"
                     }
                 }
             }
diff --git a/utils/gap_configs/python/gap/gap9/power_models/l1/l1.json b/utils/gap_configs/python/gap/gap9/power_models/l1/l1.json
index e41b57355..cbffa82f0 100644
--- a/utils/gap_configs/python/gap/gap9/power_models/l1/l1.json
+++ b/utils/gap_configs/python/gap/gap9/power_models/l1/l1.json
@@ -1,5 +1,5 @@
 {
-    "idle": {
+    "background": {
         "dynamic": {
             "type": "linear",
             "unit": "W",
@@ -7,13 +7,11 @@
             "values": {
                 "25": {
                     "1.2": {
-                        "any": "0.00000501264031"
+                        "any": "0.000"
                     }
                 }
             }
-        }
-    },
-    "leakage": {
+        },
         "leakage": {
             "type": "linear",
             "unit": "W",
@@ -21,7 +19,7 @@
             "values": {
                 "25": {
                     "1.2": {
-                        "any": "0.00001707210625"
+                        "any": "0.0000052"
                     }
                 }
             }
diff --git a/utils/gap_configs/python/gap/gap9/power_models/mchan.json b/utils/gap_configs/python/gap/gap9/power_models/mchan.json
new file mode 100644
index 000000000..823033343
--- /dev/null
+++ b/utils/gap_configs/python/gap/gap9/power_models/mchan.json
@@ -0,0 +1,42 @@
+{
+    "active": {
+        "dynamic": {
+            "type": "linear",
+            "unit": "W",
+        
+            "values": {
+                "25": {
+                    "1.2": {
+                        "any": "0.000187"
+                    }
+                }
+            }
+        }
+    },
+    "background": {
+        "dynamic": {
+            "type": "linear",
+            "unit": "W",
+        
+            "values": {
+                "25": {
+                    "1.2": {
+                        "any": "0.000100"
+                    }
+                }
+            }
+        },
+        "leakage": {
+            "type": "linear",
+            "unit": "W",
+        
+            "values": {
+                "25": {
+                    "1.2": {
+                        "any": "0.0000128"
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/utils/gap_configs/python/gap/gap9/soc.py b/utils/gap_configs/python/gap/gap9/soc.py
index 320738471..8c38dc0ce 100644
--- a/utils/gap_configs/python/gap/gap9/soc.py
+++ b/utils/gap_configs/python/gap/gap9/soc.py
@@ -33,7 +33,7 @@
 from ips.fll.fll_v2 import Fll
 from gap.gap9.cluster import get_cluster_name
 from ips.clock.clock_domain import Clock_domain
-from ips.udma.udma_v4 import Udma
+from gap.gap9.udma import Udma
 from ips.xip.xip_v1 import Xip
 from ips.interco.bus_watchpoint import Bus_watchpoint
 from ips.debug.pulp_tap import Pulp_tap
@@ -121,7 +121,7 @@ def __init__(self, parent, name, config_file, chip, cluster):
             bus_watchpoint = Bus_watchpoint(self, 'bus_watchpoint', fc_tohost)
 
         # L2
-        l2_priv0 = memory.Memory(self, 'l2_priv0', size=self.get_property('l2/priv0/mapping/size'))
+        l2_priv0 = memory.Memory(self, 'l2_priv0', size=self.get_property('l2/priv0/mapping/size'), power_trigger=True)
         l2_priv1 = memory.Memory(self, 'l2_priv1', size=self.get_property('l2/priv1/mapping/size'))
 
         l2_shared_size = self.get_property('l2/shared/mapping/size', int)
diff --git a/utils/gap_configs/python/gap/gap9/udma.py b/utils/gap_configs/python/gap/gap9/udma.py
new file mode 100644
index 000000000..d368f50f8
--- /dev/null
+++ b/utils/gap_configs/python/gap/gap9/udma.py
@@ -0,0 +1,97 @@
+#
+# Copyright (C) 2020 GreenWaves Technologies
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import gsystree as st
+
+class Udma(st.Component):
+    def __init__(self, parent, name, config_file):
+
+        super(Udma, self).__init__(parent, name)
+
+        self.vcd_group(self, skip=True)
+
+        self.add_properties(self.load_property_file(config_file))
+        
+
+    def gen_gtkw(self, tree, traces):
+
+        if tree.get_view() == 'overview':
+            map_file = tree.new_map_file(self, 'udma_state')
+            map_file.add_value(1, 'CadetBlue', 'ACTIVE')
+
+            udma_signals = [
+                ['hyper0', 'hyper0.active', '[7:0]'],
+                ['hyper1', 'hyper1.active', '[7:0]']
+            ]
+            tree.add_vector(self, self.name, traces=udma_signals, map_file=map_file, tag='overview')
+
+            sfu_signals = []
+
+            for i in range(0, 8):
+                sfu_signals.append(['mem_in_%d'%i, 'sfu0.mem_in_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 8):
+                sfu_signals.append(['stream_in_%d'%i, 'sfu0.stream_in_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 12):
+                sfu_signals.append(['pdm_in_%d'%i, 'sfu0.pdm_in_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 8):
+                sfu_signals.append(['mem_out_%d'%i, 'sfu0.mem_out_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 8):
+                sfu_signals.append(['stream_out_%d'%i, 'sfu0.stream_out_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 3):
+                sfu_signals.append(['pdm_out_%d'%i, 'sfu0.pdm_out_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 6):
+                sfu_signals.append(['limiter_%d'%i, 'sfu0.limiter_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 4):
+                sfu_signals.append(['gfu_%d'%i, 'sfu0.gfu_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 4):
+                sfu_signals.append(['mixer_%d'%i, 'sfu0.mixer_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 4):
+                sfu_signals.append(['splitter_%d'%i, 'sfu0.splitter_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 4):
+                sfu_signals.append(['pipe_%d'%i, 'sfu0.pipe_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 4):
+                sfu_signals.append(['resampler_%d'%i, 'sfu0.resampler_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 4):
+                sfu_signals.append(['polyphase_%d'%i, 'sfu0.polyphase_%d.busy'%i, '[7:0]'])
+
+            for i in range(0, 3):
+                sfu_signals.append(['asrc_%d'%i, 'sfu0.asrc_%d.busy'%i, '[7:0]'])
+
+
+            tree.add_vector(self, 'sfu', traces=sfu_signals, map_file=map_file, tag='overview')
+
+            ffc_signals = []
+            for i in range(0, 4):
+                ffc_signals.append(['ffc_%d'%i, 'ffc%d.busy'%i, '[7:0]'])
+            tree.add_vector(self, 'ffc', traces=ffc_signals, map_file=map_file, tag='overview')
+
+    def gen_gtkw_conf(self, tree, traces):
+        if tree.get_view() == 'overview':
+            self.vcd_group(self, skip=True)
+        else:
+            self.vcd_group(self, skip=False)
diff --git a/utils/gap_configs/python/ips/mchan/mchan_v7.py b/utils/gap_configs/python/ips/mchan/mchan_v7.py
index 43b4f2e1f..f61b3c0ab 100644
--- a/utils/gap_configs/python/ips/mchan/mchan_v7.py
+++ b/utils/gap_configs/python/ips/mchan/mchan_v7.py
@@ -19,7 +19,7 @@
 class Mchan(st.Component):
 
     def __init__(self, parent, name, nb_channels=0, core_queue_depth=2, global_queue_depth=8, is_64=False, max_nb_ext_read_req=8,
-            max_nb_ext_write_req=8, max_burst_length=256, nb_loc_ports=4, tcdm_addr_width=20):
+            max_nb_ext_write_req=8, max_burst_length=256, nb_loc_ports=4, tcdm_addr_width=20, power_models_file=None):
         super(Mchan, self).__init__(parent, name)
 
         self.vcd_group(self, skip=True)
@@ -37,6 +37,9 @@ def __init__(self, parent, name, nb_channels=0, core_queue_depth=2, global_queue
             'tcdm_addr_width': tcdm_addr_width,
         })
 
+        if power_models_file is not None:
+            self.add_property('power_models', self.load_property_file(power_models_file))
+
 
     def gen_gtkw(self, tree, traces):
 
diff --git a/utils/gapy/configs/devices/flash/spiflash_mx25um51245g.json b/utils/gapy/configs/devices/flash/spiflash_mx25um51245g.json
new file mode 100644
index 000000000..50a6b5dec
--- /dev/null
+++ b/utils/gapy/configs/devices/flash/spiflash_mx25um51245g.json
@@ -0,0 +1,54 @@
+{
+    "name": "SPI_flash",
+  
+    "datasheet": {
+        "type": "spi",
+        "size": "4MB",
+        "block-size": "4KB"
+    },
+  
+    "models": {
+        "gvsoc": {
+  
+        },
+        "rtl": {
+            "stimuli": {
+                "format": "slm",
+                "file": "slm_files/mx25um51245g_flash_stim.slm"
+            }
+        }
+    },
+  
+    "content": {
+      "partitions": {
+          "readfs": {
+              "type": "readfs",
+              "files": []
+          },
+          "hostfs": {
+              "type": "hostfs",
+              "files": []
+          },
+          "lfs": {
+              "type": "lfs",
+              "root_dir": null
+          }
+      }
+    },
+  
+    "doc_rst": "devices/spiflash.rst",
+    "description": "SPI flash model",
+    "platforms": [ "gvsoc", "rtl" ],
+  
+      "vp_class": "devices/spiflash/spiflash",
+      "vp_component": "devices.spiflash.spiflash_impl",
+      "type": "spiflash",
+      "size": "0x00800000",
+  
+      "fs": {
+        "files": [],
+        "encrypt": false,
+        "aes_key": 0,
+        "aes_iv": 0
+      }
+  }
\ No newline at end of file
diff --git a/utils/gapy/gen-debug-info b/utils/gapy/gen-debug-info
new file mode 100755
index 000000000..2a707a24b
Binary files /dev/null and b/utils/gapy/gen-debug-info differ
diff --git a/utils/gapy/gen-debug-info-src/CMakeLists.txt b/utils/gapy/gen-debug-info-src/CMakeLists.txt
new file mode 100644
index 000000000..810dc0076
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.16.3)
+
+project(gen-debug-info)
+
+include_directories(
+    ${CMAKE_CURRENT_SOURCE_DIR}/ext
+    ${CMAKE_CURRENT_BINARY_DIR}
+    )
+
+link_directories(${CMAKE_SOURCE_DIR}/ext)
+
+add_executable(
+    gen-debug-info
+    main.cpp
+    )
+
+target_link_libraries(gen-debug-info bfd iberty dl z)
+
+install(TARGETS gen-debug-info DESTINATION bin)
diff --git a/utils/gapy/gen-debug-info-src/LICENSE b/utils/gapy/gen-debug-info-src/LICENSE
new file mode 100644
index 000000000..293b2efc5
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/LICENSE
@@ -0,0 +1,773 @@
+The software in the directory newlib and the files configure.ac, Makefile.in,
+and patches/newlib, is licensed as follows:
+
+Copyright (c) 2016, The Regents of the University of California (Regents).
+All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the Regents nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING
+OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS
+BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
+The software in the directories binutils, gcc, and linux-headers, and the
+files patches/binutils and patches/gcc, is licensed as follows:
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+The software in the directory glibc and the file patches/glibc is licensed as
+follows:
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
diff --git a/utils/gapy/gen-debug-info-src/ext/alloca-conf.h b/utils/gapy/gen-debug-info-src/ext/alloca-conf.h
new file mode 100644
index 000000000..0e9e2c378
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/alloca-conf.h
@@ -0,0 +1,60 @@
+/* Copyright 2012 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+
+/* This is a merge of code recommended in the autoconf-2.61 documentation
+   with that recommended in the autoconf-2.13 documentation, with added
+   tweaks to heed C_ALLOCA.  */
+
+#if defined HAVE_ALLOCA_H && !defined C_ALLOCA
+# include <alloca.h>
+#else
+# if defined __GNUC__ && !defined C_ALLOCA
+#  if !defined alloca
+#   define alloca __builtin_alloca
+#  endif
+# else
+#  if defined _AIX
+/* Indented so that pre-ansi C compilers will ignore it, rather than
+   choke on it.  Some versions of AIX require this to be the first
+   thing seen by the compiler except for comments and preprocessor
+   directives.  */
+    #pragma alloca
+#  else
+#   if defined _MSC_VER && !defined C_ALLOCA
+#    include <malloc.h>
+#    define alloca _alloca
+#   else
+#    if !defined alloca
+#     if defined __STDC__ || defined __hpux
+#      if defined HAVE_STDDEF_H
+#       include <stddef.h>
+#       if defined  __cplusplus
+extern "C" void *alloca (size_t);
+#       else
+extern void *alloca (size_t);
+#       endif
+#      else
+extern void *alloca ();
+#      endif
+#     else
+extern char *alloca ();
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/ansidecl.h b/utils/gapy/gen-debug-info-src/ext/ansidecl.h
new file mode 100644
index 000000000..6e4bfc21f
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/ansidecl.h
@@ -0,0 +1,329 @@
+/* ANSI and traditional C compatability macros
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* ANSI and traditional C compatibility macros
+
+   ANSI C is assumed if __STDC__ is #defined.
+
+   Macro		ANSI C definition	Traditional C definition
+   -----		---- - ----------	----------- - ----------
+   PTR			`void *'		`char *'
+   const		not defined		`'
+   volatile		not defined		`'
+   signed		not defined		`'
+
+   For ease of writing code which uses GCC extensions but needs to be
+   portable to other compilers, we provide the GCC_VERSION macro that
+   simplifies testing __GNUC__ and __GNUC_MINOR__ together, and various
+   wrappers around __attribute__.  Also, __extension__ will be #defined
+   to nothing if it doesn't work.  See below.  */
+
+#ifndef	_ANSIDECL_H
+#define _ANSIDECL_H	1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Every source file includes this file,
+   so they will all get the switch for lint.  */
+/* LINTLIBRARY */
+
+/* Using MACRO(x,y) in cpp #if conditionals does not work with some
+   older preprocessors.  Thus we can't define something like this:
+
+#define HAVE_GCC_VERSION(MAJOR, MINOR) \
+  (__GNUC__ > (MAJOR) || (__GNUC__ == (MAJOR) && __GNUC_MINOR__ >= (MINOR)))
+
+and then test "#if HAVE_GCC_VERSION(2,7)".
+
+So instead we use the macro below and test it against specific values.  */
+
+/* This macro simplifies testing whether we are using gcc, and if it
+   is of a particular minimum version. (Both major & minor numbers are
+   significant.)  This macro will evaluate to 0 if we are not using
+   gcc at all.  */
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 1000 + __GNUC_MINOR__)
+#endif /* GCC_VERSION */
+
+#if defined (__STDC__) || defined(__cplusplus) || defined (_AIX) || (defined (__mips) && defined (_SYSTYPE_SVR4)) || defined(_WIN32)
+/* All known AIX compilers implement these things (but don't always
+   define __STDC__).  The RISC/OS MIPS compiler defines these things
+   in SVR4 mode, but does not define __STDC__.  */
+/* eraxxon@alumni.rice.edu: The Compaq C++ compiler, unlike many other
+   C++ compilers, does not define __STDC__, though it acts as if this
+   was so. (Verified versions: 5.7, 6.2, 6.3, 6.5) */
+
+#define PTR		void *
+
+#undef const
+#undef volatile
+#undef signed
+
+/* inline requires special treatment; it's in C99, and GCC >=2.7 supports
+   it too, but it's not in C89.  */
+#undef inline
+#if __STDC_VERSION__ >= 199901L || defined(__cplusplus) || (defined(__SUNPRO_C) && defined(__C99FEATURES__))
+/* it's a keyword */
+#else
+# if GCC_VERSION >= 2007
+#  define inline __inline__   /* __inline__ prevents -pedantic warnings */
+# else
+#  define inline  /* nothing */
+# endif
+#endif
+
+#else	/* Not ANSI C.  */
+
+#define PTR		char *
+
+/* some systems define these in header files for non-ansi mode */
+#undef const
+#undef volatile
+#undef signed
+#undef inline
+#define const
+#define volatile
+#define signed
+#define inline
+
+#endif	/* ANSI C.  */
+
+/* Define macros for some gcc attributes.  This permits us to use the
+   macros freely, and know that they will come into play for the
+   version of gcc in which they are supported.  */
+
+#if (GCC_VERSION < 2007)
+# define __attribute__(x)
+#endif
+
+/* Attribute __malloc__ on functions was valid as of gcc 2.96. */
+#ifndef ATTRIBUTE_MALLOC
+# if (GCC_VERSION >= 2096)
+#  define ATTRIBUTE_MALLOC __attribute__ ((__malloc__))
+# else
+#  define ATTRIBUTE_MALLOC
+# endif /* GNUC >= 2.96 */
+#endif /* ATTRIBUTE_MALLOC */
+
+/* Attributes on labels were valid as of gcc 2.93 and g++ 4.5.  For
+   g++ an attribute on a label must be followed by a semicolon.  */
+#ifndef ATTRIBUTE_UNUSED_LABEL
+# ifndef __cplusplus
+#  if GCC_VERSION >= 2093
+#   define ATTRIBUTE_UNUSED_LABEL ATTRIBUTE_UNUSED
+#  else
+#   define ATTRIBUTE_UNUSED_LABEL
+#  endif
+# else
+#  if GCC_VERSION >= 4005
+#   define ATTRIBUTE_UNUSED_LABEL ATTRIBUTE_UNUSED ;
+#  else
+#   define ATTRIBUTE_UNUSED_LABEL
+#  endif
+# endif
+#endif
+
+/* Similarly to ARG_UNUSED below.  Prior to GCC 3.4, the C++ frontend
+   couldn't parse attributes placed after the identifier name, and now
+   the entire compiler is built with C++.  */
+#ifndef ATTRIBUTE_UNUSED
+#if GCC_VERSION >= 3004
+#  define ATTRIBUTE_UNUSED __attribute__ ((__unused__))
+#else
+#define ATTRIBUTE_UNUSED
+#endif
+#endif /* ATTRIBUTE_UNUSED */
+
+/* Before GCC 3.4, the C++ frontend couldn't parse attributes placed after the
+   identifier name.  */
+#if ! defined(__cplusplus) || (GCC_VERSION >= 3004)
+# define ARG_UNUSED(NAME) NAME ATTRIBUTE_UNUSED
+#else /* !__cplusplus || GNUC >= 3.4 */
+# define ARG_UNUSED(NAME) NAME
+#endif /* !__cplusplus || GNUC >= 3.4 */
+
+#ifndef ATTRIBUTE_NORETURN
+#define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
+#endif /* ATTRIBUTE_NORETURN */
+
+/* Attribute `nonnull' was valid as of gcc 3.3.  */
+#ifndef ATTRIBUTE_NONNULL
+# if (GCC_VERSION >= 3003)
+#  define ATTRIBUTE_NONNULL(m) __attribute__ ((__nonnull__ (m)))
+# else
+#  define ATTRIBUTE_NONNULL(m)
+# endif /* GNUC >= 3.3 */
+#endif /* ATTRIBUTE_NONNULL */
+
+/* Attribute `returns_nonnull' was valid as of gcc 4.9.  */
+#ifndef ATTRIBUTE_RETURNS_NONNULL
+# if (GCC_VERSION >= 4009)
+#  define ATTRIBUTE_RETURNS_NONNULL __attribute__ ((__returns_nonnull__))
+# else
+#  define ATTRIBUTE_RETURNS_NONNULL
+# endif /* GNUC >= 4.9 */
+#endif /* ATTRIBUTE_RETURNS_NONNULL */
+
+/* Attribute `pure' was valid as of gcc 3.0.  */
+#ifndef ATTRIBUTE_PURE
+# if (GCC_VERSION >= 3000)
+#  define ATTRIBUTE_PURE __attribute__ ((__pure__))
+# else
+#  define ATTRIBUTE_PURE
+# endif /* GNUC >= 3.0 */
+#endif /* ATTRIBUTE_PURE */
+
+/* Use ATTRIBUTE_PRINTF when the format specifier must not be NULL.
+   This was the case for the `printf' format attribute by itself
+   before GCC 3.3, but as of 3.3 we need to add the `nonnull'
+   attribute to retain this behavior.  */
+#ifndef ATTRIBUTE_PRINTF
+#define ATTRIBUTE_PRINTF(m, n) __attribute__ ((__format__ (__printf__, m, n))) ATTRIBUTE_NONNULL(m)
+#define ATTRIBUTE_PRINTF_1 ATTRIBUTE_PRINTF(1, 2)
+#define ATTRIBUTE_PRINTF_2 ATTRIBUTE_PRINTF(2, 3)
+#define ATTRIBUTE_PRINTF_3 ATTRIBUTE_PRINTF(3, 4)
+#define ATTRIBUTE_PRINTF_4 ATTRIBUTE_PRINTF(4, 5)
+#define ATTRIBUTE_PRINTF_5 ATTRIBUTE_PRINTF(5, 6)
+#endif /* ATTRIBUTE_PRINTF */
+
+/* Use ATTRIBUTE_FPTR_PRINTF when the format attribute is to be set on
+   a function pointer.  Format attributes were allowed on function
+   pointers as of gcc 3.1.  */
+#ifndef ATTRIBUTE_FPTR_PRINTF
+# if (GCC_VERSION >= 3001)
+#  define ATTRIBUTE_FPTR_PRINTF(m, n) ATTRIBUTE_PRINTF(m, n)
+# else
+#  define ATTRIBUTE_FPTR_PRINTF(m, n)
+# endif /* GNUC >= 3.1 */
+# define ATTRIBUTE_FPTR_PRINTF_1 ATTRIBUTE_FPTR_PRINTF(1, 2)
+# define ATTRIBUTE_FPTR_PRINTF_2 ATTRIBUTE_FPTR_PRINTF(2, 3)
+# define ATTRIBUTE_FPTR_PRINTF_3 ATTRIBUTE_FPTR_PRINTF(3, 4)
+# define ATTRIBUTE_FPTR_PRINTF_4 ATTRIBUTE_FPTR_PRINTF(4, 5)
+# define ATTRIBUTE_FPTR_PRINTF_5 ATTRIBUTE_FPTR_PRINTF(5, 6)
+#endif /* ATTRIBUTE_FPTR_PRINTF */
+
+/* Use ATTRIBUTE_NULL_PRINTF when the format specifier may be NULL.  A
+   NULL format specifier was allowed as of gcc 3.3.  */
+#ifndef ATTRIBUTE_NULL_PRINTF
+# if (GCC_VERSION >= 3003)
+#  define ATTRIBUTE_NULL_PRINTF(m, n) __attribute__ ((__format__ (__printf__, m, n)))
+# else
+#  define ATTRIBUTE_NULL_PRINTF(m, n)
+# endif /* GNUC >= 3.3 */
+# define ATTRIBUTE_NULL_PRINTF_1 ATTRIBUTE_NULL_PRINTF(1, 2)
+# define ATTRIBUTE_NULL_PRINTF_2 ATTRIBUTE_NULL_PRINTF(2, 3)
+# define ATTRIBUTE_NULL_PRINTF_3 ATTRIBUTE_NULL_PRINTF(3, 4)
+# define ATTRIBUTE_NULL_PRINTF_4 ATTRIBUTE_NULL_PRINTF(4, 5)
+# define ATTRIBUTE_NULL_PRINTF_5 ATTRIBUTE_NULL_PRINTF(5, 6)
+#endif /* ATTRIBUTE_NULL_PRINTF */
+
+/* Attribute `sentinel' was valid as of gcc 3.5.  */
+#ifndef ATTRIBUTE_SENTINEL
+# if (GCC_VERSION >= 3005)
+#  define ATTRIBUTE_SENTINEL __attribute__ ((__sentinel__))
+# else
+#  define ATTRIBUTE_SENTINEL
+# endif /* GNUC >= 3.5 */
+#endif /* ATTRIBUTE_SENTINEL */
+
+
+#ifndef ATTRIBUTE_ALIGNED_ALIGNOF
+# if (GCC_VERSION >= 3000)
+#  define ATTRIBUTE_ALIGNED_ALIGNOF(m) __attribute__ ((__aligned__ (__alignof__ (m))))
+# else
+#  define ATTRIBUTE_ALIGNED_ALIGNOF(m)
+# endif /* GNUC >= 3.0 */
+#endif /* ATTRIBUTE_ALIGNED_ALIGNOF */
+
+/* Useful for structures whose layout must much some binary specification
+   regardless of the alignment and padding qualities of the compiler.  */
+#ifndef ATTRIBUTE_PACKED
+# define ATTRIBUTE_PACKED __attribute__ ((packed))
+#endif
+
+/* Attribute `hot' and `cold' was valid as of gcc 4.3.  */
+#ifndef ATTRIBUTE_COLD
+# if (GCC_VERSION >= 4003)
+#  define ATTRIBUTE_COLD __attribute__ ((__cold__))
+# else
+#  define ATTRIBUTE_COLD
+# endif /* GNUC >= 4.3 */
+#endif /* ATTRIBUTE_COLD */
+#ifndef ATTRIBUTE_HOT
+# if (GCC_VERSION >= 4003)
+#  define ATTRIBUTE_HOT __attribute__ ((__hot__))
+# else
+#  define ATTRIBUTE_HOT
+# endif /* GNUC >= 4.3 */
+#endif /* ATTRIBUTE_HOT */
+
+/* Attribute 'no_sanitize_undefined' was valid as of gcc 4.9.  */
+#ifndef ATTRIBUTE_NO_SANITIZE_UNDEFINED
+# if (GCC_VERSION >= 4009)
+#  define ATTRIBUTE_NO_SANITIZE_UNDEFINED __attribute__ ((no_sanitize_undefined))
+# else
+#  define ATTRIBUTE_NO_SANITIZE_UNDEFINED
+# endif /* GNUC >= 4.9 */
+#endif /* ATTRIBUTE_NO_SANITIZE_UNDEFINED */
+
+/* We use __extension__ in some places to suppress -pedantic warnings
+   about GCC extensions.  This feature didn't work properly before
+   gcc 2.8.  */
+#if GCC_VERSION < 2008
+#define __extension__
+#endif
+
+/* This is used to declare a const variable which should be visible
+   outside of the current compilation unit.  Use it as
+     EXPORTED_CONST int i = 1;
+   This is because the semantics of const are different in C and C++.
+   "extern const" is permitted in C but it looks strange, and gcc
+   warns about it when -Wc++-compat is not used.  */
+#ifdef __cplusplus
+#define EXPORTED_CONST extern const
+#else
+#define EXPORTED_CONST const
+#endif
+
+/* Be conservative and only use enum bitfields with C++ or GCC.
+   FIXME: provide a complete autoconf test for buggy enum bitfields.  */
+
+#ifdef __cplusplus
+#define ENUM_BITFIELD(TYPE) enum TYPE
+#elif (GCC_VERSION > 2000)
+#define ENUM_BITFIELD(TYPE) __extension__ enum TYPE
+#else
+#define ENUM_BITFIELD(TYPE) unsigned int
+#endif
+
+    /* This is used to mark a class or virtual function as final.  */
+#if __cplusplus >= 201103L
+#define GCC_FINAL final
+#elif GCC_VERSION >= 4007
+#define GCC_FINAL __final
+#else
+#define GCC_FINAL
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* ansidecl.h	*/
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/bfd-in3.h b/utils/gapy/gen-debug-info-src/ext/bfd/bfd-in3.h
new file mode 100644
index 000000000..1c0cf66cf
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/bfd-in3.h
@@ -0,0 +1,7720 @@
+/* DO NOT EDIT!  -*- buffer-read-only: t -*-  This file is automatically 
+   generated from "bfd-in.h", "init.c", "opncls.c", "libbfd.c", 
+   "bfdio.c", "bfdwin.c", "section.c", "archures.c", "reloc.c", 
+   "syms.c", "bfd.c", "archive.c", "corefile.c", "targets.c", "format.c", 
+   "linker.c", "simple.c" and "compress.c".
+   Run "make headers" in your build bfd/ to regenerate.  */
+
+/* Main header file for the bfd library -- portable access to object files.
+
+   Copyright (C) 1990-2017 Free Software Foundation, Inc.
+
+   Contributed by Cygnus Support.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef __BFD_H_SEEN__
+#define __BFD_H_SEEN__
+
+/* PR 14072: Ensure that config.h is included first.  */
+#if !defined PACKAGE && !defined PACKAGE_VERSION
+#error config.h must be included before this header
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "ansidecl.h"
+#include "symcat.h"
+#include <stdarg.h>
+#include <sys/stat.h>
+
+#if defined (__STDC__) || defined (ALMOST_STDC) || defined (HAVE_STRINGIZE)
+#ifndef SABER
+/* This hack is to avoid a problem with some strict ANSI C preprocessors.
+   The problem is, "32_" is not a valid preprocessing token, and we don't
+   want extra underscores (e.g., "nlm_32_").  The XCONCAT2 macro will
+   cause the inner CONCAT2 macros to be evaluated first, producing
+   still-valid pp-tokens.  Then the final concatenation can be done.  */
+#undef CONCAT4
+#define CONCAT4(a,b,c,d) XCONCAT2(CONCAT2(a,b),CONCAT2(c,d))
+#endif
+#endif
+
+/* This is a utility macro to handle the situation where the code
+   wants to place a constant string into the code, followed by a
+   comma and then the length of the string.  Doing this by hand
+   is error prone, so using this macro is safer.  */
+#define STRING_COMMA_LEN(STR) (STR), (sizeof (STR) - 1)
+/* Unfortunately it is not possible to use the STRING_COMMA_LEN macro
+   to create the arguments to another macro, since the preprocessor
+   will mis-count the number of arguments to the outer macro (by not
+   evaluating STRING_COMMA_LEN and so missing the comma).  This is a
+   problem for example when trying to use STRING_COMMA_LEN to build
+   the arguments to the strncmp() macro.  Hence this alternative
+   definition of strncmp is provided here.
+
+   Note - these macros do NOT work if STR2 is not a constant string.  */
+#define CONST_STRNEQ(STR1,STR2) (strncmp ((STR1), (STR2), sizeof (STR2) - 1) == 0)
+  /* strcpy() can have a similar problem, but since we know we are
+     copying a constant string, we can use memcpy which will be faster
+     since there is no need to check for a NUL byte inside STR.  We
+     can also save time if we do not need to copy the terminating NUL.  */
+#define LITMEMCPY(DEST,STR2) memcpy ((DEST), (STR2), sizeof (STR2) - 1)
+#define LITSTRCPY(DEST,STR2) memcpy ((DEST), (STR2), sizeof (STR2))
+
+
+#define BFD_SUPPORTS_PLUGINS 1
+
+/* The word size used by BFD on the host.  This may be 64 with a 32
+   bit target if the host is 64 bit, or if other 64 bit targets have
+   been selected with --enable-targets, or if --enable-64-bit-bfd.  */
+#define BFD_ARCH_SIZE 64
+
+/* The word size of the default bfd target.  */
+#define BFD_DEFAULT_TARGET_SIZE 32
+
+#define BFD_HOST_64BIT_LONG 1
+#define BFD_HOST_64BIT_LONG_LONG 0
+#if 1
+#define BFD_HOST_64_BIT long
+#define BFD_HOST_U_64_BIT unsigned long
+typedef BFD_HOST_64_BIT bfd_int64_t;
+typedef BFD_HOST_U_64_BIT bfd_uint64_t;
+#endif
+
+#if BFD_ARCH_SIZE >= 64
+#define BFD64
+#endif
+
+#ifndef INLINE
+#if __GNUC__ >= 2
+#define INLINE __inline__
+#else
+#define INLINE
+#endif
+#endif
+
+/* Declaring a type wide enough to hold a host long and a host pointer.  */
+#define BFD_HOSTPTR_T	unsigned long
+typedef BFD_HOSTPTR_T bfd_hostptr_t;
+
+/* Forward declaration.  */
+typedef struct bfd bfd;
+
+/* Boolean type used in bfd.  Too many systems define their own
+   versions of "boolean" for us to safely typedef a "boolean" of
+   our own.  Using an enum for "bfd_boolean" has its own set of
+   problems, with strange looking casts required to avoid warnings
+   on some older compilers.  Thus we just use an int.
+
+   General rule: Functions which are bfd_boolean return TRUE on
+   success and FALSE on failure (unless they're a predicate).  */
+
+typedef int bfd_boolean;
+#undef FALSE
+#undef TRUE
+#define FALSE 0
+#define TRUE 1
+
+#ifdef BFD64
+
+#ifndef BFD_HOST_64_BIT
+ #error No 64 bit integer type available
+#endif /* ! defined (BFD_HOST_64_BIT) */
+
+typedef BFD_HOST_U_64_BIT bfd_vma;
+typedef BFD_HOST_64_BIT bfd_signed_vma;
+typedef BFD_HOST_U_64_BIT bfd_size_type;
+typedef BFD_HOST_U_64_BIT symvalue;
+
+#if BFD_HOST_64BIT_LONG
+#define BFD_VMA_FMT "l"
+#elif defined (__MSVCRT__)
+#define BFD_VMA_FMT "I64"
+#else
+#define BFD_VMA_FMT "ll"
+#endif
+
+#ifndef fprintf_vma
+#define sprintf_vma(s,x) sprintf (s, "%016" BFD_VMA_FMT "x", x)
+#define fprintf_vma(f,x) fprintf (f, "%016" BFD_VMA_FMT "x", x)
+#endif
+
+#else /* not BFD64  */
+
+/* Represent a target address.  Also used as a generic unsigned type
+   which is guaranteed to be big enough to hold any arithmetic types
+   we need to deal with.  */
+typedef unsigned long bfd_vma;
+
+/* A generic signed type which is guaranteed to be big enough to hold any
+   arithmetic types we need to deal with.  Can be assumed to be compatible
+   with bfd_vma in the same way that signed and unsigned ints are compatible
+   (as parameters, in assignment, etc).  */
+typedef long bfd_signed_vma;
+
+typedef unsigned long symvalue;
+typedef unsigned long bfd_size_type;
+
+/* Print a bfd_vma x on stream s.  */
+#define BFD_VMA_FMT "l"
+#define fprintf_vma(s,x) fprintf (s, "%08" BFD_VMA_FMT "x", x)
+#define sprintf_vma(s,x) sprintf (s, "%08" BFD_VMA_FMT "x", x)
+
+#endif /* not BFD64  */
+
+#define HALF_BFD_SIZE_TYPE \
+  (((bfd_size_type) 1) << (8 * sizeof (bfd_size_type) / 2))
+
+#ifndef BFD_HOST_64_BIT
+/* Fall back on a 32 bit type.  The idea is to make these types always
+   available for function return types, but in the case that
+   BFD_HOST_64_BIT is undefined such a function should abort or
+   otherwise signal an error.  */
+typedef bfd_signed_vma bfd_int64_t;
+typedef bfd_vma bfd_uint64_t;
+#endif
+
+/* An offset into a file.  BFD always uses the largest possible offset
+   based on the build time availability of fseek, fseeko, or fseeko64.  */
+typedef BFD_HOST_64_BIT file_ptr;
+typedef unsigned BFD_HOST_64_BIT ufile_ptr;
+
+extern void bfd_sprintf_vma (bfd *, char *, bfd_vma);
+extern void bfd_fprintf_vma (bfd *, void *, bfd_vma);
+
+#define printf_vma(x) fprintf_vma(stdout,x)
+#define bfd_printf_vma(abfd,x) bfd_fprintf_vma (abfd,stdout,x)
+
+typedef unsigned int flagword;	/* 32 bits of flags */
+typedef unsigned char bfd_byte;
+
+/* File formats.  */
+
+typedef enum bfd_format
+{
+  bfd_unknown = 0,	/* File format is unknown.  */
+  bfd_object,		/* Linker/assembler/compiler output.  */
+  bfd_archive,		/* Object archive file.  */
+  bfd_core,		/* Core dump.  */
+  bfd_type_end		/* Marks the end; don't use it!  */
+}
+bfd_format;
+
+/* Symbols and relocation.  */
+
+/* A count of carsyms (canonical archive symbols).  */
+typedef unsigned long symindex;
+
+/* How to perform a relocation.  */
+typedef const struct reloc_howto_struct reloc_howto_type;
+
+#define BFD_NO_MORE_SYMBOLS ((symindex) ~0)
+
+/* General purpose part of a symbol X;
+   target specific parts are in libcoff.h, libaout.h, etc.  */
+
+#define bfd_get_section(x) ((x)->section)
+#define bfd_get_output_section(x) ((x)->section->output_section)
+#define bfd_set_section(x,y) ((x)->section) = (y)
+#define bfd_asymbol_base(x) ((x)->section->vma)
+#define bfd_asymbol_value(x) (bfd_asymbol_base(x) + (x)->value)
+#define bfd_asymbol_name(x) ((x)->name)
+/*Perhaps future: #define bfd_asymbol_bfd(x) ((x)->section->owner)*/
+#define bfd_asymbol_bfd(x) ((x)->the_bfd)
+#define bfd_asymbol_flavour(x)			\
+  (((x)->flags & BSF_SYNTHETIC) != 0		\
+   ? bfd_target_unknown_flavour			\
+   : bfd_asymbol_bfd (x)->xvec->flavour)
+
+/* A canonical archive symbol.  */
+/* This is a type pun with struct ranlib on purpose!  */
+typedef struct carsym
+{
+  char *name;
+  file_ptr file_offset;	/* Look here to find the file.  */
+}
+carsym;			/* To make these you call a carsymogen.  */
+
+/* Used in generating armaps (archive tables of contents).
+   Perhaps just a forward definition would do?  */
+struct orl 			/* Output ranlib.  */
+{
+  char **name;		/* Symbol name.  */
+  union
+  {
+    file_ptr pos;
+    bfd *abfd;
+  } u;			/* bfd* or file position.  */
+  int namidx;		/* Index into string table.  */
+};
+
+/* Linenumber stuff.  */
+typedef struct lineno_cache_entry
+{
+  unsigned int line_number;	/* Linenumber from start of function.  */
+  union
+  {
+    struct bfd_symbol *sym;	/* Function name.  */
+    bfd_vma offset;	    		/* Offset into section.  */
+  } u;
+}
+alent;
+
+/* Object and core file sections.  */
+typedef struct bfd_section *sec_ptr;
+
+#define	align_power(addr, align)	\
+  (((addr) + ((bfd_vma) 1 << (align)) - 1) & (-((bfd_vma) 1 << (align))))
+
+/* Align an address upward to a boundary, expressed as a number of bytes.
+   E.g. align to an 8-byte boundary with argument of 8.  Take care never
+   to wrap around if the address is within boundary-1 of the end of the
+   address space.  */
+#define BFD_ALIGN(this, boundary)					  \
+  ((((bfd_vma) (this) + (boundary) - 1) >= (bfd_vma) (this))		  \
+   ? (((bfd_vma) (this) + ((boundary) - 1)) & ~ (bfd_vma) ((boundary)-1)) \
+   : ~ (bfd_vma) 0)
+
+#define bfd_get_section_name(bfd, ptr) ((void) bfd, (ptr)->name)
+#define bfd_get_section_vma(bfd, ptr) ((void) bfd, (ptr)->vma)
+#define bfd_get_section_lma(bfd, ptr) ((void) bfd, (ptr)->lma)
+#define bfd_get_section_alignment(bfd, ptr) ((void) bfd, \
+					     (ptr)->alignment_power)
+#define bfd_section_name(bfd, ptr) ((ptr)->name)
+#define bfd_section_size(bfd, ptr) ((ptr)->size)
+#define bfd_get_section_size(ptr) ((ptr)->size)
+#define bfd_section_vma(bfd, ptr) ((ptr)->vma)
+#define bfd_section_lma(bfd, ptr) ((ptr)->lma)
+#define bfd_section_alignment(bfd, ptr) ((ptr)->alignment_power)
+#define bfd_get_section_flags(bfd, ptr) ((void) bfd, (ptr)->flags)
+#define bfd_get_section_userdata(bfd, ptr) ((void) bfd, (ptr)->userdata)
+
+#define bfd_is_com_section(ptr) (((ptr)->flags & SEC_IS_COMMON) != 0)
+
+#define bfd_get_section_limit_octets(bfd, sec)			\
+  ((bfd)->direction != write_direction && (sec)->rawsize != 0	\
+   ? (sec)->rawsize : (sec)->size)
+
+/* Find the address one past the end of SEC.  */
+#define bfd_get_section_limit(bfd, sec) \
+  (bfd_get_section_limit_octets(bfd, sec) / bfd_octets_per_byte (bfd))
+
+/* Return TRUE if input section SEC has been discarded.  */
+#define discarded_section(sec)				\
+  (!bfd_is_abs_section (sec)					\
+   && bfd_is_abs_section ((sec)->output_section)		\
+   && (sec)->sec_info_type != SEC_INFO_TYPE_MERGE		\
+   && (sec)->sec_info_type != SEC_INFO_TYPE_JUST_SYMS)
+
+typedef enum bfd_print_symbol
+{
+  bfd_print_symbol_name,
+  bfd_print_symbol_more,
+  bfd_print_symbol_all
+} bfd_print_symbol_type;
+
+/* Information about a symbol that nm needs.  */
+
+typedef struct _symbol_info
+{
+  symvalue value;
+  char type;
+  const char *name;            /* Symbol name.  */
+  unsigned char stab_type;     /* Stab type.  */
+  char stab_other;             /* Stab other.  */
+  short stab_desc;             /* Stab desc.  */
+  const char *stab_name;       /* String for stab type.  */
+} symbol_info;
+
+/* Get the name of a stabs type code.  */
+
+extern const char *bfd_get_stab_name (int);
+
+/* Hash table routines.  There is no way to free up a hash table.  */
+
+/* An element in the hash table.  Most uses will actually use a larger
+   structure, and an instance of this will be the first field.  */
+
+struct bfd_hash_entry
+{
+  /* Next entry for this hash code.  */
+  struct bfd_hash_entry *next;
+  /* String being hashed.  */
+  const char *string;
+  /* Hash code.  This is the full hash code, not the index into the
+     table.  */
+  unsigned long hash;
+};
+
+/* A hash table.  */
+
+struct bfd_hash_table
+{
+  /* The hash array.  */
+  struct bfd_hash_entry **table;
+  /* A function used to create new elements in the hash table.  The
+     first entry is itself a pointer to an element.  When this
+     function is first invoked, this pointer will be NULL.  However,
+     having the pointer permits a hierarchy of method functions to be
+     built each of which calls the function in the superclass.  Thus
+     each function should be written to allocate a new block of memory
+     only if the argument is NULL.  */
+  struct bfd_hash_entry *(*newfunc)
+    (struct bfd_hash_entry *, struct bfd_hash_table *, const char *);
+   /* An objalloc for this hash table.  This is a struct objalloc *,
+     but we use void * to avoid requiring the inclusion of objalloc.h.  */
+  void *memory;
+  /* The number of slots in the hash table.  */
+  unsigned int size;
+  /* The number of entries in the hash table.  */
+  unsigned int count;
+  /* The size of elements.  */
+  unsigned int entsize;
+  /* If non-zero, don't grow the hash table.  */
+  unsigned int frozen:1;
+};
+
+/* Initialize a hash table.  */
+extern bfd_boolean bfd_hash_table_init
+  (struct bfd_hash_table *,
+   struct bfd_hash_entry *(*) (struct bfd_hash_entry *,
+			       struct bfd_hash_table *,
+			       const char *),
+   unsigned int);
+
+/* Initialize a hash table specifying a size.  */
+extern bfd_boolean bfd_hash_table_init_n
+  (struct bfd_hash_table *,
+   struct bfd_hash_entry *(*) (struct bfd_hash_entry *,
+			       struct bfd_hash_table *,
+			       const char *),
+   unsigned int, unsigned int);
+
+/* Free up a hash table.  */
+extern void bfd_hash_table_free
+  (struct bfd_hash_table *);
+
+/* Look up a string in a hash table.  If CREATE is TRUE, a new entry
+   will be created for this string if one does not already exist.  The
+   COPY argument must be TRUE if this routine should copy the string
+   into newly allocated memory when adding an entry.  */
+extern struct bfd_hash_entry *bfd_hash_lookup
+  (struct bfd_hash_table *, const char *, bfd_boolean create,
+   bfd_boolean copy);
+
+/* Insert an entry in a hash table.  */
+extern struct bfd_hash_entry *bfd_hash_insert
+  (struct bfd_hash_table *, const char *, unsigned long);
+
+/* Rename an entry in a hash table.  */
+extern void bfd_hash_rename
+  (struct bfd_hash_table *, const char *, struct bfd_hash_entry *);
+
+/* Replace an entry in a hash table.  */
+extern void bfd_hash_replace
+  (struct bfd_hash_table *, struct bfd_hash_entry *old,
+   struct bfd_hash_entry *nw);
+
+/* Base method for creating a hash table entry.  */
+extern struct bfd_hash_entry *bfd_hash_newfunc
+  (struct bfd_hash_entry *, struct bfd_hash_table *, const char *);
+
+/* Grab some space for a hash table entry.  */
+extern void *bfd_hash_allocate
+  (struct bfd_hash_table *, unsigned int);
+
+/* Traverse a hash table in a random order, calling a function on each
+   element.  If the function returns FALSE, the traversal stops.  The
+   INFO argument is passed to the function.  */
+extern void bfd_hash_traverse
+  (struct bfd_hash_table *,
+   bfd_boolean (*) (struct bfd_hash_entry *, void *),
+   void *info);
+
+/* Allows the default size of a hash table to be configured. New hash
+   tables allocated using bfd_hash_table_init will be created with
+   this size.  */
+extern unsigned long bfd_hash_set_default_size (unsigned long);
+
+/* Types of compressed DWARF debug sections.  We currently support
+   zlib.  */
+enum compressed_debug_section_type
+{
+  COMPRESS_DEBUG_NONE = 0,
+  COMPRESS_DEBUG = 1 << 0,
+  COMPRESS_DEBUG_GNU_ZLIB = COMPRESS_DEBUG | 1 << 1,
+  COMPRESS_DEBUG_GABI_ZLIB = COMPRESS_DEBUG | 1 << 2
+};
+
+/* This structure is used to keep track of stabs in sections
+   information while linking.  */
+
+struct stab_info
+{
+  /* A hash table used to hold stabs strings.  */
+  struct bfd_strtab_hash *strings;
+  /* The header file hash table.  */
+  struct bfd_hash_table includes;
+  /* The first .stabstr section.  */
+  struct bfd_section *stabstr;
+};
+
+#define COFF_SWAP_TABLE (void *) &bfd_coff_std_swap_table
+
+/* User program access to BFD facilities.  */
+
+/* Direct I/O routines, for programs which know more about the object
+   file than BFD does.  Use higher level routines if possible.  */
+
+extern bfd_size_type bfd_bread (void *, bfd_size_type, bfd *);
+extern bfd_size_type bfd_bwrite (const void *, bfd_size_type, bfd *);
+extern int bfd_seek (bfd *, file_ptr, int);
+extern file_ptr bfd_tell (bfd *);
+extern int bfd_flush (bfd *);
+extern int bfd_stat (bfd *, struct stat *);
+
+/* Deprecated old routines.  */
+#if __GNUC__
+#define bfd_read(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_read", __FILE__, __LINE__, __FUNCTION__),	\
+   bfd_bread ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#define bfd_write(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_write", __FILE__, __LINE__, __FUNCTION__),	\
+   bfd_bwrite ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#else
+#define bfd_read(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_read", (const char *) 0, 0, (const char *) 0), \
+   bfd_bread ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#define bfd_write(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_write", (const char *) 0, 0, (const char *) 0),\
+   bfd_bwrite ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#endif
+extern void warn_deprecated (const char *, const char *, int, const char *);
+
+/* Cast from const char * to char * so that caller can assign to
+   a char * without a warning.  */
+#define bfd_get_filename(abfd) ((char *) (abfd)->filename)
+#define bfd_get_cacheable(abfd) ((abfd)->cacheable)
+#define bfd_get_format(abfd) ((abfd)->format)
+#define bfd_get_target(abfd) ((abfd)->xvec->name)
+#define bfd_get_flavour(abfd) ((abfd)->xvec->flavour)
+#define bfd_family_coff(abfd) \
+  (bfd_get_flavour (abfd) == bfd_target_coff_flavour || \
+   bfd_get_flavour (abfd) == bfd_target_xcoff_flavour)
+#define bfd_big_endian(abfd) ((abfd)->xvec->byteorder == BFD_ENDIAN_BIG)
+#define bfd_little_endian(abfd) ((abfd)->xvec->byteorder == BFD_ENDIAN_LITTLE)
+#define bfd_header_big_endian(abfd) \
+  ((abfd)->xvec->header_byteorder == BFD_ENDIAN_BIG)
+#define bfd_header_little_endian(abfd) \
+  ((abfd)->xvec->header_byteorder == BFD_ENDIAN_LITTLE)
+#define bfd_get_file_flags(abfd) ((abfd)->flags)
+#define bfd_applicable_file_flags(abfd) ((abfd)->xvec->object_flags)
+#define bfd_applicable_section_flags(abfd) ((abfd)->xvec->section_flags)
+#define bfd_has_map(abfd) ((abfd)->has_armap)
+#define bfd_is_thin_archive(abfd) ((abfd)->is_thin_archive)
+
+#define bfd_valid_reloc_types(abfd) ((abfd)->xvec->valid_reloc_types)
+#define bfd_usrdata(abfd) ((abfd)->usrdata)
+
+#define bfd_get_start_address(abfd) ((abfd)->start_address)
+#define bfd_get_symcount(abfd) ((abfd)->symcount)
+#define bfd_get_outsymbols(abfd) ((abfd)->outsymbols)
+#define bfd_count_sections(abfd) ((abfd)->section_count)
+
+#define bfd_get_dynamic_symcount(abfd) ((abfd)->dynsymcount)
+
+#define bfd_get_symbol_leading_char(abfd) ((abfd)->xvec->symbol_leading_char)
+
+extern bfd_boolean bfd_cache_close
+  (bfd *abfd);
+/* NB: This declaration should match the autogenerated one in libbfd.h.  */
+
+extern bfd_boolean bfd_cache_close_all (void);
+
+extern bfd_boolean bfd_record_phdr
+  (bfd *, unsigned long, bfd_boolean, flagword, bfd_boolean, bfd_vma,
+   bfd_boolean, bfd_boolean, unsigned int, struct bfd_section **);
+
+/* Byte swapping routines.  */
+
+bfd_uint64_t bfd_getb64 (const void *);
+bfd_uint64_t bfd_getl64 (const void *);
+bfd_int64_t bfd_getb_signed_64 (const void *);
+bfd_int64_t bfd_getl_signed_64 (const void *);
+bfd_vma bfd_getb32 (const void *);
+bfd_vma bfd_getl32 (const void *);
+bfd_signed_vma bfd_getb_signed_32 (const void *);
+bfd_signed_vma bfd_getl_signed_32 (const void *);
+bfd_vma bfd_getb16 (const void *);
+bfd_vma bfd_getl16 (const void *);
+bfd_signed_vma bfd_getb_signed_16 (const void *);
+bfd_signed_vma bfd_getl_signed_16 (const void *);
+void bfd_putb64 (bfd_uint64_t, void *);
+void bfd_putl64 (bfd_uint64_t, void *);
+void bfd_putb32 (bfd_vma, void *);
+void bfd_putl32 (bfd_vma, void *);
+void bfd_putb16 (bfd_vma, void *);
+void bfd_putl16 (bfd_vma, void *);
+
+/* Byte swapping routines which take size and endiannes as arguments.  */
+
+bfd_uint64_t bfd_get_bits (const void *, int, bfd_boolean);
+void bfd_put_bits (bfd_uint64_t, void *, int, bfd_boolean);
+
+#if defined(__STDC__) || defined(ALMOST_STDC)
+struct ecoff_debug_info;
+struct ecoff_debug_swap;
+struct ecoff_extr;
+struct bfd_symbol;
+struct bfd_link_info;
+struct bfd_link_hash_entry;
+struct bfd_section_already_linked;
+struct bfd_elf_version_tree;
+#endif
+
+extern bfd_boolean bfd_section_already_linked_table_init (void);
+extern void bfd_section_already_linked_table_free (void);
+extern bfd_boolean _bfd_handle_already_linked
+  (struct bfd_section *, struct bfd_section_already_linked *,
+   struct bfd_link_info *);
+
+/* Externally visible ECOFF routines.  */
+
+extern bfd_vma bfd_ecoff_get_gp_value
+  (bfd * abfd);
+extern bfd_boolean bfd_ecoff_set_gp_value
+  (bfd *abfd, bfd_vma gp_value);
+extern bfd_boolean bfd_ecoff_set_regmasks
+  (bfd *abfd, unsigned long gprmask, unsigned long fprmask,
+   unsigned long *cprmask);
+extern void *bfd_ecoff_debug_init
+  (bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, struct bfd_link_info *);
+extern void bfd_ecoff_debug_free
+  (void *handle, bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, struct bfd_link_info *);
+extern bfd_boolean bfd_ecoff_debug_accumulate
+  (void *handle, bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, bfd *input_bfd,
+   struct ecoff_debug_info *input_debug,
+   const struct ecoff_debug_swap *input_swap, struct bfd_link_info *);
+extern bfd_boolean bfd_ecoff_debug_accumulate_other
+  (void *handle, bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, bfd *input_bfd,
+   struct bfd_link_info *);
+extern bfd_boolean bfd_ecoff_debug_externals
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap, bfd_boolean relocatable,
+   bfd_boolean (*get_extr) (struct bfd_symbol *, struct ecoff_extr *),
+   void (*set_index) (struct bfd_symbol *, bfd_size_type));
+extern bfd_boolean bfd_ecoff_debug_one_external
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap, const char *name,
+   struct ecoff_extr *esym);
+extern bfd_size_type bfd_ecoff_debug_size
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap);
+extern bfd_boolean bfd_ecoff_write_debug
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap, file_ptr where);
+extern bfd_boolean bfd_ecoff_write_accumulated_debug
+  (void *handle, bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap,
+   struct bfd_link_info *info, file_ptr where);
+
+/* Externally visible ELF routines.  */
+
+struct bfd_link_needed_list
+{
+  struct bfd_link_needed_list *next;
+  bfd *by;
+  const char *name;
+};
+
+enum dynamic_lib_link_class {
+  DYN_NORMAL = 0,
+  DYN_AS_NEEDED = 1,
+  DYN_DT_NEEDED = 2,
+  DYN_NO_ADD_NEEDED = 4,
+  DYN_NO_NEEDED = 8
+};
+
+enum notice_asneeded_action {
+  notice_as_needed,
+  notice_not_needed,
+  notice_needed
+};
+
+extern bfd_boolean bfd_elf_record_link_assignment
+  (bfd *, struct bfd_link_info *, const char *, bfd_boolean,
+   bfd_boolean);
+extern struct bfd_link_needed_list *bfd_elf_get_needed_list
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_elf_get_bfd_needed_list
+  (bfd *, struct bfd_link_needed_list **);
+extern bfd_boolean bfd_elf_stack_segment_size (bfd *, struct bfd_link_info *,
+					       const char *, bfd_vma);
+extern bfd_boolean bfd_elf_size_dynamic_sections
+  (bfd *, const char *, const char *, const char *, const char *, const char *,
+   const char * const *, struct bfd_link_info *, struct bfd_section **);
+extern bfd_boolean bfd_elf_size_dynsym_hash_dynstr
+  (bfd *, struct bfd_link_info *);
+extern void bfd_elf_set_dt_needed_name
+  (bfd *, const char *);
+extern const char *bfd_elf_get_dt_soname
+  (bfd *);
+extern void bfd_elf_set_dyn_lib_class
+  (bfd *, enum dynamic_lib_link_class);
+extern int bfd_elf_get_dyn_lib_class
+  (bfd *);
+extern struct bfd_link_needed_list *bfd_elf_get_runpath_list
+  (bfd *, struct bfd_link_info *);
+extern int bfd_elf_discard_info
+  (bfd *, struct bfd_link_info *);
+extern unsigned int _bfd_elf_default_action_discarded
+  (struct bfd_section *);
+
+/* Return an upper bound on the number of bytes required to store a
+   copy of ABFD's program header table entries.  Return -1 if an error
+   occurs; bfd_get_error will return an appropriate code.  */
+extern long bfd_get_elf_phdr_upper_bound
+  (bfd *abfd);
+
+/* Copy ABFD's program header table entries to *PHDRS.  The entries
+   will be stored as an array of Elf_Internal_Phdr structures, as
+   defined in include/elf/internal.h.  To find out how large the
+   buffer needs to be, call bfd_get_elf_phdr_upper_bound.
+
+   Return the number of program header table entries read, or -1 if an
+   error occurs; bfd_get_error will return an appropriate code.  */
+extern int bfd_get_elf_phdrs
+  (bfd *abfd, void *phdrs);
+
+/* Create a new BFD as if by bfd_openr.  Rather than opening a file,
+   reconstruct an ELF file by reading the segments out of remote
+   memory based on the ELF file header at EHDR_VMA and the ELF program
+   headers it points to.  If non-zero, SIZE is the known extent of the
+   object.  If not null, *LOADBASEP is filled in with the difference
+   between the VMAs from which the segments were read, and the VMAs
+   the file headers (and hence BFD's idea of each section's VMA) put
+   them at.
+
+   The function TARGET_READ_MEMORY is called to copy LEN bytes from
+   the remote memory at target address VMA into the local buffer at
+   MYADDR; it should return zero on success or an `errno' code on
+   failure.  TEMPL must be a BFD for a target with the word size and
+   byte order found in the remote memory.  */
+extern bfd *bfd_elf_bfd_from_remote_memory
+  (bfd *templ, bfd_vma ehdr_vma, bfd_size_type size, bfd_vma *loadbasep,
+   int (*target_read_memory) (bfd_vma vma, bfd_byte *myaddr,
+			      bfd_size_type len));
+
+extern struct bfd_section *_bfd_elf_tls_setup
+  (bfd *, struct bfd_link_info *);
+
+extern struct bfd_section *
+_bfd_nearby_section (bfd *, struct bfd_section *, bfd_vma);
+
+extern void _bfd_fix_excluded_sec_syms
+  (bfd *, struct bfd_link_info *);
+
+extern unsigned bfd_m68k_mach_to_features (int);
+
+extern int bfd_m68k_features_to_mach (unsigned);
+
+extern bfd_boolean bfd_m68k_elf32_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *,
+   char **);
+
+extern void bfd_elf_m68k_set_target_options (struct bfd_link_info *, int);
+
+extern bfd_boolean bfd_bfin_elf32_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *,
+   char **);
+
+extern bfd_boolean bfd_cr16_elf32_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *,
+   char **);
+
+/* SunOS shared library support routines for the linker.  */
+
+extern struct bfd_link_needed_list *bfd_sunos_get_needed_list
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_sunos_record_link_assignment
+  (bfd *, struct bfd_link_info *, const char *);
+extern bfd_boolean bfd_sunos_size_dynamic_sections
+  (bfd *, struct bfd_link_info *, struct bfd_section **,
+   struct bfd_section **, struct bfd_section **);
+
+/* Linux shared library support routines for the linker.  */
+
+extern bfd_boolean bfd_i386linux_size_dynamic_sections
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_m68klinux_size_dynamic_sections
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_sparclinux_size_dynamic_sections
+  (bfd *, struct bfd_link_info *);
+
+/* mmap hacks */
+
+struct _bfd_window_internal;
+typedef struct _bfd_window_internal bfd_window_internal;
+
+typedef struct _bfd_window
+{
+  /* What the user asked for.  */
+  void *data;
+  bfd_size_type size;
+  /* The actual window used by BFD.  Small user-requested read-only
+     regions sharing a page may share a single window into the object
+     file.  Read-write versions shouldn't until I've fixed things to
+     keep track of which portions have been claimed by the
+     application; don't want to give the same region back when the
+     application wants two writable copies!  */
+  struct _bfd_window_internal *i;
+}
+bfd_window;
+
+extern void bfd_init_window
+  (bfd_window *);
+extern void bfd_free_window
+  (bfd_window *);
+extern bfd_boolean bfd_get_file_window
+  (bfd *, file_ptr, bfd_size_type, bfd_window *, bfd_boolean);
+
+/* XCOFF support routines for the linker.  */
+
+extern bfd_boolean bfd_xcoff_split_import_path
+  (bfd *, const char *, const char **, const char **);
+extern bfd_boolean bfd_xcoff_set_archive_import_path
+  (struct bfd_link_info *, bfd *, const char *);
+extern bfd_boolean bfd_xcoff_link_record_set
+  (bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, bfd_size_type);
+extern bfd_boolean bfd_xcoff_import_symbol
+  (bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, bfd_vma,
+   const char *, const char *, const char *, unsigned int);
+extern bfd_boolean bfd_xcoff_export_symbol
+  (bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *);
+extern bfd_boolean bfd_xcoff_link_count_reloc
+  (bfd *, struct bfd_link_info *, const char *);
+extern bfd_boolean bfd_xcoff_record_link_assignment
+  (bfd *, struct bfd_link_info *, const char *);
+extern bfd_boolean bfd_xcoff_size_dynamic_sections
+  (bfd *, struct bfd_link_info *, const char *, const char *,
+   unsigned long, unsigned long, unsigned long, bfd_boolean,
+   int, bfd_boolean, unsigned int, struct bfd_section **, bfd_boolean);
+extern bfd_boolean bfd_xcoff_link_generate_rtinit
+  (bfd *, const char *, const char *, bfd_boolean);
+
+/* XCOFF support routines for ar.  */
+extern bfd_boolean bfd_xcoff_ar_archive_set_magic
+  (bfd *, char *);
+
+/* Externally visible COFF routines.  */
+
+#if defined(__STDC__) || defined(ALMOST_STDC)
+struct internal_syment;
+union internal_auxent;
+#endif
+
+extern bfd_boolean bfd_coff_set_symbol_class
+  (bfd *, struct bfd_symbol *, unsigned int);
+
+extern bfd_boolean bfd_m68k_coff_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *, char **);
+
+/* ARM VFP11 erratum workaround support.  */
+typedef enum
+{
+  BFD_ARM_VFP11_FIX_DEFAULT,
+  BFD_ARM_VFP11_FIX_NONE,
+  BFD_ARM_VFP11_FIX_SCALAR,
+  BFD_ARM_VFP11_FIX_VECTOR
+} bfd_arm_vfp11_fix;
+
+extern void bfd_elf32_arm_init_maps
+  (bfd *);
+
+extern void bfd_elf32_arm_set_vfp11_fix
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_set_cortex_a8_fix
+  (bfd *, struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_vfp11_erratum_scan
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_vfp11_fix_veneer_locations
+  (bfd *, struct bfd_link_info *);
+
+/* ARM STM STM32L4XX erratum workaround support.  */
+typedef enum
+{
+  BFD_ARM_STM32L4XX_FIX_NONE,
+  BFD_ARM_STM32L4XX_FIX_DEFAULT,
+  BFD_ARM_STM32L4XX_FIX_ALL
+} bfd_arm_stm32l4xx_fix;
+
+extern void bfd_elf32_arm_set_stm32l4xx_fix
+  (bfd *, struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_stm32l4xx_erratum_scan
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_stm32l4xx_fix_veneer_locations
+  (bfd *, struct bfd_link_info *);
+
+/* ARM Interworking support.  Called from linker.  */
+extern bfd_boolean bfd_arm_allocate_interworking_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean bfd_arm_process_before_allocation
+  (bfd *, struct bfd_link_info *, int);
+
+extern bfd_boolean bfd_arm_get_bfd_for_interworking
+  (bfd *, struct bfd_link_info *);
+
+/* PE ARM Interworking support.  Called from linker.  */
+extern bfd_boolean bfd_arm_pe_allocate_interworking_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean bfd_arm_pe_process_before_allocation
+  (bfd *, struct bfd_link_info *, int);
+
+extern bfd_boolean bfd_arm_pe_get_bfd_for_interworking
+  (bfd *, struct bfd_link_info *);
+
+/* ELF ARM Interworking support.  Called from linker.  */
+extern bfd_boolean bfd_elf32_arm_allocate_interworking_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_process_before_allocation
+  (bfd *, struct bfd_link_info *);
+
+struct elf32_arm_params {
+  char *thumb_entry_symbol;
+  int byteswap_code;
+  int target1_is_rel;
+  char * target2_type;
+  int fix_v4bx;
+  int use_blx;
+  bfd_arm_vfp11_fix vfp11_denorm_fix;
+  bfd_arm_stm32l4xx_fix stm32l4xx_fix;
+  int no_enum_size_warning;
+  int no_wchar_size_warning;
+  int pic_veneer;
+  int fix_cortex_a8;
+  int fix_arm1176;
+  int merge_exidx_entries;
+  int cmse_implib;
+  bfd *in_implib_bfd;
+};
+
+void bfd_elf32_arm_set_target_params
+  (bfd *, struct bfd_link_info *, struct elf32_arm_params *);
+
+extern bfd_boolean bfd_elf32_arm_get_bfd_for_interworking
+  (bfd *, struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_add_glue_sections_to_bfd
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_keep_private_stub_output_sections
+  (struct bfd_link_info *);
+
+/* ELF ARM mapping symbol support.  */
+#define BFD_ARM_SPECIAL_SYM_TYPE_MAP	(1 << 0)
+#define BFD_ARM_SPECIAL_SYM_TYPE_TAG	(1 << 1)
+#define BFD_ARM_SPECIAL_SYM_TYPE_OTHER  (1 << 2)
+#define BFD_ARM_SPECIAL_SYM_TYPE_ANY	(~0)
+
+extern bfd_boolean bfd_is_arm_special_symbol_name
+  (const char *, int);
+
+extern void bfd_elf32_arm_set_byteswap_code
+  (struct bfd_link_info *, int);
+
+extern void bfd_elf32_arm_use_long_plt (void);
+
+/* ARM Note section processing.  */
+extern bfd_boolean bfd_arm_merge_machines
+  (bfd *, bfd *);
+
+extern bfd_boolean bfd_arm_update_notes
+  (bfd *, const char *);
+
+extern unsigned int bfd_arm_get_mach_from_notes
+  (bfd *, const char *);
+
+/* ARM stub generation support.  Called from the linker.  */
+extern int elf32_arm_setup_section_lists
+  (bfd *, struct bfd_link_info *);
+extern void elf32_arm_next_input_section
+  (struct bfd_link_info *, struct bfd_section *);
+extern bfd_boolean elf32_arm_size_stubs
+  (bfd *, bfd *, struct bfd_link_info *, bfd_signed_vma,
+   struct bfd_section * (*) (const char *, struct bfd_section *,
+			     struct bfd_section *, unsigned int),
+   void (*) (void));
+extern bfd_boolean elf32_arm_build_stubs
+  (struct bfd_link_info *);
+
+/* ARM unwind section editing support.  */
+extern bfd_boolean elf32_arm_fix_exidx_coverage
+(struct bfd_section **, unsigned int, struct bfd_link_info *, bfd_boolean);
+
+/* C6x unwind section editing support.  */
+extern bfd_boolean elf32_tic6x_fix_exidx_coverage
+(struct bfd_section **, unsigned int, struct bfd_link_info *, bfd_boolean);
+
+extern void bfd_elf64_aarch64_init_maps
+  (bfd *);
+
+extern void bfd_elf32_aarch64_init_maps
+  (bfd *);
+
+extern void bfd_elf64_aarch64_set_options
+  (bfd *, struct bfd_link_info *, int, int, int, int, int, int);
+
+extern void bfd_elf32_aarch64_set_options
+  (bfd *, struct bfd_link_info *, int, int, int, int, int, int);
+
+/* ELF AArch64 mapping symbol support.  */
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_MAP	(1 << 0)
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_TAG	(1 << 1)
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_OTHER	(1 << 2)
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_ANY	(~0)
+extern bfd_boolean bfd_is_aarch64_special_symbol_name
+  (const char * name, int type);
+
+/* AArch64 stub generation support for ELF64.  Called from the linker.  */
+extern int elf64_aarch64_setup_section_lists
+  (bfd *, struct bfd_link_info *);
+extern void elf64_aarch64_next_input_section
+  (struct bfd_link_info *, struct bfd_section *);
+extern bfd_boolean elf64_aarch64_size_stubs
+  (bfd *, bfd *, struct bfd_link_info *, bfd_signed_vma,
+   struct bfd_section * (*) (const char *, struct bfd_section *),
+   void (*) (void));
+extern bfd_boolean elf64_aarch64_build_stubs
+  (struct bfd_link_info *);
+/* AArch64 stub generation support for ELF32.  Called from the linker.  */
+extern int elf32_aarch64_setup_section_lists
+  (bfd *, struct bfd_link_info *);
+extern void elf32_aarch64_next_input_section
+  (struct bfd_link_info *, struct bfd_section *);
+extern bfd_boolean elf32_aarch64_size_stubs
+  (bfd *, bfd *, struct bfd_link_info *, bfd_signed_vma,
+   struct bfd_section * (*) (const char *, struct bfd_section *),
+   void (*) (void));
+extern bfd_boolean elf32_aarch64_build_stubs
+  (struct bfd_link_info *);
+
+
+/* TI COFF load page support.  */
+extern void bfd_ticoff_set_section_load_page
+  (struct bfd_section *, int);
+
+extern int bfd_ticoff_get_section_load_page
+  (struct bfd_section *);
+
+/* H8/300 functions.  */
+extern bfd_vma bfd_h8300_pad_address
+  (bfd *, bfd_vma);
+
+/* IA64 Itanium code generation.  Called from linker.  */
+extern void bfd_elf32_ia64_after_parse
+  (int);
+
+extern void bfd_elf64_ia64_after_parse
+  (int);
+
+/* V850 Note manipulation routines.  */
+extern bfd_boolean v850_elf_create_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean v850_elf_set_note
+  (bfd *, unsigned int, unsigned int);
+
+/* MIPS ABI flags data access.  For the disassembler.  */
+struct elf_internal_abiflags_v0;
+extern struct elf_internal_abiflags_v0 *bfd_mips_elf_get_abiflags (bfd *);
+/* Extracted from init.c.  */
+void bfd_init (void);
+
+/* Extracted from opncls.c.  */
+/* Set to N to open the next N BFDs using an alternate id space.  */
+extern unsigned int bfd_use_reserved_id;
+bfd *bfd_fopen (const char *filename, const char *target,
+    const char *mode, int fd);
+
+bfd *bfd_openr (const char *filename, const char *target);
+
+bfd *bfd_fdopenr (const char *filename, const char *target, int fd);
+
+bfd *bfd_openstreamr (const char * filename, const char * target, void * stream);
+
+bfd *bfd_openr_iovec (const char *filename, const char *target,
+    void *(*open_func) (struct bfd *nbfd,
+    void *open_closure),
+    void *open_closure,
+    file_ptr (*pread_func) (struct bfd *nbfd,
+    void *stream,
+    void *buf,
+    file_ptr nbytes,
+    file_ptr offset),
+    int (*close_func) (struct bfd *nbfd,
+    void *stream),
+    int (*stat_func) (struct bfd *abfd,
+    void *stream,
+    struct stat *sb));
+
+bfd *bfd_openw (const char *filename, const char *target);
+
+bfd_boolean bfd_close (bfd *abfd);
+
+bfd_boolean bfd_close_all_done (bfd *);
+
+bfd *bfd_create (const char *filename, bfd *templ);
+
+bfd_boolean bfd_make_writable (bfd *abfd);
+
+bfd_boolean bfd_make_readable (bfd *abfd);
+
+void *bfd_alloc (bfd *abfd, bfd_size_type wanted);
+
+void *bfd_zalloc (bfd *abfd, bfd_size_type wanted);
+
+unsigned long bfd_calc_gnu_debuglink_crc32
+   (unsigned long crc, const unsigned char *buf, bfd_size_type len);
+
+char *bfd_get_debug_link_info (bfd *abfd, unsigned long *crc32_out);
+
+char *bfd_get_alt_debug_link_info (bfd * abfd,
+    bfd_size_type *buildid_len,
+    bfd_byte **buildid_out);
+
+char *bfd_follow_gnu_debuglink (bfd *abfd, const char *dir);
+
+char *bfd_follow_gnu_debugaltlink (bfd *abfd, const char *dir);
+
+struct bfd_section *bfd_create_gnu_debuglink_section
+   (bfd *abfd, const char *filename);
+
+bfd_boolean bfd_fill_in_gnu_debuglink_section
+   (bfd *abfd, struct bfd_section *sect, const char *filename);
+
+char *bfd_follow_build_id_debuglink (bfd *abfd, const char *dir);
+
+/* Extracted from libbfd.c.  */
+
+/* Byte swapping macros for user section data.  */
+
+#define bfd_put_8(abfd, val, ptr) \
+  ((void) (*((unsigned char *) (ptr)) = (val) & 0xff))
+#define bfd_put_signed_8 \
+  bfd_put_8
+#define bfd_get_8(abfd, ptr) \
+  (*(const unsigned char *) (ptr) & 0xff)
+#define bfd_get_signed_8(abfd, ptr) \
+  (((*(const unsigned char *) (ptr) & 0xff) ^ 0x80) - 0x80)
+
+#define bfd_put_16(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_putx16, ((val),(ptr)))
+#define bfd_put_signed_16 \
+  bfd_put_16
+#define bfd_get_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx16, (ptr))
+#define bfd_get_signed_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx_signed_16, (ptr))
+
+#define bfd_put_32(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_putx32, ((val),(ptr)))
+#define bfd_put_signed_32 \
+  bfd_put_32
+#define bfd_get_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx32, (ptr))
+#define bfd_get_signed_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx_signed_32, (ptr))
+
+#define bfd_put_64(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_putx64, ((val), (ptr)))
+#define bfd_put_signed_64 \
+  bfd_put_64
+#define bfd_get_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx64, (ptr))
+#define bfd_get_signed_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx_signed_64, (ptr))
+
+#define bfd_get(bits, abfd, ptr)                       \
+  ((bits) == 8 ? (bfd_vma) bfd_get_8 (abfd, ptr)       \
+   : (bits) == 16 ? bfd_get_16 (abfd, ptr)             \
+   : (bits) == 32 ? bfd_get_32 (abfd, ptr)             \
+   : (bits) == 64 ? bfd_get_64 (abfd, ptr)             \
+   : (abort (), (bfd_vma) - 1))
+
+#define bfd_put(bits, abfd, val, ptr)                  \
+  ((bits) == 8 ? bfd_put_8  (abfd, val, ptr)           \
+   : (bits) == 16 ? bfd_put_16 (abfd, val, ptr)                \
+   : (bits) == 32 ? bfd_put_32 (abfd, val, ptr)                \
+   : (bits) == 64 ? bfd_put_64 (abfd, val, ptr)                \
+   : (abort (), (void) 0))
+
+
+/* Byte swapping macros for file header data.  */
+
+#define bfd_h_put_8(abfd, val, ptr) \
+  bfd_put_8 (abfd, val, ptr)
+#define bfd_h_put_signed_8(abfd, val, ptr) \
+  bfd_put_8 (abfd, val, ptr)
+#define bfd_h_get_8(abfd, ptr) \
+  bfd_get_8 (abfd, ptr)
+#define bfd_h_get_signed_8(abfd, ptr) \
+  bfd_get_signed_8 (abfd, ptr)
+
+#define bfd_h_put_16(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_h_putx16, (val, ptr))
+#define bfd_h_put_signed_16 \
+  bfd_h_put_16
+#define bfd_h_get_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx16, (ptr))
+#define bfd_h_get_signed_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx_signed_16, (ptr))
+
+#define bfd_h_put_32(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_h_putx32, (val, ptr))
+#define bfd_h_put_signed_32 \
+  bfd_h_put_32
+#define bfd_h_get_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx32, (ptr))
+#define bfd_h_get_signed_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx_signed_32, (ptr))
+
+#define bfd_h_put_64(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_h_putx64, (val, ptr))
+#define bfd_h_put_signed_64 \
+  bfd_h_put_64
+#define bfd_h_get_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx64, (ptr))
+#define bfd_h_get_signed_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx_signed_64, (ptr))
+
+/* Aliases for the above, which should eventually go away.  */
+
+#define H_PUT_64  bfd_h_put_64
+#define H_PUT_32  bfd_h_put_32
+#define H_PUT_16  bfd_h_put_16
+#define H_PUT_8   bfd_h_put_8
+#define H_PUT_S64 bfd_h_put_signed_64
+#define H_PUT_S32 bfd_h_put_signed_32
+#define H_PUT_S16 bfd_h_put_signed_16
+#define H_PUT_S8  bfd_h_put_signed_8
+#define H_GET_64  bfd_h_get_64
+#define H_GET_32  bfd_h_get_32
+#define H_GET_16  bfd_h_get_16
+#define H_GET_8   bfd_h_get_8
+#define H_GET_S64 bfd_h_get_signed_64
+#define H_GET_S32 bfd_h_get_signed_32
+#define H_GET_S16 bfd_h_get_signed_16
+#define H_GET_S8  bfd_h_get_signed_8
+
+
+/* Extracted from bfdio.c.  */
+long bfd_get_mtime (bfd *abfd);
+
+file_ptr bfd_get_size (bfd *abfd);
+
+void *bfd_mmap (bfd *abfd, void *addr, bfd_size_type len,
+    int prot, int flags, file_ptr offset,
+    void **map_addr, bfd_size_type *map_len);
+
+/* Extracted from bfdwin.c.  */
+/* Extracted from section.c.  */
+
+typedef struct bfd_section
+{
+  /* The name of the section; the name isn't a copy, the pointer is
+     the same as that passed to bfd_make_section.  */
+  const char *name;
+
+  /* A unique sequence number.  */
+  unsigned int id;
+
+  /* Which section in the bfd; 0..n-1 as sections are created in a bfd.  */
+  unsigned int index;
+
+  /* The next section in the list belonging to the BFD, or NULL.  */
+  struct bfd_section *next;
+
+  /* The previous section in the list belonging to the BFD, or NULL.  */
+  struct bfd_section *prev;
+
+  /* The field flags contains attributes of the section. Some
+     flags are read in from the object file, and some are
+     synthesized from other information.  */
+  flagword flags;
+
+#define SEC_NO_FLAGS   0x000
+
+  /* Tells the OS to allocate space for this section when loading.
+     This is clear for a section containing debug information only.  */
+#define SEC_ALLOC      0x001
+
+  /* Tells the OS to load the section from the file when loading.
+     This is clear for a .bss section.  */
+#define SEC_LOAD       0x002
+
+  /* The section contains data still to be relocated, so there is
+     some relocation information too.  */
+#define SEC_RELOC      0x004
+
+  /* A signal to the OS that the section contains read only data.  */
+#define SEC_READONLY   0x008
+
+  /* The section contains code only.  */
+#define SEC_CODE       0x010
+
+  /* The section contains data only.  */
+#define SEC_DATA       0x020
+
+  /* The section will reside in ROM.  */
+#define SEC_ROM        0x040
+
+  /* The section contains constructor information. This section
+     type is used by the linker to create lists of constructors and
+     destructors used by <<g++>>. When a back end sees a symbol
+     which should be used in a constructor list, it creates a new
+     section for the type of name (e.g., <<__CTOR_LIST__>>), attaches
+     the symbol to it, and builds a relocation. To build the lists
+     of constructors, all the linker has to do is catenate all the
+     sections called <<__CTOR_LIST__>> and relocate the data
+     contained within - exactly the operations it would peform on
+     standard data.  */
+#define SEC_CONSTRUCTOR 0x080
+
+  /* The section has contents - a data section could be
+     <<SEC_ALLOC>> | <<SEC_HAS_CONTENTS>>; a debug section could be
+     <<SEC_HAS_CONTENTS>>  */
+#define SEC_HAS_CONTENTS 0x100
+
+  /* An instruction to the linker to not output the section
+     even if it has information which would normally be written.  */
+#define SEC_NEVER_LOAD 0x200
+
+  /* The section contains thread local data.  */
+#define SEC_THREAD_LOCAL 0x400
+
+  /* The section has GOT references.  This flag is only for the
+     linker, and is currently only used by the elf32-hppa back end.
+     It will be set if global offset table references were detected
+     in this section, which indicate to the linker that the section
+     contains PIC code, and must be handled specially when doing a
+     static link.  */
+#define SEC_HAS_GOT_REF 0x800
+
+  /* The section contains common symbols (symbols may be defined
+     multiple times, the value of a symbol is the amount of
+     space it requires, and the largest symbol value is the one
+     used).  Most targets have exactly one of these (which we
+     translate to bfd_com_section_ptr), but ECOFF has two.  */
+#define SEC_IS_COMMON 0x1000
+
+  /* The section contains only debugging information.  For
+     example, this is set for ELF .debug and .stab sections.
+     strip tests this flag to see if a section can be
+     discarded.  */
+#define SEC_DEBUGGING 0x2000
+
+  /* The contents of this section are held in memory pointed to
+     by the contents field.  This is checked by bfd_get_section_contents,
+     and the data is retrieved from memory if appropriate.  */
+#define SEC_IN_MEMORY 0x4000
+
+  /* The contents of this section are to be excluded by the
+     linker for executable and shared objects unless those
+     objects are to be further relocated.  */
+#define SEC_EXCLUDE 0x8000
+
+  /* The contents of this section are to be sorted based on the sum of
+     the symbol and addend values specified by the associated relocation
+     entries.  Entries without associated relocation entries will be
+     appended to the end of the section in an unspecified order.  */
+#define SEC_SORT_ENTRIES 0x10000
+
+  /* When linking, duplicate sections of the same name should be
+     discarded, rather than being combined into a single section as
+     is usually done.  This is similar to how common symbols are
+     handled.  See SEC_LINK_DUPLICATES below.  */
+#define SEC_LINK_ONCE 0x20000
+
+  /* If SEC_LINK_ONCE is set, this bitfield describes how the linker
+     should handle duplicate sections.  */
+#define SEC_LINK_DUPLICATES 0xc0000
+
+  /* This value for SEC_LINK_DUPLICATES means that duplicate
+     sections with the same name should simply be discarded.  */
+#define SEC_LINK_DUPLICATES_DISCARD 0x0
+
+  /* This value for SEC_LINK_DUPLICATES means that the linker
+     should warn if there are any duplicate sections, although
+     it should still only link one copy.  */
+#define SEC_LINK_DUPLICATES_ONE_ONLY 0x40000
+
+  /* This value for SEC_LINK_DUPLICATES means that the linker
+     should warn if any duplicate sections are a different size.  */
+#define SEC_LINK_DUPLICATES_SAME_SIZE 0x80000
+
+  /* This value for SEC_LINK_DUPLICATES means that the linker
+     should warn if any duplicate sections contain different
+     contents.  */
+#define SEC_LINK_DUPLICATES_SAME_CONTENTS \
+  (SEC_LINK_DUPLICATES_ONE_ONLY | SEC_LINK_DUPLICATES_SAME_SIZE)
+
+  /* This section was created by the linker as part of dynamic
+     relocation or other arcane processing.  It is skipped when
+     going through the first-pass output, trusting that someone
+     else up the line will take care of it later.  */
+#define SEC_LINKER_CREATED 0x100000
+
+  /* This section should not be subject to garbage collection.
+     Also set to inform the linker that this section should not be
+     listed in the link map as discarded.  */
+#define SEC_KEEP 0x200000
+
+  /* This section contains "short" data, and should be placed
+     "near" the GP.  */
+#define SEC_SMALL_DATA 0x400000
+
+  /* Attempt to merge identical entities in the section.
+     Entity size is given in the entsize field.  */
+#define SEC_MERGE 0x800000
+
+  /* If given with SEC_MERGE, entities to merge are zero terminated
+     strings where entsize specifies character size instead of fixed
+     size entries.  */
+#define SEC_STRINGS 0x1000000
+
+  /* This section contains data about section groups.  */
+#define SEC_GROUP 0x2000000
+
+  /* The section is a COFF shared library section.  This flag is
+     only for the linker.  If this type of section appears in
+     the input file, the linker must copy it to the output file
+     without changing the vma or size.  FIXME: Although this
+     was originally intended to be general, it really is COFF
+     specific (and the flag was renamed to indicate this).  It
+     might be cleaner to have some more general mechanism to
+     allow the back end to control what the linker does with
+     sections.  */
+#define SEC_COFF_SHARED_LIBRARY 0x4000000
+
+  /* This input section should be copied to output in reverse order
+     as an array of pointers.  This is for ELF linker internal use
+     only.  */
+#define SEC_ELF_REVERSE_COPY 0x4000000
+
+  /* This section contains data which may be shared with other
+     executables or shared objects. This is for COFF only.  */
+#define SEC_COFF_SHARED 0x8000000
+
+  /* This section should be compressed.  This is for ELF linker
+     internal use only.  */
+#define SEC_ELF_COMPRESS 0x8000000
+
+  /* When a section with this flag is being linked, then if the size of
+     the input section is less than a page, it should not cross a page
+     boundary.  If the size of the input section is one page or more,
+     it should be aligned on a page boundary.  This is for TI
+     TMS320C54X only.  */
+#define SEC_TIC54X_BLOCK 0x10000000
+
+  /* This section should be renamed.  This is for ELF linker
+     internal use only.  */
+#define SEC_ELF_RENAME 0x10000000
+
+  /* Conditionally link this section; do not link if there are no
+     references found to any symbol in the section.  This is for TI
+     TMS320C54X only.  */
+#define SEC_TIC54X_CLINK 0x20000000
+
+  /* This section contains vliw code.  This is for Toshiba MeP only.  */
+#define SEC_MEP_VLIW 0x20000000
+
+  /* Indicate that section has the no read flag set. This happens
+     when memory read flag isn't set. */
+#define SEC_COFF_NOREAD 0x40000000
+
+  /* Indicate that section has the purecode flag set.  */
+#define SEC_ELF_PURECODE 0x80000000
+
+  /*  End of section flags.  */
+
+  /* Some internal packed boolean fields.  */
+
+  /* See the vma field.  */
+  unsigned int user_set_vma : 1;
+
+  /* A mark flag used by some of the linker backends.  */
+  unsigned int linker_mark : 1;
+
+  /* Another mark flag used by some of the linker backends.  Set for
+     output sections that have an input section.  */
+  unsigned int linker_has_input : 1;
+
+  /* Mark flag used by some linker backends for garbage collection.  */
+  unsigned int gc_mark : 1;
+
+  /* Section compression status.  */
+  unsigned int compress_status : 2;
+#define COMPRESS_SECTION_NONE    0
+#define COMPRESS_SECTION_DONE    1
+#define DECOMPRESS_SECTION_SIZED 2
+
+  /* The following flags are used by the ELF linker. */
+
+  /* Mark sections which have been allocated to segments.  */
+  unsigned int segment_mark : 1;
+
+  /* Type of sec_info information.  */
+  unsigned int sec_info_type:3;
+#define SEC_INFO_TYPE_NONE      0
+#define SEC_INFO_TYPE_STABS     1
+#define SEC_INFO_TYPE_MERGE     2
+#define SEC_INFO_TYPE_EH_FRAME  3
+#define SEC_INFO_TYPE_JUST_SYMS 4
+#define SEC_INFO_TYPE_TARGET    5
+#define SEC_INFO_TYPE_EH_FRAME_ENTRY 6
+
+  /* Nonzero if this section uses RELA relocations, rather than REL.  */
+  unsigned int use_rela_p:1;
+
+  /* Bits used by various backends.  The generic code doesn't touch
+     these fields.  */
+
+  unsigned int sec_flg0:1;
+  unsigned int sec_flg1:1;
+  unsigned int sec_flg2:1;
+  unsigned int sec_flg3:1;
+  unsigned int sec_flg4:1;
+  unsigned int sec_flg5:1;
+
+  /* End of internal packed boolean fields.  */
+
+  /*  The virtual memory address of the section - where it will be
+      at run time.  The symbols are relocated against this.  The
+      user_set_vma flag is maintained by bfd; if it's not set, the
+      backend can assign addresses (for example, in <<a.out>>, where
+      the default address for <<.data>> is dependent on the specific
+      target and various flags).  */
+  bfd_vma vma;
+
+  /*  The load address of the section - where it would be in a
+      rom image; really only used for writing section header
+      information.  */
+  bfd_vma lma;
+
+  /* The size of the section in *octets*, as it will be output.
+     Contains a value even if the section has no contents (e.g., the
+     size of <<.bss>>).  */
+  bfd_size_type size;
+
+  /* For input sections, the original size on disk of the section, in
+     octets.  This field should be set for any section whose size is
+     changed by linker relaxation.  It is required for sections where
+     the linker relaxation scheme doesn't cache altered section and
+     reloc contents (stabs, eh_frame, SEC_MERGE, some coff relaxing
+     targets), and thus the original size needs to be kept to read the
+     section multiple times.  For output sections, rawsize holds the
+     section size calculated on a previous linker relaxation pass.  */
+  bfd_size_type rawsize;
+
+  /* The compressed size of the section in octets.  */
+  bfd_size_type compressed_size;
+
+  /* Relaxation table. */
+  struct relax_table *relax;
+
+  /* Count of used relaxation table entries. */
+  int relax_count;
+
+
+  /* If this section is going to be output, then this value is the
+     offset in *bytes* into the output section of the first byte in the
+     input section (byte ==> smallest addressable unit on the
+     target).  In most cases, if this was going to start at the
+     100th octet (8-bit quantity) in the output section, this value
+     would be 100.  However, if the target byte size is 16 bits
+     (bfd_octets_per_byte is "2"), this value would be 50.  */
+  bfd_vma output_offset;
+
+  /* The output section through which to map on output.  */
+  struct bfd_section *output_section;
+
+  /* The alignment requirement of the section, as an exponent of 2 -
+     e.g., 3 aligns to 2^3 (or 8).  */
+  unsigned int alignment_power;
+
+  /* If an input section, a pointer to a vector of relocation
+     records for the data in this section.  */
+  struct reloc_cache_entry *relocation;
+
+  /* If an output section, a pointer to a vector of pointers to
+     relocation records for the data in this section.  */
+  struct reloc_cache_entry **orelocation;
+
+  /* The number of relocation records in one of the above.  */
+  unsigned reloc_count;
+
+  /* Information below is back end specific - and not always used
+     or updated.  */
+
+  /* File position of section data.  */
+  file_ptr filepos;
+
+  /* File position of relocation info.  */
+  file_ptr rel_filepos;
+
+  /* File position of line data.  */
+  file_ptr line_filepos;
+
+  /* Pointer to data for applications.  */
+  void *userdata;
+
+  /* If the SEC_IN_MEMORY flag is set, this points to the actual
+     contents.  */
+  unsigned char *contents;
+
+  /* Attached line number information.  */
+  alent *lineno;
+
+  /* Number of line number records.  */
+  unsigned int lineno_count;
+
+  /* Entity size for merging purposes.  */
+  unsigned int entsize;
+
+  /* Points to the kept section if this section is a link-once section,
+     and is discarded.  */
+  struct bfd_section *kept_section;
+
+  /* When a section is being output, this value changes as more
+     linenumbers are written out.  */
+  file_ptr moving_line_filepos;
+
+  /* What the section number is in the target world.  */
+  int target_index;
+
+  void *used_by_bfd;
+
+  /* If this is a constructor section then here is a list of the
+     relocations created to relocate items within it.  */
+  struct relent_chain *constructor_chain;
+
+  /* The BFD which owns the section.  */
+  bfd *owner;
+
+  /* A symbol which points at this section only.  */
+  struct bfd_symbol *symbol;
+  struct bfd_symbol **symbol_ptr_ptr;
+
+  /* Early in the link process, map_head and map_tail are used to build
+     a list of input sections attached to an output section.  Later,
+     output sections use these fields for a list of bfd_link_order
+     structs.  */
+  union {
+    struct bfd_link_order *link_order;
+    struct bfd_section *s;
+  } map_head, map_tail;
+} asection;
+
+/* Relax table contains information about instructions which can
+   be removed by relaxation -- replacing a long address with a
+   short address.  */
+struct relax_table {
+  /* Address where bytes may be deleted. */
+  bfd_vma addr;
+
+  /* Number of bytes to be deleted.  */
+  int size;
+};
+
+/* Note: the following are provided as inline functions rather than macros
+   because not all callers use the return value.  A macro implementation
+   would use a comma expression, eg: "((ptr)->foo = val, TRUE)" and some
+   compilers will complain about comma expressions that have no effect.  */
+static inline bfd_boolean
+bfd_set_section_userdata (bfd * abfd ATTRIBUTE_UNUSED, asection * ptr, void * val)
+{
+  ptr->userdata = val;
+  return TRUE;
+}
+
+static inline bfd_boolean
+bfd_set_section_vma (bfd * abfd ATTRIBUTE_UNUSED, asection * ptr, bfd_vma val)
+{
+  ptr->vma = ptr->lma = val;
+  ptr->user_set_vma = TRUE;
+  return TRUE;
+}
+
+static inline bfd_boolean
+bfd_set_section_alignment (bfd * abfd ATTRIBUTE_UNUSED, asection * ptr, unsigned int val)
+{
+  ptr->alignment_power = val;
+  return TRUE;
+}
+
+/* These sections are global, and are managed by BFD.  The application
+   and target back end are not permitted to change the values in
+   these sections.  */
+extern asection _bfd_std_section[4];
+
+#define BFD_ABS_SECTION_NAME "*ABS*"
+#define BFD_UND_SECTION_NAME "*UND*"
+#define BFD_COM_SECTION_NAME "*COM*"
+#define BFD_IND_SECTION_NAME "*IND*"
+
+/* Pointer to the common section.  */
+#define bfd_com_section_ptr (&_bfd_std_section[0])
+/* Pointer to the undefined section.  */
+#define bfd_und_section_ptr (&_bfd_std_section[1])
+/* Pointer to the absolute section.  */
+#define bfd_abs_section_ptr (&_bfd_std_section[2])
+/* Pointer to the indirect section.  */
+#define bfd_ind_section_ptr (&_bfd_std_section[3])
+
+#define bfd_is_und_section(sec) ((sec) == bfd_und_section_ptr)
+#define bfd_is_abs_section(sec) ((sec) == bfd_abs_section_ptr)
+#define bfd_is_ind_section(sec) ((sec) == bfd_ind_section_ptr)
+
+#define bfd_is_const_section(SEC)              \
+ (   ((SEC) == bfd_abs_section_ptr)            \
+  || ((SEC) == bfd_und_section_ptr)            \
+  || ((SEC) == bfd_com_section_ptr)            \
+  || ((SEC) == bfd_ind_section_ptr))
+
+/* Macros to handle insertion and deletion of a bfd's sections.  These
+   only handle the list pointers, ie. do not adjust section_count,
+   target_index etc.  */
+#define bfd_section_list_remove(ABFD, S) \
+  do                                                   \
+    {                                                  \
+      asection *_s = S;                                \
+      asection *_next = _s->next;                      \
+      asection *_prev = _s->prev;                      \
+      if (_prev)                                       \
+        _prev->next = _next;                           \
+      else                                             \
+        (ABFD)->sections = _next;                      \
+      if (_next)                                       \
+        _next->prev = _prev;                           \
+      else                                             \
+        (ABFD)->section_last = _prev;                  \
+    }                                                  \
+  while (0)
+#define bfd_section_list_append(ABFD, S) \
+  do                                                   \
+    {                                                  \
+      asection *_s = S;                                \
+      bfd *_abfd = ABFD;                               \
+      _s->next = NULL;                                 \
+      if (_abfd->section_last)                         \
+        {                                              \
+          _s->prev = _abfd->section_last;              \
+          _abfd->section_last->next = _s;              \
+        }                                              \
+      else                                             \
+        {                                              \
+          _s->prev = NULL;                             \
+          _abfd->sections = _s;                        \
+        }                                              \
+      _abfd->section_last = _s;                        \
+    }                                                  \
+  while (0)
+#define bfd_section_list_prepend(ABFD, S) \
+  do                                                   \
+    {                                                  \
+      asection *_s = S;                                \
+      bfd *_abfd = ABFD;                               \
+      _s->prev = NULL;                                 \
+      if (_abfd->sections)                             \
+        {                                              \
+          _s->next = _abfd->sections;                  \
+          _abfd->sections->prev = _s;                  \
+        }                                              \
+      else                                             \
+        {                                              \
+          _s->next = NULL;                             \
+          _abfd->section_last = _s;                    \
+        }                                              \
+      _abfd->sections = _s;                            \
+    }                                                  \
+  while (0)
+#define bfd_section_list_insert_after(ABFD, A, S) \
+  do                                                   \
+    {                                                  \
+      asection *_a = A;                                \
+      asection *_s = S;                                \
+      asection *_next = _a->next;                      \
+      _s->next = _next;                                \
+      _s->prev = _a;                                   \
+      _a->next = _s;                                   \
+      if (_next)                                       \
+        _next->prev = _s;                              \
+      else                                             \
+        (ABFD)->section_last = _s;                     \
+    }                                                  \
+  while (0)
+#define bfd_section_list_insert_before(ABFD, B, S) \
+  do                                                   \
+    {                                                  \
+      asection *_b = B;                                \
+      asection *_s = S;                                \
+      asection *_prev = _b->prev;                      \
+      _s->prev = _prev;                                \
+      _s->next = _b;                                   \
+      _b->prev = _s;                                   \
+      if (_prev)                                       \
+        _prev->next = _s;                              \
+      else                                             \
+        (ABFD)->sections = _s;                         \
+    }                                                  \
+  while (0)
+#define bfd_section_removed_from_list(ABFD, S) \
+  ((S)->next == NULL ? (ABFD)->section_last != (S) : (S)->next->prev != (S))
+
+#define BFD_FAKE_SECTION(SEC, SYM, NAME, IDX, FLAGS)                   \
+  /* name, id,  index, next, prev, flags, user_set_vma,            */  \
+  {  NAME, IDX, 0,     NULL, NULL, FLAGS, 0,                           \
+                                                                       \
+  /* linker_mark, linker_has_input, gc_mark, decompress_status,    */  \
+     0,           0,                1,       0,                        \
+                                                                       \
+  /* segment_mark, sec_info_type, use_rela_p,                      */  \
+     0,            0,             0,                                   \
+                                                                       \
+  /* sec_flg0, sec_flg1, sec_flg2, sec_flg3, sec_flg4, sec_flg5,   */  \
+     0,        0,        0,        0,        0,        0,              \
+                                                                       \
+  /* vma, lma, size, rawsize, compressed_size, relax, relax_count, */  \
+     0,   0,   0,    0,       0,               0,     0,               \
+                                                                       \
+  /* output_offset, output_section, alignment_power,               */  \
+     0,             &SEC,           0,                                 \
+                                                                       \
+  /* relocation, orelocation, reloc_count, filepos, rel_filepos,   */  \
+     NULL,       NULL,        0,           0,       0,                 \
+                                                                       \
+  /* line_filepos, userdata, contents, lineno, lineno_count,       */  \
+     0,            NULL,     NULL,     NULL,   0,                      \
+                                                                       \
+  /* entsize, kept_section, moving_line_filepos,                    */ \
+     0,       NULL,          0,                                        \
+                                                                       \
+  /* target_index, used_by_bfd, constructor_chain, owner,          */  \
+     0,            NULL,        NULL,              NULL,               \
+                                                                       \
+  /* symbol,                    symbol_ptr_ptr,                    */  \
+     (struct bfd_symbol *) SYM, &SEC.symbol,                           \
+                                                                       \
+  /* map_head, map_tail                                            */  \
+     { NULL }, { NULL }                                                \
+    }
+
+void bfd_section_list_clear (bfd *);
+
+asection *bfd_get_section_by_name (bfd *abfd, const char *name);
+
+asection *bfd_get_next_section_by_name (bfd *ibfd, asection *sec);
+
+asection *bfd_get_linker_section (bfd *abfd, const char *name);
+
+asection *bfd_get_section_by_name_if
+   (bfd *abfd,
+    const char *name,
+    bfd_boolean (*func) (bfd *abfd, asection *sect, void *obj),
+    void *obj);
+
+char *bfd_get_unique_section_name
+   (bfd *abfd, const char *templat, int *count);
+
+asection *bfd_make_section_old_way (bfd *abfd, const char *name);
+
+asection *bfd_make_section_anyway_with_flags
+   (bfd *abfd, const char *name, flagword flags);
+
+asection *bfd_make_section_anyway (bfd *abfd, const char *name);
+
+asection *bfd_make_section_with_flags
+   (bfd *, const char *name, flagword flags);
+
+asection *bfd_make_section (bfd *, const char *name);
+
+int bfd_get_next_section_id (void);
+
+bfd_boolean bfd_set_section_flags
+   (bfd *abfd, asection *sec, flagword flags);
+
+void bfd_rename_section
+   (bfd *abfd, asection *sec, const char *newname);
+
+void bfd_map_over_sections
+   (bfd *abfd,
+    void (*func) (bfd *abfd, asection *sect, void *obj),
+    void *obj);
+
+asection *bfd_sections_find_if
+   (bfd *abfd,
+    bfd_boolean (*operation) (bfd *abfd, asection *sect, void *obj),
+    void *obj);
+
+bfd_boolean bfd_set_section_size
+   (bfd *abfd, asection *sec, bfd_size_type val);
+
+bfd_boolean bfd_set_section_contents
+   (bfd *abfd, asection *section, const void *data,
+    file_ptr offset, bfd_size_type count);
+
+bfd_boolean bfd_get_section_contents
+   (bfd *abfd, asection *section, void *location, file_ptr offset,
+    bfd_size_type count);
+
+bfd_boolean bfd_malloc_and_get_section
+   (bfd *abfd, asection *section, bfd_byte **buf);
+
+bfd_boolean bfd_copy_private_section_data
+   (bfd *ibfd, asection *isec, bfd *obfd, asection *osec);
+
+#define bfd_copy_private_section_data(ibfd, isection, obfd, osection) \
+     BFD_SEND (obfd, _bfd_copy_private_section_data, \
+               (ibfd, isection, obfd, osection))
+bfd_boolean bfd_generic_is_group_section (bfd *, const asection *sec);
+
+bfd_boolean bfd_generic_discard_group (bfd *abfd, asection *group);
+
+/* Extracted from archures.c.  */
+enum bfd_architecture
+{
+  bfd_arch_unknown,   /* File arch not known.  */
+  bfd_arch_obscure,   /* Arch known, not one of these.  */
+  bfd_arch_m68k,      /* Motorola 68xxx */
+#define bfd_mach_m68000 1
+#define bfd_mach_m68008 2
+#define bfd_mach_m68010 3
+#define bfd_mach_m68020 4
+#define bfd_mach_m68030 5
+#define bfd_mach_m68040 6
+#define bfd_mach_m68060 7
+#define bfd_mach_cpu32  8
+#define bfd_mach_fido   9
+#define bfd_mach_mcf_isa_a_nodiv 10
+#define bfd_mach_mcf_isa_a 11
+#define bfd_mach_mcf_isa_a_mac 12
+#define bfd_mach_mcf_isa_a_emac 13
+#define bfd_mach_mcf_isa_aplus 14
+#define bfd_mach_mcf_isa_aplus_mac 15
+#define bfd_mach_mcf_isa_aplus_emac 16
+#define bfd_mach_mcf_isa_b_nousp 17
+#define bfd_mach_mcf_isa_b_nousp_mac 18
+#define bfd_mach_mcf_isa_b_nousp_emac 19
+#define bfd_mach_mcf_isa_b 20
+#define bfd_mach_mcf_isa_b_mac 21
+#define bfd_mach_mcf_isa_b_emac 22
+#define bfd_mach_mcf_isa_b_float 23
+#define bfd_mach_mcf_isa_b_float_mac 24
+#define bfd_mach_mcf_isa_b_float_emac 25
+#define bfd_mach_mcf_isa_c 26
+#define bfd_mach_mcf_isa_c_mac 27
+#define bfd_mach_mcf_isa_c_emac 28
+#define bfd_mach_mcf_isa_c_nodiv 29
+#define bfd_mach_mcf_isa_c_nodiv_mac 30
+#define bfd_mach_mcf_isa_c_nodiv_emac 31
+  bfd_arch_vax,       /* DEC Vax */
+  bfd_arch_i960,      /* Intel 960 */
+    /* The order of the following is important.
+       lower number indicates a machine type that
+       only accepts a subset of the instructions
+       available to machines with higher numbers.
+       The exception is the "ca", which is
+       incompatible with all other machines except
+       "core".  */
+
+#define bfd_mach_i960_core      1
+#define bfd_mach_i960_ka_sa     2
+#define bfd_mach_i960_kb_sb     3
+#define bfd_mach_i960_mc        4
+#define bfd_mach_i960_xa        5
+#define bfd_mach_i960_ca        6
+#define bfd_mach_i960_jx        7
+#define bfd_mach_i960_hx        8
+
+  bfd_arch_or1k,      /* OpenRISC 1000 */
+#define bfd_mach_or1k           1
+#define bfd_mach_or1knd         2
+
+  bfd_arch_sparc,     /* SPARC */
+#define bfd_mach_sparc                 1
+/* The difference between v8plus and v9 is that v9 is a true 64 bit env.  */
+#define bfd_mach_sparc_sparclet        2
+#define bfd_mach_sparc_sparclite       3
+#define bfd_mach_sparc_v8plus          4
+#define bfd_mach_sparc_v8plusa         5 /* with ultrasparc add'ns.  */
+#define bfd_mach_sparc_sparclite_le    6
+#define bfd_mach_sparc_v9              7
+#define bfd_mach_sparc_v9a             8 /* with ultrasparc add'ns.  */
+#define bfd_mach_sparc_v8plusb         9 /* with cheetah add'ns.  */
+#define bfd_mach_sparc_v9b             10 /* with cheetah add'ns.  */
+#define bfd_mach_sparc_v8plusc         11 /* with UA2005 and T1 add'ns.  */
+#define bfd_mach_sparc_v9c             12 /* with UA2005 and T1 add'ns.  */
+#define bfd_mach_sparc_v8plusd         13 /* with UA2007 and T3 add'ns.  */
+#define bfd_mach_sparc_v9d             14 /* with UA2007 and T3 add'ns.  */
+#define bfd_mach_sparc_v8pluse         15 /* with OSA2001 and T4 add'ns (no IMA).  */
+#define bfd_mach_sparc_v9e             16 /* with OSA2001 and T4 add'ns (no IMA).  */
+#define bfd_mach_sparc_v8plusv         17 /* with OSA2011 and T4 and IMA and FJMAU add'ns.  */
+#define bfd_mach_sparc_v9v             18 /* with OSA2011 and T4 and IMA and FJMAU add'ns.  */
+#define bfd_mach_sparc_v8plusm         19 /* with OSA2015 and M7 add'ns.  */
+#define bfd_mach_sparc_v9m             20 /* with OSA2015 and M7 add'ns.  */
+/* Nonzero if MACH has the v9 instruction set.  */
+#define bfd_mach_sparc_v9_p(mach) \
+  ((mach) >= bfd_mach_sparc_v8plus && (mach) <= bfd_mach_sparc_v9m \
+   && (mach) != bfd_mach_sparc_sparclite_le)
+/* Nonzero if MACH is a 64 bit sparc architecture.  */
+#define bfd_mach_sparc_64bit_p(mach) \
+  ((mach) >= bfd_mach_sparc_v9 \
+   && (mach) != bfd_mach_sparc_v8plusb \
+   && (mach) != bfd_mach_sparc_v8plusc \
+   && (mach) != bfd_mach_sparc_v8plusd \
+   && (mach) != bfd_mach_sparc_v8pluse \
+   && (mach) != bfd_mach_sparc_v8plusv \
+   && (mach) != bfd_mach_sparc_v8plusm)
+  bfd_arch_spu,       /* PowerPC SPU */
+#define bfd_mach_spu           256
+  bfd_arch_mips,      /* MIPS Rxxxx */
+#define bfd_mach_mips3000              3000
+#define bfd_mach_mips3900              3900
+#define bfd_mach_mips4000              4000
+#define bfd_mach_mips4010              4010
+#define bfd_mach_mips4100              4100
+#define bfd_mach_mips4111              4111
+#define bfd_mach_mips4120              4120
+#define bfd_mach_mips4300              4300
+#define bfd_mach_mips4400              4400
+#define bfd_mach_mips4600              4600
+#define bfd_mach_mips4650              4650
+#define bfd_mach_mips5000              5000
+#define bfd_mach_mips5400              5400
+#define bfd_mach_mips5500              5500
+#define bfd_mach_mips5900              5900
+#define bfd_mach_mips6000              6000
+#define bfd_mach_mips7000              7000
+#define bfd_mach_mips8000              8000
+#define bfd_mach_mips9000              9000
+#define bfd_mach_mips10000             10000
+#define bfd_mach_mips12000             12000
+#define bfd_mach_mips14000             14000
+#define bfd_mach_mips16000             16000
+#define bfd_mach_mips16                16
+#define bfd_mach_mips5                 5
+#define bfd_mach_mips_loongson_2e      3001
+#define bfd_mach_mips_loongson_2f      3002
+#define bfd_mach_mips_loongson_3a      3003
+#define bfd_mach_mips_sb1              12310201 /* octal 'SB', 01 */
+#define bfd_mach_mips_octeon           6501
+#define bfd_mach_mips_octeonp          6601
+#define bfd_mach_mips_octeon2          6502
+#define bfd_mach_mips_octeon3          6503
+#define bfd_mach_mips_xlr              887682   /* decimal 'XLR'  */
+#define bfd_mach_mipsisa32             32
+#define bfd_mach_mipsisa32r2           33
+#define bfd_mach_mipsisa32r3           34
+#define bfd_mach_mipsisa32r5           36
+#define bfd_mach_mipsisa32r6           37
+#define bfd_mach_mipsisa64             64
+#define bfd_mach_mipsisa64r2           65
+#define bfd_mach_mipsisa64r3           66
+#define bfd_mach_mipsisa64r5           68
+#define bfd_mach_mipsisa64r6           69
+#define bfd_mach_mips_micromips        96
+  bfd_arch_i386,      /* Intel 386 */
+#define bfd_mach_i386_intel_syntax     (1 << 0)
+#define bfd_mach_i386_i8086            (1 << 1)
+#define bfd_mach_i386_i386             (1 << 2)
+#define bfd_mach_x86_64                (1 << 3)
+#define bfd_mach_x64_32                (1 << 4)
+#define bfd_mach_i386_i386_intel_syntax (bfd_mach_i386_i386 | bfd_mach_i386_intel_syntax)
+#define bfd_mach_x86_64_intel_syntax   (bfd_mach_x86_64 | bfd_mach_i386_intel_syntax)
+#define bfd_mach_x64_32_intel_syntax   (bfd_mach_x64_32 | bfd_mach_i386_intel_syntax)
+  bfd_arch_l1om,   /* Intel L1OM */
+#define bfd_mach_l1om                  (1 << 5)
+#define bfd_mach_l1om_intel_syntax     (bfd_mach_l1om | bfd_mach_i386_intel_syntax)
+  bfd_arch_k1om,   /* Intel K1OM */
+#define bfd_mach_k1om                  (1 << 6)
+#define bfd_mach_k1om_intel_syntax     (bfd_mach_k1om | bfd_mach_i386_intel_syntax)
+#define bfd_mach_i386_nacl             (1 << 7)
+#define bfd_mach_i386_i386_nacl        (bfd_mach_i386_i386 | bfd_mach_i386_nacl)
+#define bfd_mach_x86_64_nacl           (bfd_mach_x86_64 | bfd_mach_i386_nacl)
+#define bfd_mach_x64_32_nacl           (bfd_mach_x64_32 | bfd_mach_i386_nacl)
+  bfd_arch_iamcu,   /* Intel MCU */
+#define bfd_mach_iamcu                 (1 << 8)
+#define bfd_mach_i386_iamcu            (bfd_mach_i386_i386 | bfd_mach_iamcu)
+#define bfd_mach_i386_iamcu_intel_syntax (bfd_mach_i386_iamcu | bfd_mach_i386_intel_syntax)
+  bfd_arch_we32k,     /* AT&T WE32xxx */
+  bfd_arch_tahoe,     /* CCI/Harris Tahoe */
+  bfd_arch_i860,      /* Intel 860 */
+  bfd_arch_i370,      /* IBM 360/370 Mainframes */
+  bfd_arch_romp,      /* IBM ROMP PC/RT */
+  bfd_arch_convex,    /* Convex */
+  bfd_arch_m88k,      /* Motorola 88xxx */
+  bfd_arch_m98k,      /* Motorola 98xxx */
+  bfd_arch_pyramid,   /* Pyramid Technology */
+  bfd_arch_h8300,     /* Renesas H8/300 (formerly Hitachi H8/300) */
+#define bfd_mach_h8300    1
+#define bfd_mach_h8300h   2
+#define bfd_mach_h8300s   3
+#define bfd_mach_h8300hn  4
+#define bfd_mach_h8300sn  5
+#define bfd_mach_h8300sx  6
+#define bfd_mach_h8300sxn 7
+  bfd_arch_pdp11,     /* DEC PDP-11 */
+  bfd_arch_plugin,
+  bfd_arch_powerpc,   /* PowerPC */
+#define bfd_mach_ppc           32
+#define bfd_mach_ppc64         64
+#define bfd_mach_ppc_403       403
+#define bfd_mach_ppc_403gc     4030
+#define bfd_mach_ppc_405       405
+#define bfd_mach_ppc_505       505
+#define bfd_mach_ppc_601       601
+#define bfd_mach_ppc_602       602
+#define bfd_mach_ppc_603       603
+#define bfd_mach_ppc_ec603e    6031
+#define bfd_mach_ppc_604       604
+#define bfd_mach_ppc_620       620
+#define bfd_mach_ppc_630       630
+#define bfd_mach_ppc_750       750
+#define bfd_mach_ppc_860       860
+#define bfd_mach_ppc_a35       35
+#define bfd_mach_ppc_rs64ii    642
+#define bfd_mach_ppc_rs64iii   643
+#define bfd_mach_ppc_7400      7400
+#define bfd_mach_ppc_e500      500
+#define bfd_mach_ppc_e500mc    5001
+#define bfd_mach_ppc_e500mc64  5005
+#define bfd_mach_ppc_e5500     5006
+#define bfd_mach_ppc_e6500     5007
+#define bfd_mach_ppc_titan     83
+#define bfd_mach_ppc_vle       84
+  bfd_arch_rs6000,    /* IBM RS/6000 */
+#define bfd_mach_rs6k          6000
+#define bfd_mach_rs6k_rs1      6001
+#define bfd_mach_rs6k_rsc      6003
+#define bfd_mach_rs6k_rs2      6002
+  bfd_arch_hppa,      /* HP PA RISC */
+#define bfd_mach_hppa10        10
+#define bfd_mach_hppa11        11
+#define bfd_mach_hppa20        20
+#define bfd_mach_hppa20w       25
+  bfd_arch_d10v,      /* Mitsubishi D10V */
+#define bfd_mach_d10v          1
+#define bfd_mach_d10v_ts2      2
+#define bfd_mach_d10v_ts3      3
+  bfd_arch_d30v,      /* Mitsubishi D30V */
+  bfd_arch_dlx,       /* DLX */
+  bfd_arch_m68hc11,   /* Motorola 68HC11 */
+  bfd_arch_m68hc12,   /* Motorola 68HC12 */
+#define bfd_mach_m6812_default 0
+#define bfd_mach_m6812         1
+#define bfd_mach_m6812s        2
+  bfd_arch_m9s12x,   /* Freescale S12X */
+  bfd_arch_m9s12xg,  /* Freescale XGATE */
+  bfd_arch_z8k,       /* Zilog Z8000 */
+#define bfd_mach_z8001         1
+#define bfd_mach_z8002         2
+  bfd_arch_h8500,     /* Renesas H8/500 (formerly Hitachi H8/500) */
+  bfd_arch_sh,        /* Renesas / SuperH SH (formerly Hitachi SH) */
+#define bfd_mach_sh            1
+#define bfd_mach_sh2        0x20
+#define bfd_mach_sh_dsp     0x2d
+#define bfd_mach_sh2a       0x2a
+#define bfd_mach_sh2a_nofpu 0x2b
+#define bfd_mach_sh2a_nofpu_or_sh4_nommu_nofpu 0x2a1
+#define bfd_mach_sh2a_nofpu_or_sh3_nommu 0x2a2
+#define bfd_mach_sh2a_or_sh4  0x2a3
+#define bfd_mach_sh2a_or_sh3e 0x2a4
+#define bfd_mach_sh2e       0x2e
+#define bfd_mach_sh3        0x30
+#define bfd_mach_sh3_nommu  0x31
+#define bfd_mach_sh3_dsp    0x3d
+#define bfd_mach_sh3e       0x3e
+#define bfd_mach_sh4        0x40
+#define bfd_mach_sh4_nofpu  0x41
+#define bfd_mach_sh4_nommu_nofpu  0x42
+#define bfd_mach_sh4a       0x4a
+#define bfd_mach_sh4a_nofpu 0x4b
+#define bfd_mach_sh4al_dsp  0x4d
+#define bfd_mach_sh5        0x50
+  bfd_arch_alpha,     /* Dec Alpha */
+#define bfd_mach_alpha_ev4  0x10
+#define bfd_mach_alpha_ev5  0x20
+#define bfd_mach_alpha_ev6  0x30
+  bfd_arch_arm,       /* Advanced Risc Machines ARM.  */
+#define bfd_mach_arm_unknown   0
+#define bfd_mach_arm_2         1
+#define bfd_mach_arm_2a        2
+#define bfd_mach_arm_3         3
+#define bfd_mach_arm_3M        4
+#define bfd_mach_arm_4         5
+#define bfd_mach_arm_4T        6
+#define bfd_mach_arm_5         7
+#define bfd_mach_arm_5T        8
+#define bfd_mach_arm_5TE       9
+#define bfd_mach_arm_XScale    10
+#define bfd_mach_arm_ep9312    11
+#define bfd_mach_arm_iWMMXt    12
+#define bfd_mach_arm_iWMMXt2   13
+  bfd_arch_nds32,     /* Andes NDS32 */
+#define bfd_mach_n1            1
+#define bfd_mach_n1h           2
+#define bfd_mach_n1h_v2        3
+#define bfd_mach_n1h_v3        4
+#define bfd_mach_n1h_v3m       5
+  bfd_arch_ns32k,     /* National Semiconductors ns32000 */
+  bfd_arch_w65,       /* WDC 65816 */
+  bfd_arch_tic30,     /* Texas Instruments TMS320C30 */
+  bfd_arch_tic4x,     /* Texas Instruments TMS320C3X/4X */
+#define bfd_mach_tic3x         30
+#define bfd_mach_tic4x         40
+  bfd_arch_tic54x,    /* Texas Instruments TMS320C54X */
+  bfd_arch_tic6x,     /* Texas Instruments TMS320C6X */
+  bfd_arch_tic80,     /* TI TMS320c80 (MVP) */
+  bfd_arch_v850,      /* NEC V850 */
+  bfd_arch_v850_rh850,/* NEC V850 (using RH850 ABI) */
+#define bfd_mach_v850          1
+#define bfd_mach_v850e         'E'
+#define bfd_mach_v850e1        '1'
+#define bfd_mach_v850e2        0x4532
+#define bfd_mach_v850e2v3      0x45325633
+#define bfd_mach_v850e3v5      0x45335635 /* ('E'|'3'|'V'|'5') */
+  bfd_arch_arc,       /* ARC Cores */
+#define bfd_mach_arc_a4        0
+#define bfd_mach_arc_a5        1
+#define bfd_mach_arc_arc600    2
+#define bfd_mach_arc_arc601    4
+#define bfd_mach_arc_arc700    3
+#define bfd_mach_arc_arcv2     5
+ bfd_arch_m32c,     /* Renesas M16C/M32C.  */
+#define bfd_mach_m16c        0x75
+#define bfd_mach_m32c        0x78
+  bfd_arch_m32r,      /* Renesas M32R (formerly Mitsubishi M32R/D) */
+#define bfd_mach_m32r          1 /* For backwards compatibility.  */
+#define bfd_mach_m32rx         'x'
+#define bfd_mach_m32r2         '2'
+  bfd_arch_mn10200,   /* Matsushita MN10200 */
+  bfd_arch_mn10300,   /* Matsushita MN10300 */
+#define bfd_mach_mn10300               300
+#define bfd_mach_am33          330
+#define bfd_mach_am33_2        332
+  bfd_arch_fr30,
+#define bfd_mach_fr30          0x46523330
+  bfd_arch_frv,
+#define bfd_mach_frv           1
+#define bfd_mach_frvsimple     2
+#define bfd_mach_fr300         300
+#define bfd_mach_fr400         400
+#define bfd_mach_fr450         450
+#define bfd_mach_frvtomcat     499     /* fr500 prototype */
+#define bfd_mach_fr500         500
+#define bfd_mach_fr550         550
+  bfd_arch_moxie,       /* The moxie processor */
+#define bfd_mach_moxie         1
+  bfd_arch_ft32,       /* The ft32 processor */
+#define bfd_mach_ft32          1
+  bfd_arch_mcore,
+  bfd_arch_mep,
+#define bfd_mach_mep           1
+#define bfd_mach_mep_h1        0x6831
+#define bfd_mach_mep_c5        0x6335
+  bfd_arch_metag,
+#define bfd_mach_metag         1
+  bfd_arch_ia64,      /* HP/Intel ia64 */
+#define bfd_mach_ia64_elf64    64
+#define bfd_mach_ia64_elf32    32
+  bfd_arch_ip2k,      /* Ubicom IP2K microcontrollers. */
+#define bfd_mach_ip2022        1
+#define bfd_mach_ip2022ext     2
+ bfd_arch_iq2000,     /* Vitesse IQ2000.  */
+#define bfd_mach_iq2000        1
+#define bfd_mach_iq10          2
+  bfd_arch_epiphany,   /* Adapteva EPIPHANY */
+#define bfd_mach_epiphany16    1
+#define bfd_mach_epiphany32    2
+  bfd_arch_mt,
+#define bfd_mach_ms1           1
+#define bfd_mach_mrisc2        2
+#define bfd_mach_ms2           3
+  bfd_arch_pj,
+  bfd_arch_avr,       /* Atmel AVR microcontrollers.  */
+#define bfd_mach_avr1          1
+#define bfd_mach_avr2          2
+#define bfd_mach_avr25         25
+#define bfd_mach_avr3          3
+#define bfd_mach_avr31         31
+#define bfd_mach_avr35         35
+#define bfd_mach_avr4          4
+#define bfd_mach_avr5          5
+#define bfd_mach_avr51         51
+#define bfd_mach_avr6          6
+#define bfd_mach_avrtiny   100
+#define bfd_mach_avrxmega1 101
+#define bfd_mach_avrxmega2 102
+#define bfd_mach_avrxmega3 103
+#define bfd_mach_avrxmega4 104
+#define bfd_mach_avrxmega5 105
+#define bfd_mach_avrxmega6 106
+#define bfd_mach_avrxmega7 107
+  bfd_arch_bfin,        /* ADI Blackfin */
+#define bfd_mach_bfin          1
+  bfd_arch_cr16,       /* National Semiconductor CompactRISC (ie CR16). */
+#define bfd_mach_cr16          1
+  bfd_arch_cr16c,       /* National Semiconductor CompactRISC. */
+#define bfd_mach_cr16c         1
+  bfd_arch_crx,       /*  National Semiconductor CRX.  */
+#define bfd_mach_crx           1
+  bfd_arch_cris,      /* Axis CRIS */
+#define bfd_mach_cris_v0_v10   255
+#define bfd_mach_cris_v32      32
+#define bfd_mach_cris_v10_v32  1032
+  bfd_arch_riscv,
+#define bfd_mach_riscv32       132
+#define bfd_mach_riscv64       164
+  bfd_arch_rl78,
+#define bfd_mach_rl78  0x75
+  bfd_arch_rx,        /* Renesas RX.  */
+#define bfd_mach_rx            0x75
+  bfd_arch_s390,      /* IBM s390 */
+#define bfd_mach_s390_31       31
+#define bfd_mach_s390_64       64
+  bfd_arch_score,     /* Sunplus score */
+#define bfd_mach_score3         3
+#define bfd_mach_score7         7
+  bfd_arch_mmix,      /* Donald Knuth's educational processor.  */
+  bfd_arch_xstormy16,
+#define bfd_mach_xstormy16     1
+  bfd_arch_msp430,    /* Texas Instruments MSP430 architecture.  */
+#define bfd_mach_msp11          11
+#define bfd_mach_msp110         110
+#define bfd_mach_msp12          12
+#define bfd_mach_msp13          13
+#define bfd_mach_msp14          14
+#define bfd_mach_msp15          15
+#define bfd_mach_msp16          16
+#define bfd_mach_msp20          20
+#define bfd_mach_msp21          21
+#define bfd_mach_msp22          22
+#define bfd_mach_msp23          23
+#define bfd_mach_msp24          24
+#define bfd_mach_msp26          26
+#define bfd_mach_msp31          31
+#define bfd_mach_msp32          32
+#define bfd_mach_msp33          33
+#define bfd_mach_msp41          41
+#define bfd_mach_msp42          42
+#define bfd_mach_msp43          43
+#define bfd_mach_msp44          44
+#define bfd_mach_msp430x        45
+#define bfd_mach_msp46          46
+#define bfd_mach_msp47          47
+#define bfd_mach_msp54          54
+  bfd_arch_xc16x,     /* Infineon's XC16X Series.               */
+#define bfd_mach_xc16x         1
+#define bfd_mach_xc16xl        2
+#define bfd_mach_xc16xs        3
+  bfd_arch_xgate,   /* Freescale XGATE */
+#define bfd_mach_xgate         1
+  bfd_arch_xtensa,    /* Tensilica's Xtensa cores.  */
+#define bfd_mach_xtensa        1
+  bfd_arch_z80,
+#define bfd_mach_z80strict      1 /* No undocumented opcodes.  */
+#define bfd_mach_z80            3 /* With ixl, ixh, iyl, and iyh.  */
+#define bfd_mach_z80full        7 /* All undocumented instructions.  */
+#define bfd_mach_r800           11 /* R800: successor with multiplication.  */
+  bfd_arch_lm32,      /* Lattice Mico32 */
+#define bfd_mach_lm32      1
+  bfd_arch_microblaze,/* Xilinx MicroBlaze. */
+  bfd_arch_tilepro,   /* Tilera TILEPro */
+  bfd_arch_tilegx, /* Tilera TILE-Gx */
+#define bfd_mach_tilepro   1
+#define bfd_mach_tilegx    1
+#define bfd_mach_tilegx32  2
+  bfd_arch_aarch64,   /* AArch64  */
+#define bfd_mach_aarch64 0
+#define bfd_mach_aarch64_ilp32 32
+  bfd_arch_nios2,      /* Nios II */
+#define bfd_mach_nios2         0
+#define bfd_mach_nios2r1       1
+#define bfd_mach_nios2r2       2
+  bfd_arch_visium,     /* Visium */
+#define bfd_mach_visium        1
+  bfd_arch_last
+  };
+
+typedef struct bfd_arch_info
+{
+  int bits_per_word;
+  int bits_per_address;
+  int bits_per_byte;
+  enum bfd_architecture arch;
+  unsigned long mach;
+  const char *arch_name;
+  const char *printable_name;
+  unsigned int section_align_power;
+  /* TRUE if this is the default machine for the architecture.
+     The default arch should be the first entry for an arch so that
+     all the entries for that arch can be accessed via <<next>>.  */
+  bfd_boolean the_default;
+  const struct bfd_arch_info * (*compatible)
+    (const struct bfd_arch_info *a, const struct bfd_arch_info *b);
+
+  bfd_boolean (*scan) (const struct bfd_arch_info *, const char *);
+
+  /* Allocate via bfd_malloc and return a fill buffer of size COUNT.  If
+     IS_BIGENDIAN is TRUE, the order of bytes is big endian.  If CODE is
+     TRUE, the buffer contains code.  */
+  void *(*fill) (bfd_size_type count, bfd_boolean is_bigendian,
+                 bfd_boolean code);
+
+  const struct bfd_arch_info *next;
+}
+bfd_arch_info_type;
+
+const char *bfd_printable_name (bfd *abfd);
+
+const bfd_arch_info_type *bfd_scan_arch (const char *string);
+
+const char **bfd_arch_list (void);
+
+const bfd_arch_info_type *bfd_arch_get_compatible
+   (const bfd *abfd, const bfd *bbfd, bfd_boolean accept_unknowns);
+
+void bfd_set_arch_info (bfd *abfd, const bfd_arch_info_type *arg);
+
+bfd_boolean bfd_default_set_arch_mach
+   (bfd *abfd, enum bfd_architecture arch, unsigned long mach);
+
+enum bfd_architecture bfd_get_arch (bfd *abfd);
+
+unsigned long bfd_get_mach (bfd *abfd);
+
+unsigned int bfd_arch_bits_per_byte (bfd *abfd);
+
+unsigned int bfd_arch_bits_per_address (bfd *abfd);
+
+const bfd_arch_info_type *bfd_get_arch_info (bfd *abfd);
+
+const bfd_arch_info_type *bfd_lookup_arch
+   (enum bfd_architecture arch, unsigned long machine);
+
+const char *bfd_printable_arch_mach
+   (enum bfd_architecture arch, unsigned long machine);
+
+unsigned int bfd_octets_per_byte (bfd *abfd);
+
+unsigned int bfd_arch_mach_octets_per_byte
+   (enum bfd_architecture arch, unsigned long machine);
+
+/* Extracted from reloc.c.  */
+
+typedef enum bfd_reloc_status
+{
+  /* No errors detected.  */
+  bfd_reloc_ok,
+
+  /* The relocation was performed, but there was an overflow.  */
+  bfd_reloc_overflow,
+
+  /* The address to relocate was not within the section supplied.  */
+  bfd_reloc_outofrange,
+
+  /* Used by special functions.  */
+  bfd_reloc_continue,
+
+  /* Unsupported relocation size requested.  */
+  bfd_reloc_notsupported,
+
+  /* Unused.  */
+  bfd_reloc_other,
+
+  /* The symbol to relocate against was undefined.  */
+  bfd_reloc_undefined,
+
+  /* The relocation was performed, but may not be ok - presently
+     generated only when linking i960 coff files with i960 b.out
+     symbols.  If this type is returned, the error_message argument
+     to bfd_perform_relocation will be set.  */
+  bfd_reloc_dangerous
+ }
+ bfd_reloc_status_type;
+
+
+typedef struct reloc_cache_entry
+{
+  /* A pointer into the canonical table of pointers.  */
+  struct bfd_symbol **sym_ptr_ptr;
+
+  /* offset in section.  */
+  bfd_size_type address;
+
+  /* addend for relocation value.  */
+  bfd_vma addend;
+
+  /* Pointer to how to perform the required relocation.  */
+  reloc_howto_type *howto;
+
+}
+arelent;
+
+
+enum complain_overflow
+{
+  /* Do not complain on overflow.  */
+  complain_overflow_dont,
+
+  /* Complain if the value overflows when considered as a signed
+     number one bit larger than the field.  ie. A bitfield of N bits
+     is allowed to represent -2**n to 2**n-1.  */
+  complain_overflow_bitfield,
+
+  /* Complain if the value overflows when considered as a signed
+     number.  */
+  complain_overflow_signed,
+
+  /* Complain if the value overflows when considered as an
+     unsigned number.  */
+  complain_overflow_unsigned
+};
+struct bfd_symbol;             /* Forward declaration.  */
+
+struct reloc_howto_struct
+{
+  /*  The type field has mainly a documentary use - the back end can
+      do what it wants with it, though normally the back end's
+      external idea of what a reloc number is stored
+      in this field.  For example, a PC relative word relocation
+      in a coff environment has the type 023 - because that's
+      what the outside world calls a R_PCRWORD reloc.  */
+  unsigned int type;
+
+  /*  The value the final relocation is shifted right by.  This drops
+      unwanted data from the relocation.  */
+  unsigned int rightshift;
+
+  /*  The size of the item to be relocated.  This is *not* a
+      power-of-two measure.  To get the number of bytes operated
+      on by a type of relocation, use bfd_get_reloc_size.  */
+  int size;
+
+  /*  The number of bits in the item to be relocated.  This is used
+      when doing overflow checking.  */
+  unsigned int bitsize;
+
+  /*  The relocation is relative to the field being relocated.  */
+  bfd_boolean pc_relative;
+
+  /*  The bit position of the reloc value in the destination.
+      The relocated value is left shifted by this amount.  */
+  unsigned int bitpos;
+
+  /* What type of overflow error should be checked for when
+     relocating.  */
+  enum complain_overflow complain_on_overflow;
+
+  /* If this field is non null, then the supplied function is
+     called rather than the normal function.  This allows really
+     strange relocation methods to be accommodated (e.g., i960 callj
+     instructions).  */
+  bfd_reloc_status_type (*special_function)
+    (bfd *, arelent *, struct bfd_symbol *, void *, asection *,
+     bfd *, char **);
+
+  /* The textual name of the relocation type.  */
+  char *name;
+
+  /* Some formats record a relocation addend in the section contents
+     rather than with the relocation.  For ELF formats this is the
+     distinction between USE_REL and USE_RELA (though the code checks
+     for USE_REL == 1/0).  The value of this field is TRUE if the
+     addend is recorded with the section contents; when performing a
+     partial link (ld -r) the section contents (the data) will be
+     modified.  The value of this field is FALSE if addends are
+     recorded with the relocation (in arelent.addend); when performing
+     a partial link the relocation will be modified.
+     All relocations for all ELF USE_RELA targets should set this field
+     to FALSE (values of TRUE should be looked on with suspicion).
+     However, the converse is not true: not all relocations of all ELF
+     USE_REL targets set this field to TRUE.  Why this is so is peculiar
+     to each particular target.  For relocs that aren't used in partial
+     links (e.g. GOT stuff) it doesn't matter what this is set to.  */
+  bfd_boolean partial_inplace;
+
+  /* src_mask selects the part of the instruction (or data) to be used
+     in the relocation sum.  If the target relocations don't have an
+     addend in the reloc, eg. ELF USE_REL, src_mask will normally equal
+     dst_mask to extract the addend from the section contents.  If
+     relocations do have an addend in the reloc, eg. ELF USE_RELA, this
+     field should be zero.  Non-zero values for ELF USE_RELA targets are
+     bogus as in those cases the value in the dst_mask part of the
+     section contents should be treated as garbage.  */
+  bfd_vma src_mask;
+
+  /* dst_mask selects which parts of the instruction (or data) are
+     replaced with a relocated value.  */
+  bfd_vma dst_mask;
+
+  /* When some formats create PC relative instructions, they leave
+     the value of the pc of the place being relocated in the offset
+     slot of the instruction, so that a PC relative relocation can
+     be made just by adding in an ordinary offset (e.g., sun3 a.out).
+     Some formats leave the displacement part of an instruction
+     empty (e.g., m88k bcs); this flag signals the fact.  */
+  bfd_boolean pcrel_offset;
+};
+
+#define HOWTO(C, R, S, B, P, BI, O, SF, NAME, INPLACE, MASKSRC, MASKDST, PC) \
+  { (unsigned) C, R, S, B, P, BI, O, SF, NAME, INPLACE, MASKSRC, MASKDST, PC }
+#define NEWHOWTO(FUNCTION, NAME, SIZE, REL, IN) \
+  HOWTO (0, 0, SIZE, 0, REL, 0, complain_overflow_dont, FUNCTION, \
+         NAME, FALSE, 0, 0, IN)
+
+#define EMPTY_HOWTO(C) \
+  HOWTO ((C), 0, 0, 0, FALSE, 0, complain_overflow_dont, NULL, \
+         NULL, FALSE, 0, 0, FALSE)
+
+#define HOWTO_PREPARE(relocation, symbol)               \
+  {                                                     \
+    if (symbol != NULL)                                 \
+      {                                                 \
+        if (bfd_is_com_section (symbol->section))       \
+          {                                             \
+            relocation = 0;                             \
+          }                                             \
+        else                                            \
+          {                                             \
+            relocation = symbol->value;                 \
+          }                                             \
+      }                                                 \
+  }
+
+unsigned int bfd_get_reloc_size (reloc_howto_type *);
+
+typedef struct relent_chain
+{
+  arelent relent;
+  struct relent_chain *next;
+}
+arelent_chain;
+
+bfd_reloc_status_type bfd_check_overflow
+   (enum complain_overflow how,
+    unsigned int bitsize,
+    unsigned int rightshift,
+    unsigned int addrsize,
+    bfd_vma relocation);
+
+bfd_reloc_status_type bfd_perform_relocation
+   (bfd *abfd,
+    arelent *reloc_entry,
+    void *data,
+    asection *input_section,
+    bfd *output_bfd,
+    char **error_message);
+
+bfd_reloc_status_type bfd_install_relocation
+   (bfd *abfd,
+    arelent *reloc_entry,
+    void *data, bfd_vma data_start,
+    asection *input_section,
+    char **error_message);
+
+enum bfd_reloc_code_real {
+  _dummy_first_bfd_reloc_code_real,
+
+
+/* Basic absolute relocations of N bits.  */
+  BFD_RELOC_64,
+  BFD_RELOC_32,
+  BFD_RELOC_26,
+  BFD_RELOC_24,
+  BFD_RELOC_16,
+  BFD_RELOC_14,
+  BFD_RELOC_8,
+
+/* PC-relative relocations.  Sometimes these are relative to the address
+of the relocation itself; sometimes they are relative to the start of
+the section containing the relocation.  It depends on the specific target.
+
+The 24-bit relocation is used in some Intel 960 configurations.  */
+  BFD_RELOC_64_PCREL,
+  BFD_RELOC_32_PCREL,
+  BFD_RELOC_24_PCREL,
+  BFD_RELOC_16_PCREL,
+  BFD_RELOC_12_PCREL,
+  BFD_RELOC_8_PCREL,
+
+/* Section relative relocations.  Some targets need this for DWARF2.  */
+  BFD_RELOC_32_SECREL,
+
+/* For ELF.  */
+  BFD_RELOC_32_GOT_PCREL,
+  BFD_RELOC_16_GOT_PCREL,
+  BFD_RELOC_8_GOT_PCREL,
+  BFD_RELOC_32_GOTOFF,
+  BFD_RELOC_16_GOTOFF,
+  BFD_RELOC_LO16_GOTOFF,
+  BFD_RELOC_HI16_GOTOFF,
+  BFD_RELOC_HI16_S_GOTOFF,
+  BFD_RELOC_8_GOTOFF,
+  BFD_RELOC_64_PLT_PCREL,
+  BFD_RELOC_32_PLT_PCREL,
+  BFD_RELOC_24_PLT_PCREL,
+  BFD_RELOC_16_PLT_PCREL,
+  BFD_RELOC_8_PLT_PCREL,
+  BFD_RELOC_64_PLTOFF,
+  BFD_RELOC_32_PLTOFF,
+  BFD_RELOC_16_PLTOFF,
+  BFD_RELOC_LO16_PLTOFF,
+  BFD_RELOC_HI16_PLTOFF,
+  BFD_RELOC_HI16_S_PLTOFF,
+  BFD_RELOC_8_PLTOFF,
+
+/* Size relocations.  */
+  BFD_RELOC_SIZE32,
+  BFD_RELOC_SIZE64,
+
+/* Relocations used by 68K ELF.  */
+  BFD_RELOC_68K_GLOB_DAT,
+  BFD_RELOC_68K_JMP_SLOT,
+  BFD_RELOC_68K_RELATIVE,
+  BFD_RELOC_68K_TLS_GD32,
+  BFD_RELOC_68K_TLS_GD16,
+  BFD_RELOC_68K_TLS_GD8,
+  BFD_RELOC_68K_TLS_LDM32,
+  BFD_RELOC_68K_TLS_LDM16,
+  BFD_RELOC_68K_TLS_LDM8,
+  BFD_RELOC_68K_TLS_LDO32,
+  BFD_RELOC_68K_TLS_LDO16,
+  BFD_RELOC_68K_TLS_LDO8,
+  BFD_RELOC_68K_TLS_IE32,
+  BFD_RELOC_68K_TLS_IE16,
+  BFD_RELOC_68K_TLS_IE8,
+  BFD_RELOC_68K_TLS_LE32,
+  BFD_RELOC_68K_TLS_LE16,
+  BFD_RELOC_68K_TLS_LE8,
+
+/* Linkage-table relative.  */
+  BFD_RELOC_32_BASEREL,
+  BFD_RELOC_16_BASEREL,
+  BFD_RELOC_LO16_BASEREL,
+  BFD_RELOC_HI16_BASEREL,
+  BFD_RELOC_HI16_S_BASEREL,
+  BFD_RELOC_8_BASEREL,
+  BFD_RELOC_RVA,
+
+/* Absolute 8-bit relocation, but used to form an address like 0xFFnn.  */
+  BFD_RELOC_8_FFnn,
+
+/* These PC-relative relocations are stored as word displacements --
+i.e., byte displacements shifted right two bits.  The 30-bit word
+displacement (<<32_PCREL_S2>> -- 32 bits, shifted 2) is used on the
+SPARC.  (SPARC tools generally refer to this as <<WDISP30>>.)  The
+signed 16-bit displacement is used on the MIPS, and the 23-bit
+displacement is used on the Alpha.  */
+  BFD_RELOC_32_PCREL_S2,
+  BFD_RELOC_16_PCREL_S2,
+  BFD_RELOC_23_PCREL_S2,
+
+/* High 22 bits and low 10 bits of 32-bit value, placed into lower bits of
+the target word.  These are used on the SPARC.  */
+  BFD_RELOC_HI22,
+  BFD_RELOC_LO10,
+
+/* For systems that allocate a Global Pointer register, these are
+displacements off that register.  These relocation types are
+handled specially, because the value the register will have is
+decided relatively late.  */
+  BFD_RELOC_GPREL16,
+  BFD_RELOC_GPREL32,
+
+/* Reloc types used for i960/b.out.  */
+  BFD_RELOC_I960_CALLJ,
+
+/* SPARC ELF relocations.  There is probably some overlap with other
+relocation types already defined.  */
+  BFD_RELOC_NONE,
+  BFD_RELOC_SPARC_WDISP22,
+  BFD_RELOC_SPARC22,
+  BFD_RELOC_SPARC13,
+  BFD_RELOC_SPARC_GOT10,
+  BFD_RELOC_SPARC_GOT13,
+  BFD_RELOC_SPARC_GOT22,
+  BFD_RELOC_SPARC_PC10,
+  BFD_RELOC_SPARC_PC22,
+  BFD_RELOC_SPARC_WPLT30,
+  BFD_RELOC_SPARC_COPY,
+  BFD_RELOC_SPARC_GLOB_DAT,
+  BFD_RELOC_SPARC_JMP_SLOT,
+  BFD_RELOC_SPARC_RELATIVE,
+  BFD_RELOC_SPARC_UA16,
+  BFD_RELOC_SPARC_UA32,
+  BFD_RELOC_SPARC_UA64,
+  BFD_RELOC_SPARC_GOTDATA_HIX22,
+  BFD_RELOC_SPARC_GOTDATA_LOX10,
+  BFD_RELOC_SPARC_GOTDATA_OP_HIX22,
+  BFD_RELOC_SPARC_GOTDATA_OP_LOX10,
+  BFD_RELOC_SPARC_GOTDATA_OP,
+  BFD_RELOC_SPARC_JMP_IREL,
+  BFD_RELOC_SPARC_IRELATIVE,
+
+/* I think these are specific to SPARC a.out (e.g., Sun 4).  */
+  BFD_RELOC_SPARC_BASE13,
+  BFD_RELOC_SPARC_BASE22,
+
+/* SPARC64 relocations  */
+#define BFD_RELOC_SPARC_64 BFD_RELOC_64
+  BFD_RELOC_SPARC_10,
+  BFD_RELOC_SPARC_11,
+  BFD_RELOC_SPARC_OLO10,
+  BFD_RELOC_SPARC_HH22,
+  BFD_RELOC_SPARC_HM10,
+  BFD_RELOC_SPARC_LM22,
+  BFD_RELOC_SPARC_PC_HH22,
+  BFD_RELOC_SPARC_PC_HM10,
+  BFD_RELOC_SPARC_PC_LM22,
+  BFD_RELOC_SPARC_WDISP16,
+  BFD_RELOC_SPARC_WDISP19,
+  BFD_RELOC_SPARC_7,
+  BFD_RELOC_SPARC_6,
+  BFD_RELOC_SPARC_5,
+#define BFD_RELOC_SPARC_DISP64 BFD_RELOC_64_PCREL
+  BFD_RELOC_SPARC_PLT32,
+  BFD_RELOC_SPARC_PLT64,
+  BFD_RELOC_SPARC_HIX22,
+  BFD_RELOC_SPARC_LOX10,
+  BFD_RELOC_SPARC_H44,
+  BFD_RELOC_SPARC_M44,
+  BFD_RELOC_SPARC_L44,
+  BFD_RELOC_SPARC_REGISTER,
+  BFD_RELOC_SPARC_H34,
+  BFD_RELOC_SPARC_SIZE32,
+  BFD_RELOC_SPARC_SIZE64,
+  BFD_RELOC_SPARC_WDISP10,
+
+/* SPARC little endian relocation  */
+  BFD_RELOC_SPARC_REV32,
+
+/* SPARC TLS relocations  */
+  BFD_RELOC_SPARC_TLS_GD_HI22,
+  BFD_RELOC_SPARC_TLS_GD_LO10,
+  BFD_RELOC_SPARC_TLS_GD_ADD,
+  BFD_RELOC_SPARC_TLS_GD_CALL,
+  BFD_RELOC_SPARC_TLS_LDM_HI22,
+  BFD_RELOC_SPARC_TLS_LDM_LO10,
+  BFD_RELOC_SPARC_TLS_LDM_ADD,
+  BFD_RELOC_SPARC_TLS_LDM_CALL,
+  BFD_RELOC_SPARC_TLS_LDO_HIX22,
+  BFD_RELOC_SPARC_TLS_LDO_LOX10,
+  BFD_RELOC_SPARC_TLS_LDO_ADD,
+  BFD_RELOC_SPARC_TLS_IE_HI22,
+  BFD_RELOC_SPARC_TLS_IE_LO10,
+  BFD_RELOC_SPARC_TLS_IE_LD,
+  BFD_RELOC_SPARC_TLS_IE_LDX,
+  BFD_RELOC_SPARC_TLS_IE_ADD,
+  BFD_RELOC_SPARC_TLS_LE_HIX22,
+  BFD_RELOC_SPARC_TLS_LE_LOX10,
+  BFD_RELOC_SPARC_TLS_DTPMOD32,
+  BFD_RELOC_SPARC_TLS_DTPMOD64,
+  BFD_RELOC_SPARC_TLS_DTPOFF32,
+  BFD_RELOC_SPARC_TLS_DTPOFF64,
+  BFD_RELOC_SPARC_TLS_TPOFF32,
+  BFD_RELOC_SPARC_TLS_TPOFF64,
+
+/* SPU Relocations.  */
+  BFD_RELOC_SPU_IMM7,
+  BFD_RELOC_SPU_IMM8,
+  BFD_RELOC_SPU_IMM10,
+  BFD_RELOC_SPU_IMM10W,
+  BFD_RELOC_SPU_IMM16,
+  BFD_RELOC_SPU_IMM16W,
+  BFD_RELOC_SPU_IMM18,
+  BFD_RELOC_SPU_PCREL9a,
+  BFD_RELOC_SPU_PCREL9b,
+  BFD_RELOC_SPU_PCREL16,
+  BFD_RELOC_SPU_LO16,
+  BFD_RELOC_SPU_HI16,
+  BFD_RELOC_SPU_PPU32,
+  BFD_RELOC_SPU_PPU64,
+  BFD_RELOC_SPU_ADD_PIC,
+
+/* Alpha ECOFF and ELF relocations.  Some of these treat the symbol or
+"addend" in some special way.
+For GPDISP_HI16 ("gpdisp") relocations, the symbol is ignored when
+writing; when reading, it will be the absolute section symbol.  The
+addend is the displacement in bytes of the "lda" instruction from
+the "ldah" instruction (which is at the address of this reloc).  */
+  BFD_RELOC_ALPHA_GPDISP_HI16,
+
+/* For GPDISP_LO16 ("ignore") relocations, the symbol is handled as
+with GPDISP_HI16 relocs.  The addend is ignored when writing the
+relocations out, and is filled in with the file's GP value on
+reading, for convenience.  */
+  BFD_RELOC_ALPHA_GPDISP_LO16,
+
+/* The ELF GPDISP relocation is exactly the same as the GPDISP_HI16
+relocation except that there is no accompanying GPDISP_LO16
+relocation.  */
+  BFD_RELOC_ALPHA_GPDISP,
+
+/* The Alpha LITERAL/LITUSE relocs are produced by a symbol reference;
+the assembler turns it into a LDQ instruction to load the address of
+the symbol, and then fills in a register in the real instruction.
+
+The LITERAL reloc, at the LDQ instruction, refers to the .lita
+section symbol.  The addend is ignored when writing, but is filled
+in with the file's GP value on reading, for convenience, as with the
+GPDISP_LO16 reloc.
+
+The ELF_LITERAL reloc is somewhere between 16_GOTOFF and GPDISP_LO16.
+It should refer to the symbol to be referenced, as with 16_GOTOFF,
+but it generates output not based on the position within the .got
+section, but relative to the GP value chosen for the file during the
+final link stage.
+
+The LITUSE reloc, on the instruction using the loaded address, gives
+information to the linker that it might be able to use to optimize
+away some literal section references.  The symbol is ignored (read
+as the absolute section symbol), and the "addend" indicates the type
+of instruction using the register:
+1 - "memory" fmt insn
+2 - byte-manipulation (byte offset reg)
+3 - jsr (target of branch)  */
+  BFD_RELOC_ALPHA_LITERAL,
+  BFD_RELOC_ALPHA_ELF_LITERAL,
+  BFD_RELOC_ALPHA_LITUSE,
+
+/* The HINT relocation indicates a value that should be filled into the
+"hint" field of a jmp/jsr/ret instruction, for possible branch-
+prediction logic which may be provided on some processors.  */
+  BFD_RELOC_ALPHA_HINT,
+
+/* The LINKAGE relocation outputs a linkage pair in the object file,
+which is filled by the linker.  */
+  BFD_RELOC_ALPHA_LINKAGE,
+
+/* The CODEADDR relocation outputs a STO_CA in the object file,
+which is filled by the linker.  */
+  BFD_RELOC_ALPHA_CODEADDR,
+
+/* The GPREL_HI/LO relocations together form a 32-bit offset from the
+GP register.  */
+  BFD_RELOC_ALPHA_GPREL_HI16,
+  BFD_RELOC_ALPHA_GPREL_LO16,
+
+/* Like BFD_RELOC_23_PCREL_S2, except that the source and target must
+share a common GP, and the target address is adjusted for
+STO_ALPHA_STD_GPLOAD.  */
+  BFD_RELOC_ALPHA_BRSGP,
+
+/* The NOP relocation outputs a NOP if the longword displacement
+between two procedure entry points is < 2^21.  */
+  BFD_RELOC_ALPHA_NOP,
+
+/* The BSR relocation outputs a BSR if the longword displacement
+between two procedure entry points is < 2^21.  */
+  BFD_RELOC_ALPHA_BSR,
+
+/* The LDA relocation outputs a LDA if the longword displacement
+between two procedure entry points is < 2^16.  */
+  BFD_RELOC_ALPHA_LDA,
+
+/* The BOH relocation outputs a BSR if the longword displacement
+between two procedure entry points is < 2^21, or else a hint.  */
+  BFD_RELOC_ALPHA_BOH,
+
+/* Alpha thread-local storage relocations.  */
+  BFD_RELOC_ALPHA_TLSGD,
+  BFD_RELOC_ALPHA_TLSLDM,
+  BFD_RELOC_ALPHA_DTPMOD64,
+  BFD_RELOC_ALPHA_GOTDTPREL16,
+  BFD_RELOC_ALPHA_DTPREL64,
+  BFD_RELOC_ALPHA_DTPREL_HI16,
+  BFD_RELOC_ALPHA_DTPREL_LO16,
+  BFD_RELOC_ALPHA_DTPREL16,
+  BFD_RELOC_ALPHA_GOTTPREL16,
+  BFD_RELOC_ALPHA_TPREL64,
+  BFD_RELOC_ALPHA_TPREL_HI16,
+  BFD_RELOC_ALPHA_TPREL_LO16,
+  BFD_RELOC_ALPHA_TPREL16,
+
+/* The MIPS jump instruction.  */
+  BFD_RELOC_MIPS_JMP,
+  BFD_RELOC_MICROMIPS_JMP,
+
+/* The MIPS16 jump instruction.  */
+  BFD_RELOC_MIPS16_JMP,
+
+/* MIPS16 GP relative reloc.  */
+  BFD_RELOC_MIPS16_GPREL,
+
+/* High 16 bits of 32-bit value; simple reloc.  */
+  BFD_RELOC_HI16,
+
+/* High 16 bits of 32-bit value but the low 16 bits will be sign
+extended and added to form the final result.  If the low 16
+bits form a negative number, we need to add one to the high value
+to compensate for the borrow when the low bits are added.  */
+  BFD_RELOC_HI16_S,
+
+/* Low 16 bits.  */
+  BFD_RELOC_LO16,
+
+/* High 16 bits of 32-bit pc-relative value  */
+  BFD_RELOC_HI16_PCREL,
+
+/* High 16 bits of 32-bit pc-relative value, adjusted  */
+  BFD_RELOC_HI16_S_PCREL,
+
+/* Low 16 bits of pc-relative value  */
+  BFD_RELOC_LO16_PCREL,
+
+/* Equivalent of BFD_RELOC_MIPS_*, but with the MIPS16 layout of
+16-bit immediate fields  */
+  BFD_RELOC_MIPS16_GOT16,
+  BFD_RELOC_MIPS16_CALL16,
+
+/* MIPS16 high 16 bits of 32-bit value.  */
+  BFD_RELOC_MIPS16_HI16,
+
+/* MIPS16 high 16 bits of 32-bit value but the low 16 bits will be sign
+extended and added to form the final result.  If the low 16
+bits form a negative number, we need to add one to the high value
+to compensate for the borrow when the low bits are added.  */
+  BFD_RELOC_MIPS16_HI16_S,
+
+/* MIPS16 low 16 bits.  */
+  BFD_RELOC_MIPS16_LO16,
+
+/* MIPS16 TLS relocations  */
+  BFD_RELOC_MIPS16_TLS_GD,
+  BFD_RELOC_MIPS16_TLS_LDM,
+  BFD_RELOC_MIPS16_TLS_DTPREL_HI16,
+  BFD_RELOC_MIPS16_TLS_DTPREL_LO16,
+  BFD_RELOC_MIPS16_TLS_GOTTPREL,
+  BFD_RELOC_MIPS16_TLS_TPREL_HI16,
+  BFD_RELOC_MIPS16_TLS_TPREL_LO16,
+
+/* Relocation against a MIPS literal section.  */
+  BFD_RELOC_MIPS_LITERAL,
+  BFD_RELOC_MICROMIPS_LITERAL,
+
+/* microMIPS PC-relative relocations.  */
+  BFD_RELOC_MICROMIPS_7_PCREL_S1,
+  BFD_RELOC_MICROMIPS_10_PCREL_S1,
+  BFD_RELOC_MICROMIPS_16_PCREL_S1,
+
+/* MIPS16 PC-relative relocation.  */
+  BFD_RELOC_MIPS16_16_PCREL_S1,
+
+/* MIPS PC-relative relocations.  */
+  BFD_RELOC_MIPS_21_PCREL_S2,
+  BFD_RELOC_MIPS_26_PCREL_S2,
+  BFD_RELOC_MIPS_18_PCREL_S3,
+  BFD_RELOC_MIPS_19_PCREL_S2,
+
+/* microMIPS versions of generic BFD relocs.  */
+  BFD_RELOC_MICROMIPS_GPREL16,
+  BFD_RELOC_MICROMIPS_HI16,
+  BFD_RELOC_MICROMIPS_HI16_S,
+  BFD_RELOC_MICROMIPS_LO16,
+
+/* MIPS ELF relocations.  */
+  BFD_RELOC_MIPS_GOT16,
+  BFD_RELOC_MICROMIPS_GOT16,
+  BFD_RELOC_MIPS_CALL16,
+  BFD_RELOC_MICROMIPS_CALL16,
+  BFD_RELOC_MIPS_GOT_HI16,
+  BFD_RELOC_MICROMIPS_GOT_HI16,
+  BFD_RELOC_MIPS_GOT_LO16,
+  BFD_RELOC_MICROMIPS_GOT_LO16,
+  BFD_RELOC_MIPS_CALL_HI16,
+  BFD_RELOC_MICROMIPS_CALL_HI16,
+  BFD_RELOC_MIPS_CALL_LO16,
+  BFD_RELOC_MICROMIPS_CALL_LO16,
+  BFD_RELOC_MIPS_SUB,
+  BFD_RELOC_MICROMIPS_SUB,
+  BFD_RELOC_MIPS_GOT_PAGE,
+  BFD_RELOC_MICROMIPS_GOT_PAGE,
+  BFD_RELOC_MIPS_GOT_OFST,
+  BFD_RELOC_MICROMIPS_GOT_OFST,
+  BFD_RELOC_MIPS_GOT_DISP,
+  BFD_RELOC_MICROMIPS_GOT_DISP,
+  BFD_RELOC_MIPS_SHIFT5,
+  BFD_RELOC_MIPS_SHIFT6,
+  BFD_RELOC_MIPS_INSERT_A,
+  BFD_RELOC_MIPS_INSERT_B,
+  BFD_RELOC_MIPS_DELETE,
+  BFD_RELOC_MIPS_HIGHEST,
+  BFD_RELOC_MICROMIPS_HIGHEST,
+  BFD_RELOC_MIPS_HIGHER,
+  BFD_RELOC_MICROMIPS_HIGHER,
+  BFD_RELOC_MIPS_SCN_DISP,
+  BFD_RELOC_MICROMIPS_SCN_DISP,
+  BFD_RELOC_MIPS_REL16,
+  BFD_RELOC_MIPS_RELGOT,
+  BFD_RELOC_MIPS_JALR,
+  BFD_RELOC_MICROMIPS_JALR,
+  BFD_RELOC_MIPS_TLS_DTPMOD32,
+  BFD_RELOC_MIPS_TLS_DTPREL32,
+  BFD_RELOC_MIPS_TLS_DTPMOD64,
+  BFD_RELOC_MIPS_TLS_DTPREL64,
+  BFD_RELOC_MIPS_TLS_GD,
+  BFD_RELOC_MICROMIPS_TLS_GD,
+  BFD_RELOC_MIPS_TLS_LDM,
+  BFD_RELOC_MICROMIPS_TLS_LDM,
+  BFD_RELOC_MIPS_TLS_DTPREL_HI16,
+  BFD_RELOC_MICROMIPS_TLS_DTPREL_HI16,
+  BFD_RELOC_MIPS_TLS_DTPREL_LO16,
+  BFD_RELOC_MICROMIPS_TLS_DTPREL_LO16,
+  BFD_RELOC_MIPS_TLS_GOTTPREL,
+  BFD_RELOC_MICROMIPS_TLS_GOTTPREL,
+  BFD_RELOC_MIPS_TLS_TPREL32,
+  BFD_RELOC_MIPS_TLS_TPREL64,
+  BFD_RELOC_MIPS_TLS_TPREL_HI16,
+  BFD_RELOC_MICROMIPS_TLS_TPREL_HI16,
+  BFD_RELOC_MIPS_TLS_TPREL_LO16,
+  BFD_RELOC_MICROMIPS_TLS_TPREL_LO16,
+  BFD_RELOC_MIPS_EH,
+
+
+/* MIPS ELF relocations (VxWorks and PLT extensions).  */
+  BFD_RELOC_MIPS_COPY,
+  BFD_RELOC_MIPS_JUMP_SLOT,
+
+
+/* Moxie ELF relocations.  */
+  BFD_RELOC_MOXIE_10_PCREL,
+
+
+/* FT32 ELF relocations.  */
+  BFD_RELOC_FT32_10,
+  BFD_RELOC_FT32_20,
+  BFD_RELOC_FT32_17,
+  BFD_RELOC_FT32_18,
+
+
+/* Fujitsu Frv Relocations.  */
+  BFD_RELOC_FRV_LABEL16,
+  BFD_RELOC_FRV_LABEL24,
+  BFD_RELOC_FRV_LO16,
+  BFD_RELOC_FRV_HI16,
+  BFD_RELOC_FRV_GPREL12,
+  BFD_RELOC_FRV_GPRELU12,
+  BFD_RELOC_FRV_GPREL32,
+  BFD_RELOC_FRV_GPRELHI,
+  BFD_RELOC_FRV_GPRELLO,
+  BFD_RELOC_FRV_GOT12,
+  BFD_RELOC_FRV_GOTHI,
+  BFD_RELOC_FRV_GOTLO,
+  BFD_RELOC_FRV_FUNCDESC,
+  BFD_RELOC_FRV_FUNCDESC_GOT12,
+  BFD_RELOC_FRV_FUNCDESC_GOTHI,
+  BFD_RELOC_FRV_FUNCDESC_GOTLO,
+  BFD_RELOC_FRV_FUNCDESC_VALUE,
+  BFD_RELOC_FRV_FUNCDESC_GOTOFF12,
+  BFD_RELOC_FRV_FUNCDESC_GOTOFFHI,
+  BFD_RELOC_FRV_FUNCDESC_GOTOFFLO,
+  BFD_RELOC_FRV_GOTOFF12,
+  BFD_RELOC_FRV_GOTOFFHI,
+  BFD_RELOC_FRV_GOTOFFLO,
+  BFD_RELOC_FRV_GETTLSOFF,
+  BFD_RELOC_FRV_TLSDESC_VALUE,
+  BFD_RELOC_FRV_GOTTLSDESC12,
+  BFD_RELOC_FRV_GOTTLSDESCHI,
+  BFD_RELOC_FRV_GOTTLSDESCLO,
+  BFD_RELOC_FRV_TLSMOFF12,
+  BFD_RELOC_FRV_TLSMOFFHI,
+  BFD_RELOC_FRV_TLSMOFFLO,
+  BFD_RELOC_FRV_GOTTLSOFF12,
+  BFD_RELOC_FRV_GOTTLSOFFHI,
+  BFD_RELOC_FRV_GOTTLSOFFLO,
+  BFD_RELOC_FRV_TLSOFF,
+  BFD_RELOC_FRV_TLSDESC_RELAX,
+  BFD_RELOC_FRV_GETTLSOFF_RELAX,
+  BFD_RELOC_FRV_TLSOFF_RELAX,
+  BFD_RELOC_FRV_TLSMOFF,
+
+
+/* This is a 24bit GOT-relative reloc for the mn10300.  */
+  BFD_RELOC_MN10300_GOTOFF24,
+
+/* This is a 32bit GOT-relative reloc for the mn10300, offset by two bytes
+in the instruction.  */
+  BFD_RELOC_MN10300_GOT32,
+
+/* This is a 24bit GOT-relative reloc for the mn10300, offset by two bytes
+in the instruction.  */
+  BFD_RELOC_MN10300_GOT24,
+
+/* This is a 16bit GOT-relative reloc for the mn10300, offset by two bytes
+in the instruction.  */
+  BFD_RELOC_MN10300_GOT16,
+
+/* Copy symbol at runtime.  */
+  BFD_RELOC_MN10300_COPY,
+
+/* Create GOT entry.  */
+  BFD_RELOC_MN10300_GLOB_DAT,
+
+/* Create PLT entry.  */
+  BFD_RELOC_MN10300_JMP_SLOT,
+
+/* Adjust by program base.  */
+  BFD_RELOC_MN10300_RELATIVE,
+
+/* Together with another reloc targeted at the same location,
+allows for a value that is the difference of two symbols
+in the same section.  */
+  BFD_RELOC_MN10300_SYM_DIFF,
+
+/* The addend of this reloc is an alignment power that must
+be honoured at the offset's location, regardless of linker
+relaxation.  */
+  BFD_RELOC_MN10300_ALIGN,
+
+/* Various TLS-related relocations.  */
+  BFD_RELOC_MN10300_TLS_GD,
+  BFD_RELOC_MN10300_TLS_LD,
+  BFD_RELOC_MN10300_TLS_LDO,
+  BFD_RELOC_MN10300_TLS_GOTIE,
+  BFD_RELOC_MN10300_TLS_IE,
+  BFD_RELOC_MN10300_TLS_LE,
+  BFD_RELOC_MN10300_TLS_DTPMOD,
+  BFD_RELOC_MN10300_TLS_DTPOFF,
+  BFD_RELOC_MN10300_TLS_TPOFF,
+
+/* This is a 32bit pcrel reloc for the mn10300, offset by two bytes in the
+instruction.  */
+  BFD_RELOC_MN10300_32_PCREL,
+
+/* This is a 16bit pcrel reloc for the mn10300, offset by two bytes in the
+instruction.  */
+  BFD_RELOC_MN10300_16_PCREL,
+
+
+/* i386/elf relocations  */
+  BFD_RELOC_386_GOT32,
+  BFD_RELOC_386_PLT32,
+  BFD_RELOC_386_COPY,
+  BFD_RELOC_386_GLOB_DAT,
+  BFD_RELOC_386_JUMP_SLOT,
+  BFD_RELOC_386_RELATIVE,
+  BFD_RELOC_386_GOTOFF,
+  BFD_RELOC_386_GOTPC,
+  BFD_RELOC_386_TLS_TPOFF,
+  BFD_RELOC_386_TLS_IE,
+  BFD_RELOC_386_TLS_GOTIE,
+  BFD_RELOC_386_TLS_LE,
+  BFD_RELOC_386_TLS_GD,
+  BFD_RELOC_386_TLS_LDM,
+  BFD_RELOC_386_TLS_LDO_32,
+  BFD_RELOC_386_TLS_IE_32,
+  BFD_RELOC_386_TLS_LE_32,
+  BFD_RELOC_386_TLS_DTPMOD32,
+  BFD_RELOC_386_TLS_DTPOFF32,
+  BFD_RELOC_386_TLS_TPOFF32,
+  BFD_RELOC_386_TLS_GOTDESC,
+  BFD_RELOC_386_TLS_DESC_CALL,
+  BFD_RELOC_386_TLS_DESC,
+  BFD_RELOC_386_IRELATIVE,
+  BFD_RELOC_386_GOT32X,
+
+/* x86-64/elf relocations  */
+  BFD_RELOC_X86_64_GOT32,
+  BFD_RELOC_X86_64_PLT32,
+  BFD_RELOC_X86_64_COPY,
+  BFD_RELOC_X86_64_GLOB_DAT,
+  BFD_RELOC_X86_64_JUMP_SLOT,
+  BFD_RELOC_X86_64_RELATIVE,
+  BFD_RELOC_X86_64_GOTPCREL,
+  BFD_RELOC_X86_64_32S,
+  BFD_RELOC_X86_64_DTPMOD64,
+  BFD_RELOC_X86_64_DTPOFF64,
+  BFD_RELOC_X86_64_TPOFF64,
+  BFD_RELOC_X86_64_TLSGD,
+  BFD_RELOC_X86_64_TLSLD,
+  BFD_RELOC_X86_64_DTPOFF32,
+  BFD_RELOC_X86_64_GOTTPOFF,
+  BFD_RELOC_X86_64_TPOFF32,
+  BFD_RELOC_X86_64_GOTOFF64,
+  BFD_RELOC_X86_64_GOTPC32,
+  BFD_RELOC_X86_64_GOT64,
+  BFD_RELOC_X86_64_GOTPCREL64,
+  BFD_RELOC_X86_64_GOTPC64,
+  BFD_RELOC_X86_64_GOTPLT64,
+  BFD_RELOC_X86_64_PLTOFF64,
+  BFD_RELOC_X86_64_GOTPC32_TLSDESC,
+  BFD_RELOC_X86_64_TLSDESC_CALL,
+  BFD_RELOC_X86_64_TLSDESC,
+  BFD_RELOC_X86_64_IRELATIVE,
+  BFD_RELOC_X86_64_PC32_BND,
+  BFD_RELOC_X86_64_PLT32_BND,
+  BFD_RELOC_X86_64_GOTPCRELX,
+  BFD_RELOC_X86_64_REX_GOTPCRELX,
+
+/* ns32k relocations  */
+  BFD_RELOC_NS32K_IMM_8,
+  BFD_RELOC_NS32K_IMM_16,
+  BFD_RELOC_NS32K_IMM_32,
+  BFD_RELOC_NS32K_IMM_8_PCREL,
+  BFD_RELOC_NS32K_IMM_16_PCREL,
+  BFD_RELOC_NS32K_IMM_32_PCREL,
+  BFD_RELOC_NS32K_DISP_8,
+  BFD_RELOC_NS32K_DISP_16,
+  BFD_RELOC_NS32K_DISP_32,
+  BFD_RELOC_NS32K_DISP_8_PCREL,
+  BFD_RELOC_NS32K_DISP_16_PCREL,
+  BFD_RELOC_NS32K_DISP_32_PCREL,
+
+/* PDP11 relocations  */
+  BFD_RELOC_PDP11_DISP_8_PCREL,
+  BFD_RELOC_PDP11_DISP_6_PCREL,
+
+/* Picojava relocs.  Not all of these appear in object files.  */
+  BFD_RELOC_PJ_CODE_HI16,
+  BFD_RELOC_PJ_CODE_LO16,
+  BFD_RELOC_PJ_CODE_DIR16,
+  BFD_RELOC_PJ_CODE_DIR32,
+  BFD_RELOC_PJ_CODE_REL16,
+  BFD_RELOC_PJ_CODE_REL32,
+
+/* Power(rs6000) and PowerPC relocations.  */
+  BFD_RELOC_PPC_B26,
+  BFD_RELOC_PPC_BA26,
+  BFD_RELOC_PPC_TOC16,
+  BFD_RELOC_PPC_B16,
+  BFD_RELOC_PPC_B16_BRTAKEN,
+  BFD_RELOC_PPC_B16_BRNTAKEN,
+  BFD_RELOC_PPC_BA16,
+  BFD_RELOC_PPC_BA16_BRTAKEN,
+  BFD_RELOC_PPC_BA16_BRNTAKEN,
+  BFD_RELOC_PPC_COPY,
+  BFD_RELOC_PPC_GLOB_DAT,
+  BFD_RELOC_PPC_JMP_SLOT,
+  BFD_RELOC_PPC_RELATIVE,
+  BFD_RELOC_PPC_LOCAL24PC,
+  BFD_RELOC_PPC_EMB_NADDR32,
+  BFD_RELOC_PPC_EMB_NADDR16,
+  BFD_RELOC_PPC_EMB_NADDR16_LO,
+  BFD_RELOC_PPC_EMB_NADDR16_HI,
+  BFD_RELOC_PPC_EMB_NADDR16_HA,
+  BFD_RELOC_PPC_EMB_SDAI16,
+  BFD_RELOC_PPC_EMB_SDA2I16,
+  BFD_RELOC_PPC_EMB_SDA2REL,
+  BFD_RELOC_PPC_EMB_SDA21,
+  BFD_RELOC_PPC_EMB_MRKREF,
+  BFD_RELOC_PPC_EMB_RELSEC16,
+  BFD_RELOC_PPC_EMB_RELST_LO,
+  BFD_RELOC_PPC_EMB_RELST_HI,
+  BFD_RELOC_PPC_EMB_RELST_HA,
+  BFD_RELOC_PPC_EMB_BIT_FLD,
+  BFD_RELOC_PPC_EMB_RELSDA,
+  BFD_RELOC_PPC_VLE_REL8,
+  BFD_RELOC_PPC_VLE_REL15,
+  BFD_RELOC_PPC_VLE_REL24,
+  BFD_RELOC_PPC_VLE_LO16A,
+  BFD_RELOC_PPC_VLE_LO16D,
+  BFD_RELOC_PPC_VLE_HI16A,
+  BFD_RELOC_PPC_VLE_HI16D,
+  BFD_RELOC_PPC_VLE_HA16A,
+  BFD_RELOC_PPC_VLE_HA16D,
+  BFD_RELOC_PPC_VLE_SDA21,
+  BFD_RELOC_PPC_VLE_SDA21_LO,
+  BFD_RELOC_PPC_VLE_SDAREL_LO16A,
+  BFD_RELOC_PPC_VLE_SDAREL_LO16D,
+  BFD_RELOC_PPC_VLE_SDAREL_HI16A,
+  BFD_RELOC_PPC_VLE_SDAREL_HI16D,
+  BFD_RELOC_PPC_VLE_SDAREL_HA16A,
+  BFD_RELOC_PPC_VLE_SDAREL_HA16D,
+  BFD_RELOC_PPC_16DX_HA,
+  BFD_RELOC_PPC_REL16DX_HA,
+  BFD_RELOC_PPC64_HIGHER,
+  BFD_RELOC_PPC64_HIGHER_S,
+  BFD_RELOC_PPC64_HIGHEST,
+  BFD_RELOC_PPC64_HIGHEST_S,
+  BFD_RELOC_PPC64_TOC16_LO,
+  BFD_RELOC_PPC64_TOC16_HI,
+  BFD_RELOC_PPC64_TOC16_HA,
+  BFD_RELOC_PPC64_TOC,
+  BFD_RELOC_PPC64_PLTGOT16,
+  BFD_RELOC_PPC64_PLTGOT16_LO,
+  BFD_RELOC_PPC64_PLTGOT16_HI,
+  BFD_RELOC_PPC64_PLTGOT16_HA,
+  BFD_RELOC_PPC64_ADDR16_DS,
+  BFD_RELOC_PPC64_ADDR16_LO_DS,
+  BFD_RELOC_PPC64_GOT16_DS,
+  BFD_RELOC_PPC64_GOT16_LO_DS,
+  BFD_RELOC_PPC64_PLT16_LO_DS,
+  BFD_RELOC_PPC64_SECTOFF_DS,
+  BFD_RELOC_PPC64_SECTOFF_LO_DS,
+  BFD_RELOC_PPC64_TOC16_DS,
+  BFD_RELOC_PPC64_TOC16_LO_DS,
+  BFD_RELOC_PPC64_PLTGOT16_DS,
+  BFD_RELOC_PPC64_PLTGOT16_LO_DS,
+  BFD_RELOC_PPC64_ADDR16_HIGH,
+  BFD_RELOC_PPC64_ADDR16_HIGHA,
+  BFD_RELOC_PPC64_ADDR64_LOCAL,
+  BFD_RELOC_PPC64_ENTRY,
+
+/* PowerPC and PowerPC64 thread-local storage relocations.  */
+  BFD_RELOC_PPC_TLS,
+  BFD_RELOC_PPC_TLSGD,
+  BFD_RELOC_PPC_TLSLD,
+  BFD_RELOC_PPC_DTPMOD,
+  BFD_RELOC_PPC_TPREL16,
+  BFD_RELOC_PPC_TPREL16_LO,
+  BFD_RELOC_PPC_TPREL16_HI,
+  BFD_RELOC_PPC_TPREL16_HA,
+  BFD_RELOC_PPC_TPREL,
+  BFD_RELOC_PPC_DTPREL16,
+  BFD_RELOC_PPC_DTPREL16_LO,
+  BFD_RELOC_PPC_DTPREL16_HI,
+  BFD_RELOC_PPC_DTPREL16_HA,
+  BFD_RELOC_PPC_DTPREL,
+  BFD_RELOC_PPC_GOT_TLSGD16,
+  BFD_RELOC_PPC_GOT_TLSGD16_LO,
+  BFD_RELOC_PPC_GOT_TLSGD16_HI,
+  BFD_RELOC_PPC_GOT_TLSGD16_HA,
+  BFD_RELOC_PPC_GOT_TLSLD16,
+  BFD_RELOC_PPC_GOT_TLSLD16_LO,
+  BFD_RELOC_PPC_GOT_TLSLD16_HI,
+  BFD_RELOC_PPC_GOT_TLSLD16_HA,
+  BFD_RELOC_PPC_GOT_TPREL16,
+  BFD_RELOC_PPC_GOT_TPREL16_LO,
+  BFD_RELOC_PPC_GOT_TPREL16_HI,
+  BFD_RELOC_PPC_GOT_TPREL16_HA,
+  BFD_RELOC_PPC_GOT_DTPREL16,
+  BFD_RELOC_PPC_GOT_DTPREL16_LO,
+  BFD_RELOC_PPC_GOT_DTPREL16_HI,
+  BFD_RELOC_PPC_GOT_DTPREL16_HA,
+  BFD_RELOC_PPC64_TPREL16_DS,
+  BFD_RELOC_PPC64_TPREL16_LO_DS,
+  BFD_RELOC_PPC64_TPREL16_HIGHER,
+  BFD_RELOC_PPC64_TPREL16_HIGHERA,
+  BFD_RELOC_PPC64_TPREL16_HIGHEST,
+  BFD_RELOC_PPC64_TPREL16_HIGHESTA,
+  BFD_RELOC_PPC64_DTPREL16_DS,
+  BFD_RELOC_PPC64_DTPREL16_LO_DS,
+  BFD_RELOC_PPC64_DTPREL16_HIGHER,
+  BFD_RELOC_PPC64_DTPREL16_HIGHERA,
+  BFD_RELOC_PPC64_DTPREL16_HIGHEST,
+  BFD_RELOC_PPC64_DTPREL16_HIGHESTA,
+  BFD_RELOC_PPC64_TPREL16_HIGH,
+  BFD_RELOC_PPC64_TPREL16_HIGHA,
+  BFD_RELOC_PPC64_DTPREL16_HIGH,
+  BFD_RELOC_PPC64_DTPREL16_HIGHA,
+
+/* IBM 370/390 relocations  */
+  BFD_RELOC_I370_D12,
+
+/* The type of reloc used to build a constructor table - at the moment
+probably a 32 bit wide absolute relocation, but the target can choose.
+It generally does map to one of the other relocation types.  */
+  BFD_RELOC_CTOR,
+
+/* ARM 26 bit pc-relative branch.  The lowest two bits must be zero and are
+not stored in the instruction.  */
+  BFD_RELOC_ARM_PCREL_BRANCH,
+
+/* ARM 26 bit pc-relative branch.  The lowest bit must be zero and is
+not stored in the instruction.  The 2nd lowest bit comes from a 1 bit
+field in the instruction.  */
+  BFD_RELOC_ARM_PCREL_BLX,
+
+/* Thumb 22 bit pc-relative branch.  The lowest bit must be zero and is
+not stored in the instruction.  The 2nd lowest bit comes from a 1 bit
+field in the instruction.  */
+  BFD_RELOC_THUMB_PCREL_BLX,
+
+/* ARM 26-bit pc-relative branch for an unconditional BL or BLX instruction.  */
+  BFD_RELOC_ARM_PCREL_CALL,
+
+/* ARM 26-bit pc-relative branch for B or conditional BL instruction.  */
+  BFD_RELOC_ARM_PCREL_JUMP,
+
+/* Thumb 7-, 9-, 12-, 20-, 23-, and 25-bit pc-relative branches.
+The lowest bit must be zero and is not stored in the instruction.
+Note that the corresponding ELF R_ARM_THM_JUMPnn constant has an
+"nn" one smaller in all cases.  Note further that BRANCH23
+corresponds to R_ARM_THM_CALL.  */
+  BFD_RELOC_THUMB_PCREL_BRANCH7,
+  BFD_RELOC_THUMB_PCREL_BRANCH9,
+  BFD_RELOC_THUMB_PCREL_BRANCH12,
+  BFD_RELOC_THUMB_PCREL_BRANCH20,
+  BFD_RELOC_THUMB_PCREL_BRANCH23,
+  BFD_RELOC_THUMB_PCREL_BRANCH25,
+
+/* 12-bit immediate offset, used in ARM-format ldr and str instructions.  */
+  BFD_RELOC_ARM_OFFSET_IMM,
+
+/* 5-bit immediate offset, used in Thumb-format ldr and str instructions.  */
+  BFD_RELOC_ARM_THUMB_OFFSET,
+
+/* Pc-relative or absolute relocation depending on target.  Used for
+entries in .init_array sections.  */
+  BFD_RELOC_ARM_TARGET1,
+
+/* Read-only segment base relative address.  */
+  BFD_RELOC_ARM_ROSEGREL32,
+
+/* Data segment base relative address.  */
+  BFD_RELOC_ARM_SBREL32,
+
+/* This reloc is used for references to RTTI data from exception handling
+tables.  The actual definition depends on the target.  It may be a
+pc-relative or some form of GOT-indirect relocation.  */
+  BFD_RELOC_ARM_TARGET2,
+
+/* 31-bit PC relative address.  */
+  BFD_RELOC_ARM_PREL31,
+
+/* Low and High halfword relocations for MOVW and MOVT instructions.  */
+  BFD_RELOC_ARM_MOVW,
+  BFD_RELOC_ARM_MOVT,
+  BFD_RELOC_ARM_MOVW_PCREL,
+  BFD_RELOC_ARM_MOVT_PCREL,
+  BFD_RELOC_ARM_THUMB_MOVW,
+  BFD_RELOC_ARM_THUMB_MOVT,
+  BFD_RELOC_ARM_THUMB_MOVW_PCREL,
+  BFD_RELOC_ARM_THUMB_MOVT_PCREL,
+
+/* Relocations for setting up GOTs and PLTs for shared libraries.  */
+  BFD_RELOC_ARM_JUMP_SLOT,
+  BFD_RELOC_ARM_GLOB_DAT,
+  BFD_RELOC_ARM_GOT32,
+  BFD_RELOC_ARM_PLT32,
+  BFD_RELOC_ARM_RELATIVE,
+  BFD_RELOC_ARM_GOTOFF,
+  BFD_RELOC_ARM_GOTPC,
+  BFD_RELOC_ARM_GOT_PREL,
+
+/* ARM thread-local storage relocations.  */
+  BFD_RELOC_ARM_TLS_GD32,
+  BFD_RELOC_ARM_TLS_LDO32,
+  BFD_RELOC_ARM_TLS_LDM32,
+  BFD_RELOC_ARM_TLS_DTPOFF32,
+  BFD_RELOC_ARM_TLS_DTPMOD32,
+  BFD_RELOC_ARM_TLS_TPOFF32,
+  BFD_RELOC_ARM_TLS_IE32,
+  BFD_RELOC_ARM_TLS_LE32,
+  BFD_RELOC_ARM_TLS_GOTDESC,
+  BFD_RELOC_ARM_TLS_CALL,
+  BFD_RELOC_ARM_THM_TLS_CALL,
+  BFD_RELOC_ARM_TLS_DESCSEQ,
+  BFD_RELOC_ARM_THM_TLS_DESCSEQ,
+  BFD_RELOC_ARM_TLS_DESC,
+
+/* ARM group relocations.  */
+  BFD_RELOC_ARM_ALU_PC_G0_NC,
+  BFD_RELOC_ARM_ALU_PC_G0,
+  BFD_RELOC_ARM_ALU_PC_G1_NC,
+  BFD_RELOC_ARM_ALU_PC_G1,
+  BFD_RELOC_ARM_ALU_PC_G2,
+  BFD_RELOC_ARM_LDR_PC_G0,
+  BFD_RELOC_ARM_LDR_PC_G1,
+  BFD_RELOC_ARM_LDR_PC_G2,
+  BFD_RELOC_ARM_LDRS_PC_G0,
+  BFD_RELOC_ARM_LDRS_PC_G1,
+  BFD_RELOC_ARM_LDRS_PC_G2,
+  BFD_RELOC_ARM_LDC_PC_G0,
+  BFD_RELOC_ARM_LDC_PC_G1,
+  BFD_RELOC_ARM_LDC_PC_G2,
+  BFD_RELOC_ARM_ALU_SB_G0_NC,
+  BFD_RELOC_ARM_ALU_SB_G0,
+  BFD_RELOC_ARM_ALU_SB_G1_NC,
+  BFD_RELOC_ARM_ALU_SB_G1,
+  BFD_RELOC_ARM_ALU_SB_G2,
+  BFD_RELOC_ARM_LDR_SB_G0,
+  BFD_RELOC_ARM_LDR_SB_G1,
+  BFD_RELOC_ARM_LDR_SB_G2,
+  BFD_RELOC_ARM_LDRS_SB_G0,
+  BFD_RELOC_ARM_LDRS_SB_G1,
+  BFD_RELOC_ARM_LDRS_SB_G2,
+  BFD_RELOC_ARM_LDC_SB_G0,
+  BFD_RELOC_ARM_LDC_SB_G1,
+  BFD_RELOC_ARM_LDC_SB_G2,
+
+/* Annotation of BX instructions.  */
+  BFD_RELOC_ARM_V4BX,
+
+/* ARM support for STT_GNU_IFUNC.  */
+  BFD_RELOC_ARM_IRELATIVE,
+
+/* Thumb1 relocations to support execute-only code.  */
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G0_NC,
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G1_NC,
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G2_NC,
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G3_NC,
+
+/* These relocs are only used within the ARM assembler.  They are not
+(at present) written to any object files.  */
+  BFD_RELOC_ARM_IMMEDIATE,
+  BFD_RELOC_ARM_ADRL_IMMEDIATE,
+  BFD_RELOC_ARM_T32_IMMEDIATE,
+  BFD_RELOC_ARM_T32_ADD_IMM,
+  BFD_RELOC_ARM_T32_IMM12,
+  BFD_RELOC_ARM_T32_ADD_PC12,
+  BFD_RELOC_ARM_SHIFT_IMM,
+  BFD_RELOC_ARM_SMC,
+  BFD_RELOC_ARM_HVC,
+  BFD_RELOC_ARM_SWI,
+  BFD_RELOC_ARM_MULTI,
+  BFD_RELOC_ARM_CP_OFF_IMM,
+  BFD_RELOC_ARM_CP_OFF_IMM_S2,
+  BFD_RELOC_ARM_T32_CP_OFF_IMM,
+  BFD_RELOC_ARM_T32_CP_OFF_IMM_S2,
+  BFD_RELOC_ARM_ADR_IMM,
+  BFD_RELOC_ARM_LDR_IMM,
+  BFD_RELOC_ARM_LITERAL,
+  BFD_RELOC_ARM_IN_POOL,
+  BFD_RELOC_ARM_OFFSET_IMM8,
+  BFD_RELOC_ARM_T32_OFFSET_U8,
+  BFD_RELOC_ARM_T32_OFFSET_IMM,
+  BFD_RELOC_ARM_HWLITERAL,
+  BFD_RELOC_ARM_THUMB_ADD,
+  BFD_RELOC_ARM_THUMB_IMM,
+  BFD_RELOC_ARM_THUMB_SHIFT,
+
+/* Renesas / SuperH SH relocs.  Not all of these appear in object files.  */
+  BFD_RELOC_SH_PCDISP8BY2,
+  BFD_RELOC_SH_PCDISP12BY2,
+  BFD_RELOC_SH_IMM3,
+  BFD_RELOC_SH_IMM3U,
+  BFD_RELOC_SH_DISP12,
+  BFD_RELOC_SH_DISP12BY2,
+  BFD_RELOC_SH_DISP12BY4,
+  BFD_RELOC_SH_DISP12BY8,
+  BFD_RELOC_SH_DISP20,
+  BFD_RELOC_SH_DISP20BY8,
+  BFD_RELOC_SH_IMM4,
+  BFD_RELOC_SH_IMM4BY2,
+  BFD_RELOC_SH_IMM4BY4,
+  BFD_RELOC_SH_IMM8,
+  BFD_RELOC_SH_IMM8BY2,
+  BFD_RELOC_SH_IMM8BY4,
+  BFD_RELOC_SH_PCRELIMM8BY2,
+  BFD_RELOC_SH_PCRELIMM8BY4,
+  BFD_RELOC_SH_SWITCH16,
+  BFD_RELOC_SH_SWITCH32,
+  BFD_RELOC_SH_USES,
+  BFD_RELOC_SH_COUNT,
+  BFD_RELOC_SH_ALIGN,
+  BFD_RELOC_SH_CODE,
+  BFD_RELOC_SH_DATA,
+  BFD_RELOC_SH_LABEL,
+  BFD_RELOC_SH_LOOP_START,
+  BFD_RELOC_SH_LOOP_END,
+  BFD_RELOC_SH_COPY,
+  BFD_RELOC_SH_GLOB_DAT,
+  BFD_RELOC_SH_JMP_SLOT,
+  BFD_RELOC_SH_RELATIVE,
+  BFD_RELOC_SH_GOTPC,
+  BFD_RELOC_SH_GOT_LOW16,
+  BFD_RELOC_SH_GOT_MEDLOW16,
+  BFD_RELOC_SH_GOT_MEDHI16,
+  BFD_RELOC_SH_GOT_HI16,
+  BFD_RELOC_SH_GOTPLT_LOW16,
+  BFD_RELOC_SH_GOTPLT_MEDLOW16,
+  BFD_RELOC_SH_GOTPLT_MEDHI16,
+  BFD_RELOC_SH_GOTPLT_HI16,
+  BFD_RELOC_SH_PLT_LOW16,
+  BFD_RELOC_SH_PLT_MEDLOW16,
+  BFD_RELOC_SH_PLT_MEDHI16,
+  BFD_RELOC_SH_PLT_HI16,
+  BFD_RELOC_SH_GOTOFF_LOW16,
+  BFD_RELOC_SH_GOTOFF_MEDLOW16,
+  BFD_RELOC_SH_GOTOFF_MEDHI16,
+  BFD_RELOC_SH_GOTOFF_HI16,
+  BFD_RELOC_SH_GOTPC_LOW16,
+  BFD_RELOC_SH_GOTPC_MEDLOW16,
+  BFD_RELOC_SH_GOTPC_MEDHI16,
+  BFD_RELOC_SH_GOTPC_HI16,
+  BFD_RELOC_SH_COPY64,
+  BFD_RELOC_SH_GLOB_DAT64,
+  BFD_RELOC_SH_JMP_SLOT64,
+  BFD_RELOC_SH_RELATIVE64,
+  BFD_RELOC_SH_GOT10BY4,
+  BFD_RELOC_SH_GOT10BY8,
+  BFD_RELOC_SH_GOTPLT10BY4,
+  BFD_RELOC_SH_GOTPLT10BY8,
+  BFD_RELOC_SH_GOTPLT32,
+  BFD_RELOC_SH_SHMEDIA_CODE,
+  BFD_RELOC_SH_IMMU5,
+  BFD_RELOC_SH_IMMS6,
+  BFD_RELOC_SH_IMMS6BY32,
+  BFD_RELOC_SH_IMMU6,
+  BFD_RELOC_SH_IMMS10,
+  BFD_RELOC_SH_IMMS10BY2,
+  BFD_RELOC_SH_IMMS10BY4,
+  BFD_RELOC_SH_IMMS10BY8,
+  BFD_RELOC_SH_IMMS16,
+  BFD_RELOC_SH_IMMU16,
+  BFD_RELOC_SH_IMM_LOW16,
+  BFD_RELOC_SH_IMM_LOW16_PCREL,
+  BFD_RELOC_SH_IMM_MEDLOW16,
+  BFD_RELOC_SH_IMM_MEDLOW16_PCREL,
+  BFD_RELOC_SH_IMM_MEDHI16,
+  BFD_RELOC_SH_IMM_MEDHI16_PCREL,
+  BFD_RELOC_SH_IMM_HI16,
+  BFD_RELOC_SH_IMM_HI16_PCREL,
+  BFD_RELOC_SH_PT_16,
+  BFD_RELOC_SH_TLS_GD_32,
+  BFD_RELOC_SH_TLS_LD_32,
+  BFD_RELOC_SH_TLS_LDO_32,
+  BFD_RELOC_SH_TLS_IE_32,
+  BFD_RELOC_SH_TLS_LE_32,
+  BFD_RELOC_SH_TLS_DTPMOD32,
+  BFD_RELOC_SH_TLS_DTPOFF32,
+  BFD_RELOC_SH_TLS_TPOFF32,
+  BFD_RELOC_SH_GOT20,
+  BFD_RELOC_SH_GOTOFF20,
+  BFD_RELOC_SH_GOTFUNCDESC,
+  BFD_RELOC_SH_GOTFUNCDESC20,
+  BFD_RELOC_SH_GOTOFFFUNCDESC,
+  BFD_RELOC_SH_GOTOFFFUNCDESC20,
+  BFD_RELOC_SH_FUNCDESC,
+
+/* ARC relocs.  */
+  BFD_RELOC_ARC_NONE,
+  BFD_RELOC_ARC_8,
+  BFD_RELOC_ARC_16,
+  BFD_RELOC_ARC_24,
+  BFD_RELOC_ARC_32,
+  BFD_RELOC_ARC_N8,
+  BFD_RELOC_ARC_N16,
+  BFD_RELOC_ARC_N24,
+  BFD_RELOC_ARC_N32,
+  BFD_RELOC_ARC_SDA,
+  BFD_RELOC_ARC_SECTOFF,
+  BFD_RELOC_ARC_S21H_PCREL,
+  BFD_RELOC_ARC_S21W_PCREL,
+  BFD_RELOC_ARC_S25H_PCREL,
+  BFD_RELOC_ARC_S25W_PCREL,
+  BFD_RELOC_ARC_SDA32,
+  BFD_RELOC_ARC_SDA_LDST,
+  BFD_RELOC_ARC_SDA_LDST1,
+  BFD_RELOC_ARC_SDA_LDST2,
+  BFD_RELOC_ARC_SDA16_LD,
+  BFD_RELOC_ARC_SDA16_LD1,
+  BFD_RELOC_ARC_SDA16_LD2,
+  BFD_RELOC_ARC_S13_PCREL,
+  BFD_RELOC_ARC_W,
+  BFD_RELOC_ARC_32_ME,
+  BFD_RELOC_ARC_32_ME_S,
+  BFD_RELOC_ARC_N32_ME,
+  BFD_RELOC_ARC_SECTOFF_ME,
+  BFD_RELOC_ARC_SDA32_ME,
+  BFD_RELOC_ARC_W_ME,
+  BFD_RELOC_AC_SECTOFF_U8,
+  BFD_RELOC_AC_SECTOFF_U8_1,
+  BFD_RELOC_AC_SECTOFF_U8_2,
+  BFD_RELOC_AC_SECTOFF_S9,
+  BFD_RELOC_AC_SECTOFF_S9_1,
+  BFD_RELOC_AC_SECTOFF_S9_2,
+  BFD_RELOC_ARC_SECTOFF_ME_1,
+  BFD_RELOC_ARC_SECTOFF_ME_2,
+  BFD_RELOC_ARC_SECTOFF_1,
+  BFD_RELOC_ARC_SECTOFF_2,
+  BFD_RELOC_ARC_SDA_12,
+  BFD_RELOC_ARC_SDA16_ST2,
+  BFD_RELOC_ARC_32_PCREL,
+  BFD_RELOC_ARC_PC32,
+  BFD_RELOC_ARC_GOT32,
+  BFD_RELOC_ARC_GOTPC32,
+  BFD_RELOC_ARC_PLT32,
+  BFD_RELOC_ARC_COPY,
+  BFD_RELOC_ARC_GLOB_DAT,
+  BFD_RELOC_ARC_JMP_SLOT,
+  BFD_RELOC_ARC_RELATIVE,
+  BFD_RELOC_ARC_GOTOFF,
+  BFD_RELOC_ARC_GOTPC,
+  BFD_RELOC_ARC_S21W_PCREL_PLT,
+  BFD_RELOC_ARC_S25H_PCREL_PLT,
+  BFD_RELOC_ARC_TLS_DTPMOD,
+  BFD_RELOC_ARC_TLS_TPOFF,
+  BFD_RELOC_ARC_TLS_GD_GOT,
+  BFD_RELOC_ARC_TLS_GD_LD,
+  BFD_RELOC_ARC_TLS_GD_CALL,
+  BFD_RELOC_ARC_TLS_IE_GOT,
+  BFD_RELOC_ARC_TLS_DTPOFF,
+  BFD_RELOC_ARC_TLS_DTPOFF_S9,
+  BFD_RELOC_ARC_TLS_LE_S9,
+  BFD_RELOC_ARC_TLS_LE_32,
+  BFD_RELOC_ARC_S25W_PCREL_PLT,
+  BFD_RELOC_ARC_S21H_PCREL_PLT,
+  BFD_RELOC_ARC_NPS_CMEM16,
+
+/* ADI Blackfin 16 bit immediate absolute reloc.  */
+  BFD_RELOC_BFIN_16_IMM,
+
+/* ADI Blackfin 16 bit immediate absolute reloc higher 16 bits.  */
+  BFD_RELOC_BFIN_16_HIGH,
+
+/* ADI Blackfin 'a' part of LSETUP.  */
+  BFD_RELOC_BFIN_4_PCREL,
+
+/* ADI Blackfin.  */
+  BFD_RELOC_BFIN_5_PCREL,
+
+/* ADI Blackfin 16 bit immediate absolute reloc lower 16 bits.  */
+  BFD_RELOC_BFIN_16_LOW,
+
+/* ADI Blackfin.  */
+  BFD_RELOC_BFIN_10_PCREL,
+
+/* ADI Blackfin 'b' part of LSETUP.  */
+  BFD_RELOC_BFIN_11_PCREL,
+
+/* ADI Blackfin.  */
+  BFD_RELOC_BFIN_12_PCREL_JUMP,
+
+/* ADI Blackfin Short jump, pcrel.  */
+  BFD_RELOC_BFIN_12_PCREL_JUMP_S,
+
+/* ADI Blackfin Call.x not implemented.  */
+  BFD_RELOC_BFIN_24_PCREL_CALL_X,
+
+/* ADI Blackfin Long Jump pcrel.  */
+  BFD_RELOC_BFIN_24_PCREL_JUMP_L,
+
+/* ADI Blackfin FD-PIC relocations.  */
+  BFD_RELOC_BFIN_GOT17M4,
+  BFD_RELOC_BFIN_GOTHI,
+  BFD_RELOC_BFIN_GOTLO,
+  BFD_RELOC_BFIN_FUNCDESC,
+  BFD_RELOC_BFIN_FUNCDESC_GOT17M4,
+  BFD_RELOC_BFIN_FUNCDESC_GOTHI,
+  BFD_RELOC_BFIN_FUNCDESC_GOTLO,
+  BFD_RELOC_BFIN_FUNCDESC_VALUE,
+  BFD_RELOC_BFIN_FUNCDESC_GOTOFF17M4,
+  BFD_RELOC_BFIN_FUNCDESC_GOTOFFHI,
+  BFD_RELOC_BFIN_FUNCDESC_GOTOFFLO,
+  BFD_RELOC_BFIN_GOTOFF17M4,
+  BFD_RELOC_BFIN_GOTOFFHI,
+  BFD_RELOC_BFIN_GOTOFFLO,
+
+/* ADI Blackfin GOT relocation.  */
+  BFD_RELOC_BFIN_GOT,
+
+/* ADI Blackfin PLTPC relocation.  */
+  BFD_RELOC_BFIN_PLTPC,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_PUSH,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_CONST,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_ADD,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_SUB,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_MULT,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_DIV,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_MOD,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LSHIFT,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_RSHIFT,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_AND,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_OR,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_XOR,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LAND,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LOR,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LEN,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_NEG,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_COMP,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_PAGE,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_HWPAGE,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_ADDR,
+
+/* Mitsubishi D10V relocs.
+This is a 10-bit reloc with the right 2 bits
+assumed to be 0.  */
+  BFD_RELOC_D10V_10_PCREL_R,
+
+/* Mitsubishi D10V relocs.
+This is a 10-bit reloc with the right 2 bits
+assumed to be 0.  This is the same as the previous reloc
+except it is in the left container, i.e.,
+shifted left 15 bits.  */
+  BFD_RELOC_D10V_10_PCREL_L,
+
+/* This is an 18-bit reloc with the right 2 bits
+assumed to be 0.  */
+  BFD_RELOC_D10V_18,
+
+/* This is an 18-bit reloc with the right 2 bits
+assumed to be 0.  */
+  BFD_RELOC_D10V_18_PCREL,
+
+/* Mitsubishi D30V relocs.
+This is a 6-bit absolute reloc.  */
+  BFD_RELOC_D30V_6,
+
+/* This is a 6-bit pc-relative reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_9_PCREL,
+
+/* This is a 6-bit pc-relative reloc with
+the right 3 bits assumed to be 0. Same
+as the previous reloc but on the right side
+of the container.  */
+  BFD_RELOC_D30V_9_PCREL_R,
+
+/* This is a 12-bit absolute reloc with the
+right 3 bitsassumed to be 0.  */
+  BFD_RELOC_D30V_15,
+
+/* This is a 12-bit pc-relative reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_15_PCREL,
+
+/* This is a 12-bit pc-relative reloc with
+the right 3 bits assumed to be 0. Same
+as the previous reloc but on the right side
+of the container.  */
+  BFD_RELOC_D30V_15_PCREL_R,
+
+/* This is an 18-bit absolute reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_21,
+
+/* This is an 18-bit pc-relative reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_21_PCREL,
+
+/* This is an 18-bit pc-relative reloc with
+the right 3 bits assumed to be 0. Same
+as the previous reloc but on the right side
+of the container.  */
+  BFD_RELOC_D30V_21_PCREL_R,
+
+/* This is a 32-bit absolute reloc.  */
+  BFD_RELOC_D30V_32,
+
+/* This is a 32-bit pc-relative reloc.  */
+  BFD_RELOC_D30V_32_PCREL,
+
+/* DLX relocs  */
+  BFD_RELOC_DLX_HI16_S,
+
+/* DLX relocs  */
+  BFD_RELOC_DLX_LO16,
+
+/* DLX relocs  */
+  BFD_RELOC_DLX_JMP26,
+
+/* Renesas M16C/M32C Relocations.  */
+  BFD_RELOC_M32C_HI8,
+  BFD_RELOC_M32C_RL_JUMP,
+  BFD_RELOC_M32C_RL_1ADDR,
+  BFD_RELOC_M32C_RL_2ADDR,
+
+/* Renesas M32R (formerly Mitsubishi M32R) relocs.
+This is a 24 bit absolute address.  */
+  BFD_RELOC_M32R_24,
+
+/* This is a 10-bit pc-relative reloc with the right 2 bits assumed to be 0.  */
+  BFD_RELOC_M32R_10_PCREL,
+
+/* This is an 18-bit reloc with the right 2 bits assumed to be 0.  */
+  BFD_RELOC_M32R_18_PCREL,
+
+/* This is a 26-bit reloc with the right 2 bits assumed to be 0.  */
+  BFD_RELOC_M32R_26_PCREL,
+
+/* This is a 16-bit reloc containing the high 16 bits of an address
+used when the lower 16 bits are treated as unsigned.  */
+  BFD_RELOC_M32R_HI16_ULO,
+
+/* This is a 16-bit reloc containing the high 16 bits of an address
+used when the lower 16 bits are treated as signed.  */
+  BFD_RELOC_M32R_HI16_SLO,
+
+/* This is a 16-bit reloc containing the lower 16 bits of an address.  */
+  BFD_RELOC_M32R_LO16,
+
+/* This is a 16-bit reloc containing the small data area offset for use in
+add3, load, and store instructions.  */
+  BFD_RELOC_M32R_SDA16,
+
+/* For PIC.  */
+  BFD_RELOC_M32R_GOT24,
+  BFD_RELOC_M32R_26_PLTREL,
+  BFD_RELOC_M32R_COPY,
+  BFD_RELOC_M32R_GLOB_DAT,
+  BFD_RELOC_M32R_JMP_SLOT,
+  BFD_RELOC_M32R_RELATIVE,
+  BFD_RELOC_M32R_GOTOFF,
+  BFD_RELOC_M32R_GOTOFF_HI_ULO,
+  BFD_RELOC_M32R_GOTOFF_HI_SLO,
+  BFD_RELOC_M32R_GOTOFF_LO,
+  BFD_RELOC_M32R_GOTPC24,
+  BFD_RELOC_M32R_GOT16_HI_ULO,
+  BFD_RELOC_M32R_GOT16_HI_SLO,
+  BFD_RELOC_M32R_GOT16_LO,
+  BFD_RELOC_M32R_GOTPC_HI_ULO,
+  BFD_RELOC_M32R_GOTPC_HI_SLO,
+  BFD_RELOC_M32R_GOTPC_LO,
+
+/* NDS32 relocs.
+This is a 20 bit absolute address.  */
+  BFD_RELOC_NDS32_20,
+
+/* This is a 9-bit pc-relative reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_9_PCREL,
+
+/* This is a 9-bit pc-relative reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_WORD_9_PCREL,
+
+/* This is an 15-bit reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_15_PCREL,
+
+/* This is an 17-bit reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_17_PCREL,
+
+/* This is a 25-bit reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_25_PCREL,
+
+/* This is a 20-bit reloc containing the high 20 bits of an address
+used with the lower 12 bits  */
+  BFD_RELOC_NDS32_HI20,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift right by 3. This is used with ldi,sdi...  */
+  BFD_RELOC_NDS32_LO12S3,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 2. This is used with lwi,swi...  */
+  BFD_RELOC_NDS32_LO12S2,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 1. This is used with lhi,shi...  */
+  BFD_RELOC_NDS32_LO12S1,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 0. This is used with lbisbi...  */
+  BFD_RELOC_NDS32_LO12S0,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 0. This is only used with branch relaxations  */
+  BFD_RELOC_NDS32_LO12S0_ORI,
+
+/* This is a 15-bit reloc containing the small data area 18-bit signed offset
+and shift left by 3 for use in ldi, sdi...  */
+  BFD_RELOC_NDS32_SDA15S3,
+
+/* This is a 15-bit reloc containing the small data area 17-bit signed offset
+and shift left by 2 for use in lwi, swi...  */
+  BFD_RELOC_NDS32_SDA15S2,
+
+/* This is a 15-bit reloc containing the small data area 16-bit signed offset
+and shift left by 1 for use in lhi, shi...  */
+  BFD_RELOC_NDS32_SDA15S1,
+
+/* This is a 15-bit reloc containing the small data area 15-bit signed offset
+and shift left by 0 for use in lbi, sbi...  */
+  BFD_RELOC_NDS32_SDA15S0,
+
+/* This is a 16-bit reloc containing the small data area 16-bit signed offset
+and shift left by 3  */
+  BFD_RELOC_NDS32_SDA16S3,
+
+/* This is a 17-bit reloc containing the small data area 17-bit signed offset
+and shift left by 2 for use in lwi.gp, swi.gp...  */
+  BFD_RELOC_NDS32_SDA17S2,
+
+/* This is a 18-bit reloc containing the small data area 18-bit signed offset
+and shift left by 1 for use in lhi.gp, shi.gp...  */
+  BFD_RELOC_NDS32_SDA18S1,
+
+/* This is a 19-bit reloc containing the small data area 19-bit signed offset
+and shift left by 0 for use in lbi.gp, sbi.gp...  */
+  BFD_RELOC_NDS32_SDA19S0,
+
+/* for PIC  */
+  BFD_RELOC_NDS32_GOT20,
+  BFD_RELOC_NDS32_9_PLTREL,
+  BFD_RELOC_NDS32_25_PLTREL,
+  BFD_RELOC_NDS32_COPY,
+  BFD_RELOC_NDS32_GLOB_DAT,
+  BFD_RELOC_NDS32_JMP_SLOT,
+  BFD_RELOC_NDS32_RELATIVE,
+  BFD_RELOC_NDS32_GOTOFF,
+  BFD_RELOC_NDS32_GOTOFF_HI20,
+  BFD_RELOC_NDS32_GOTOFF_LO12,
+  BFD_RELOC_NDS32_GOTPC20,
+  BFD_RELOC_NDS32_GOT_HI20,
+  BFD_RELOC_NDS32_GOT_LO12,
+  BFD_RELOC_NDS32_GOTPC_HI20,
+  BFD_RELOC_NDS32_GOTPC_LO12,
+
+/* for relax  */
+  BFD_RELOC_NDS32_INSN16,
+  BFD_RELOC_NDS32_LABEL,
+  BFD_RELOC_NDS32_LONGCALL1,
+  BFD_RELOC_NDS32_LONGCALL2,
+  BFD_RELOC_NDS32_LONGCALL3,
+  BFD_RELOC_NDS32_LONGJUMP1,
+  BFD_RELOC_NDS32_LONGJUMP2,
+  BFD_RELOC_NDS32_LONGJUMP3,
+  BFD_RELOC_NDS32_LOADSTORE,
+  BFD_RELOC_NDS32_9_FIXED,
+  BFD_RELOC_NDS32_15_FIXED,
+  BFD_RELOC_NDS32_17_FIXED,
+  BFD_RELOC_NDS32_25_FIXED,
+  BFD_RELOC_NDS32_LONGCALL4,
+  BFD_RELOC_NDS32_LONGCALL5,
+  BFD_RELOC_NDS32_LONGCALL6,
+  BFD_RELOC_NDS32_LONGJUMP4,
+  BFD_RELOC_NDS32_LONGJUMP5,
+  BFD_RELOC_NDS32_LONGJUMP6,
+  BFD_RELOC_NDS32_LONGJUMP7,
+
+/* for PIC  */
+  BFD_RELOC_NDS32_PLTREL_HI20,
+  BFD_RELOC_NDS32_PLTREL_LO12,
+  BFD_RELOC_NDS32_PLT_GOTREL_HI20,
+  BFD_RELOC_NDS32_PLT_GOTREL_LO12,
+
+/* for floating point  */
+  BFD_RELOC_NDS32_SDA12S2_DP,
+  BFD_RELOC_NDS32_SDA12S2_SP,
+  BFD_RELOC_NDS32_LO12S2_DP,
+  BFD_RELOC_NDS32_LO12S2_SP,
+
+/* for dwarf2 debug_line.  */
+  BFD_RELOC_NDS32_DWARF2_OP1,
+  BFD_RELOC_NDS32_DWARF2_OP2,
+  BFD_RELOC_NDS32_DWARF2_LEB,
+
+/* for eliminate 16-bit instructions  */
+  BFD_RELOC_NDS32_UPDATE_TA,
+
+/* for PIC object relaxation  */
+  BFD_RELOC_NDS32_PLT_GOTREL_LO20,
+  BFD_RELOC_NDS32_PLT_GOTREL_LO15,
+  BFD_RELOC_NDS32_PLT_GOTREL_LO19,
+  BFD_RELOC_NDS32_GOT_LO15,
+  BFD_RELOC_NDS32_GOT_LO19,
+  BFD_RELOC_NDS32_GOTOFF_LO15,
+  BFD_RELOC_NDS32_GOTOFF_LO19,
+  BFD_RELOC_NDS32_GOT15S2,
+  BFD_RELOC_NDS32_GOT17S2,
+
+/* NDS32 relocs.
+This is a 5 bit absolute address.  */
+  BFD_RELOC_NDS32_5,
+
+/* This is a 10-bit unsigned pc-relative reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_10_UPCREL,
+
+/* If fp were omitted, fp can used as another gp.  */
+  BFD_RELOC_NDS32_SDA_FP7U2_RELA,
+
+/* relaxation relative relocation types  */
+  BFD_RELOC_NDS32_RELAX_ENTRY,
+  BFD_RELOC_NDS32_GOT_SUFF,
+  BFD_RELOC_NDS32_GOTOFF_SUFF,
+  BFD_RELOC_NDS32_PLT_GOT_SUFF,
+  BFD_RELOC_NDS32_MULCALL_SUFF,
+  BFD_RELOC_NDS32_PTR,
+  BFD_RELOC_NDS32_PTR_COUNT,
+  BFD_RELOC_NDS32_PTR_RESOLVED,
+  BFD_RELOC_NDS32_PLTBLOCK,
+  BFD_RELOC_NDS32_RELAX_REGION_BEGIN,
+  BFD_RELOC_NDS32_RELAX_REGION_END,
+  BFD_RELOC_NDS32_MINUEND,
+  BFD_RELOC_NDS32_SUBTRAHEND,
+  BFD_RELOC_NDS32_DIFF8,
+  BFD_RELOC_NDS32_DIFF16,
+  BFD_RELOC_NDS32_DIFF32,
+  BFD_RELOC_NDS32_DIFF_ULEB128,
+  BFD_RELOC_NDS32_EMPTY,
+
+/* This is a 25 bit absolute address.  */
+  BFD_RELOC_NDS32_25_ABS,
+
+/* For ex9 and ifc using.  */
+  BFD_RELOC_NDS32_DATA,
+  BFD_RELOC_NDS32_TRAN,
+  BFD_RELOC_NDS32_17IFC_PCREL,
+  BFD_RELOC_NDS32_10IFCU_PCREL,
+
+/* For TLS.  */
+  BFD_RELOC_NDS32_TPOFF,
+  BFD_RELOC_NDS32_TLS_LE_HI20,
+  BFD_RELOC_NDS32_TLS_LE_LO12,
+  BFD_RELOC_NDS32_TLS_LE_ADD,
+  BFD_RELOC_NDS32_TLS_LE_LS,
+  BFD_RELOC_NDS32_GOTTPOFF,
+  BFD_RELOC_NDS32_TLS_IE_HI20,
+  BFD_RELOC_NDS32_TLS_IE_LO12S2,
+  BFD_RELOC_NDS32_TLS_TPOFF,
+  BFD_RELOC_NDS32_TLS_LE_20,
+  BFD_RELOC_NDS32_TLS_LE_15S0,
+  BFD_RELOC_NDS32_TLS_LE_15S1,
+  BFD_RELOC_NDS32_TLS_LE_15S2,
+
+/* This is a 9-bit reloc  */
+  BFD_RELOC_V850_9_PCREL,
+
+/* This is a 22-bit reloc  */
+  BFD_RELOC_V850_22_PCREL,
+
+/* This is a 16 bit offset from the short data area pointer.  */
+  BFD_RELOC_V850_SDA_16_16_OFFSET,
+
+/* This is a 16 bit offset (of which only 15 bits are used) from the
+short data area pointer.  */
+  BFD_RELOC_V850_SDA_15_16_OFFSET,
+
+/* This is a 16 bit offset from the zero data area pointer.  */
+  BFD_RELOC_V850_ZDA_16_16_OFFSET,
+
+/* This is a 16 bit offset (of which only 15 bits are used) from the
+zero data area pointer.  */
+  BFD_RELOC_V850_ZDA_15_16_OFFSET,
+
+/* This is an 8 bit offset (of which only 6 bits are used) from the
+tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_6_8_OFFSET,
+
+/* This is an 8bit offset (of which only 7 bits are used) from the tiny
+data area pointer.  */
+  BFD_RELOC_V850_TDA_7_8_OFFSET,
+
+/* This is a 7 bit offset from the tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_7_7_OFFSET,
+
+/* This is a 16 bit offset from the tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_16_16_OFFSET,
+
+/* This is a 5 bit offset (of which only 4 bits are used) from the tiny
+data area pointer.  */
+  BFD_RELOC_V850_TDA_4_5_OFFSET,
+
+/* This is a 4 bit offset from the tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_4_4_OFFSET,
+
+/* This is a 16 bit offset from the short data area pointer, with the
+bits placed non-contiguously in the instruction.  */
+  BFD_RELOC_V850_SDA_16_16_SPLIT_OFFSET,
+
+/* This is a 16 bit offset from the zero data area pointer, with the
+bits placed non-contiguously in the instruction.  */
+  BFD_RELOC_V850_ZDA_16_16_SPLIT_OFFSET,
+
+/* This is a 6 bit offset from the call table base pointer.  */
+  BFD_RELOC_V850_CALLT_6_7_OFFSET,
+
+/* This is a 16 bit offset from the call table base pointer.  */
+  BFD_RELOC_V850_CALLT_16_16_OFFSET,
+
+/* Used for relaxing indirect function calls.  */
+  BFD_RELOC_V850_LONGCALL,
+
+/* Used for relaxing indirect jumps.  */
+  BFD_RELOC_V850_LONGJUMP,
+
+/* Used to maintain alignment whilst relaxing.  */
+  BFD_RELOC_V850_ALIGN,
+
+/* This is a variation of BFD_RELOC_LO16 that can be used in v850e ld.bu
+instructions.  */
+  BFD_RELOC_V850_LO16_SPLIT_OFFSET,
+
+/* This is a 16-bit reloc.  */
+  BFD_RELOC_V850_16_PCREL,
+
+/* This is a 17-bit reloc.  */
+  BFD_RELOC_V850_17_PCREL,
+
+/* This is a 23-bit reloc.  */
+  BFD_RELOC_V850_23,
+
+/* This is a 32-bit reloc.  */
+  BFD_RELOC_V850_32_PCREL,
+
+/* This is a 32-bit reloc.  */
+  BFD_RELOC_V850_32_ABS,
+
+/* This is a 16-bit reloc.  */
+  BFD_RELOC_V850_16_SPLIT_OFFSET,
+
+/* This is a 16-bit reloc.  */
+  BFD_RELOC_V850_16_S1,
+
+/* Low 16 bits. 16 bit shifted by 1.  */
+  BFD_RELOC_V850_LO16_S1,
+
+/* This is a 16 bit offset from the call table base pointer.  */
+  BFD_RELOC_V850_CALLT_15_16_OFFSET,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_GOTPCREL,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_16_GOT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_GOT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_22_PLT_PCREL,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_PLT_PCREL,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_COPY,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_GLOB_DAT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_JMP_SLOT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_RELATIVE,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_16_GOTOFF,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_GOTOFF,
+
+/* start code.  */
+  BFD_RELOC_V850_CODE,
+
+/* start data in text.  */
+  BFD_RELOC_V850_DATA,
+
+/* This is a 8bit DP reloc for the tms320c30, where the most
+significant 8 bits of a 24 bit word are placed into the least
+significant 8 bits of the opcode.  */
+  BFD_RELOC_TIC30_LDP,
+
+/* This is a 7bit reloc for the tms320c54x, where the least
+significant 7 bits of a 16 bit word are placed into the least
+significant 7 bits of the opcode.  */
+  BFD_RELOC_TIC54X_PARTLS7,
+
+/* This is a 9bit DP reloc for the tms320c54x, where the most
+significant 9 bits of a 16 bit word are placed into the least
+significant 9 bits of the opcode.  */
+  BFD_RELOC_TIC54X_PARTMS9,
+
+/* This is an extended address 23-bit reloc for the tms320c54x.  */
+  BFD_RELOC_TIC54X_23,
+
+/* This is a 16-bit reloc for the tms320c54x, where the least
+significant 16 bits of a 23-bit extended address are placed into
+the opcode.  */
+  BFD_RELOC_TIC54X_16_OF_23,
+
+/* This is a reloc for the tms320c54x, where the most
+significant 7 bits of a 23-bit extended address are placed into
+the opcode.  */
+  BFD_RELOC_TIC54X_MS7_OF_23,
+
+/* TMS320C6000 relocations.  */
+  BFD_RELOC_C6000_PCR_S21,
+  BFD_RELOC_C6000_PCR_S12,
+  BFD_RELOC_C6000_PCR_S10,
+  BFD_RELOC_C6000_PCR_S7,
+  BFD_RELOC_C6000_ABS_S16,
+  BFD_RELOC_C6000_ABS_L16,
+  BFD_RELOC_C6000_ABS_H16,
+  BFD_RELOC_C6000_SBR_U15_B,
+  BFD_RELOC_C6000_SBR_U15_H,
+  BFD_RELOC_C6000_SBR_U15_W,
+  BFD_RELOC_C6000_SBR_S16,
+  BFD_RELOC_C6000_SBR_L16_B,
+  BFD_RELOC_C6000_SBR_L16_H,
+  BFD_RELOC_C6000_SBR_L16_W,
+  BFD_RELOC_C6000_SBR_H16_B,
+  BFD_RELOC_C6000_SBR_H16_H,
+  BFD_RELOC_C6000_SBR_H16_W,
+  BFD_RELOC_C6000_SBR_GOT_U15_W,
+  BFD_RELOC_C6000_SBR_GOT_L16_W,
+  BFD_RELOC_C6000_SBR_GOT_H16_W,
+  BFD_RELOC_C6000_DSBT_INDEX,
+  BFD_RELOC_C6000_PREL31,
+  BFD_RELOC_C6000_COPY,
+  BFD_RELOC_C6000_JUMP_SLOT,
+  BFD_RELOC_C6000_EHTYPE,
+  BFD_RELOC_C6000_PCR_H16,
+  BFD_RELOC_C6000_PCR_L16,
+  BFD_RELOC_C6000_ALIGN,
+  BFD_RELOC_C6000_FPHEAD,
+  BFD_RELOC_C6000_NOCMP,
+
+/* This is a 48 bit reloc for the FR30 that stores 32 bits.  */
+  BFD_RELOC_FR30_48,
+
+/* This is a 32 bit reloc for the FR30 that stores 20 bits split up into
+two sections.  */
+  BFD_RELOC_FR30_20,
+
+/* This is a 16 bit reloc for the FR30 that stores a 6 bit word offset in
+4 bits.  */
+  BFD_RELOC_FR30_6_IN_4,
+
+/* This is a 16 bit reloc for the FR30 that stores an 8 bit byte offset
+into 8 bits.  */
+  BFD_RELOC_FR30_8_IN_8,
+
+/* This is a 16 bit reloc for the FR30 that stores a 9 bit short offset
+into 8 bits.  */
+  BFD_RELOC_FR30_9_IN_8,
+
+/* This is a 16 bit reloc for the FR30 that stores a 10 bit word offset
+into 8 bits.  */
+  BFD_RELOC_FR30_10_IN_8,
+
+/* This is a 16 bit reloc for the FR30 that stores a 9 bit pc relative
+short offset into 8 bits.  */
+  BFD_RELOC_FR30_9_PCREL,
+
+/* This is a 16 bit reloc for the FR30 that stores a 12 bit pc relative
+short offset into 11 bits.  */
+  BFD_RELOC_FR30_12_PCREL,
+
+/* Motorola Mcore relocations.  */
+  BFD_RELOC_MCORE_PCREL_IMM8BY4,
+  BFD_RELOC_MCORE_PCREL_IMM11BY2,
+  BFD_RELOC_MCORE_PCREL_IMM4BY2,
+  BFD_RELOC_MCORE_PCREL_32,
+  BFD_RELOC_MCORE_PCREL_JSR_IMM11BY2,
+  BFD_RELOC_MCORE_RVA,
+
+/* Toshiba Media Processor Relocations.  */
+  BFD_RELOC_MEP_8,
+  BFD_RELOC_MEP_16,
+  BFD_RELOC_MEP_32,
+  BFD_RELOC_MEP_PCREL8A2,
+  BFD_RELOC_MEP_PCREL12A2,
+  BFD_RELOC_MEP_PCREL17A2,
+  BFD_RELOC_MEP_PCREL24A2,
+  BFD_RELOC_MEP_PCABS24A2,
+  BFD_RELOC_MEP_LOW16,
+  BFD_RELOC_MEP_HI16U,
+  BFD_RELOC_MEP_HI16S,
+  BFD_RELOC_MEP_GPREL,
+  BFD_RELOC_MEP_TPREL,
+  BFD_RELOC_MEP_TPREL7,
+  BFD_RELOC_MEP_TPREL7A2,
+  BFD_RELOC_MEP_TPREL7A4,
+  BFD_RELOC_MEP_UIMM24,
+  BFD_RELOC_MEP_ADDR24A4,
+  BFD_RELOC_MEP_GNU_VTINHERIT,
+  BFD_RELOC_MEP_GNU_VTENTRY,
+
+
+/* Imagination Technologies Meta relocations.  */
+  BFD_RELOC_METAG_HIADDR16,
+  BFD_RELOC_METAG_LOADDR16,
+  BFD_RELOC_METAG_RELBRANCH,
+  BFD_RELOC_METAG_GETSETOFF,
+  BFD_RELOC_METAG_HIOG,
+  BFD_RELOC_METAG_LOOG,
+  BFD_RELOC_METAG_REL8,
+  BFD_RELOC_METAG_REL16,
+  BFD_RELOC_METAG_HI16_GOTOFF,
+  BFD_RELOC_METAG_LO16_GOTOFF,
+  BFD_RELOC_METAG_GETSET_GOTOFF,
+  BFD_RELOC_METAG_GETSET_GOT,
+  BFD_RELOC_METAG_HI16_GOTPC,
+  BFD_RELOC_METAG_LO16_GOTPC,
+  BFD_RELOC_METAG_HI16_PLT,
+  BFD_RELOC_METAG_LO16_PLT,
+  BFD_RELOC_METAG_RELBRANCH_PLT,
+  BFD_RELOC_METAG_GOTOFF,
+  BFD_RELOC_METAG_PLT,
+  BFD_RELOC_METAG_COPY,
+  BFD_RELOC_METAG_JMP_SLOT,
+  BFD_RELOC_METAG_RELATIVE,
+  BFD_RELOC_METAG_GLOB_DAT,
+  BFD_RELOC_METAG_TLS_GD,
+  BFD_RELOC_METAG_TLS_LDM,
+  BFD_RELOC_METAG_TLS_LDO_HI16,
+  BFD_RELOC_METAG_TLS_LDO_LO16,
+  BFD_RELOC_METAG_TLS_LDO,
+  BFD_RELOC_METAG_TLS_IE,
+  BFD_RELOC_METAG_TLS_IENONPIC,
+  BFD_RELOC_METAG_TLS_IENONPIC_HI16,
+  BFD_RELOC_METAG_TLS_IENONPIC_LO16,
+  BFD_RELOC_METAG_TLS_TPOFF,
+  BFD_RELOC_METAG_TLS_DTPMOD,
+  BFD_RELOC_METAG_TLS_DTPOFF,
+  BFD_RELOC_METAG_TLS_LE,
+  BFD_RELOC_METAG_TLS_LE_HI16,
+  BFD_RELOC_METAG_TLS_LE_LO16,
+
+/* These are relocations for the GETA instruction.  */
+  BFD_RELOC_MMIX_GETA,
+  BFD_RELOC_MMIX_GETA_1,
+  BFD_RELOC_MMIX_GETA_2,
+  BFD_RELOC_MMIX_GETA_3,
+
+/* These are relocations for a conditional branch instruction.  */
+  BFD_RELOC_MMIX_CBRANCH,
+  BFD_RELOC_MMIX_CBRANCH_J,
+  BFD_RELOC_MMIX_CBRANCH_1,
+  BFD_RELOC_MMIX_CBRANCH_2,
+  BFD_RELOC_MMIX_CBRANCH_3,
+
+/* These are relocations for the PUSHJ instruction.  */
+  BFD_RELOC_MMIX_PUSHJ,
+  BFD_RELOC_MMIX_PUSHJ_1,
+  BFD_RELOC_MMIX_PUSHJ_2,
+  BFD_RELOC_MMIX_PUSHJ_3,
+  BFD_RELOC_MMIX_PUSHJ_STUBBABLE,
+
+/* These are relocations for the JMP instruction.  */
+  BFD_RELOC_MMIX_JMP,
+  BFD_RELOC_MMIX_JMP_1,
+  BFD_RELOC_MMIX_JMP_2,
+  BFD_RELOC_MMIX_JMP_3,
+
+/* This is a relocation for a relative address as in a GETA instruction or
+a branch.  */
+  BFD_RELOC_MMIX_ADDR19,
+
+/* This is a relocation for a relative address as in a JMP instruction.  */
+  BFD_RELOC_MMIX_ADDR27,
+
+/* This is a relocation for an instruction field that may be a general
+register or a value 0..255.  */
+  BFD_RELOC_MMIX_REG_OR_BYTE,
+
+/* This is a relocation for an instruction field that may be a general
+register.  */
+  BFD_RELOC_MMIX_REG,
+
+/* This is a relocation for two instruction fields holding a register and
+an offset, the equivalent of the relocation.  */
+  BFD_RELOC_MMIX_BASE_PLUS_OFFSET,
+
+/* This relocation is an assertion that the expression is not allocated as
+a global register.  It does not modify contents.  */
+  BFD_RELOC_MMIX_LOCAL,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit pc relative
+short offset into 7 bits.  */
+  BFD_RELOC_AVR_7_PCREL,
+
+/* This is a 16 bit reloc for the AVR that stores 13 bit pc relative
+short offset into 12 bits.  */
+  BFD_RELOC_AVR_13_PCREL,
+
+/* This is a 16 bit reloc for the AVR that stores 17 bit value (usually
+program memory address) into 16 bits.  */
+  BFD_RELOC_AVR_16_PM,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (usually
+data memory address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_LO8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (high 8 bit
+of data memory address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HI8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (most high 8 bit
+of program memory address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HH8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (most high 8 bit
+of 32 bit value) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_MS8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(usually data memory address) into 8 bit immediate value of SUBI insn.  */
+  BFD_RELOC_AVR_LO8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(high 8 bit of data memory address) into 8 bit immediate value of
+SUBI insn.  */
+  BFD_RELOC_AVR_HI8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(most high 8 bit of program memory address) into 8 bit immediate value
+of LDI or SUBI insn.  */
+  BFD_RELOC_AVR_HH8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value (msb
+of 32 bit value) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_MS8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (usually
+command address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_LO8_LDI_PM,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value
+(command address) into 8 bit immediate value of LDI insn. If the address
+is beyond the 128k boundary, the linker inserts a jump stub for this reloc
+in the lower 128k.  */
+  BFD_RELOC_AVR_LO8_LDI_GS,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (high 8 bit
+of command address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HI8_LDI_PM,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (high 8 bit
+of command address) into 8 bit immediate value of LDI insn.  If the address
+is beyond the 128k boundary, the linker inserts a jump stub for this reloc
+below 128k.  */
+  BFD_RELOC_AVR_HI8_LDI_GS,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (most high 8 bit
+of command address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HH8_LDI_PM,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(usually command address) into 8 bit immediate value of SUBI insn.  */
+  BFD_RELOC_AVR_LO8_LDI_PM_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(high 8 bit of 16 bit command address) into 8 bit immediate value
+of SUBI insn.  */
+  BFD_RELOC_AVR_HI8_LDI_PM_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(high 6 bit of 22 bit command address) into 8 bit immediate
+value of SUBI insn.  */
+  BFD_RELOC_AVR_HH8_LDI_PM_NEG,
+
+/* This is a 32 bit reloc for the AVR that stores 23 bit value
+into 22 bits.  */
+  BFD_RELOC_AVR_CALL,
+
+/* This is a 16 bit reloc for the AVR that stores all needed bits
+for absolute addressing with ldi with overflow check to linktime  */
+  BFD_RELOC_AVR_LDI,
+
+/* This is a 6 bit reloc for the AVR that stores offset for ldd/std
+instructions  */
+  BFD_RELOC_AVR_6,
+
+/* This is a 6 bit reloc for the AVR that stores offset for adiw/sbiw
+instructions  */
+  BFD_RELOC_AVR_6_ADIW,
+
+/* This is a 8 bit reloc for the AVR that stores bits 0..7 of a symbol
+in .byte lo8(symbol)  */
+  BFD_RELOC_AVR_8_LO,
+
+/* This is a 8 bit reloc for the AVR that stores bits 8..15 of a symbol
+in .byte hi8(symbol)  */
+  BFD_RELOC_AVR_8_HI,
+
+/* This is a 8 bit reloc for the AVR that stores bits 16..23 of a symbol
+in .byte hlo8(symbol)  */
+  BFD_RELOC_AVR_8_HLO,
+
+/* AVR relocations to mark the difference of two local symbols.
+These are only needed to support linker relaxation and can be ignored
+when not relaxing.  The field is set to the value of the difference
+assuming no relaxation.  The relocation encodes the position of the
+second symbol so the linker can determine whether to adjust the field
+value.  */
+  BFD_RELOC_AVR_DIFF8,
+  BFD_RELOC_AVR_DIFF16,
+  BFD_RELOC_AVR_DIFF32,
+
+/* This is a 7 bit reloc for the AVR that stores SRAM address for 16bit
+lds and sts instructions supported only tiny core.  */
+  BFD_RELOC_AVR_LDS_STS_16,
+
+/* This is a 6 bit reloc for the AVR that stores an I/O register
+number for the IN and OUT instructions  */
+  BFD_RELOC_AVR_PORT6,
+
+/* This is a 5 bit reloc for the AVR that stores an I/O register
+number for the SBIC, SBIS, SBI and CBI instructions  */
+  BFD_RELOC_AVR_PORT5,
+
+/* RISC-V relocations.  */
+  BFD_RELOC_RISCV_HI20,
+  BFD_RELOC_RISCV_PCREL_HI20,
+  BFD_RELOC_RISCV_PCREL_LO12_I,
+  BFD_RELOC_RISCV_PCREL_LO12_S,
+  BFD_RELOC_RISCV_LO12_I,
+  BFD_RELOC_RISCV_LO12_S,
+  BFD_RELOC_RISCV_GPREL12_I,
+  BFD_RELOC_RISCV_GPREL12_S,
+  BFD_RELOC_RISCV_TPREL_HI20,
+  BFD_RELOC_RISCV_TPREL_LO12_I,
+  BFD_RELOC_RISCV_TPREL_LO12_S,
+  BFD_RELOC_RISCV_TPREL_ADD,
+  BFD_RELOC_RISCV_CALL,
+  BFD_RELOC_RISCV_CALL_PLT,
+  BFD_RELOC_RISCV_ADD8,
+  BFD_RELOC_RISCV_ADD16,
+  BFD_RELOC_RISCV_ADD32,
+  BFD_RELOC_RISCV_ADD64,
+  BFD_RELOC_RISCV_SUB8,
+  BFD_RELOC_RISCV_SUB16,
+  BFD_RELOC_RISCV_SUB32,
+  BFD_RELOC_RISCV_SUB64,
+  BFD_RELOC_RISCV_GOT_HI20,
+  BFD_RELOC_RISCV_TLS_GOT_HI20,
+  BFD_RELOC_RISCV_TLS_GD_HI20,
+  BFD_RELOC_RISCV_JMP,
+  BFD_RELOC_RISCV_TLS_DTPMOD32,
+  BFD_RELOC_RISCV_TLS_DTPREL32,
+  BFD_RELOC_RISCV_TLS_DTPMOD64,
+  BFD_RELOC_RISCV_TLS_DTPREL64,
+  BFD_RELOC_RISCV_TLS_TPREL32,
+  BFD_RELOC_RISCV_TLS_TPREL64,
+  BFD_RELOC_RISCV_ALIGN,
+  BFD_RELOC_RISCV_RVC_BRANCH,
+  BFD_RELOC_RISCV_RVC_JUMP,
+  BFD_RELOC_RISCV_RVC_LUI,
+  BFD_RELOC_RISCV_GPREL_I,
+  BFD_RELOC_RISCV_GPREL_S,
+  BFD_RELOC_RISCV_TPREL_I,
+  BFD_RELOC_RISCV_TPREL_S,
+  BFD_RELOC_RISCV_RELAX,
+  BFD_RELOC_RISCV_CFA,
+  BFD_RELOC_RISCV_SUB6,
+  BFD_RELOC_RISCV_SET6,
+  BFD_RELOC_RISCV_SET8,
+  BFD_RELOC_RISCV_SET16,
+  BFD_RELOC_RISCV_SET32,
+/* Riscv, Pulp Specific */
+  BFD_RELOC_RISCV_REL12,
+  BFD_RELOC_RISCV_RELU5,
+  BFD_RELOC_RISCV_12_I,
+  BFD_RELOC_RISCV_12_S,
+
+
+/* Renesas RL78 Relocations.  */
+  BFD_RELOC_RL78_NEG8,
+  BFD_RELOC_RL78_NEG16,
+  BFD_RELOC_RL78_NEG24,
+  BFD_RELOC_RL78_NEG32,
+  BFD_RELOC_RL78_16_OP,
+  BFD_RELOC_RL78_24_OP,
+  BFD_RELOC_RL78_32_OP,
+  BFD_RELOC_RL78_8U,
+  BFD_RELOC_RL78_16U,
+  BFD_RELOC_RL78_24U,
+  BFD_RELOC_RL78_DIR3U_PCREL,
+  BFD_RELOC_RL78_DIFF,
+  BFD_RELOC_RL78_GPRELB,
+  BFD_RELOC_RL78_GPRELW,
+  BFD_RELOC_RL78_GPRELL,
+  BFD_RELOC_RL78_SYM,
+  BFD_RELOC_RL78_OP_SUBTRACT,
+  BFD_RELOC_RL78_OP_NEG,
+  BFD_RELOC_RL78_OP_AND,
+  BFD_RELOC_RL78_OP_SHRA,
+  BFD_RELOC_RL78_ABS8,
+  BFD_RELOC_RL78_ABS16,
+  BFD_RELOC_RL78_ABS16_REV,
+  BFD_RELOC_RL78_ABS32,
+  BFD_RELOC_RL78_ABS32_REV,
+  BFD_RELOC_RL78_ABS16U,
+  BFD_RELOC_RL78_ABS16UW,
+  BFD_RELOC_RL78_ABS16UL,
+  BFD_RELOC_RL78_RELAX,
+  BFD_RELOC_RL78_HI16,
+  BFD_RELOC_RL78_HI8,
+  BFD_RELOC_RL78_LO16,
+  BFD_RELOC_RL78_CODE,
+  BFD_RELOC_RL78_SADDR,
+
+/* Renesas RX Relocations.  */
+  BFD_RELOC_RX_NEG8,
+  BFD_RELOC_RX_NEG16,
+  BFD_RELOC_RX_NEG24,
+  BFD_RELOC_RX_NEG32,
+  BFD_RELOC_RX_16_OP,
+  BFD_RELOC_RX_24_OP,
+  BFD_RELOC_RX_32_OP,
+  BFD_RELOC_RX_8U,
+  BFD_RELOC_RX_16U,
+  BFD_RELOC_RX_24U,
+  BFD_RELOC_RX_DIR3U_PCREL,
+  BFD_RELOC_RX_DIFF,
+  BFD_RELOC_RX_GPRELB,
+  BFD_RELOC_RX_GPRELW,
+  BFD_RELOC_RX_GPRELL,
+  BFD_RELOC_RX_SYM,
+  BFD_RELOC_RX_OP_SUBTRACT,
+  BFD_RELOC_RX_OP_NEG,
+  BFD_RELOC_RX_ABS8,
+  BFD_RELOC_RX_ABS16,
+  BFD_RELOC_RX_ABS16_REV,
+  BFD_RELOC_RX_ABS32,
+  BFD_RELOC_RX_ABS32_REV,
+  BFD_RELOC_RX_ABS16U,
+  BFD_RELOC_RX_ABS16UW,
+  BFD_RELOC_RX_ABS16UL,
+  BFD_RELOC_RX_RELAX,
+
+/* Direct 12 bit.  */
+  BFD_RELOC_390_12,
+
+/* 12 bit GOT offset.  */
+  BFD_RELOC_390_GOT12,
+
+/* 32 bit PC relative PLT address.  */
+  BFD_RELOC_390_PLT32,
+
+/* Copy symbol at runtime.  */
+  BFD_RELOC_390_COPY,
+
+/* Create GOT entry.  */
+  BFD_RELOC_390_GLOB_DAT,
+
+/* Create PLT entry.  */
+  BFD_RELOC_390_JMP_SLOT,
+
+/* Adjust by program base.  */
+  BFD_RELOC_390_RELATIVE,
+
+/* 32 bit PC relative offset to GOT.  */
+  BFD_RELOC_390_GOTPC,
+
+/* 16 bit GOT offset.  */
+  BFD_RELOC_390_GOT16,
+
+/* PC relative 12 bit shifted by 1.  */
+  BFD_RELOC_390_PC12DBL,
+
+/* 12 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT12DBL,
+
+/* PC relative 16 bit shifted by 1.  */
+  BFD_RELOC_390_PC16DBL,
+
+/* 16 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT16DBL,
+
+/* PC relative 24 bit shifted by 1.  */
+  BFD_RELOC_390_PC24DBL,
+
+/* 24 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT24DBL,
+
+/* PC relative 32 bit shifted by 1.  */
+  BFD_RELOC_390_PC32DBL,
+
+/* 32 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT32DBL,
+
+/* 32 bit PC rel. GOT shifted by 1.  */
+  BFD_RELOC_390_GOTPCDBL,
+
+/* 64 bit GOT offset.  */
+  BFD_RELOC_390_GOT64,
+
+/* 64 bit PC relative PLT address.  */
+  BFD_RELOC_390_PLT64,
+
+/* 32 bit rel. offset to GOT entry.  */
+  BFD_RELOC_390_GOTENT,
+
+/* 64 bit offset to GOT.  */
+  BFD_RELOC_390_GOTOFF64,
+
+/* 12-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT12,
+
+/* 16-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT16,
+
+/* 32-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT32,
+
+/* 64-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT64,
+
+/* 32-bit rel. offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLTENT,
+
+/* 16-bit rel. offset from the GOT to a PLT entry.  */
+  BFD_RELOC_390_PLTOFF16,
+
+/* 32-bit rel. offset from the GOT to a PLT entry.  */
+  BFD_RELOC_390_PLTOFF32,
+
+/* 64-bit rel. offset from the GOT to a PLT entry.  */
+  BFD_RELOC_390_PLTOFF64,
+
+/* s390 tls relocations.  */
+  BFD_RELOC_390_TLS_LOAD,
+  BFD_RELOC_390_TLS_GDCALL,
+  BFD_RELOC_390_TLS_LDCALL,
+  BFD_RELOC_390_TLS_GD32,
+  BFD_RELOC_390_TLS_GD64,
+  BFD_RELOC_390_TLS_GOTIE12,
+  BFD_RELOC_390_TLS_GOTIE32,
+  BFD_RELOC_390_TLS_GOTIE64,
+  BFD_RELOC_390_TLS_LDM32,
+  BFD_RELOC_390_TLS_LDM64,
+  BFD_RELOC_390_TLS_IE32,
+  BFD_RELOC_390_TLS_IE64,
+  BFD_RELOC_390_TLS_IEENT,
+  BFD_RELOC_390_TLS_LE32,
+  BFD_RELOC_390_TLS_LE64,
+  BFD_RELOC_390_TLS_LDO32,
+  BFD_RELOC_390_TLS_LDO64,
+  BFD_RELOC_390_TLS_DTPMOD,
+  BFD_RELOC_390_TLS_DTPOFF,
+  BFD_RELOC_390_TLS_TPOFF,
+
+/* Long displacement extension.  */
+  BFD_RELOC_390_20,
+  BFD_RELOC_390_GOT20,
+  BFD_RELOC_390_GOTPLT20,
+  BFD_RELOC_390_TLS_GOTIE20,
+
+/* STT_GNU_IFUNC relocation.  */
+  BFD_RELOC_390_IRELATIVE,
+
+/* Score relocations
+Low 16 bit for load/store  */
+  BFD_RELOC_SCORE_GPREL15,
+
+/* This is a 24-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE_DUMMY2,
+  BFD_RELOC_SCORE_JMP,
+
+/* This is a 19-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE_BRANCH,
+
+/* This is a 32-bit reloc for 48-bit instructions.  */
+  BFD_RELOC_SCORE_IMM30,
+
+/* This is a 32-bit reloc for 48-bit instructions.  */
+  BFD_RELOC_SCORE_IMM32,
+
+/* This is a 11-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE16_JMP,
+
+/* This is a 8-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE16_BRANCH,
+
+/* This is a 9-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE_BCMP,
+
+/* Undocumented Score relocs  */
+  BFD_RELOC_SCORE_GOT15,
+  BFD_RELOC_SCORE_GOT_LO16,
+  BFD_RELOC_SCORE_CALL15,
+  BFD_RELOC_SCORE_DUMMY_HI16,
+
+/* Scenix IP2K - 9-bit register number / data address  */
+  BFD_RELOC_IP2K_FR9,
+
+/* Scenix IP2K - 4-bit register/data bank number  */
+  BFD_RELOC_IP2K_BANK,
+
+/* Scenix IP2K - low 13 bits of instruction word address  */
+  BFD_RELOC_IP2K_ADDR16CJP,
+
+/* Scenix IP2K - high 3 bits of instruction word address  */
+  BFD_RELOC_IP2K_PAGE3,
+
+/* Scenix IP2K - ext/low/high 8 bits of data address  */
+  BFD_RELOC_IP2K_LO8DATA,
+  BFD_RELOC_IP2K_HI8DATA,
+  BFD_RELOC_IP2K_EX8DATA,
+
+/* Scenix IP2K - low/high 8 bits of instruction word address  */
+  BFD_RELOC_IP2K_LO8INSN,
+  BFD_RELOC_IP2K_HI8INSN,
+
+/* Scenix IP2K - even/odd PC modifier to modify snb pcl.0  */
+  BFD_RELOC_IP2K_PC_SKIP,
+
+/* Scenix IP2K - 16 bit word address in text section.  */
+  BFD_RELOC_IP2K_TEXT,
+
+/* Scenix IP2K - 7-bit sp or dp offset  */
+  BFD_RELOC_IP2K_FR_OFFSET,
+
+/* Scenix VPE4K coprocessor - data/insn-space addressing  */
+  BFD_RELOC_VPE4KMATH_DATA,
+  BFD_RELOC_VPE4KMATH_INSN,
+
+/* These two relocations are used by the linker to determine which of
+the entries in a C++ virtual function table are actually used.  When
+the --gc-sections option is given, the linker will zero out the entries
+that are not used, so that the code for those functions need not be
+included in the output.
+
+VTABLE_INHERIT is a zero-space relocation used to describe to the
+linker the inheritance tree of a C++ virtual function table.  The
+relocation's symbol should be the parent class' vtable, and the
+relocation should be located at the child vtable.
+
+VTABLE_ENTRY is a zero-space relocation that describes the use of a
+virtual function table entry.  The reloc's symbol should refer to the
+table of the class mentioned in the code.  Off of that base, an offset
+describes the entry that is being used.  For Rela hosts, this offset
+is stored in the reloc's addend.  For Rel hosts, we are forced to put
+this offset in the reloc's section offset.  */
+  BFD_RELOC_VTABLE_INHERIT,
+  BFD_RELOC_VTABLE_ENTRY,
+
+/* Intel IA64 Relocations.  */
+  BFD_RELOC_IA64_IMM14,
+  BFD_RELOC_IA64_IMM22,
+  BFD_RELOC_IA64_IMM64,
+  BFD_RELOC_IA64_DIR32MSB,
+  BFD_RELOC_IA64_DIR32LSB,
+  BFD_RELOC_IA64_DIR64MSB,
+  BFD_RELOC_IA64_DIR64LSB,
+  BFD_RELOC_IA64_GPREL22,
+  BFD_RELOC_IA64_GPREL64I,
+  BFD_RELOC_IA64_GPREL32MSB,
+  BFD_RELOC_IA64_GPREL32LSB,
+  BFD_RELOC_IA64_GPREL64MSB,
+  BFD_RELOC_IA64_GPREL64LSB,
+  BFD_RELOC_IA64_LTOFF22,
+  BFD_RELOC_IA64_LTOFF64I,
+  BFD_RELOC_IA64_PLTOFF22,
+  BFD_RELOC_IA64_PLTOFF64I,
+  BFD_RELOC_IA64_PLTOFF64MSB,
+  BFD_RELOC_IA64_PLTOFF64LSB,
+  BFD_RELOC_IA64_FPTR64I,
+  BFD_RELOC_IA64_FPTR32MSB,
+  BFD_RELOC_IA64_FPTR32LSB,
+  BFD_RELOC_IA64_FPTR64MSB,
+  BFD_RELOC_IA64_FPTR64LSB,
+  BFD_RELOC_IA64_PCREL21B,
+  BFD_RELOC_IA64_PCREL21BI,
+  BFD_RELOC_IA64_PCREL21M,
+  BFD_RELOC_IA64_PCREL21F,
+  BFD_RELOC_IA64_PCREL22,
+  BFD_RELOC_IA64_PCREL60B,
+  BFD_RELOC_IA64_PCREL64I,
+  BFD_RELOC_IA64_PCREL32MSB,
+  BFD_RELOC_IA64_PCREL32LSB,
+  BFD_RELOC_IA64_PCREL64MSB,
+  BFD_RELOC_IA64_PCREL64LSB,
+  BFD_RELOC_IA64_LTOFF_FPTR22,
+  BFD_RELOC_IA64_LTOFF_FPTR64I,
+  BFD_RELOC_IA64_LTOFF_FPTR32MSB,
+  BFD_RELOC_IA64_LTOFF_FPTR32LSB,
+  BFD_RELOC_IA64_LTOFF_FPTR64MSB,
+  BFD_RELOC_IA64_LTOFF_FPTR64LSB,
+  BFD_RELOC_IA64_SEGREL32MSB,
+  BFD_RELOC_IA64_SEGREL32LSB,
+  BFD_RELOC_IA64_SEGREL64MSB,
+  BFD_RELOC_IA64_SEGREL64LSB,
+  BFD_RELOC_IA64_SECREL32MSB,
+  BFD_RELOC_IA64_SECREL32LSB,
+  BFD_RELOC_IA64_SECREL64MSB,
+  BFD_RELOC_IA64_SECREL64LSB,
+  BFD_RELOC_IA64_REL32MSB,
+  BFD_RELOC_IA64_REL32LSB,
+  BFD_RELOC_IA64_REL64MSB,
+  BFD_RELOC_IA64_REL64LSB,
+  BFD_RELOC_IA64_LTV32MSB,
+  BFD_RELOC_IA64_LTV32LSB,
+  BFD_RELOC_IA64_LTV64MSB,
+  BFD_RELOC_IA64_LTV64LSB,
+  BFD_RELOC_IA64_IPLTMSB,
+  BFD_RELOC_IA64_IPLTLSB,
+  BFD_RELOC_IA64_COPY,
+  BFD_RELOC_IA64_LTOFF22X,
+  BFD_RELOC_IA64_LDXMOV,
+  BFD_RELOC_IA64_TPREL14,
+  BFD_RELOC_IA64_TPREL22,
+  BFD_RELOC_IA64_TPREL64I,
+  BFD_RELOC_IA64_TPREL64MSB,
+  BFD_RELOC_IA64_TPREL64LSB,
+  BFD_RELOC_IA64_LTOFF_TPREL22,
+  BFD_RELOC_IA64_DTPMOD64MSB,
+  BFD_RELOC_IA64_DTPMOD64LSB,
+  BFD_RELOC_IA64_LTOFF_DTPMOD22,
+  BFD_RELOC_IA64_DTPREL14,
+  BFD_RELOC_IA64_DTPREL22,
+  BFD_RELOC_IA64_DTPREL64I,
+  BFD_RELOC_IA64_DTPREL32MSB,
+  BFD_RELOC_IA64_DTPREL32LSB,
+  BFD_RELOC_IA64_DTPREL64MSB,
+  BFD_RELOC_IA64_DTPREL64LSB,
+  BFD_RELOC_IA64_LTOFF_DTPREL22,
+
+/* Motorola 68HC11 reloc.
+This is the 8 bit high part of an absolute address.  */
+  BFD_RELOC_M68HC11_HI8,
+
+/* Motorola 68HC11 reloc.
+This is the 8 bit low part of an absolute address.  */
+  BFD_RELOC_M68HC11_LO8,
+
+/* Motorola 68HC11 reloc.
+This is the 3 bit of a value.  */
+  BFD_RELOC_M68HC11_3B,
+
+/* Motorola 68HC11 reloc.
+This reloc marks the beginning of a jump/call instruction.
+It is used for linker relaxation to correctly identify beginning
+of instruction and change some branches to use PC-relative
+addressing mode.  */
+  BFD_RELOC_M68HC11_RL_JUMP,
+
+/* Motorola 68HC11 reloc.
+This reloc marks a group of several instructions that gcc generates
+and for which the linker relaxation pass can modify and/or remove
+some of them.  */
+  BFD_RELOC_M68HC11_RL_GROUP,
+
+/* Motorola 68HC11 reloc.
+This is the 16-bit lower part of an address.  It is used for 'call'
+instruction to specify the symbol address without any special
+transformation (due to memory bank window).  */
+  BFD_RELOC_M68HC11_LO16,
+
+/* Motorola 68HC11 reloc.
+This is a 8-bit reloc that specifies the page number of an address.
+It is used by 'call' instruction to specify the page number of
+the symbol.  */
+  BFD_RELOC_M68HC11_PAGE,
+
+/* Motorola 68HC11 reloc.
+This is a 24-bit reloc that represents the address with a 16-bit
+value and a 8-bit page number.  The symbol address is transformed
+to follow the 16K memory bank of 68HC12 (seen as mapped in the window).  */
+  BFD_RELOC_M68HC11_24,
+
+/* Motorola 68HC12 reloc.
+This is the 5 bits of a value.  */
+  BFD_RELOC_M68HC12_5B,
+
+/* Freescale XGATE reloc.
+This reloc marks the beginning of a bra/jal instruction.  */
+  BFD_RELOC_XGATE_RL_JUMP,
+
+/* Freescale XGATE reloc.
+This reloc marks a group of several instructions that gcc generates
+and for which the linker relaxation pass can modify and/or remove
+some of them.  */
+  BFD_RELOC_XGATE_RL_GROUP,
+
+/* Freescale XGATE reloc.
+This is the 16-bit lower part of an address.  It is used for the '16-bit'
+instructions.  */
+  BFD_RELOC_XGATE_LO16,
+
+/* Freescale XGATE reloc.  */
+  BFD_RELOC_XGATE_GPAGE,
+
+/* Freescale XGATE reloc.  */
+  BFD_RELOC_XGATE_24,
+
+/* Freescale XGATE reloc.
+This is a 9-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_PCREL_9,
+
+/* Freescale XGATE reloc.
+This is a 10-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_PCREL_10,
+
+/* Freescale XGATE reloc.
+This is the 16-bit lower part of an address.  It is used for the '16-bit'
+instructions.  */
+  BFD_RELOC_XGATE_IMM8_LO,
+
+/* Freescale XGATE reloc.
+This is the 16-bit higher part of an address.  It is used for the '16-bit'
+instructions.  */
+  BFD_RELOC_XGATE_IMM8_HI,
+
+/* Freescale XGATE reloc.
+This is a 3-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_IMM3,
+
+/* Freescale XGATE reloc.
+This is a 4-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_IMM4,
+
+/* Freescale XGATE reloc.
+This is a 5-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_IMM5,
+
+/* Motorola 68HC12 reloc.
+This is the 9 bits of a value.  */
+  BFD_RELOC_M68HC12_9B,
+
+/* Motorola 68HC12 reloc.
+This is the 16 bits of a value.  */
+  BFD_RELOC_M68HC12_16B,
+
+/* Motorola 68HC12/XGATE reloc.
+This is a PCREL9 branch.  */
+  BFD_RELOC_M68HC12_9_PCREL,
+
+/* Motorola 68HC12/XGATE reloc.
+This is a PCREL10 branch.  */
+  BFD_RELOC_M68HC12_10_PCREL,
+
+/* Motorola 68HC12/XGATE reloc.
+This is the 8 bit low part of an absolute address and immediately precedes
+a matching HI8XG part.  */
+  BFD_RELOC_M68HC12_LO8XG,
+
+/* Motorola 68HC12/XGATE reloc.
+This is the 8 bit high part of an absolute address and immediately follows
+a matching LO8XG part.  */
+  BFD_RELOC_M68HC12_HI8XG,
+
+/* NS CR16C Relocations.  */
+  BFD_RELOC_16C_NUM08,
+  BFD_RELOC_16C_NUM08_C,
+  BFD_RELOC_16C_NUM16,
+  BFD_RELOC_16C_NUM16_C,
+  BFD_RELOC_16C_NUM32,
+  BFD_RELOC_16C_NUM32_C,
+  BFD_RELOC_16C_DISP04,
+  BFD_RELOC_16C_DISP04_C,
+  BFD_RELOC_16C_DISP08,
+  BFD_RELOC_16C_DISP08_C,
+  BFD_RELOC_16C_DISP16,
+  BFD_RELOC_16C_DISP16_C,
+  BFD_RELOC_16C_DISP24,
+  BFD_RELOC_16C_DISP24_C,
+  BFD_RELOC_16C_DISP24a,
+  BFD_RELOC_16C_DISP24a_C,
+  BFD_RELOC_16C_REG04,
+  BFD_RELOC_16C_REG04_C,
+  BFD_RELOC_16C_REG04a,
+  BFD_RELOC_16C_REG04a_C,
+  BFD_RELOC_16C_REG14,
+  BFD_RELOC_16C_REG14_C,
+  BFD_RELOC_16C_REG16,
+  BFD_RELOC_16C_REG16_C,
+  BFD_RELOC_16C_REG20,
+  BFD_RELOC_16C_REG20_C,
+  BFD_RELOC_16C_ABS20,
+  BFD_RELOC_16C_ABS20_C,
+  BFD_RELOC_16C_ABS24,
+  BFD_RELOC_16C_ABS24_C,
+  BFD_RELOC_16C_IMM04,
+  BFD_RELOC_16C_IMM04_C,
+  BFD_RELOC_16C_IMM16,
+  BFD_RELOC_16C_IMM16_C,
+  BFD_RELOC_16C_IMM20,
+  BFD_RELOC_16C_IMM20_C,
+  BFD_RELOC_16C_IMM24,
+  BFD_RELOC_16C_IMM24_C,
+  BFD_RELOC_16C_IMM32,
+  BFD_RELOC_16C_IMM32_C,
+
+/* NS CR16 Relocations.  */
+  BFD_RELOC_CR16_NUM8,
+  BFD_RELOC_CR16_NUM16,
+  BFD_RELOC_CR16_NUM32,
+  BFD_RELOC_CR16_NUM32a,
+  BFD_RELOC_CR16_REGREL0,
+  BFD_RELOC_CR16_REGREL4,
+  BFD_RELOC_CR16_REGREL4a,
+  BFD_RELOC_CR16_REGREL14,
+  BFD_RELOC_CR16_REGREL14a,
+  BFD_RELOC_CR16_REGREL16,
+  BFD_RELOC_CR16_REGREL20,
+  BFD_RELOC_CR16_REGREL20a,
+  BFD_RELOC_CR16_ABS20,
+  BFD_RELOC_CR16_ABS24,
+  BFD_RELOC_CR16_IMM4,
+  BFD_RELOC_CR16_IMM8,
+  BFD_RELOC_CR16_IMM16,
+  BFD_RELOC_CR16_IMM20,
+  BFD_RELOC_CR16_IMM24,
+  BFD_RELOC_CR16_IMM32,
+  BFD_RELOC_CR16_IMM32a,
+  BFD_RELOC_CR16_DISP4,
+  BFD_RELOC_CR16_DISP8,
+  BFD_RELOC_CR16_DISP16,
+  BFD_RELOC_CR16_DISP20,
+  BFD_RELOC_CR16_DISP24,
+  BFD_RELOC_CR16_DISP24a,
+  BFD_RELOC_CR16_SWITCH8,
+  BFD_RELOC_CR16_SWITCH16,
+  BFD_RELOC_CR16_SWITCH32,
+  BFD_RELOC_CR16_GOT_REGREL20,
+  BFD_RELOC_CR16_GOTC_REGREL20,
+  BFD_RELOC_CR16_GLOB_DAT,
+
+/* NS CRX Relocations.  */
+  BFD_RELOC_CRX_REL4,
+  BFD_RELOC_CRX_REL8,
+  BFD_RELOC_CRX_REL8_CMP,
+  BFD_RELOC_CRX_REL16,
+  BFD_RELOC_CRX_REL24,
+  BFD_RELOC_CRX_REL32,
+  BFD_RELOC_CRX_REGREL12,
+  BFD_RELOC_CRX_REGREL22,
+  BFD_RELOC_CRX_REGREL28,
+  BFD_RELOC_CRX_REGREL32,
+  BFD_RELOC_CRX_ABS16,
+  BFD_RELOC_CRX_ABS32,
+  BFD_RELOC_CRX_NUM8,
+  BFD_RELOC_CRX_NUM16,
+  BFD_RELOC_CRX_NUM32,
+  BFD_RELOC_CRX_IMM16,
+  BFD_RELOC_CRX_IMM32,
+  BFD_RELOC_CRX_SWITCH8,
+  BFD_RELOC_CRX_SWITCH16,
+  BFD_RELOC_CRX_SWITCH32,
+
+/* These relocs are only used within the CRIS assembler.  They are not
+(at present) written to any object files.  */
+  BFD_RELOC_CRIS_BDISP8,
+  BFD_RELOC_CRIS_UNSIGNED_5,
+  BFD_RELOC_CRIS_SIGNED_6,
+  BFD_RELOC_CRIS_UNSIGNED_6,
+  BFD_RELOC_CRIS_SIGNED_8,
+  BFD_RELOC_CRIS_UNSIGNED_8,
+  BFD_RELOC_CRIS_SIGNED_16,
+  BFD_RELOC_CRIS_UNSIGNED_16,
+  BFD_RELOC_CRIS_LAPCQ_OFFSET,
+  BFD_RELOC_CRIS_UNSIGNED_4,
+
+/* Relocs used in ELF shared libraries for CRIS.  */
+  BFD_RELOC_CRIS_COPY,
+  BFD_RELOC_CRIS_GLOB_DAT,
+  BFD_RELOC_CRIS_JUMP_SLOT,
+  BFD_RELOC_CRIS_RELATIVE,
+
+/* 32-bit offset to symbol-entry within GOT.  */
+  BFD_RELOC_CRIS_32_GOT,
+
+/* 16-bit offset to symbol-entry within GOT.  */
+  BFD_RELOC_CRIS_16_GOT,
+
+/* 32-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_CRIS_32_GOTPLT,
+
+/* 16-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_CRIS_16_GOTPLT,
+
+/* 32-bit offset to symbol, relative to GOT.  */
+  BFD_RELOC_CRIS_32_GOTREL,
+
+/* 32-bit offset to symbol with PLT entry, relative to GOT.  */
+  BFD_RELOC_CRIS_32_PLT_GOTREL,
+
+/* 32-bit offset to symbol with PLT entry, relative to this relocation.  */
+  BFD_RELOC_CRIS_32_PLT_PCREL,
+
+/* Relocs used in TLS code for CRIS.  */
+  BFD_RELOC_CRIS_32_GOT_GD,
+  BFD_RELOC_CRIS_16_GOT_GD,
+  BFD_RELOC_CRIS_32_GD,
+  BFD_RELOC_CRIS_DTP,
+  BFD_RELOC_CRIS_32_DTPREL,
+  BFD_RELOC_CRIS_16_DTPREL,
+  BFD_RELOC_CRIS_32_GOT_TPREL,
+  BFD_RELOC_CRIS_16_GOT_TPREL,
+  BFD_RELOC_CRIS_32_TPREL,
+  BFD_RELOC_CRIS_16_TPREL,
+  BFD_RELOC_CRIS_DTPMOD,
+  BFD_RELOC_CRIS_32_IE,
+
+/* Intel i860 Relocations.  */
+  BFD_RELOC_860_COPY,
+  BFD_RELOC_860_GLOB_DAT,
+  BFD_RELOC_860_JUMP_SLOT,
+  BFD_RELOC_860_RELATIVE,
+  BFD_RELOC_860_PC26,
+  BFD_RELOC_860_PLT26,
+  BFD_RELOC_860_PC16,
+  BFD_RELOC_860_LOW0,
+  BFD_RELOC_860_SPLIT0,
+  BFD_RELOC_860_LOW1,
+  BFD_RELOC_860_SPLIT1,
+  BFD_RELOC_860_LOW2,
+  BFD_RELOC_860_SPLIT2,
+  BFD_RELOC_860_LOW3,
+  BFD_RELOC_860_LOGOT0,
+  BFD_RELOC_860_SPGOT0,
+  BFD_RELOC_860_LOGOT1,
+  BFD_RELOC_860_SPGOT1,
+  BFD_RELOC_860_LOGOTOFF0,
+  BFD_RELOC_860_SPGOTOFF0,
+  BFD_RELOC_860_LOGOTOFF1,
+  BFD_RELOC_860_SPGOTOFF1,
+  BFD_RELOC_860_LOGOTOFF2,
+  BFD_RELOC_860_LOGOTOFF3,
+  BFD_RELOC_860_LOPC,
+  BFD_RELOC_860_HIGHADJ,
+  BFD_RELOC_860_HAGOT,
+  BFD_RELOC_860_HAGOTOFF,
+  BFD_RELOC_860_HAPC,
+  BFD_RELOC_860_HIGH,
+  BFD_RELOC_860_HIGOT,
+  BFD_RELOC_860_HIGOTOFF,
+
+/* OpenRISC 1000 Relocations.  */
+  BFD_RELOC_OR1K_REL_26,
+  BFD_RELOC_OR1K_GOTPC_HI16,
+  BFD_RELOC_OR1K_GOTPC_LO16,
+  BFD_RELOC_OR1K_GOT16,
+  BFD_RELOC_OR1K_PLT26,
+  BFD_RELOC_OR1K_GOTOFF_HI16,
+  BFD_RELOC_OR1K_GOTOFF_LO16,
+  BFD_RELOC_OR1K_COPY,
+  BFD_RELOC_OR1K_GLOB_DAT,
+  BFD_RELOC_OR1K_JMP_SLOT,
+  BFD_RELOC_OR1K_RELATIVE,
+  BFD_RELOC_OR1K_TLS_GD_HI16,
+  BFD_RELOC_OR1K_TLS_GD_LO16,
+  BFD_RELOC_OR1K_TLS_LDM_HI16,
+  BFD_RELOC_OR1K_TLS_LDM_LO16,
+  BFD_RELOC_OR1K_TLS_LDO_HI16,
+  BFD_RELOC_OR1K_TLS_LDO_LO16,
+  BFD_RELOC_OR1K_TLS_IE_HI16,
+  BFD_RELOC_OR1K_TLS_IE_LO16,
+  BFD_RELOC_OR1K_TLS_LE_HI16,
+  BFD_RELOC_OR1K_TLS_LE_LO16,
+  BFD_RELOC_OR1K_TLS_TPOFF,
+  BFD_RELOC_OR1K_TLS_DTPOFF,
+  BFD_RELOC_OR1K_TLS_DTPMOD,
+
+/* H8 elf Relocations.  */
+  BFD_RELOC_H8_DIR16A8,
+  BFD_RELOC_H8_DIR16R8,
+  BFD_RELOC_H8_DIR24A8,
+  BFD_RELOC_H8_DIR24R8,
+  BFD_RELOC_H8_DIR32A16,
+  BFD_RELOC_H8_DISP32A16,
+
+/* Sony Xstormy16 Relocations.  */
+  BFD_RELOC_XSTORMY16_REL_12,
+  BFD_RELOC_XSTORMY16_12,
+  BFD_RELOC_XSTORMY16_24,
+  BFD_RELOC_XSTORMY16_FPTR16,
+
+/* Self-describing complex relocations.  */
+  BFD_RELOC_RELC,
+
+
+/* Infineon Relocations.  */
+  BFD_RELOC_XC16X_PAG,
+  BFD_RELOC_XC16X_POF,
+  BFD_RELOC_XC16X_SEG,
+  BFD_RELOC_XC16X_SOF,
+
+/* Relocations used by VAX ELF.  */
+  BFD_RELOC_VAX_GLOB_DAT,
+  BFD_RELOC_VAX_JMP_SLOT,
+  BFD_RELOC_VAX_RELATIVE,
+
+/* Morpho MT - 16 bit immediate relocation.  */
+  BFD_RELOC_MT_PC16,
+
+/* Morpho MT - Hi 16 bits of an address.  */
+  BFD_RELOC_MT_HI16,
+
+/* Morpho MT - Low 16 bits of an address.  */
+  BFD_RELOC_MT_LO16,
+
+/* Morpho MT - Used to tell the linker which vtable entries are used.  */
+  BFD_RELOC_MT_GNU_VTINHERIT,
+
+/* Morpho MT - Used to tell the linker which vtable entries are used.  */
+  BFD_RELOC_MT_GNU_VTENTRY,
+
+/* Morpho MT - 8 bit immediate relocation.  */
+  BFD_RELOC_MT_PCINSN8,
+
+/* msp430 specific relocation codes  */
+  BFD_RELOC_MSP430_10_PCREL,
+  BFD_RELOC_MSP430_16_PCREL,
+  BFD_RELOC_MSP430_16,
+  BFD_RELOC_MSP430_16_PCREL_BYTE,
+  BFD_RELOC_MSP430_16_BYTE,
+  BFD_RELOC_MSP430_2X_PCREL,
+  BFD_RELOC_MSP430_RL_PCREL,
+  BFD_RELOC_MSP430_ABS8,
+  BFD_RELOC_MSP430X_PCR20_EXT_SRC,
+  BFD_RELOC_MSP430X_PCR20_EXT_DST,
+  BFD_RELOC_MSP430X_PCR20_EXT_ODST,
+  BFD_RELOC_MSP430X_ABS20_EXT_SRC,
+  BFD_RELOC_MSP430X_ABS20_EXT_DST,
+  BFD_RELOC_MSP430X_ABS20_EXT_ODST,
+  BFD_RELOC_MSP430X_ABS20_ADR_SRC,
+  BFD_RELOC_MSP430X_ABS20_ADR_DST,
+  BFD_RELOC_MSP430X_PCR16,
+  BFD_RELOC_MSP430X_PCR20_CALL,
+  BFD_RELOC_MSP430X_ABS16,
+  BFD_RELOC_MSP430_ABS_HI16,
+  BFD_RELOC_MSP430_PREL31,
+  BFD_RELOC_MSP430_SYM_DIFF,
+
+/* Relocations used by the Altera Nios II core.  */
+  BFD_RELOC_NIOS2_S16,
+  BFD_RELOC_NIOS2_U16,
+  BFD_RELOC_NIOS2_CALL26,
+  BFD_RELOC_NIOS2_IMM5,
+  BFD_RELOC_NIOS2_CACHE_OPX,
+  BFD_RELOC_NIOS2_IMM6,
+  BFD_RELOC_NIOS2_IMM8,
+  BFD_RELOC_NIOS2_HI16,
+  BFD_RELOC_NIOS2_LO16,
+  BFD_RELOC_NIOS2_HIADJ16,
+  BFD_RELOC_NIOS2_GPREL,
+  BFD_RELOC_NIOS2_UJMP,
+  BFD_RELOC_NIOS2_CJMP,
+  BFD_RELOC_NIOS2_CALLR,
+  BFD_RELOC_NIOS2_ALIGN,
+  BFD_RELOC_NIOS2_GOT16,
+  BFD_RELOC_NIOS2_CALL16,
+  BFD_RELOC_NIOS2_GOTOFF_LO,
+  BFD_RELOC_NIOS2_GOTOFF_HA,
+  BFD_RELOC_NIOS2_PCREL_LO,
+  BFD_RELOC_NIOS2_PCREL_HA,
+  BFD_RELOC_NIOS2_TLS_GD16,
+  BFD_RELOC_NIOS2_TLS_LDM16,
+  BFD_RELOC_NIOS2_TLS_LDO16,
+  BFD_RELOC_NIOS2_TLS_IE16,
+  BFD_RELOC_NIOS2_TLS_LE16,
+  BFD_RELOC_NIOS2_TLS_DTPMOD,
+  BFD_RELOC_NIOS2_TLS_DTPREL,
+  BFD_RELOC_NIOS2_TLS_TPREL,
+  BFD_RELOC_NIOS2_COPY,
+  BFD_RELOC_NIOS2_GLOB_DAT,
+  BFD_RELOC_NIOS2_JUMP_SLOT,
+  BFD_RELOC_NIOS2_RELATIVE,
+  BFD_RELOC_NIOS2_GOTOFF,
+  BFD_RELOC_NIOS2_CALL26_NOAT,
+  BFD_RELOC_NIOS2_GOT_LO,
+  BFD_RELOC_NIOS2_GOT_HA,
+  BFD_RELOC_NIOS2_CALL_LO,
+  BFD_RELOC_NIOS2_CALL_HA,
+  BFD_RELOC_NIOS2_R2_S12,
+  BFD_RELOC_NIOS2_R2_I10_1_PCREL,
+  BFD_RELOC_NIOS2_R2_T1I7_1_PCREL,
+  BFD_RELOC_NIOS2_R2_T1I7_2,
+  BFD_RELOC_NIOS2_R2_T2I4,
+  BFD_RELOC_NIOS2_R2_T2I4_1,
+  BFD_RELOC_NIOS2_R2_T2I4_2,
+  BFD_RELOC_NIOS2_R2_X1I7_2,
+  BFD_RELOC_NIOS2_R2_X2L5,
+  BFD_RELOC_NIOS2_R2_F1I5_2,
+  BFD_RELOC_NIOS2_R2_L5I4X1,
+  BFD_RELOC_NIOS2_R2_T1X1I6,
+  BFD_RELOC_NIOS2_R2_T1X1I6_2,
+
+/* IQ2000 Relocations.  */
+  BFD_RELOC_IQ2000_OFFSET_16,
+  BFD_RELOC_IQ2000_OFFSET_21,
+  BFD_RELOC_IQ2000_UHI16,
+
+/* Special Xtensa relocation used only by PLT entries in ELF shared
+objects to indicate that the runtime linker should set the value
+to one of its own internal functions or data structures.  */
+  BFD_RELOC_XTENSA_RTLD,
+
+/* Xtensa relocations for ELF shared objects.  */
+  BFD_RELOC_XTENSA_GLOB_DAT,
+  BFD_RELOC_XTENSA_JMP_SLOT,
+  BFD_RELOC_XTENSA_RELATIVE,
+
+/* Xtensa relocation used in ELF object files for symbols that may require
+PLT entries.  Otherwise, this is just a generic 32-bit relocation.  */
+  BFD_RELOC_XTENSA_PLT,
+
+/* Xtensa relocations to mark the difference of two local symbols.
+These are only needed to support linker relaxation and can be ignored
+when not relaxing.  The field is set to the value of the difference
+assuming no relaxation.  The relocation encodes the position of the
+first symbol so the linker can determine whether to adjust the field
+value.  */
+  BFD_RELOC_XTENSA_DIFF8,
+  BFD_RELOC_XTENSA_DIFF16,
+  BFD_RELOC_XTENSA_DIFF32,
+
+/* Generic Xtensa relocations for instruction operands.  Only the slot
+number is encoded in the relocation.  The relocation applies to the
+last PC-relative immediate operand, or if there are no PC-relative
+immediates, to the last immediate operand.  */
+  BFD_RELOC_XTENSA_SLOT0_OP,
+  BFD_RELOC_XTENSA_SLOT1_OP,
+  BFD_RELOC_XTENSA_SLOT2_OP,
+  BFD_RELOC_XTENSA_SLOT3_OP,
+  BFD_RELOC_XTENSA_SLOT4_OP,
+  BFD_RELOC_XTENSA_SLOT5_OP,
+  BFD_RELOC_XTENSA_SLOT6_OP,
+  BFD_RELOC_XTENSA_SLOT7_OP,
+  BFD_RELOC_XTENSA_SLOT8_OP,
+  BFD_RELOC_XTENSA_SLOT9_OP,
+  BFD_RELOC_XTENSA_SLOT10_OP,
+  BFD_RELOC_XTENSA_SLOT11_OP,
+  BFD_RELOC_XTENSA_SLOT12_OP,
+  BFD_RELOC_XTENSA_SLOT13_OP,
+  BFD_RELOC_XTENSA_SLOT14_OP,
+
+/* Alternate Xtensa relocations.  Only the slot is encoded in the
+relocation.  The meaning of these relocations is opcode-specific.  */
+  BFD_RELOC_XTENSA_SLOT0_ALT,
+  BFD_RELOC_XTENSA_SLOT1_ALT,
+  BFD_RELOC_XTENSA_SLOT2_ALT,
+  BFD_RELOC_XTENSA_SLOT3_ALT,
+  BFD_RELOC_XTENSA_SLOT4_ALT,
+  BFD_RELOC_XTENSA_SLOT5_ALT,
+  BFD_RELOC_XTENSA_SLOT6_ALT,
+  BFD_RELOC_XTENSA_SLOT7_ALT,
+  BFD_RELOC_XTENSA_SLOT8_ALT,
+  BFD_RELOC_XTENSA_SLOT9_ALT,
+  BFD_RELOC_XTENSA_SLOT10_ALT,
+  BFD_RELOC_XTENSA_SLOT11_ALT,
+  BFD_RELOC_XTENSA_SLOT12_ALT,
+  BFD_RELOC_XTENSA_SLOT13_ALT,
+  BFD_RELOC_XTENSA_SLOT14_ALT,
+
+/* Xtensa relocations for backward compatibility.  These have all been
+replaced by BFD_RELOC_XTENSA_SLOT0_OP.  */
+  BFD_RELOC_XTENSA_OP0,
+  BFD_RELOC_XTENSA_OP1,
+  BFD_RELOC_XTENSA_OP2,
+
+/* Xtensa relocation to mark that the assembler expanded the
+instructions from an original target.  The expansion size is
+encoded in the reloc size.  */
+  BFD_RELOC_XTENSA_ASM_EXPAND,
+
+/* Xtensa relocation to mark that the linker should simplify
+assembler-expanded instructions.  This is commonly used
+internally by the linker after analysis of a
+BFD_RELOC_XTENSA_ASM_EXPAND.  */
+  BFD_RELOC_XTENSA_ASM_SIMPLIFY,
+
+/* Xtensa TLS relocations.  */
+  BFD_RELOC_XTENSA_TLSDESC_FN,
+  BFD_RELOC_XTENSA_TLSDESC_ARG,
+  BFD_RELOC_XTENSA_TLS_DTPOFF,
+  BFD_RELOC_XTENSA_TLS_TPOFF,
+  BFD_RELOC_XTENSA_TLS_FUNC,
+  BFD_RELOC_XTENSA_TLS_ARG,
+  BFD_RELOC_XTENSA_TLS_CALL,
+
+/* 8 bit signed offset in (ix+d) or (iy+d).  */
+  BFD_RELOC_Z80_DISP8,
+
+/* DJNZ offset.  */
+  BFD_RELOC_Z8K_DISP7,
+
+/* CALR offset.  */
+  BFD_RELOC_Z8K_CALLR,
+
+/* 4 bit value.  */
+  BFD_RELOC_Z8K_IMM4L,
+
+/* Lattice Mico32 relocations.  */
+  BFD_RELOC_LM32_CALL,
+  BFD_RELOC_LM32_BRANCH,
+  BFD_RELOC_LM32_16_GOT,
+  BFD_RELOC_LM32_GOTOFF_HI16,
+  BFD_RELOC_LM32_GOTOFF_LO16,
+  BFD_RELOC_LM32_COPY,
+  BFD_RELOC_LM32_GLOB_DAT,
+  BFD_RELOC_LM32_JMP_SLOT,
+  BFD_RELOC_LM32_RELATIVE,
+
+/* Difference between two section addreses.  Must be followed by a
+BFD_RELOC_MACH_O_PAIR.  */
+  BFD_RELOC_MACH_O_SECTDIFF,
+
+/* Like BFD_RELOC_MACH_O_SECTDIFF but with a local symbol.  */
+  BFD_RELOC_MACH_O_LOCAL_SECTDIFF,
+
+/* Pair of relocation.  Contains the first symbol.  */
+  BFD_RELOC_MACH_O_PAIR,
+
+/* Symbol will be substracted.  Must be followed by a BFD_RELOC_32.  */
+  BFD_RELOC_MACH_O_SUBTRACTOR32,
+
+/* Symbol will be substracted.  Must be followed by a BFD_RELOC_64.  */
+  BFD_RELOC_MACH_O_SUBTRACTOR64,
+
+/* PCREL relocations.  They are marked as branch to create PLT entry if
+required.  */
+  BFD_RELOC_MACH_O_X86_64_BRANCH32,
+  BFD_RELOC_MACH_O_X86_64_BRANCH8,
+
+/* Used when referencing a GOT entry.  */
+  BFD_RELOC_MACH_O_X86_64_GOT,
+
+/* Used when loading a GOT entry with movq.  It is specially marked so that
+the linker could optimize the movq to a leaq if possible.  */
+  BFD_RELOC_MACH_O_X86_64_GOT_LOAD,
+
+/* Same as BFD_RELOC_32_PCREL but with an implicit -1 addend.  */
+  BFD_RELOC_MACH_O_X86_64_PCREL32_1,
+
+/* Same as BFD_RELOC_32_PCREL but with an implicit -2 addend.  */
+  BFD_RELOC_MACH_O_X86_64_PCREL32_2,
+
+/* Same as BFD_RELOC_32_PCREL but with an implicit -4 addend.  */
+  BFD_RELOC_MACH_O_X86_64_PCREL32_4,
+
+/* Addend for PAGE or PAGEOFF.  */
+  BFD_RELOC_MACH_O_ARM64_ADDEND,
+
+/* Relative offset to page of GOT slot.  */
+  BFD_RELOC_MACH_O_ARM64_GOT_LOAD_PAGE21,
+
+/* Relative offset within page of GOT slot.  */
+  BFD_RELOC_MACH_O_ARM64_GOT_LOAD_PAGEOFF12,
+
+/* Address of a GOT entry.  */
+  BFD_RELOC_MACH_O_ARM64_POINTER_TO_GOT,
+
+/* This is a 32 bit reloc for the microblaze that stores the
+low 16 bits of a value  */
+  BFD_RELOC_MICROBLAZE_32_LO,
+
+/* This is a 32 bit pc-relative reloc for the microblaze that
+stores the low 16 bits of a value  */
+  BFD_RELOC_MICROBLAZE_32_LO_PCREL,
+
+/* This is a 32 bit reloc for the microblaze that stores a
+value relative to the read-only small data area anchor  */
+  BFD_RELOC_MICROBLAZE_32_ROSDA,
+
+/* This is a 32 bit reloc for the microblaze that stores a
+value relative to the read-write small data area anchor  */
+  BFD_RELOC_MICROBLAZE_32_RWSDA,
+
+/* This is a 32 bit reloc for the microblaze to handle
+expressions of the form "Symbol Op Symbol"  */
+  BFD_RELOC_MICROBLAZE_32_SYM_OP_SYM,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  No relocation is
+done here - only used for relaxing  */
+  BFD_RELOC_MICROBLAZE_64_NONE,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  The relocation is
+PC-relative GOT offset  */
+  BFD_RELOC_MICROBLAZE_64_GOTPC,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  The relocation is
+GOT offset  */
+  BFD_RELOC_MICROBLAZE_64_GOT,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  The relocation is
+PC-relative offset into PLT  */
+  BFD_RELOC_MICROBLAZE_64_PLT,
+
+/* This is a 64 bit reloc that stores the 32 bit GOT relative
+value in two words (with an imm instruction).  The relocation is
+relative offset from _GLOBAL_OFFSET_TABLE_  */
+  BFD_RELOC_MICROBLAZE_64_GOTOFF,
+
+/* This is a 32 bit reloc that stores the 32 bit GOT relative
+value in a word.  The relocation is relative offset from  */
+  BFD_RELOC_MICROBLAZE_32_GOTOFF,
+
+/* This is used to tell the dynamic linker to copy the value out of
+the dynamic object into the runtime process image.  */
+  BFD_RELOC_MICROBLAZE_COPY,
+
+/* Unused Reloc  */
+  BFD_RELOC_MICROBLAZE_64_TLS,
+
+/* This is a 64 bit reloc that stores the 32 bit GOT relative value
+of the GOT TLS GD info entry in two words (with an imm instruction). The
+relocation is GOT offset.  */
+  BFD_RELOC_MICROBLAZE_64_TLSGD,
+
+/* This is a 64 bit reloc that stores the 32 bit GOT relative value
+of the GOT TLS LD info entry in two words (with an imm instruction). The
+relocation is GOT offset.  */
+  BFD_RELOC_MICROBLAZE_64_TLSLD,
+
+/* This is a 32 bit reloc that stores the Module ID to GOT(n).  */
+  BFD_RELOC_MICROBLAZE_32_TLSDTPMOD,
+
+/* This is a 32 bit reloc that stores TLS offset to GOT(n+1).  */
+  BFD_RELOC_MICROBLAZE_32_TLSDTPREL,
+
+/* This is a 32 bit reloc for storing TLS offset to two words (uses imm
+instruction)  */
+  BFD_RELOC_MICROBLAZE_64_TLSDTPREL,
+
+/* This is a 64 bit reloc that stores 32-bit thread pointer relative offset
+to two words (uses imm instruction).  */
+  BFD_RELOC_MICROBLAZE_64_TLSGOTTPREL,
+
+/* This is a 64 bit reloc that stores 32-bit thread pointer relative offset
+to two words (uses imm instruction).  */
+  BFD_RELOC_MICROBLAZE_64_TLSTPREL,
+
+/* AArch64 pseudo relocation code to mark the start of the AArch64
+relocation enumerators.  N.B. the order of the enumerators is
+important as several tables in the AArch64 bfd backend are indexed
+by these enumerators; make sure they are all synced.  */
+  BFD_RELOC_AARCH64_RELOC_START,
+
+/* Deprecated AArch64 null relocation code.  */
+  BFD_RELOC_AARCH64_NULL,
+
+/* AArch64 null relocation code.  */
+  BFD_RELOC_AARCH64_NONE,
+
+/* Basic absolute relocations of N bits.  These are equivalent to
+BFD_RELOC_N and they were added to assist the indexing of the howto
+table.  */
+  BFD_RELOC_AARCH64_64,
+  BFD_RELOC_AARCH64_32,
+  BFD_RELOC_AARCH64_16,
+
+/* PC-relative relocations.  These are equivalent to BFD_RELOC_N_PCREL
+and they were added to assist the indexing of the howto table.  */
+  BFD_RELOC_AARCH64_64_PCREL,
+  BFD_RELOC_AARCH64_32_PCREL,
+  BFD_RELOC_AARCH64_16_PCREL,
+
+/* AArch64 MOV[NZK] instruction with most significant bits 0 to 15
+of an unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G0,
+
+/* AArch64 MOV[NZK] instruction with less significant bits 0 to 15 of
+an address/value.  No overflow checking.  */
+  BFD_RELOC_AARCH64_MOVW_G0_NC,
+
+/* AArch64 MOV[NZK] instruction with most significant bits 16 to 31
+of an unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G1,
+
+/* AArch64 MOV[NZK] instruction with less significant bits 16 to 31
+of an address/value.  No overflow checking.  */
+  BFD_RELOC_AARCH64_MOVW_G1_NC,
+
+/* AArch64 MOV[NZK] instruction with most significant bits 32 to 47
+of an unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G2,
+
+/* AArch64 MOV[NZK] instruction with less significant bits 32 to 47
+of an address/value.  No overflow checking.  */
+  BFD_RELOC_AARCH64_MOVW_G2_NC,
+
+/* AArch64 MOV[NZK] instruction with most signficant bits 48 to 64
+of a signed or unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G3,
+
+/* AArch64 MOV[NZ] instruction with most significant bits 0 to 15
+of a signed value.  Changes instruction to MOVZ or MOVN depending on the
+value's sign.  */
+  BFD_RELOC_AARCH64_MOVW_G0_S,
+
+/* AArch64 MOV[NZ] instruction with most significant bits 16 to 31
+of a signed value.  Changes instruction to MOVZ or MOVN depending on the
+value's sign.  */
+  BFD_RELOC_AARCH64_MOVW_G1_S,
+
+/* AArch64 MOV[NZ] instruction with most significant bits 32 to 47
+of a signed value.  Changes instruction to MOVZ or MOVN depending on the
+value's sign.  */
+  BFD_RELOC_AARCH64_MOVW_G2_S,
+
+/* AArch64 Load Literal instruction, holding a 19 bit pc-relative word
+offset.  The lowest two bits must be zero and are not stored in the
+instruction, giving a 21 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_LD_LO19_PCREL,
+
+/* AArch64 ADR instruction, holding a simple 21 bit pc-relative byte offset.  */
+  BFD_RELOC_AARCH64_ADR_LO21_PCREL,
+
+/* AArch64 ADRP instruction, with bits 12 to 32 of a pc-relative page
+offset, giving a 4KB aligned page base address.  */
+  BFD_RELOC_AARCH64_ADR_HI21_PCREL,
+
+/* AArch64 ADRP instruction, with bits 12 to 32 of a pc-relative page
+offset, giving a 4KB aligned page base address, but with no overflow
+checking.  */
+  BFD_RELOC_AARCH64_ADR_HI21_NC_PCREL,
+
+/* AArch64 ADD immediate instruction, holding bits 0 to 11 of the address.
+Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_ADD_LO12,
+
+/* AArch64 8-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST8_LO12,
+
+/* AArch64 14 bit pc-relative test bit and branch.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 16 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_TSTBR14,
+
+/* AArch64 19 bit pc-relative conditional branch and compare & branch.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 21 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_BRANCH19,
+
+/* AArch64 26 bit pc-relative unconditional branch.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 28 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_JUMP26,
+
+/* AArch64 26 bit pc-relative unconditional branch and link.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 28 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_CALL26,
+
+/* AArch64 16-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST16_LO12,
+
+/* AArch64 32-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST32_LO12,
+
+/* AArch64 64-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST64_LO12,
+
+/* AArch64 128-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST128_LO12,
+
+/* AArch64 Load Literal instruction, holding a 19 bit PC relative word
+offset of the global offset table entry for a symbol.  The lowest two
+bits must be zero and are not stored in the instruction, giving a 21
+bit signed byte offset.  This relocation type requires signed overflow
+checking.  */
+  BFD_RELOC_AARCH64_GOT_LD_PREL19,
+
+/* Get to the page base of the global offset table entry for a symbol as
+part of an ADRP instruction using a 21 bit PC relative value.Used in
+conjunction with BFD_RELOC_AARCH64_LD64_GOT_LO12_NC.  */
+  BFD_RELOC_AARCH64_ADR_GOT_PAGE,
+
+/* Unsigned 12 bit byte offset for 64 bit load/store from the page of
+the GOT entry for this symbol.  Used in conjunction with
+BFD_RELOC_AARCH64_ADR_GOTPAGE.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_LD64_GOT_LO12_NC,
+
+/* Unsigned 12 bit byte offset for 32 bit load/store from the page of
+the GOT entry for this symbol.  Used in conjunction with
+BFD_RELOC_AARCH64_ADR_GOTPAGE.  Valid in ILP32 ABI only.  */
+  BFD_RELOC_AARCH64_LD32_GOT_LO12_NC,
+
+/* Unsigned 16 bit byte offset for 64 bit load/store from the GOT entry
+for this symbol.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_MOVW_GOTOFF_G0_NC,
+
+/* Unsigned 16 bit byte higher offset for 64 bit load/store from the GOT entry
+for this symbol.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_MOVW_GOTOFF_G1,
+
+/* Unsigned 15 bit byte offset for 64 bit load/store from the page of
+the GOT entry for this symbol.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_LD64_GOTOFF_LO15,
+
+/* Scaled 14 bit byte offset to the page base of the global offset table.  */
+  BFD_RELOC_AARCH64_LD32_GOTPAGE_LO14,
+
+/* Scaled 15 bit byte offset to the page base of the global offset table.  */
+  BFD_RELOC_AARCH64_LD64_GOTPAGE_LO15,
+
+/* Get to the page base of the global offset table entry for a symbols
+tls_index structure as part of an adrp instruction using a 21 bit PC
+relative value.  Used in conjunction with
+BFD_RELOC_AARCH64_TLSGD_ADD_LO12_NC.  */
+  BFD_RELOC_AARCH64_TLSGD_ADR_PAGE21,
+
+/* AArch64 TLS General Dynamic  */
+  BFD_RELOC_AARCH64_TLSGD_ADR_PREL21,
+
+/* Unsigned 12 bit byte offset to global offset table entry for a symbols
+tls_index structure.  Used in conjunction with
+BFD_RELOC_AARCH64_TLSGD_ADR_PAGE21.  */
+  BFD_RELOC_AARCH64_TLSGD_ADD_LO12_NC,
+
+/* AArch64 TLS General Dynamic relocation.  */
+  BFD_RELOC_AARCH64_TLSGD_MOVW_G0_NC,
+
+/* AArch64 TLS General Dynamic relocation.  */
+  BFD_RELOC_AARCH64_TLSGD_MOVW_G1,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_LD32_GOTTPREL_LO12_NC,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_LD_GOTTPREL_PREL19,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_MOVW_GOTTPREL_G1,
+
+/* bit[23:12] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_HI12,
+
+/* Unsigned 12 bit byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_LO12,
+
+/* No overflow check version of BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_LO12.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_LO12_NC,
+
+/* Unsigned 12 bit byte offset to global offset table entry for a symbols
+tls_index structure.  Used in conjunction with
+BFD_RELOC_AARCH64_TLSLD_ADR_PAGE21.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_LO12_NC,
+
+/* GOT entry page address for AArch64 TLS Local Dynamic, used with ADRP
+instruction.  */
+  BFD_RELOC_AARCH64_TLSLD_ADR_PAGE21,
+
+/* GOT entry address for AArch64 TLS Local Dynamic, used with ADR instruction.  */
+  BFD_RELOC_AARCH64_TLSLD_ADR_PREL21,
+
+/* bit[11:1] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST16_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST16_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC,
+
+/* bit[11:2] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST32_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST32_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC,
+
+/* bit[11:3] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST64_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST64_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC,
+
+/* bit[11:0] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST8_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST8_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC,
+
+/* bit[15:0] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G0,
+
+/* No overflow check version of BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G0  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G0_NC,
+
+/* bit[31:16] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G1,
+
+/* No overflow check version of BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G1  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G1_NC,
+
+/* bit[47:32] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G2,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G2,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G1,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G1_NC,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G0,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G0_NC,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_ADD_TPREL_HI12,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_ADD_TPREL_LO12,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_ADD_TPREL_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD_PREL19,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADR_PREL21,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADR_PAGE21,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD64_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD32_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADD_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_OFF_G1,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_OFF_G0_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LDR,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADD,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_CALL,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_COPY,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_GLOB_DAT,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_JUMP_SLOT,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_RELATIVE,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLS_DTPMOD,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLS_DTPREL,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLS_TPREL,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC,
+
+/* AArch64 support for STT_GNU_IFUNC.  */
+  BFD_RELOC_AARCH64_IRELATIVE,
+
+/* AArch64 pseudo relocation code to mark the end of the AArch64
+relocation enumerators that have direct mapping to ELF reloc codes.
+There are a few more enumerators after this one; those are mainly
+used by the AArch64 assembler for the internal fixup or to select
+one of the above enumerators.  */
+  BFD_RELOC_AARCH64_RELOC_END,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_GAS_INTERNAL_FIXUP,
+
+/* AArch64 unspecified load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST_LO12,
+
+/* AArch64 pseudo relocation code for TLS local dynamic mode.  It's to be
+used internally by the AArch64 assembler and not (currently) written to
+any object files.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST_DTPREL_LO12_NC,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_LD_GOT_LO12_NC,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_TLSIE_LD_GOTTPREL_LO12_NC,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD_LO12_NC,
+
+/* Tilera TILEPro Relocations.  */
+  BFD_RELOC_TILEPRO_COPY,
+  BFD_RELOC_TILEPRO_GLOB_DAT,
+  BFD_RELOC_TILEPRO_JMP_SLOT,
+  BFD_RELOC_TILEPRO_RELATIVE,
+  BFD_RELOC_TILEPRO_BROFF_X1,
+  BFD_RELOC_TILEPRO_JOFFLONG_X1,
+  BFD_RELOC_TILEPRO_JOFFLONG_X1_PLT,
+  BFD_RELOC_TILEPRO_IMM8_X0,
+  BFD_RELOC_TILEPRO_IMM8_Y0,
+  BFD_RELOC_TILEPRO_IMM8_X1,
+  BFD_RELOC_TILEPRO_IMM8_Y1,
+  BFD_RELOC_TILEPRO_DEST_IMM8_X1,
+  BFD_RELOC_TILEPRO_MT_IMM15_X1,
+  BFD_RELOC_TILEPRO_MF_IMM15_X1,
+  BFD_RELOC_TILEPRO_IMM16_X0,
+  BFD_RELOC_TILEPRO_IMM16_X1,
+  BFD_RELOC_TILEPRO_IMM16_X0_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_HA,
+  BFD_RELOC_TILEPRO_IMM16_X0_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_LO_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_LO_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_HI_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_HI_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_HA_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_HA_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT_HA,
+  BFD_RELOC_TILEPRO_MMSTART_X0,
+  BFD_RELOC_TILEPRO_MMEND_X0,
+  BFD_RELOC_TILEPRO_MMSTART_X1,
+  BFD_RELOC_TILEPRO_MMEND_X1,
+  BFD_RELOC_TILEPRO_SHAMT_X0,
+  BFD_RELOC_TILEPRO_SHAMT_X1,
+  BFD_RELOC_TILEPRO_SHAMT_Y0,
+  BFD_RELOC_TILEPRO_SHAMT_Y1,
+  BFD_RELOC_TILEPRO_TLS_GD_CALL,
+  BFD_RELOC_TILEPRO_IMM8_X0_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_IMM8_X1_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_IMM8_Y0_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_IMM8_Y1_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_TLS_IE_LOAD,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD_HA,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE_HA,
+  BFD_RELOC_TILEPRO_TLS_DTPMOD32,
+  BFD_RELOC_TILEPRO_TLS_DTPOFF32,
+  BFD_RELOC_TILEPRO_TLS_TPOFF32,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE_HA,
+
+/* Tilera TILE-Gx Relocations.  */
+  BFD_RELOC_TILEGX_HW0,
+  BFD_RELOC_TILEGX_HW1,
+  BFD_RELOC_TILEGX_HW2,
+  BFD_RELOC_TILEGX_HW3,
+  BFD_RELOC_TILEGX_HW0_LAST,
+  BFD_RELOC_TILEGX_HW1_LAST,
+  BFD_RELOC_TILEGX_HW2_LAST,
+  BFD_RELOC_TILEGX_COPY,
+  BFD_RELOC_TILEGX_GLOB_DAT,
+  BFD_RELOC_TILEGX_JMP_SLOT,
+  BFD_RELOC_TILEGX_RELATIVE,
+  BFD_RELOC_TILEGX_BROFF_X1,
+  BFD_RELOC_TILEGX_JUMPOFF_X1,
+  BFD_RELOC_TILEGX_JUMPOFF_X1_PLT,
+  BFD_RELOC_TILEGX_IMM8_X0,
+  BFD_RELOC_TILEGX_IMM8_Y0,
+  BFD_RELOC_TILEGX_IMM8_X1,
+  BFD_RELOC_TILEGX_IMM8_Y1,
+  BFD_RELOC_TILEGX_DEST_IMM8_X1,
+  BFD_RELOC_TILEGX_MT_IMM14_X1,
+  BFD_RELOC_TILEGX_MF_IMM14_X1,
+  BFD_RELOC_TILEGX_MMSTART_X0,
+  BFD_RELOC_TILEGX_MMEND_X0,
+  BFD_RELOC_TILEGX_SHAMT_X0,
+  BFD_RELOC_TILEGX_SHAMT_X1,
+  BFD_RELOC_TILEGX_SHAMT_Y0,
+  BFD_RELOC_TILEGX_SHAMT_Y1,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2,
+  BFD_RELOC_TILEGX_IMM16_X0_HW3,
+  BFD_RELOC_TILEGX_IMM16_X1_HW3,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_LAST,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_LAST,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW3_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW3_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_GOT,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_GOT,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X0_HW3_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW3_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_TLS_DTPMOD64,
+  BFD_RELOC_TILEGX_TLS_DTPOFF64,
+  BFD_RELOC_TILEGX_TLS_TPOFF64,
+  BFD_RELOC_TILEGX_TLS_DTPMOD32,
+  BFD_RELOC_TILEGX_TLS_DTPOFF32,
+  BFD_RELOC_TILEGX_TLS_TPOFF32,
+  BFD_RELOC_TILEGX_TLS_GD_CALL,
+  BFD_RELOC_TILEGX_IMM8_X0_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_IMM8_X1_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y0_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y1_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_TLS_IE_LOAD,
+  BFD_RELOC_TILEGX_IMM8_X0_TLS_ADD,
+  BFD_RELOC_TILEGX_IMM8_X1_TLS_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y0_TLS_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y1_TLS_ADD,
+
+/* Adapteva EPIPHANY - 8 bit signed pc-relative displacement  */
+  BFD_RELOC_EPIPHANY_SIMM8,
+
+/* Adapteva EPIPHANY - 24 bit signed pc-relative displacement  */
+  BFD_RELOC_EPIPHANY_SIMM24,
+
+/* Adapteva EPIPHANY - 16 most-significant bits of absolute address  */
+  BFD_RELOC_EPIPHANY_HIGH,
+
+/* Adapteva EPIPHANY - 16 least-significant bits of absolute address  */
+  BFD_RELOC_EPIPHANY_LOW,
+
+/* Adapteva EPIPHANY - 11 bit signed number - add/sub immediate  */
+  BFD_RELOC_EPIPHANY_SIMM11,
+
+/* Adapteva EPIPHANY - 11 bit sign-magnitude number (ld/st displacement)  */
+  BFD_RELOC_EPIPHANY_IMM11,
+
+/* Adapteva EPIPHANY - 8 bit immediate for 16 bit mov instruction.  */
+  BFD_RELOC_EPIPHANY_IMM8,
+
+/* Visium Relocations.  */
+  BFD_RELOC_VISIUM_HI16,
+  BFD_RELOC_VISIUM_LO16,
+  BFD_RELOC_VISIUM_IM16,
+  BFD_RELOC_VISIUM_REL16,
+  BFD_RELOC_VISIUM_HI16_PCREL,
+  BFD_RELOC_VISIUM_LO16_PCREL,
+  BFD_RELOC_VISIUM_IM16_PCREL,
+  BFD_RELOC_UNUSED };
+
+typedef enum bfd_reloc_code_real bfd_reloc_code_real_type;
+reloc_howto_type *bfd_reloc_type_lookup
+   (bfd *abfd, bfd_reloc_code_real_type code);
+reloc_howto_type *bfd_reloc_name_lookup
+   (bfd *abfd, const char *reloc_name);
+
+const char *bfd_get_reloc_code_name (bfd_reloc_code_real_type code);
+
+/* Extracted from syms.c.  */
+
+typedef struct bfd_symbol
+{
+  /* A pointer to the BFD which owns the symbol. This information
+     is necessary so that a back end can work out what additional
+     information (invisible to the application writer) is carried
+     with the symbol.
+
+     This field is *almost* redundant, since you can use section->owner
+     instead, except that some symbols point to the global sections
+     bfd_{abs,com,und}_section.  This could be fixed by making
+     these globals be per-bfd (or per-target-flavor).  FIXME.  */
+  struct bfd *the_bfd; /* Use bfd_asymbol_bfd(sym) to access this field.  */
+
+  /* The text of the symbol. The name is left alone, and not copied; the
+     application may not alter it.  */
+  const char *name;
+
+  /* The value of the symbol.  This really should be a union of a
+     numeric value with a pointer, since some flags indicate that
+     a pointer to another symbol is stored here.  */
+  symvalue value;
+
+  /* Attributes of a symbol.  */
+#define BSF_NO_FLAGS           0x00
+
+  /* The symbol has local scope; <<static>> in <<C>>. The value
+     is the offset into the section of the data.  */
+#define BSF_LOCAL              (1 << 0)
+
+  /* The symbol has global scope; initialized data in <<C>>. The
+     value is the offset into the section of the data.  */
+#define BSF_GLOBAL             (1 << 1)
+
+  /* The symbol has global scope and is exported. The value is
+     the offset into the section of the data.  */
+#define BSF_EXPORT     BSF_GLOBAL /* No real difference.  */
+
+  /* A normal C symbol would be one of:
+     <<BSF_LOCAL>>, <<BSF_UNDEFINED>> or <<BSF_GLOBAL>>.  */
+
+  /* The symbol is a debugging record. The value has an arbitrary
+     meaning, unless BSF_DEBUGGING_RELOC is also set.  */
+#define BSF_DEBUGGING          (1 << 2)
+
+  /* The symbol denotes a function entry point.  Used in ELF,
+     perhaps others someday.  */
+#define BSF_FUNCTION           (1 << 3)
+
+  /* Used by the linker.  */
+#define BSF_KEEP               (1 << 5)
+
+  /* An ELF common symbol.  */
+#define BSF_ELF_COMMON         (1 << 6)
+
+  /* A weak global symbol, overridable without warnings by
+     a regular global symbol of the same name.  */
+#define BSF_WEAK               (1 << 7)
+
+  /* This symbol was created to point to a section, e.g. ELF's
+     STT_SECTION symbols.  */
+#define BSF_SECTION_SYM        (1 << 8)
+
+  /* The symbol used to be a common symbol, but now it is
+     allocated.  */
+#define BSF_OLD_COMMON         (1 << 9)
+
+  /* In some files the type of a symbol sometimes alters its
+     location in an output file - ie in coff a <<ISFCN>> symbol
+     which is also <<C_EXT>> symbol appears where it was
+     declared and not at the end of a section.  This bit is set
+     by the target BFD part to convey this information.  */
+#define BSF_NOT_AT_END         (1 << 10)
+
+  /* Signal that the symbol is the label of constructor section.  */
+#define BSF_CONSTRUCTOR        (1 << 11)
+
+  /* Signal that the symbol is a warning symbol.  The name is a
+     warning.  The name of the next symbol is the one to warn about;
+     if a reference is made to a symbol with the same name as the next
+     symbol, a warning is issued by the linker.  */
+#define BSF_WARNING            (1 << 12)
+
+  /* Signal that the symbol is indirect.  This symbol is an indirect
+     pointer to the symbol with the same name as the next symbol.  */
+#define BSF_INDIRECT           (1 << 13)
+
+  /* BSF_FILE marks symbols that contain a file name.  This is used
+     for ELF STT_FILE symbols.  */
+#define BSF_FILE               (1 << 14)
+
+  /* Symbol is from dynamic linking information.  */
+#define BSF_DYNAMIC            (1 << 15)
+
+  /* The symbol denotes a data object.  Used in ELF, and perhaps
+     others someday.  */
+#define BSF_OBJECT             (1 << 16)
+
+  /* This symbol is a debugging symbol.  The value is the offset
+     into the section of the data.  BSF_DEBUGGING should be set
+     as well.  */
+#define BSF_DEBUGGING_RELOC    (1 << 17)
+
+  /* This symbol is thread local.  Used in ELF.  */
+#define BSF_THREAD_LOCAL       (1 << 18)
+
+  /* This symbol represents a complex relocation expression,
+     with the expression tree serialized in the symbol name.  */
+#define BSF_RELC               (1 << 19)
+
+  /* This symbol represents a signed complex relocation expression,
+     with the expression tree serialized in the symbol name.  */
+#define BSF_SRELC              (1 << 20)
+
+  /* This symbol was created by bfd_get_synthetic_symtab.  */
+#define BSF_SYNTHETIC          (1 << 21)
+
+  /* This symbol is an indirect code object.  Unrelated to BSF_INDIRECT.
+     The dynamic linker will compute the value of this symbol by
+     calling the function that it points to.  BSF_FUNCTION must
+     also be also set.  */
+#define BSF_GNU_INDIRECT_FUNCTION (1 << 22)
+  /* This symbol is a globally unique data object.  The dynamic linker
+     will make sure that in the entire process there is just one symbol
+     with this name and type in use.  BSF_OBJECT must also be set.  */
+#define BSF_GNU_UNIQUE         (1 << 23)
+
+  flagword flags;
+
+  /* A pointer to the section to which this symbol is
+     relative.  This will always be non NULL, there are special
+     sections for undefined and absolute symbols.  */
+  struct bfd_section *section;
+
+  /* Back end special data.  */
+  union
+    {
+      void *p;
+      bfd_vma i;
+    }
+  udata;
+}
+asymbol;
+
+#define bfd_get_symtab_upper_bound(abfd) \
+     BFD_SEND (abfd, _bfd_get_symtab_upper_bound, (abfd))
+
+bfd_boolean bfd_is_local_label (bfd *abfd, asymbol *sym);
+
+bfd_boolean bfd_is_local_label_name (bfd *abfd, const char *name);
+
+#define bfd_is_local_label_name(abfd, name) \
+  BFD_SEND (abfd, _bfd_is_local_label_name, (abfd, name))
+
+bfd_boolean bfd_is_target_special_symbol (bfd *abfd, asymbol *sym);
+
+#define bfd_is_target_special_symbol(abfd, sym) \
+  BFD_SEND (abfd, _bfd_is_target_special_symbol, (abfd, sym))
+
+#define bfd_canonicalize_symtab(abfd, location) \
+  BFD_SEND (abfd, _bfd_canonicalize_symtab, (abfd, location))
+
+bfd_boolean bfd_set_symtab
+   (bfd *abfd, asymbol **location, unsigned int count);
+
+void bfd_print_symbol_vandf (bfd *abfd, void *file, asymbol *symbol);
+
+#define bfd_make_empty_symbol(abfd) \
+  BFD_SEND (abfd, _bfd_make_empty_symbol, (abfd))
+
+asymbol *_bfd_generic_make_empty_symbol (bfd *);
+
+#define bfd_make_debug_symbol(abfd,ptr,size) \
+  BFD_SEND (abfd, _bfd_make_debug_symbol, (abfd, ptr, size))
+
+int bfd_decode_symclass (asymbol *symbol);
+
+bfd_boolean bfd_is_undefined_symclass (int symclass);
+
+void bfd_symbol_info (asymbol *symbol, symbol_info *ret);
+
+bfd_boolean bfd_copy_private_symbol_data
+   (bfd *ibfd, asymbol *isym, bfd *obfd, asymbol *osym);
+
+#define bfd_copy_private_symbol_data(ibfd, isymbol, obfd, osymbol) \
+  BFD_SEND (obfd, _bfd_copy_private_symbol_data, \
+            (ibfd, isymbol, obfd, osymbol))
+
+/* Extracted from bfd.c.  */
+
+enum bfd_direction
+  {
+    no_direction = 0,
+    read_direction = 1,
+    write_direction = 2,
+    both_direction = 3
+  };
+
+enum bfd_plugin_format
+  {
+    bfd_plugin_unknown = 0,
+    bfd_plugin_yes = 1,
+    bfd_plugin_no = 2
+  };
+
+struct bfd_build_id
+  {
+    bfd_size_type size;
+    bfd_byte data[1];
+  };
+
+struct bfd
+{
+  /* The filename the application opened the BFD with.  */
+  const char *filename;
+
+  /* A pointer to the target jump table.  */
+  const struct bfd_target *xvec;
+
+  /* The IOSTREAM, and corresponding IO vector that provide access
+     to the file backing the BFD.  */
+  void *iostream;
+  const struct bfd_iovec *iovec;
+
+  /* The caching routines use these to maintain a
+     least-recently-used list of BFDs.  */
+  struct bfd *lru_prev, *lru_next;
+
+  /* When a file is closed by the caching routines, BFD retains
+     state information on the file here...  */
+  ufile_ptr where;
+
+  /* File modified time, if mtime_set is TRUE.  */
+  long mtime;
+
+  /* A unique identifier of the BFD  */
+  unsigned int id;
+
+  /* The format which belongs to the BFD. (object, core, etc.)  */
+  ENUM_BITFIELD (bfd_format) format : 3;
+
+  /* The direction with which the BFD was opened.  */
+  ENUM_BITFIELD (bfd_direction) direction : 2;
+
+  /* Format_specific flags.  */
+  flagword flags : 20;
+
+  /* Values that may appear in the flags field of a BFD.  These also
+     appear in the object_flags field of the bfd_target structure, where
+     they indicate the set of flags used by that backend (not all flags
+     are meaningful for all object file formats) (FIXME: at the moment,
+     the object_flags values have mostly just been copied from backend
+     to another, and are not necessarily correct).  */
+
+#define BFD_NO_FLAGS   0x00
+
+  /* BFD contains relocation entries.  */
+#define HAS_RELOC      0x01
+
+  /* BFD is directly executable.  */
+#define EXEC_P         0x02
+
+  /* BFD has line number information (basically used for F_LNNO in a
+     COFF header).  */
+#define HAS_LINENO     0x04
+
+  /* BFD has debugging information.  */
+#define HAS_DEBUG      0x08
+
+  /* BFD has symbols.  */
+#define HAS_SYMS       0x10
+
+  /* BFD has local symbols (basically used for F_LSYMS in a COFF
+     header).  */
+#define HAS_LOCALS     0x20
+
+  /* BFD is a dynamic object.  */
+#define DYNAMIC        0x40
+
+  /* Text section is write protected (if D_PAGED is not set, this is
+     like an a.out NMAGIC file) (the linker sets this by default, but
+     clears it for -r or -N).  */
+#define WP_TEXT        0x80
+
+  /* BFD is dynamically paged (this is like an a.out ZMAGIC file) (the
+     linker sets this by default, but clears it for -r or -n or -N).  */
+#define D_PAGED        0x100
+
+  /* BFD is relaxable (this means that bfd_relax_section may be able to
+     do something) (sometimes bfd_relax_section can do something even if
+     this is not set).  */
+#define BFD_IS_RELAXABLE 0x200
+
+  /* This may be set before writing out a BFD to request using a
+     traditional format.  For example, this is used to request that when
+     writing out an a.out object the symbols not be hashed to eliminate
+     duplicates.  */
+#define BFD_TRADITIONAL_FORMAT 0x400
+
+  /* This flag indicates that the BFD contents are actually cached
+     in memory.  If this is set, iostream points to a bfd_in_memory
+     struct.  */
+#define BFD_IN_MEMORY 0x800
+
+  /* This BFD has been created by the linker and doesn't correspond
+     to any input file.  */
+#define BFD_LINKER_CREATED 0x1000
+
+  /* This may be set before writing out a BFD to request that it
+     be written using values for UIDs, GIDs, timestamps, etc. that
+     will be consistent from run to run.  */
+#define BFD_DETERMINISTIC_OUTPUT 0x2000
+
+  /* Compress sections in this BFD.  */
+#define BFD_COMPRESS 0x4000
+
+  /* Decompress sections in this BFD.  */
+#define BFD_DECOMPRESS 0x8000
+
+  /* BFD is a dummy, for plugins.  */
+#define BFD_PLUGIN 0x10000
+
+  /* Compress sections in this BFD with SHF_COMPRESSED from gABI.  */
+#define BFD_COMPRESS_GABI 0x20000
+
+  /* Convert ELF common symbol type to STT_COMMON or STT_OBJECT in this
+     BFD.  */
+#define BFD_CONVERT_ELF_COMMON 0x40000
+
+  /* Use the ELF STT_COMMON type in this BFD.  */
+#define BFD_USE_ELF_STT_COMMON 0x80000
+
+  /* Flags bits to be saved in bfd_preserve_save.  */
+#define BFD_FLAGS_SAVED \
+  (BFD_IN_MEMORY | BFD_COMPRESS | BFD_DECOMPRESS | BFD_PLUGIN \
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON)
+
+  /* Flags bits which are for BFD use only.  */
+#define BFD_FLAGS_FOR_BFD_USE_MASK \
+  (BFD_IN_MEMORY | BFD_COMPRESS | BFD_DECOMPRESS | BFD_LINKER_CREATED \
+   | BFD_PLUGIN | BFD_TRADITIONAL_FORMAT | BFD_DETERMINISTIC_OUTPUT \
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON)
+
+  /* Is the file descriptor being cached?  That is, can it be closed as
+     needed, and re-opened when accessed later?  */
+  unsigned int cacheable : 1;
+
+  /* Marks whether there was a default target specified when the
+     BFD was opened. This is used to select which matching algorithm
+     to use to choose the back end.  */
+  unsigned int target_defaulted : 1;
+
+  /* ... and here: (``once'' means at least once).  */
+  unsigned int opened_once : 1;
+
+  /* Set if we have a locally maintained mtime value, rather than
+     getting it from the file each time.  */
+  unsigned int mtime_set : 1;
+
+  /* Flag set if symbols from this BFD should not be exported.  */
+  unsigned int no_export : 1;
+
+  /* Remember when output has begun, to stop strange things
+     from happening.  */
+  unsigned int output_has_begun : 1;
+
+  /* Have archive map.  */
+  unsigned int has_armap : 1;
+
+  /* Set if this is a thin archive.  */
+  unsigned int is_thin_archive : 1;
+
+  /* Set if only required symbols should be added in the link hash table for
+     this object.  Used by VMS linkers.  */
+  unsigned int selective_search : 1;
+
+  /* Set if this is the linker output BFD.  */
+  unsigned int is_linker_output : 1;
+
+  /* Set if this is the linker input BFD.  */
+  unsigned int is_linker_input : 1;
+
+  /* If this is an input for a compiler plug-in library.  */
+  ENUM_BITFIELD (bfd_plugin_format) plugin_format : 2;
+
+  /* Set if this is a plugin output file.  */
+  unsigned int lto_output : 1;
+
+  /* Set to dummy BFD created when claimed by a compiler plug-in
+     library.  */
+  bfd *plugin_dummy_bfd;
+
+  /* Currently my_archive is tested before adding origin to
+     anything. I believe that this can become always an add of
+     origin, with origin set to 0 for non archive files.  */
+  ufile_ptr origin;
+
+  /* The origin in the archive of the proxy entry.  This will
+     normally be the same as origin, except for thin archives,
+     when it will contain the current offset of the proxy in the
+     thin archive rather than the offset of the bfd in its actual
+     container.  */
+  ufile_ptr proxy_origin;
+
+  /* A hash table for section names.  */
+  struct bfd_hash_table section_htab;
+
+  /* Pointer to linked list of sections.  */
+  struct bfd_section *sections;
+
+  /* The last section on the section list.  */
+  struct bfd_section *section_last;
+
+  /* The number of sections.  */
+  unsigned int section_count;
+
+  /* A field used by _bfd_generic_link_add_archive_symbols.  This will
+     be used only for archive elements.  */
+  int archive_pass;
+
+  /* Stuff only useful for object files:
+     The start address.  */
+  bfd_vma start_address;
+
+  /* Symbol table for output BFD (with symcount entries).
+     Also used by the linker to cache input BFD symbols.  */
+  struct bfd_symbol  **outsymbols;
+
+  /* Used for input and output.  */
+  unsigned int symcount;
+
+  /* Used for slurped dynamic symbol tables.  */
+  unsigned int dynsymcount;
+
+  /* Pointer to structure which contains architecture information.  */
+  const struct bfd_arch_info *arch_info;
+
+  /* Stuff only useful for archives.  */
+  void *arelt_data;
+  struct bfd *my_archive;      /* The containing archive BFD.  */
+  struct bfd *archive_next;    /* The next BFD in the archive.  */
+  struct bfd *archive_head;    /* The first BFD in the archive.  */
+  struct bfd *nested_archives; /* List of nested archive in a flattened
+                                  thin archive.  */
+
+  union {
+    /* For input BFDs, a chain of BFDs involved in a link.  */
+    struct bfd *next;
+    /* For output BFD, the linker hash table.  */
+    struct bfd_link_hash_table *hash;
+  } link;
+
+  /* Used by the back end to hold private data.  */
+  union
+    {
+      struct aout_data_struct *aout_data;
+      struct artdata *aout_ar_data;
+      struct _oasys_data *oasys_obj_data;
+      struct _oasys_ar_data *oasys_ar_data;
+      struct coff_tdata *coff_obj_data;
+      struct pe_tdata *pe_obj_data;
+      struct xcoff_tdata *xcoff_obj_data;
+      struct ecoff_tdata *ecoff_obj_data;
+      struct ieee_data_struct *ieee_data;
+      struct ieee_ar_data_struct *ieee_ar_data;
+      struct srec_data_struct *srec_data;
+      struct verilog_data_struct *verilog_data;
+      struct ihex_data_struct *ihex_data;
+      struct tekhex_data_struct *tekhex_data;
+      struct elf_obj_tdata *elf_obj_data;
+      struct nlm_obj_tdata *nlm_obj_data;
+      struct bout_data_struct *bout_data;
+      struct mmo_data_struct *mmo_data;
+      struct sun_core_struct *sun_core_data;
+      struct sco5_core_struct *sco5_core_data;
+      struct trad_core_struct *trad_core_data;
+      struct som_data_struct *som_data;
+      struct hpux_core_struct *hpux_core_data;
+      struct hppabsd_core_struct *hppabsd_core_data;
+      struct sgi_core_struct *sgi_core_data;
+      struct lynx_core_struct *lynx_core_data;
+      struct osf_core_struct *osf_core_data;
+      struct cisco_core_struct *cisco_core_data;
+      struct versados_data_struct *versados_data;
+      struct netbsd_core_struct *netbsd_core_data;
+      struct mach_o_data_struct *mach_o_data;
+      struct mach_o_fat_data_struct *mach_o_fat_data;
+      struct plugin_data_struct *plugin_data;
+      struct bfd_pef_data_struct *pef_data;
+      struct bfd_pef_xlib_data_struct *pef_xlib_data;
+      struct bfd_sym_data_struct *sym_data;
+      void *any;
+    }
+  tdata;
+
+  /* Used by the application to hold private data.  */
+  void *usrdata;
+
+  /* Where all the allocated stuff under this BFD goes.  This is a
+     struct objalloc *, but we use void * to avoid requiring the inclusion
+     of objalloc.h.  */
+  void *memory;
+
+  /* For input BFDs, the build ID, if the object has one. */
+  const struct bfd_build_id *build_id;
+};
+
+/* See note beside bfd_set_section_userdata.  */
+static inline bfd_boolean
+bfd_set_cacheable (bfd * abfd, bfd_boolean val)
+{
+  abfd->cacheable = val;
+  return TRUE;
+}
+
+
+typedef enum bfd_error
+{
+  bfd_error_no_error = 0,
+  bfd_error_system_call,
+  bfd_error_invalid_target,
+  bfd_error_wrong_format,
+  bfd_error_wrong_object_format,
+  bfd_error_invalid_operation,
+  bfd_error_no_memory,
+  bfd_error_no_symbols,
+  bfd_error_no_armap,
+  bfd_error_no_more_archived_files,
+  bfd_error_malformed_archive,
+  bfd_error_missing_dso,
+  bfd_error_file_not_recognized,
+  bfd_error_file_ambiguously_recognized,
+  bfd_error_no_contents,
+  bfd_error_nonrepresentable_section,
+  bfd_error_no_debug_section,
+  bfd_error_bad_value,
+  bfd_error_file_truncated,
+  bfd_error_file_too_big,
+  bfd_error_on_input,
+  bfd_error_invalid_error_code
+}
+bfd_error_type;
+
+bfd_error_type bfd_get_error (void);
+
+void bfd_set_error (bfd_error_type error_tag, ...);
+
+const char *bfd_errmsg (bfd_error_type error_tag);
+
+void bfd_perror (const char *message);
+
+
+typedef void (*bfd_error_handler_type) (const char *, va_list);
+
+bfd_error_handler_type bfd_set_error_handler (bfd_error_handler_type);
+
+void bfd_set_error_program_name (const char *);
+
+
+typedef void (*bfd_assert_handler_type) (const char *bfd_formatmsg,
+                                         const char *bfd_version,
+                                         const char *bfd_file,
+                                         int bfd_line);
+
+bfd_assert_handler_type bfd_set_assert_handler (bfd_assert_handler_type);
+
+long bfd_get_reloc_upper_bound (bfd *abfd, asection *sect);
+
+long bfd_canonicalize_reloc
+   (bfd *abfd, asection *sec, arelent **loc, asymbol **syms);
+
+void bfd_set_reloc
+   (bfd *abfd, asection *sec, arelent **rel, unsigned int count);
+
+bfd_boolean bfd_set_file_flags (bfd *abfd, flagword flags);
+
+int bfd_get_arch_size (bfd *abfd);
+
+int bfd_get_sign_extend_vma (bfd *abfd);
+
+bfd_boolean bfd_set_start_address (bfd *abfd, bfd_vma vma);
+
+unsigned int bfd_get_gp_size (bfd *abfd);
+
+void bfd_set_gp_size (bfd *abfd, unsigned int i);
+
+bfd_vma bfd_scan_vma (const char *string, const char **end, int base);
+
+bfd_boolean bfd_copy_private_header_data (bfd *ibfd, bfd *obfd);
+
+#define bfd_copy_private_header_data(ibfd, obfd) \
+     BFD_SEND (obfd, _bfd_copy_private_header_data, \
+               (ibfd, obfd))
+bfd_boolean bfd_copy_private_bfd_data (bfd *ibfd, bfd *obfd);
+
+#define bfd_copy_private_bfd_data(ibfd, obfd) \
+     BFD_SEND (obfd, _bfd_copy_private_bfd_data, \
+               (ibfd, obfd))
+bfd_boolean bfd_set_private_flags (bfd *abfd, flagword flags);
+
+#define bfd_set_private_flags(abfd, flags) \
+     BFD_SEND (abfd, _bfd_set_private_flags, (abfd, flags))
+#define bfd_sizeof_headers(abfd, info) \
+       BFD_SEND (abfd, _bfd_sizeof_headers, (abfd, info))
+
+#define bfd_find_nearest_line(abfd, sec, syms, off, file, func, line) \
+       BFD_SEND (abfd, _bfd_find_nearest_line, \
+                 (abfd, syms, sec, off, file, func, line, NULL))
+
+#define bfd_find_nearest_line_discriminator(abfd, sec, syms, off, file, func, \
+                                            line, disc) \
+       BFD_SEND (abfd, _bfd_find_nearest_line, \
+                 (abfd, syms, sec, off, file, func, line, disc))
+
+#define bfd_find_line(abfd, syms, sym, file, line) \
+       BFD_SEND (abfd, _bfd_find_line, \
+                 (abfd, syms, sym, file, line))
+
+#define bfd_find_inliner_info(abfd, file, func, line) \
+       BFD_SEND (abfd, _bfd_find_inliner_info, \
+                 (abfd, file, func, line))
+
+#define bfd_debug_info_start(abfd) \
+       BFD_SEND (abfd, _bfd_debug_info_start, (abfd))
+
+#define bfd_debug_info_end(abfd) \
+       BFD_SEND (abfd, _bfd_debug_info_end, (abfd))
+
+#define bfd_debug_info_accumulate(abfd, section) \
+       BFD_SEND (abfd, _bfd_debug_info_accumulate, (abfd, section))
+
+#define bfd_stat_arch_elt(abfd, stat) \
+       BFD_SEND (abfd, _bfd_stat_arch_elt,(abfd, stat))
+
+#define bfd_update_armap_timestamp(abfd) \
+       BFD_SEND (abfd, _bfd_update_armap_timestamp, (abfd))
+
+#define bfd_set_arch_mach(abfd, arch, mach)\
+       BFD_SEND ( abfd, _bfd_set_arch_mach, (abfd, arch, mach))
+
+#define bfd_relax_section(abfd, section, link_info, again) \
+       BFD_SEND (abfd, _bfd_relax_section, (abfd, section, link_info, again))
+
+#define bfd_gc_sections(abfd, link_info) \
+       BFD_SEND (abfd, _bfd_gc_sections, (abfd, link_info))
+
+#define bfd_lookup_section_flags(link_info, flag_info, section) \
+       BFD_SEND (abfd, _bfd_lookup_section_flags, (link_info, flag_info, section))
+
+#define bfd_merge_sections(abfd, link_info) \
+       BFD_SEND (abfd, _bfd_merge_sections, (abfd, link_info))
+
+#define bfd_is_group_section(abfd, sec) \
+       BFD_SEND (abfd, _bfd_is_group_section, (abfd, sec))
+
+#define bfd_discard_group(abfd, sec) \
+       BFD_SEND (abfd, _bfd_discard_group, (abfd, sec))
+
+#define bfd_link_hash_table_create(abfd) \
+       BFD_SEND (abfd, _bfd_link_hash_table_create, (abfd))
+
+#define bfd_link_add_symbols(abfd, info) \
+       BFD_SEND (abfd, _bfd_link_add_symbols, (abfd, info))
+
+#define bfd_link_just_syms(abfd, sec, info) \
+       BFD_SEND (abfd, _bfd_link_just_syms, (sec, info))
+
+#define bfd_final_link(abfd, info) \
+       BFD_SEND (abfd, _bfd_final_link, (abfd, info))
+
+#define bfd_free_cached_info(abfd) \
+       BFD_SEND (abfd, _bfd_free_cached_info, (abfd))
+
+#define bfd_get_dynamic_symtab_upper_bound(abfd) \
+       BFD_SEND (abfd, _bfd_get_dynamic_symtab_upper_bound, (abfd))
+
+#define bfd_print_private_bfd_data(abfd, file)\
+       BFD_SEND (abfd, _bfd_print_private_bfd_data, (abfd, file))
+
+#define bfd_canonicalize_dynamic_symtab(abfd, asymbols) \
+       BFD_SEND (abfd, _bfd_canonicalize_dynamic_symtab, (abfd, asymbols))
+
+#define bfd_get_synthetic_symtab(abfd, count, syms, dyncount, dynsyms, ret) \
+       BFD_SEND (abfd, _bfd_get_synthetic_symtab, (abfd, count, syms, \
+                                                   dyncount, dynsyms, ret))
+
+#define bfd_get_dynamic_reloc_upper_bound(abfd) \
+       BFD_SEND (abfd, _bfd_get_dynamic_reloc_upper_bound, (abfd))
+
+#define bfd_canonicalize_dynamic_reloc(abfd, arels, asyms) \
+       BFD_SEND (abfd, _bfd_canonicalize_dynamic_reloc, (abfd, arels, asyms))
+
+extern bfd_byte *bfd_get_relocated_section_contents
+  (bfd *, struct bfd_link_info *, struct bfd_link_order *, bfd_byte *,
+   bfd_boolean, asymbol **);
+
+bfd_boolean bfd_alt_mach_code (bfd *abfd, int alternative);
+
+bfd_vma bfd_emul_get_maxpagesize (const char *);
+
+void bfd_emul_set_maxpagesize (const char *, bfd_vma);
+
+bfd_vma bfd_emul_get_commonpagesize (const char *);
+
+void bfd_emul_set_commonpagesize (const char *, bfd_vma);
+
+char *bfd_demangle (bfd *, const char *, int);
+
+void bfd_update_compression_header
+   (bfd *abfd, bfd_byte *contents, asection *sec);
+
+bfd_boolean bfd_check_compression_header
+   (bfd *abfd, bfd_byte *contents, asection *sec,
+    bfd_size_type *uncompressed_size);
+
+int bfd_get_compression_header_size (bfd *abfd, asection *sec);
+
+bfd_size_type bfd_convert_section_size
+   (bfd *ibfd, asection *isec, bfd *obfd, bfd_size_type size);
+
+bfd_boolean bfd_convert_section_contents
+   (bfd *ibfd, asection *isec, bfd *obfd,
+    bfd_byte **ptr, bfd_size_type *ptr_size);
+
+/* Extracted from archive.c.  */
+symindex bfd_get_next_mapent
+   (bfd *abfd, symindex previous, carsym **sym);
+
+bfd_boolean bfd_set_archive_head (bfd *output, bfd *new_head);
+
+bfd *bfd_openr_next_archived_file (bfd *archive, bfd *previous);
+
+/* Extracted from corefile.c.  */
+const char *bfd_core_file_failing_command (bfd *abfd);
+
+int bfd_core_file_failing_signal (bfd *abfd);
+
+int bfd_core_file_pid (bfd *abfd);
+
+bfd_boolean core_file_matches_executable_p
+   (bfd *core_bfd, bfd *exec_bfd);
+
+bfd_boolean generic_core_file_matches_executable_p
+   (bfd *core_bfd, bfd *exec_bfd);
+
+/* Extracted from targets.c.  */
+#define BFD_SEND(bfd, message, arglist) \
+  ((*((bfd)->xvec->message)) arglist)
+
+#ifdef DEBUG_BFD_SEND
+#undef BFD_SEND
+#define BFD_SEND(bfd, message, arglist) \
+  (((bfd) && (bfd)->xvec && (bfd)->xvec->message) ? \
+    ((*((bfd)->xvec->message)) arglist) : \
+    (bfd_assert (__FILE__,__LINE__), NULL))
+#endif
+#define BFD_SEND_FMT(bfd, message, arglist) \
+  (((bfd)->xvec->message[(int) ((bfd)->format)]) arglist)
+
+#ifdef DEBUG_BFD_SEND
+#undef BFD_SEND_FMT
+#define BFD_SEND_FMT(bfd, message, arglist) \
+  (((bfd) && (bfd)->xvec && (bfd)->xvec->message) ? \
+   (((bfd)->xvec->message[(int) ((bfd)->format)]) arglist) : \
+   (bfd_assert (__FILE__,__LINE__), NULL))
+#endif
+
+enum bfd_flavour
+{
+  /* N.B. Update bfd_flavour_name if you change this.  */
+  bfd_target_unknown_flavour,
+  bfd_target_aout_flavour,
+  bfd_target_coff_flavour,
+  bfd_target_ecoff_flavour,
+  bfd_target_xcoff_flavour,
+  bfd_target_elf_flavour,
+  bfd_target_ieee_flavour,
+  bfd_target_nlm_flavour,
+  bfd_target_oasys_flavour,
+  bfd_target_tekhex_flavour,
+  bfd_target_srec_flavour,
+  bfd_target_verilog_flavour,
+  bfd_target_ihex_flavour,
+  bfd_target_som_flavour,
+  bfd_target_os9k_flavour,
+  bfd_target_versados_flavour,
+  bfd_target_msdos_flavour,
+  bfd_target_ovax_flavour,
+  bfd_target_evax_flavour,
+  bfd_target_mmo_flavour,
+  bfd_target_mach_o_flavour,
+  bfd_target_pef_flavour,
+  bfd_target_pef_xlib_flavour,
+  bfd_target_sym_flavour
+};
+
+enum bfd_endian { BFD_ENDIAN_BIG, BFD_ENDIAN_LITTLE, BFD_ENDIAN_UNKNOWN };
+
+/* Forward declaration.  */
+typedef struct bfd_link_info _bfd_link_info;
+
+/* Forward declaration.  */
+typedef struct flag_info flag_info;
+
+typedef struct bfd_target
+{
+  /* Identifies the kind of target, e.g., SunOS4, Ultrix, etc.  */
+  char *name;
+
+ /* The "flavour" of a back end is a general indication about
+    the contents of a file.  */
+  enum bfd_flavour flavour;
+
+  /* The order of bytes within the data area of a file.  */
+  enum bfd_endian byteorder;
+
+ /* The order of bytes within the header parts of a file.  */
+  enum bfd_endian header_byteorder;
+
+  /* A mask of all the flags which an executable may have set -
+     from the set <<BFD_NO_FLAGS>>, <<HAS_RELOC>>, ...<<D_PAGED>>.  */
+  flagword object_flags;
+
+ /* A mask of all the flags which a section may have set - from
+    the set <<SEC_NO_FLAGS>>, <<SEC_ALLOC>>, ...<<SET_NEVER_LOAD>>.  */
+  flagword section_flags;
+
+ /* The character normally found at the front of a symbol.
+    (if any), perhaps `_'.  */
+  char symbol_leading_char;
+
+ /* The pad character for file names within an archive header.  */
+  char ar_pad_char;
+
+  /* The maximum number of characters in an archive header.  */
+  unsigned char ar_max_namelen;
+
+  /* How well this target matches, used to select between various
+     possible targets when more than one target matches.  */
+  unsigned char match_priority;
+
+  /* Entries for byte swapping for data. These are different from the
+     other entry points, since they don't take a BFD as the first argument.
+     Certain other handlers could do the same.  */
+  bfd_uint64_t   (*bfd_getx64) (const void *);
+  bfd_int64_t    (*bfd_getx_signed_64) (const void *);
+  void           (*bfd_putx64) (bfd_uint64_t, void *);
+  bfd_vma        (*bfd_getx32) (const void *);
+  bfd_signed_vma (*bfd_getx_signed_32) (const void *);
+  void           (*bfd_putx32) (bfd_vma, void *);
+  bfd_vma        (*bfd_getx16) (const void *);
+  bfd_signed_vma (*bfd_getx_signed_16) (const void *);
+  void           (*bfd_putx16) (bfd_vma, void *);
+
+  /* Byte swapping for the headers.  */
+  bfd_uint64_t   (*bfd_h_getx64) (const void *);
+  bfd_int64_t    (*bfd_h_getx_signed_64) (const void *);
+  void           (*bfd_h_putx64) (bfd_uint64_t, void *);
+  bfd_vma        (*bfd_h_getx32) (const void *);
+  bfd_signed_vma (*bfd_h_getx_signed_32) (const void *);
+  void           (*bfd_h_putx32) (bfd_vma, void *);
+  bfd_vma        (*bfd_h_getx16) (const void *);
+  bfd_signed_vma (*bfd_h_getx_signed_16) (const void *);
+  void           (*bfd_h_putx16) (bfd_vma, void *);
+
+  /* Format dependent routines: these are vectors of entry points
+     within the target vector structure, one for each format to check.  */
+
+  /* Check the format of a file being read.  Return a <<bfd_target *>> or zero.  */
+  const struct bfd_target *(*_bfd_check_format[bfd_type_end]) (bfd *);
+
+  /* Set the format of a file being written.  */
+  bfd_boolean (*_bfd_set_format[bfd_type_end]) (bfd *);
+
+  /* Write cached information into a file being written, at <<bfd_close>>.  */
+  bfd_boolean (*_bfd_write_contents[bfd_type_end]) (bfd *);
+
+
+  /* Generic entry points.  */
+#define BFD_JUMP_TABLE_GENERIC(NAME) \
+  NAME##_close_and_cleanup, \
+  NAME##_bfd_free_cached_info, \
+  NAME##_new_section_hook, \
+  NAME##_get_section_contents, \
+  NAME##_get_section_contents_in_window
+
+  /* Called when the BFD is being closed to do any necessary cleanup.  */
+  bfd_boolean (*_close_and_cleanup) (bfd *);
+  /* Ask the BFD to free all cached information.  */
+  bfd_boolean (*_bfd_free_cached_info) (bfd *);
+  /* Called when a new section is created.  */
+  bfd_boolean (*_new_section_hook) (bfd *, sec_ptr);
+  /* Read the contents of a section.  */
+  bfd_boolean (*_bfd_get_section_contents)
+    (bfd *, sec_ptr, void *, file_ptr, bfd_size_type);
+  bfd_boolean (*_bfd_get_section_contents_in_window)
+    (bfd *, sec_ptr, bfd_window *, file_ptr, bfd_size_type);
+
+  /* Entry points to copy private data.  */
+#define BFD_JUMP_TABLE_COPY(NAME) \
+  NAME##_bfd_copy_private_bfd_data, \
+  NAME##_bfd_merge_private_bfd_data, \
+  _bfd_generic_init_private_section_data, \
+  NAME##_bfd_copy_private_section_data, \
+  NAME##_bfd_copy_private_symbol_data, \
+  NAME##_bfd_copy_private_header_data, \
+  NAME##_bfd_set_private_flags, \
+  NAME##_bfd_print_private_bfd_data
+
+  /* Called to copy BFD general private data from one object file
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_bfd_data) (bfd *, bfd *);
+  /* Called to merge BFD general private data from one object file
+     to a common output file when linking.  */
+  bfd_boolean (*_bfd_merge_private_bfd_data) (bfd *, struct bfd_link_info *);
+  /* Called to initialize BFD private section data from one object file
+     to another.  */
+#define bfd_init_private_section_data(ibfd, isec, obfd, osec, link_info) \
+  BFD_SEND (obfd, _bfd_init_private_section_data, (ibfd, isec, obfd, osec, link_info))
+  bfd_boolean (*_bfd_init_private_section_data)
+    (bfd *, sec_ptr, bfd *, sec_ptr, struct bfd_link_info *);
+  /* Called to copy BFD private section data from one object file
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_section_data)
+    (bfd *, sec_ptr, bfd *, sec_ptr);
+  /* Called to copy BFD private symbol data from one symbol
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_symbol_data)
+    (bfd *, asymbol *, bfd *, asymbol *);
+  /* Called to copy BFD private header data from one object file
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_header_data)
+    (bfd *, bfd *);
+  /* Called to set private backend flags.  */
+  bfd_boolean (*_bfd_set_private_flags) (bfd *, flagword);
+
+  /* Called to print private BFD data.  */
+  bfd_boolean (*_bfd_print_private_bfd_data) (bfd *, void *);
+
+  /* Core file entry points.  */
+#define BFD_JUMP_TABLE_CORE(NAME) \
+  NAME##_core_file_failing_command, \
+  NAME##_core_file_failing_signal, \
+  NAME##_core_file_matches_executable_p, \
+  NAME##_core_file_pid
+
+  char *      (*_core_file_failing_command) (bfd *);
+  int         (*_core_file_failing_signal) (bfd *);
+  bfd_boolean (*_core_file_matches_executable_p) (bfd *, bfd *);
+  int         (*_core_file_pid) (bfd *);
+
+  /* Archive entry points.  */
+#define BFD_JUMP_TABLE_ARCHIVE(NAME) \
+  NAME##_slurp_armap, \
+  NAME##_slurp_extended_name_table, \
+  NAME##_construct_extended_name_table, \
+  NAME##_truncate_arname, \
+  NAME##_write_armap, \
+  NAME##_read_ar_hdr, \
+  NAME##_write_ar_hdr, \
+  NAME##_openr_next_archived_file, \
+  NAME##_get_elt_at_index, \
+  NAME##_generic_stat_arch_elt, \
+  NAME##_update_armap_timestamp
+
+  bfd_boolean (*_bfd_slurp_armap) (bfd *);
+  bfd_boolean (*_bfd_slurp_extended_name_table) (bfd *);
+  bfd_boolean (*_bfd_construct_extended_name_table)
+    (bfd *, char **, bfd_size_type *, const char **);
+  void        (*_bfd_truncate_arname) (bfd *, const char *, char *);
+  bfd_boolean (*write_armap)
+    (bfd *, unsigned int, struct orl *, unsigned int, int);
+  void *      (*_bfd_read_ar_hdr_fn) (bfd *);
+  bfd_boolean (*_bfd_write_ar_hdr_fn) (bfd *, bfd *);
+  bfd *       (*openr_next_archived_file) (bfd *, bfd *);
+#define bfd_get_elt_at_index(b,i) BFD_SEND (b, _bfd_get_elt_at_index, (b,i))
+  bfd *       (*_bfd_get_elt_at_index) (bfd *, symindex);
+  int         (*_bfd_stat_arch_elt) (bfd *, struct stat *);
+  bfd_boolean (*_bfd_update_armap_timestamp) (bfd *);
+
+  /* Entry points used for symbols.  */
+#define BFD_JUMP_TABLE_SYMBOLS(NAME) \
+  NAME##_get_symtab_upper_bound, \
+  NAME##_canonicalize_symtab, \
+  NAME##_make_empty_symbol, \
+  NAME##_print_symbol, \
+  NAME##_get_symbol_info, \
+  NAME##_get_symbol_version_string, \
+  NAME##_bfd_is_local_label_name, \
+  NAME##_bfd_is_target_special_symbol, \
+  NAME##_get_lineno, \
+  NAME##_find_nearest_line, \
+  NAME##_find_line, \
+  NAME##_find_inliner_info, \
+  NAME##_bfd_make_debug_symbol, \
+  NAME##_read_minisymbols, \
+  NAME##_minisymbol_to_symbol
+
+  long        (*_bfd_get_symtab_upper_bound) (bfd *);
+  long        (*_bfd_canonicalize_symtab)
+    (bfd *, struct bfd_symbol **);
+  struct bfd_symbol *
+              (*_bfd_make_empty_symbol) (bfd *);
+  void        (*_bfd_print_symbol)
+    (bfd *, void *, struct bfd_symbol *, bfd_print_symbol_type);
+#define bfd_print_symbol(b,p,s,e) BFD_SEND (b, _bfd_print_symbol, (b,p,s,e))
+  void        (*_bfd_get_symbol_info)
+    (bfd *, struct bfd_symbol *, symbol_info *);
+#define bfd_get_symbol_info(b,p,e) BFD_SEND (b, _bfd_get_symbol_info, (b,p,e))
+  const char *(*_bfd_get_symbol_version_string)
+    (bfd *, struct bfd_symbol *, bfd_boolean *);
+#define bfd_get_symbol_version_string(b,s,h) BFD_SEND (b, _bfd_get_symbol_version_string, (b,s,h))
+  bfd_boolean (*_bfd_is_local_label_name) (bfd *, const char *);
+  bfd_boolean (*_bfd_is_target_special_symbol) (bfd *, asymbol *);
+  alent *     (*_get_lineno) (bfd *, struct bfd_symbol *);
+  bfd_boolean (*_bfd_find_nearest_line)
+    (bfd *, struct bfd_symbol **, struct bfd_section *, bfd_vma,
+     const char **, const char **, unsigned int *, unsigned int *);
+  bfd_boolean (*_bfd_find_line)
+    (bfd *, struct bfd_symbol **, struct bfd_symbol *,
+     const char **, unsigned int *);
+  bfd_boolean (*_bfd_find_inliner_info)
+    (bfd *, const char **, const char **, unsigned int *);
+ /* Back-door to allow format-aware applications to create debug symbols
+    while using BFD for everything else.  Currently used by the assembler
+    when creating COFF files.  */
+  asymbol *   (*_bfd_make_debug_symbol)
+    (bfd *, void *, unsigned long size);
+#define bfd_read_minisymbols(b, d, m, s) \
+  BFD_SEND (b, _read_minisymbols, (b, d, m, s))
+  long        (*_read_minisymbols)
+    (bfd *, bfd_boolean, void **, unsigned int *);
+#define bfd_minisymbol_to_symbol(b, d, m, f) \
+  BFD_SEND (b, _minisymbol_to_symbol, (b, d, m, f))
+  asymbol *   (*_minisymbol_to_symbol)
+    (bfd *, bfd_boolean, const void *, asymbol *);
+
+  /* Routines for relocs.  */
+#define BFD_JUMP_TABLE_RELOCS(NAME) \
+  NAME##_get_reloc_upper_bound, \
+  NAME##_canonicalize_reloc, \
+  NAME##_bfd_reloc_type_lookup, \
+  NAME##_bfd_reloc_name_lookup
+
+  long        (*_get_reloc_upper_bound) (bfd *, sec_ptr);
+  long        (*_bfd_canonicalize_reloc)
+    (bfd *, sec_ptr, arelent **, struct bfd_symbol **);
+  /* See documentation on reloc types.  */
+  reloc_howto_type *
+              (*reloc_type_lookup) (bfd *, bfd_reloc_code_real_type);
+  reloc_howto_type *
+              (*reloc_name_lookup) (bfd *, const char *);
+
+
+  /* Routines used when writing an object file.  */
+#define BFD_JUMP_TABLE_WRITE(NAME) \
+  NAME##_set_arch_mach, \
+  NAME##_set_section_contents
+
+  bfd_boolean (*_bfd_set_arch_mach)
+    (bfd *, enum bfd_architecture, unsigned long);
+  bfd_boolean (*_bfd_set_section_contents)
+    (bfd *, sec_ptr, const void *, file_ptr, bfd_size_type);
+
+  /* Routines used by the linker.  */
+#define BFD_JUMP_TABLE_LINK(NAME) \
+  NAME##_sizeof_headers, \
+  NAME##_bfd_get_relocated_section_contents, \
+  NAME##_bfd_relax_section, \
+  NAME##_bfd_link_hash_table_create, \
+  NAME##_bfd_link_add_symbols, \
+  NAME##_bfd_link_just_syms, \
+  NAME##_bfd_copy_link_hash_symbol_type, \
+  NAME##_bfd_final_link, \
+  NAME##_bfd_link_split_section, \
+  NAME##_bfd_link_check_relocs, \
+  NAME##_bfd_gc_sections, \
+  NAME##_bfd_lookup_section_flags, \
+  NAME##_bfd_merge_sections, \
+  NAME##_bfd_is_group_section, \
+  NAME##_bfd_discard_group, \
+  NAME##_section_already_linked, \
+  NAME##_bfd_define_common_symbol
+
+  int         (*_bfd_sizeof_headers) (bfd *, struct bfd_link_info *);
+  bfd_byte *  (*_bfd_get_relocated_section_contents)
+    (bfd *, struct bfd_link_info *, struct bfd_link_order *,
+     bfd_byte *, bfd_boolean, struct bfd_symbol **);
+
+  bfd_boolean (*_bfd_relax_section)
+    (bfd *, struct bfd_section *, struct bfd_link_info *, bfd_boolean *);
+
+  /* Create a hash table for the linker.  Different backends store
+     different information in this table.  */
+  struct bfd_link_hash_table *
+              (*_bfd_link_hash_table_create) (bfd *);
+
+  /* Add symbols from this object file into the hash table.  */
+  bfd_boolean (*_bfd_link_add_symbols) (bfd *, struct bfd_link_info *);
+
+  /* Indicate that we are only retrieving symbol values from this section.  */
+  void        (*_bfd_link_just_syms) (asection *, struct bfd_link_info *);
+
+  /* Copy the symbol type and other attributes for a linker script
+     assignment of one symbol to another.  */
+#define bfd_copy_link_hash_symbol_type(b, t, f) \
+  BFD_SEND (b, _bfd_copy_link_hash_symbol_type, (b, t, f))
+  void (*_bfd_copy_link_hash_symbol_type)
+    (bfd *, struct bfd_link_hash_entry *, struct bfd_link_hash_entry *);
+
+  /* Do a link based on the link_order structures attached to each
+     section of the BFD.  */
+  bfd_boolean (*_bfd_final_link) (bfd *, struct bfd_link_info *);
+
+  /* Should this section be split up into smaller pieces during linking.  */
+  bfd_boolean (*_bfd_link_split_section) (bfd *, struct bfd_section *);
+
+  /* Check the relocations in the bfd for validity.  */
+  bfd_boolean (* _bfd_link_check_relocs)(bfd *, struct bfd_link_info *);
+
+  /* Remove sections that are not referenced from the output.  */
+  bfd_boolean (*_bfd_gc_sections) (bfd *, struct bfd_link_info *);
+
+  /* Sets the bitmask of allowed and disallowed section flags.  */
+  bfd_boolean (*_bfd_lookup_section_flags) (struct bfd_link_info *,
+                                            struct flag_info *,
+                                            asection *);
+
+  /* Attempt to merge SEC_MERGE sections.  */
+  bfd_boolean (*_bfd_merge_sections) (bfd *, struct bfd_link_info *);
+
+  /* Is this section a member of a group?  */
+  bfd_boolean (*_bfd_is_group_section) (bfd *, const struct bfd_section *);
+
+  /* Discard members of a group.  */
+  bfd_boolean (*_bfd_discard_group) (bfd *, struct bfd_section *);
+
+  /* Check if SEC has been already linked during a reloceatable or
+     final link.  */
+  bfd_boolean (*_section_already_linked) (bfd *, asection *,
+                                          struct bfd_link_info *);
+
+  /* Define a common symbol.  */
+  bfd_boolean (*_bfd_define_common_symbol) (bfd *, struct bfd_link_info *,
+                                            struct bfd_link_hash_entry *);
+
+  /* Routines to handle dynamic symbols and relocs.  */
+#define BFD_JUMP_TABLE_DYNAMIC(NAME) \
+  NAME##_get_dynamic_symtab_upper_bound, \
+  NAME##_canonicalize_dynamic_symtab, \
+  NAME##_get_synthetic_symtab, \
+  NAME##_get_dynamic_reloc_upper_bound, \
+  NAME##_canonicalize_dynamic_reloc
+
+  /* Get the amount of memory required to hold the dynamic symbols.  */
+  long        (*_bfd_get_dynamic_symtab_upper_bound) (bfd *);
+  /* Read in the dynamic symbols.  */
+  long        (*_bfd_canonicalize_dynamic_symtab)
+    (bfd *, struct bfd_symbol **);
+  /* Create synthetized symbols.  */
+  long        (*_bfd_get_synthetic_symtab)
+    (bfd *, long, struct bfd_symbol **, long, struct bfd_symbol **,
+     struct bfd_symbol **);
+  /* Get the amount of memory required to hold the dynamic relocs.  */
+  long        (*_bfd_get_dynamic_reloc_upper_bound) (bfd *);
+  /* Read in the dynamic relocs.  */
+  long        (*_bfd_canonicalize_dynamic_reloc)
+    (bfd *, arelent **, struct bfd_symbol **);
+
+  /* Opposite endian version of this target.  */
+  const struct bfd_target * alternative_target;
+
+  /* Data for use by back-end routines, which isn't
+     generic enough to belong in this structure.  */
+  const void *backend_data;
+
+} bfd_target;
+
+bfd_boolean bfd_set_default_target (const char *name);
+
+const bfd_target *bfd_find_target (const char *target_name, bfd *abfd);
+
+const bfd_target *bfd_get_target_info (const char *target_name,
+    bfd *abfd,
+    bfd_boolean *is_bigendian,
+    int *underscoring,
+    const char **def_target_arch);
+const char ** bfd_target_list (void);
+
+const bfd_target *bfd_iterate_over_targets
+   (int (*func) (const bfd_target *, void *),
+    void *data);
+
+const char *bfd_flavour_name (enum bfd_flavour flavour);
+
+/* Extracted from format.c.  */
+bfd_boolean bfd_check_format (bfd *abfd, bfd_format format);
+
+bfd_boolean bfd_check_format_matches
+   (bfd *abfd, bfd_format format, char ***matching);
+
+bfd_boolean bfd_set_format (bfd *abfd, bfd_format format);
+
+const char *bfd_format_string (bfd_format format);
+
+/* Extracted from linker.c.  */
+bfd_boolean bfd_link_split_section (bfd *abfd, asection *sec);
+
+#define bfd_link_split_section(abfd, sec) \
+       BFD_SEND (abfd, _bfd_link_split_section, (abfd, sec))
+
+bfd_boolean bfd_section_already_linked (bfd *abfd,
+    asection *sec,
+    struct bfd_link_info *info);
+
+#define bfd_section_already_linked(abfd, sec, info) \
+       BFD_SEND (abfd, _section_already_linked, (abfd, sec, info))
+
+bfd_boolean bfd_generic_define_common_symbol
+   (bfd *output_bfd, struct bfd_link_info *info,
+    struct bfd_link_hash_entry *h);
+
+#define bfd_define_common_symbol(output_bfd, info, h) \
+       BFD_SEND (output_bfd, _bfd_define_common_symbol, (output_bfd, info, h))
+
+struct bfd_elf_version_tree * bfd_find_version_for_sym
+   (struct bfd_elf_version_tree *verdefs,
+    const char *sym_name, bfd_boolean *hide);
+
+bfd_boolean bfd_hide_sym_by_version
+   (struct bfd_elf_version_tree *verdefs, const char *sym_name);
+
+bfd_boolean bfd_link_check_relocs
+   (bfd *abfd, struct bfd_link_info *info);
+
+bfd_boolean _bfd_generic_link_check_relocs
+   (bfd *abfd, struct bfd_link_info *info);
+
+bfd_boolean bfd_merge_private_bfd_data
+   (bfd *ibfd, struct bfd_link_info *info);
+
+#define bfd_merge_private_bfd_data(ibfd, info) \
+     BFD_SEND ((info)->output_bfd, _bfd_merge_private_bfd_data, \
+               (ibfd, info))
+/* Extracted from simple.c.  */
+bfd_byte *bfd_simple_get_relocated_section_contents
+   (bfd *abfd, asection *sec, bfd_byte *outbuf, asymbol **symbol_table);
+
+/* Extracted from compress.c.  */
+bfd_boolean bfd_get_full_section_contents
+   (bfd *abfd, asection *section, bfd_byte **ptr);
+
+void bfd_cache_section_contents
+   (asection *sec, void *contents);
+
+bfd_boolean bfd_is_section_compressed_with_header
+   (bfd *abfd, asection *section,
+    int *compression_header_size_p,
+    bfd_size_type *uncompressed_size_p);
+
+bfd_boolean bfd_is_section_compressed
+   (bfd *abfd, asection *section);
+
+bfd_boolean bfd_init_section_decompress_status
+   (bfd *abfd, asection *section);
+
+bfd_boolean bfd_init_section_compress_status
+   (bfd *abfd, asection *section);
+
+bfd_boolean bfd_compress_section
+   (bfd *abfd, asection *section, bfd_byte *uncompressed_buffer);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/bfd.h b/utils/gapy/gen-debug-info-src/ext/bfd/bfd.h
new file mode 100644
index 000000000..1c0cf66cf
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/bfd.h
@@ -0,0 +1,7720 @@
+/* DO NOT EDIT!  -*- buffer-read-only: t -*-  This file is automatically 
+   generated from "bfd-in.h", "init.c", "opncls.c", "libbfd.c", 
+   "bfdio.c", "bfdwin.c", "section.c", "archures.c", "reloc.c", 
+   "syms.c", "bfd.c", "archive.c", "corefile.c", "targets.c", "format.c", 
+   "linker.c", "simple.c" and "compress.c".
+   Run "make headers" in your build bfd/ to regenerate.  */
+
+/* Main header file for the bfd library -- portable access to object files.
+
+   Copyright (C) 1990-2017 Free Software Foundation, Inc.
+
+   Contributed by Cygnus Support.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef __BFD_H_SEEN__
+#define __BFD_H_SEEN__
+
+/* PR 14072: Ensure that config.h is included first.  */
+#if !defined PACKAGE && !defined PACKAGE_VERSION
+#error config.h must be included before this header
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "ansidecl.h"
+#include "symcat.h"
+#include <stdarg.h>
+#include <sys/stat.h>
+
+#if defined (__STDC__) || defined (ALMOST_STDC) || defined (HAVE_STRINGIZE)
+#ifndef SABER
+/* This hack is to avoid a problem with some strict ANSI C preprocessors.
+   The problem is, "32_" is not a valid preprocessing token, and we don't
+   want extra underscores (e.g., "nlm_32_").  The XCONCAT2 macro will
+   cause the inner CONCAT2 macros to be evaluated first, producing
+   still-valid pp-tokens.  Then the final concatenation can be done.  */
+#undef CONCAT4
+#define CONCAT4(a,b,c,d) XCONCAT2(CONCAT2(a,b),CONCAT2(c,d))
+#endif
+#endif
+
+/* This is a utility macro to handle the situation where the code
+   wants to place a constant string into the code, followed by a
+   comma and then the length of the string.  Doing this by hand
+   is error prone, so using this macro is safer.  */
+#define STRING_COMMA_LEN(STR) (STR), (sizeof (STR) - 1)
+/* Unfortunately it is not possible to use the STRING_COMMA_LEN macro
+   to create the arguments to another macro, since the preprocessor
+   will mis-count the number of arguments to the outer macro (by not
+   evaluating STRING_COMMA_LEN and so missing the comma).  This is a
+   problem for example when trying to use STRING_COMMA_LEN to build
+   the arguments to the strncmp() macro.  Hence this alternative
+   definition of strncmp is provided here.
+
+   Note - these macros do NOT work if STR2 is not a constant string.  */
+#define CONST_STRNEQ(STR1,STR2) (strncmp ((STR1), (STR2), sizeof (STR2) - 1) == 0)
+  /* strcpy() can have a similar problem, but since we know we are
+     copying a constant string, we can use memcpy which will be faster
+     since there is no need to check for a NUL byte inside STR.  We
+     can also save time if we do not need to copy the terminating NUL.  */
+#define LITMEMCPY(DEST,STR2) memcpy ((DEST), (STR2), sizeof (STR2) - 1)
+#define LITSTRCPY(DEST,STR2) memcpy ((DEST), (STR2), sizeof (STR2))
+
+
+#define BFD_SUPPORTS_PLUGINS 1
+
+/* The word size used by BFD on the host.  This may be 64 with a 32
+   bit target if the host is 64 bit, or if other 64 bit targets have
+   been selected with --enable-targets, or if --enable-64-bit-bfd.  */
+#define BFD_ARCH_SIZE 64
+
+/* The word size of the default bfd target.  */
+#define BFD_DEFAULT_TARGET_SIZE 32
+
+#define BFD_HOST_64BIT_LONG 1
+#define BFD_HOST_64BIT_LONG_LONG 0
+#if 1
+#define BFD_HOST_64_BIT long
+#define BFD_HOST_U_64_BIT unsigned long
+typedef BFD_HOST_64_BIT bfd_int64_t;
+typedef BFD_HOST_U_64_BIT bfd_uint64_t;
+#endif
+
+#if BFD_ARCH_SIZE >= 64
+#define BFD64
+#endif
+
+#ifndef INLINE
+#if __GNUC__ >= 2
+#define INLINE __inline__
+#else
+#define INLINE
+#endif
+#endif
+
+/* Declaring a type wide enough to hold a host long and a host pointer.  */
+#define BFD_HOSTPTR_T	unsigned long
+typedef BFD_HOSTPTR_T bfd_hostptr_t;
+
+/* Forward declaration.  */
+typedef struct bfd bfd;
+
+/* Boolean type used in bfd.  Too many systems define their own
+   versions of "boolean" for us to safely typedef a "boolean" of
+   our own.  Using an enum for "bfd_boolean" has its own set of
+   problems, with strange looking casts required to avoid warnings
+   on some older compilers.  Thus we just use an int.
+
+   General rule: Functions which are bfd_boolean return TRUE on
+   success and FALSE on failure (unless they're a predicate).  */
+
+typedef int bfd_boolean;
+#undef FALSE
+#undef TRUE
+#define FALSE 0
+#define TRUE 1
+
+#ifdef BFD64
+
+#ifndef BFD_HOST_64_BIT
+ #error No 64 bit integer type available
+#endif /* ! defined (BFD_HOST_64_BIT) */
+
+typedef BFD_HOST_U_64_BIT bfd_vma;
+typedef BFD_HOST_64_BIT bfd_signed_vma;
+typedef BFD_HOST_U_64_BIT bfd_size_type;
+typedef BFD_HOST_U_64_BIT symvalue;
+
+#if BFD_HOST_64BIT_LONG
+#define BFD_VMA_FMT "l"
+#elif defined (__MSVCRT__)
+#define BFD_VMA_FMT "I64"
+#else
+#define BFD_VMA_FMT "ll"
+#endif
+
+#ifndef fprintf_vma
+#define sprintf_vma(s,x) sprintf (s, "%016" BFD_VMA_FMT "x", x)
+#define fprintf_vma(f,x) fprintf (f, "%016" BFD_VMA_FMT "x", x)
+#endif
+
+#else /* not BFD64  */
+
+/* Represent a target address.  Also used as a generic unsigned type
+   which is guaranteed to be big enough to hold any arithmetic types
+   we need to deal with.  */
+typedef unsigned long bfd_vma;
+
+/* A generic signed type which is guaranteed to be big enough to hold any
+   arithmetic types we need to deal with.  Can be assumed to be compatible
+   with bfd_vma in the same way that signed and unsigned ints are compatible
+   (as parameters, in assignment, etc).  */
+typedef long bfd_signed_vma;
+
+typedef unsigned long symvalue;
+typedef unsigned long bfd_size_type;
+
+/* Print a bfd_vma x on stream s.  */
+#define BFD_VMA_FMT "l"
+#define fprintf_vma(s,x) fprintf (s, "%08" BFD_VMA_FMT "x", x)
+#define sprintf_vma(s,x) sprintf (s, "%08" BFD_VMA_FMT "x", x)
+
+#endif /* not BFD64  */
+
+#define HALF_BFD_SIZE_TYPE \
+  (((bfd_size_type) 1) << (8 * sizeof (bfd_size_type) / 2))
+
+#ifndef BFD_HOST_64_BIT
+/* Fall back on a 32 bit type.  The idea is to make these types always
+   available for function return types, but in the case that
+   BFD_HOST_64_BIT is undefined such a function should abort or
+   otherwise signal an error.  */
+typedef bfd_signed_vma bfd_int64_t;
+typedef bfd_vma bfd_uint64_t;
+#endif
+
+/* An offset into a file.  BFD always uses the largest possible offset
+   based on the build time availability of fseek, fseeko, or fseeko64.  */
+typedef BFD_HOST_64_BIT file_ptr;
+typedef unsigned BFD_HOST_64_BIT ufile_ptr;
+
+extern void bfd_sprintf_vma (bfd *, char *, bfd_vma);
+extern void bfd_fprintf_vma (bfd *, void *, bfd_vma);
+
+#define printf_vma(x) fprintf_vma(stdout,x)
+#define bfd_printf_vma(abfd,x) bfd_fprintf_vma (abfd,stdout,x)
+
+typedef unsigned int flagword;	/* 32 bits of flags */
+typedef unsigned char bfd_byte;
+
+/* File formats.  */
+
+typedef enum bfd_format
+{
+  bfd_unknown = 0,	/* File format is unknown.  */
+  bfd_object,		/* Linker/assembler/compiler output.  */
+  bfd_archive,		/* Object archive file.  */
+  bfd_core,		/* Core dump.  */
+  bfd_type_end		/* Marks the end; don't use it!  */
+}
+bfd_format;
+
+/* Symbols and relocation.  */
+
+/* A count of carsyms (canonical archive symbols).  */
+typedef unsigned long symindex;
+
+/* How to perform a relocation.  */
+typedef const struct reloc_howto_struct reloc_howto_type;
+
+#define BFD_NO_MORE_SYMBOLS ((symindex) ~0)
+
+/* General purpose part of a symbol X;
+   target specific parts are in libcoff.h, libaout.h, etc.  */
+
+#define bfd_get_section(x) ((x)->section)
+#define bfd_get_output_section(x) ((x)->section->output_section)
+#define bfd_set_section(x,y) ((x)->section) = (y)
+#define bfd_asymbol_base(x) ((x)->section->vma)
+#define bfd_asymbol_value(x) (bfd_asymbol_base(x) + (x)->value)
+#define bfd_asymbol_name(x) ((x)->name)
+/*Perhaps future: #define bfd_asymbol_bfd(x) ((x)->section->owner)*/
+#define bfd_asymbol_bfd(x) ((x)->the_bfd)
+#define bfd_asymbol_flavour(x)			\
+  (((x)->flags & BSF_SYNTHETIC) != 0		\
+   ? bfd_target_unknown_flavour			\
+   : bfd_asymbol_bfd (x)->xvec->flavour)
+
+/* A canonical archive symbol.  */
+/* This is a type pun with struct ranlib on purpose!  */
+typedef struct carsym
+{
+  char *name;
+  file_ptr file_offset;	/* Look here to find the file.  */
+}
+carsym;			/* To make these you call a carsymogen.  */
+
+/* Used in generating armaps (archive tables of contents).
+   Perhaps just a forward definition would do?  */
+struct orl 			/* Output ranlib.  */
+{
+  char **name;		/* Symbol name.  */
+  union
+  {
+    file_ptr pos;
+    bfd *abfd;
+  } u;			/* bfd* or file position.  */
+  int namidx;		/* Index into string table.  */
+};
+
+/* Linenumber stuff.  */
+typedef struct lineno_cache_entry
+{
+  unsigned int line_number;	/* Linenumber from start of function.  */
+  union
+  {
+    struct bfd_symbol *sym;	/* Function name.  */
+    bfd_vma offset;	    		/* Offset into section.  */
+  } u;
+}
+alent;
+
+/* Object and core file sections.  */
+typedef struct bfd_section *sec_ptr;
+
+#define	align_power(addr, align)	\
+  (((addr) + ((bfd_vma) 1 << (align)) - 1) & (-((bfd_vma) 1 << (align))))
+
+/* Align an address upward to a boundary, expressed as a number of bytes.
+   E.g. align to an 8-byte boundary with argument of 8.  Take care never
+   to wrap around if the address is within boundary-1 of the end of the
+   address space.  */
+#define BFD_ALIGN(this, boundary)					  \
+  ((((bfd_vma) (this) + (boundary) - 1) >= (bfd_vma) (this))		  \
+   ? (((bfd_vma) (this) + ((boundary) - 1)) & ~ (bfd_vma) ((boundary)-1)) \
+   : ~ (bfd_vma) 0)
+
+#define bfd_get_section_name(bfd, ptr) ((void) bfd, (ptr)->name)
+#define bfd_get_section_vma(bfd, ptr) ((void) bfd, (ptr)->vma)
+#define bfd_get_section_lma(bfd, ptr) ((void) bfd, (ptr)->lma)
+#define bfd_get_section_alignment(bfd, ptr) ((void) bfd, \
+					     (ptr)->alignment_power)
+#define bfd_section_name(bfd, ptr) ((ptr)->name)
+#define bfd_section_size(bfd, ptr) ((ptr)->size)
+#define bfd_get_section_size(ptr) ((ptr)->size)
+#define bfd_section_vma(bfd, ptr) ((ptr)->vma)
+#define bfd_section_lma(bfd, ptr) ((ptr)->lma)
+#define bfd_section_alignment(bfd, ptr) ((ptr)->alignment_power)
+#define bfd_get_section_flags(bfd, ptr) ((void) bfd, (ptr)->flags)
+#define bfd_get_section_userdata(bfd, ptr) ((void) bfd, (ptr)->userdata)
+
+#define bfd_is_com_section(ptr) (((ptr)->flags & SEC_IS_COMMON) != 0)
+
+#define bfd_get_section_limit_octets(bfd, sec)			\
+  ((bfd)->direction != write_direction && (sec)->rawsize != 0	\
+   ? (sec)->rawsize : (sec)->size)
+
+/* Find the address one past the end of SEC.  */
+#define bfd_get_section_limit(bfd, sec) \
+  (bfd_get_section_limit_octets(bfd, sec) / bfd_octets_per_byte (bfd))
+
+/* Return TRUE if input section SEC has been discarded.  */
+#define discarded_section(sec)				\
+  (!bfd_is_abs_section (sec)					\
+   && bfd_is_abs_section ((sec)->output_section)		\
+   && (sec)->sec_info_type != SEC_INFO_TYPE_MERGE		\
+   && (sec)->sec_info_type != SEC_INFO_TYPE_JUST_SYMS)
+
+typedef enum bfd_print_symbol
+{
+  bfd_print_symbol_name,
+  bfd_print_symbol_more,
+  bfd_print_symbol_all
+} bfd_print_symbol_type;
+
+/* Information about a symbol that nm needs.  */
+
+typedef struct _symbol_info
+{
+  symvalue value;
+  char type;
+  const char *name;            /* Symbol name.  */
+  unsigned char stab_type;     /* Stab type.  */
+  char stab_other;             /* Stab other.  */
+  short stab_desc;             /* Stab desc.  */
+  const char *stab_name;       /* String for stab type.  */
+} symbol_info;
+
+/* Get the name of a stabs type code.  */
+
+extern const char *bfd_get_stab_name (int);
+
+/* Hash table routines.  There is no way to free up a hash table.  */
+
+/* An element in the hash table.  Most uses will actually use a larger
+   structure, and an instance of this will be the first field.  */
+
+struct bfd_hash_entry
+{
+  /* Next entry for this hash code.  */
+  struct bfd_hash_entry *next;
+  /* String being hashed.  */
+  const char *string;
+  /* Hash code.  This is the full hash code, not the index into the
+     table.  */
+  unsigned long hash;
+};
+
+/* A hash table.  */
+
+struct bfd_hash_table
+{
+  /* The hash array.  */
+  struct bfd_hash_entry **table;
+  /* A function used to create new elements in the hash table.  The
+     first entry is itself a pointer to an element.  When this
+     function is first invoked, this pointer will be NULL.  However,
+     having the pointer permits a hierarchy of method functions to be
+     built each of which calls the function in the superclass.  Thus
+     each function should be written to allocate a new block of memory
+     only if the argument is NULL.  */
+  struct bfd_hash_entry *(*newfunc)
+    (struct bfd_hash_entry *, struct bfd_hash_table *, const char *);
+   /* An objalloc for this hash table.  This is a struct objalloc *,
+     but we use void * to avoid requiring the inclusion of objalloc.h.  */
+  void *memory;
+  /* The number of slots in the hash table.  */
+  unsigned int size;
+  /* The number of entries in the hash table.  */
+  unsigned int count;
+  /* The size of elements.  */
+  unsigned int entsize;
+  /* If non-zero, don't grow the hash table.  */
+  unsigned int frozen:1;
+};
+
+/* Initialize a hash table.  */
+extern bfd_boolean bfd_hash_table_init
+  (struct bfd_hash_table *,
+   struct bfd_hash_entry *(*) (struct bfd_hash_entry *,
+			       struct bfd_hash_table *,
+			       const char *),
+   unsigned int);
+
+/* Initialize a hash table specifying a size.  */
+extern bfd_boolean bfd_hash_table_init_n
+  (struct bfd_hash_table *,
+   struct bfd_hash_entry *(*) (struct bfd_hash_entry *,
+			       struct bfd_hash_table *,
+			       const char *),
+   unsigned int, unsigned int);
+
+/* Free up a hash table.  */
+extern void bfd_hash_table_free
+  (struct bfd_hash_table *);
+
+/* Look up a string in a hash table.  If CREATE is TRUE, a new entry
+   will be created for this string if one does not already exist.  The
+   COPY argument must be TRUE if this routine should copy the string
+   into newly allocated memory when adding an entry.  */
+extern struct bfd_hash_entry *bfd_hash_lookup
+  (struct bfd_hash_table *, const char *, bfd_boolean create,
+   bfd_boolean copy);
+
+/* Insert an entry in a hash table.  */
+extern struct bfd_hash_entry *bfd_hash_insert
+  (struct bfd_hash_table *, const char *, unsigned long);
+
+/* Rename an entry in a hash table.  */
+extern void bfd_hash_rename
+  (struct bfd_hash_table *, const char *, struct bfd_hash_entry *);
+
+/* Replace an entry in a hash table.  */
+extern void bfd_hash_replace
+  (struct bfd_hash_table *, struct bfd_hash_entry *old,
+   struct bfd_hash_entry *nw);
+
+/* Base method for creating a hash table entry.  */
+extern struct bfd_hash_entry *bfd_hash_newfunc
+  (struct bfd_hash_entry *, struct bfd_hash_table *, const char *);
+
+/* Grab some space for a hash table entry.  */
+extern void *bfd_hash_allocate
+  (struct bfd_hash_table *, unsigned int);
+
+/* Traverse a hash table in a random order, calling a function on each
+   element.  If the function returns FALSE, the traversal stops.  The
+   INFO argument is passed to the function.  */
+extern void bfd_hash_traverse
+  (struct bfd_hash_table *,
+   bfd_boolean (*) (struct bfd_hash_entry *, void *),
+   void *info);
+
+/* Allows the default size of a hash table to be configured. New hash
+   tables allocated using bfd_hash_table_init will be created with
+   this size.  */
+extern unsigned long bfd_hash_set_default_size (unsigned long);
+
+/* Types of compressed DWARF debug sections.  We currently support
+   zlib.  */
+enum compressed_debug_section_type
+{
+  COMPRESS_DEBUG_NONE = 0,
+  COMPRESS_DEBUG = 1 << 0,
+  COMPRESS_DEBUG_GNU_ZLIB = COMPRESS_DEBUG | 1 << 1,
+  COMPRESS_DEBUG_GABI_ZLIB = COMPRESS_DEBUG | 1 << 2
+};
+
+/* This structure is used to keep track of stabs in sections
+   information while linking.  */
+
+struct stab_info
+{
+  /* A hash table used to hold stabs strings.  */
+  struct bfd_strtab_hash *strings;
+  /* The header file hash table.  */
+  struct bfd_hash_table includes;
+  /* The first .stabstr section.  */
+  struct bfd_section *stabstr;
+};
+
+#define COFF_SWAP_TABLE (void *) &bfd_coff_std_swap_table
+
+/* User program access to BFD facilities.  */
+
+/* Direct I/O routines, for programs which know more about the object
+   file than BFD does.  Use higher level routines if possible.  */
+
+extern bfd_size_type bfd_bread (void *, bfd_size_type, bfd *);
+extern bfd_size_type bfd_bwrite (const void *, bfd_size_type, bfd *);
+extern int bfd_seek (bfd *, file_ptr, int);
+extern file_ptr bfd_tell (bfd *);
+extern int bfd_flush (bfd *);
+extern int bfd_stat (bfd *, struct stat *);
+
+/* Deprecated old routines.  */
+#if __GNUC__
+#define bfd_read(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_read", __FILE__, __LINE__, __FUNCTION__),	\
+   bfd_bread ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#define bfd_write(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_write", __FILE__, __LINE__, __FUNCTION__),	\
+   bfd_bwrite ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#else
+#define bfd_read(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_read", (const char *) 0, 0, (const char *) 0), \
+   bfd_bread ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#define bfd_write(BUF, ELTSIZE, NITEMS, ABFD)				\
+  (warn_deprecated ("bfd_write", (const char *) 0, 0, (const char *) 0),\
+   bfd_bwrite ((BUF), (ELTSIZE) * (NITEMS), (ABFD)))
+#endif
+extern void warn_deprecated (const char *, const char *, int, const char *);
+
+/* Cast from const char * to char * so that caller can assign to
+   a char * without a warning.  */
+#define bfd_get_filename(abfd) ((char *) (abfd)->filename)
+#define bfd_get_cacheable(abfd) ((abfd)->cacheable)
+#define bfd_get_format(abfd) ((abfd)->format)
+#define bfd_get_target(abfd) ((abfd)->xvec->name)
+#define bfd_get_flavour(abfd) ((abfd)->xvec->flavour)
+#define bfd_family_coff(abfd) \
+  (bfd_get_flavour (abfd) == bfd_target_coff_flavour || \
+   bfd_get_flavour (abfd) == bfd_target_xcoff_flavour)
+#define bfd_big_endian(abfd) ((abfd)->xvec->byteorder == BFD_ENDIAN_BIG)
+#define bfd_little_endian(abfd) ((abfd)->xvec->byteorder == BFD_ENDIAN_LITTLE)
+#define bfd_header_big_endian(abfd) \
+  ((abfd)->xvec->header_byteorder == BFD_ENDIAN_BIG)
+#define bfd_header_little_endian(abfd) \
+  ((abfd)->xvec->header_byteorder == BFD_ENDIAN_LITTLE)
+#define bfd_get_file_flags(abfd) ((abfd)->flags)
+#define bfd_applicable_file_flags(abfd) ((abfd)->xvec->object_flags)
+#define bfd_applicable_section_flags(abfd) ((abfd)->xvec->section_flags)
+#define bfd_has_map(abfd) ((abfd)->has_armap)
+#define bfd_is_thin_archive(abfd) ((abfd)->is_thin_archive)
+
+#define bfd_valid_reloc_types(abfd) ((abfd)->xvec->valid_reloc_types)
+#define bfd_usrdata(abfd) ((abfd)->usrdata)
+
+#define bfd_get_start_address(abfd) ((abfd)->start_address)
+#define bfd_get_symcount(abfd) ((abfd)->symcount)
+#define bfd_get_outsymbols(abfd) ((abfd)->outsymbols)
+#define bfd_count_sections(abfd) ((abfd)->section_count)
+
+#define bfd_get_dynamic_symcount(abfd) ((abfd)->dynsymcount)
+
+#define bfd_get_symbol_leading_char(abfd) ((abfd)->xvec->symbol_leading_char)
+
+extern bfd_boolean bfd_cache_close
+  (bfd *abfd);
+/* NB: This declaration should match the autogenerated one in libbfd.h.  */
+
+extern bfd_boolean bfd_cache_close_all (void);
+
+extern bfd_boolean bfd_record_phdr
+  (bfd *, unsigned long, bfd_boolean, flagword, bfd_boolean, bfd_vma,
+   bfd_boolean, bfd_boolean, unsigned int, struct bfd_section **);
+
+/* Byte swapping routines.  */
+
+bfd_uint64_t bfd_getb64 (const void *);
+bfd_uint64_t bfd_getl64 (const void *);
+bfd_int64_t bfd_getb_signed_64 (const void *);
+bfd_int64_t bfd_getl_signed_64 (const void *);
+bfd_vma bfd_getb32 (const void *);
+bfd_vma bfd_getl32 (const void *);
+bfd_signed_vma bfd_getb_signed_32 (const void *);
+bfd_signed_vma bfd_getl_signed_32 (const void *);
+bfd_vma bfd_getb16 (const void *);
+bfd_vma bfd_getl16 (const void *);
+bfd_signed_vma bfd_getb_signed_16 (const void *);
+bfd_signed_vma bfd_getl_signed_16 (const void *);
+void bfd_putb64 (bfd_uint64_t, void *);
+void bfd_putl64 (bfd_uint64_t, void *);
+void bfd_putb32 (bfd_vma, void *);
+void bfd_putl32 (bfd_vma, void *);
+void bfd_putb16 (bfd_vma, void *);
+void bfd_putl16 (bfd_vma, void *);
+
+/* Byte swapping routines which take size and endiannes as arguments.  */
+
+bfd_uint64_t bfd_get_bits (const void *, int, bfd_boolean);
+void bfd_put_bits (bfd_uint64_t, void *, int, bfd_boolean);
+
+#if defined(__STDC__) || defined(ALMOST_STDC)
+struct ecoff_debug_info;
+struct ecoff_debug_swap;
+struct ecoff_extr;
+struct bfd_symbol;
+struct bfd_link_info;
+struct bfd_link_hash_entry;
+struct bfd_section_already_linked;
+struct bfd_elf_version_tree;
+#endif
+
+extern bfd_boolean bfd_section_already_linked_table_init (void);
+extern void bfd_section_already_linked_table_free (void);
+extern bfd_boolean _bfd_handle_already_linked
+  (struct bfd_section *, struct bfd_section_already_linked *,
+   struct bfd_link_info *);
+
+/* Externally visible ECOFF routines.  */
+
+extern bfd_vma bfd_ecoff_get_gp_value
+  (bfd * abfd);
+extern bfd_boolean bfd_ecoff_set_gp_value
+  (bfd *abfd, bfd_vma gp_value);
+extern bfd_boolean bfd_ecoff_set_regmasks
+  (bfd *abfd, unsigned long gprmask, unsigned long fprmask,
+   unsigned long *cprmask);
+extern void *bfd_ecoff_debug_init
+  (bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, struct bfd_link_info *);
+extern void bfd_ecoff_debug_free
+  (void *handle, bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, struct bfd_link_info *);
+extern bfd_boolean bfd_ecoff_debug_accumulate
+  (void *handle, bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, bfd *input_bfd,
+   struct ecoff_debug_info *input_debug,
+   const struct ecoff_debug_swap *input_swap, struct bfd_link_info *);
+extern bfd_boolean bfd_ecoff_debug_accumulate_other
+  (void *handle, bfd *output_bfd, struct ecoff_debug_info *output_debug,
+   const struct ecoff_debug_swap *output_swap, bfd *input_bfd,
+   struct bfd_link_info *);
+extern bfd_boolean bfd_ecoff_debug_externals
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap, bfd_boolean relocatable,
+   bfd_boolean (*get_extr) (struct bfd_symbol *, struct ecoff_extr *),
+   void (*set_index) (struct bfd_symbol *, bfd_size_type));
+extern bfd_boolean bfd_ecoff_debug_one_external
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap, const char *name,
+   struct ecoff_extr *esym);
+extern bfd_size_type bfd_ecoff_debug_size
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap);
+extern bfd_boolean bfd_ecoff_write_debug
+  (bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap, file_ptr where);
+extern bfd_boolean bfd_ecoff_write_accumulated_debug
+  (void *handle, bfd *abfd, struct ecoff_debug_info *debug,
+   const struct ecoff_debug_swap *swap,
+   struct bfd_link_info *info, file_ptr where);
+
+/* Externally visible ELF routines.  */
+
+struct bfd_link_needed_list
+{
+  struct bfd_link_needed_list *next;
+  bfd *by;
+  const char *name;
+};
+
+enum dynamic_lib_link_class {
+  DYN_NORMAL = 0,
+  DYN_AS_NEEDED = 1,
+  DYN_DT_NEEDED = 2,
+  DYN_NO_ADD_NEEDED = 4,
+  DYN_NO_NEEDED = 8
+};
+
+enum notice_asneeded_action {
+  notice_as_needed,
+  notice_not_needed,
+  notice_needed
+};
+
+extern bfd_boolean bfd_elf_record_link_assignment
+  (bfd *, struct bfd_link_info *, const char *, bfd_boolean,
+   bfd_boolean);
+extern struct bfd_link_needed_list *bfd_elf_get_needed_list
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_elf_get_bfd_needed_list
+  (bfd *, struct bfd_link_needed_list **);
+extern bfd_boolean bfd_elf_stack_segment_size (bfd *, struct bfd_link_info *,
+					       const char *, bfd_vma);
+extern bfd_boolean bfd_elf_size_dynamic_sections
+  (bfd *, const char *, const char *, const char *, const char *, const char *,
+   const char * const *, struct bfd_link_info *, struct bfd_section **);
+extern bfd_boolean bfd_elf_size_dynsym_hash_dynstr
+  (bfd *, struct bfd_link_info *);
+extern void bfd_elf_set_dt_needed_name
+  (bfd *, const char *);
+extern const char *bfd_elf_get_dt_soname
+  (bfd *);
+extern void bfd_elf_set_dyn_lib_class
+  (bfd *, enum dynamic_lib_link_class);
+extern int bfd_elf_get_dyn_lib_class
+  (bfd *);
+extern struct bfd_link_needed_list *bfd_elf_get_runpath_list
+  (bfd *, struct bfd_link_info *);
+extern int bfd_elf_discard_info
+  (bfd *, struct bfd_link_info *);
+extern unsigned int _bfd_elf_default_action_discarded
+  (struct bfd_section *);
+
+/* Return an upper bound on the number of bytes required to store a
+   copy of ABFD's program header table entries.  Return -1 if an error
+   occurs; bfd_get_error will return an appropriate code.  */
+extern long bfd_get_elf_phdr_upper_bound
+  (bfd *abfd);
+
+/* Copy ABFD's program header table entries to *PHDRS.  The entries
+   will be stored as an array of Elf_Internal_Phdr structures, as
+   defined in include/elf/internal.h.  To find out how large the
+   buffer needs to be, call bfd_get_elf_phdr_upper_bound.
+
+   Return the number of program header table entries read, or -1 if an
+   error occurs; bfd_get_error will return an appropriate code.  */
+extern int bfd_get_elf_phdrs
+  (bfd *abfd, void *phdrs);
+
+/* Create a new BFD as if by bfd_openr.  Rather than opening a file,
+   reconstruct an ELF file by reading the segments out of remote
+   memory based on the ELF file header at EHDR_VMA and the ELF program
+   headers it points to.  If non-zero, SIZE is the known extent of the
+   object.  If not null, *LOADBASEP is filled in with the difference
+   between the VMAs from which the segments were read, and the VMAs
+   the file headers (and hence BFD's idea of each section's VMA) put
+   them at.
+
+   The function TARGET_READ_MEMORY is called to copy LEN bytes from
+   the remote memory at target address VMA into the local buffer at
+   MYADDR; it should return zero on success or an `errno' code on
+   failure.  TEMPL must be a BFD for a target with the word size and
+   byte order found in the remote memory.  */
+extern bfd *bfd_elf_bfd_from_remote_memory
+  (bfd *templ, bfd_vma ehdr_vma, bfd_size_type size, bfd_vma *loadbasep,
+   int (*target_read_memory) (bfd_vma vma, bfd_byte *myaddr,
+			      bfd_size_type len));
+
+extern struct bfd_section *_bfd_elf_tls_setup
+  (bfd *, struct bfd_link_info *);
+
+extern struct bfd_section *
+_bfd_nearby_section (bfd *, struct bfd_section *, bfd_vma);
+
+extern void _bfd_fix_excluded_sec_syms
+  (bfd *, struct bfd_link_info *);
+
+extern unsigned bfd_m68k_mach_to_features (int);
+
+extern int bfd_m68k_features_to_mach (unsigned);
+
+extern bfd_boolean bfd_m68k_elf32_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *,
+   char **);
+
+extern void bfd_elf_m68k_set_target_options (struct bfd_link_info *, int);
+
+extern bfd_boolean bfd_bfin_elf32_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *,
+   char **);
+
+extern bfd_boolean bfd_cr16_elf32_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *,
+   char **);
+
+/* SunOS shared library support routines for the linker.  */
+
+extern struct bfd_link_needed_list *bfd_sunos_get_needed_list
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_sunos_record_link_assignment
+  (bfd *, struct bfd_link_info *, const char *);
+extern bfd_boolean bfd_sunos_size_dynamic_sections
+  (bfd *, struct bfd_link_info *, struct bfd_section **,
+   struct bfd_section **, struct bfd_section **);
+
+/* Linux shared library support routines for the linker.  */
+
+extern bfd_boolean bfd_i386linux_size_dynamic_sections
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_m68klinux_size_dynamic_sections
+  (bfd *, struct bfd_link_info *);
+extern bfd_boolean bfd_sparclinux_size_dynamic_sections
+  (bfd *, struct bfd_link_info *);
+
+/* mmap hacks */
+
+struct _bfd_window_internal;
+typedef struct _bfd_window_internal bfd_window_internal;
+
+typedef struct _bfd_window
+{
+  /* What the user asked for.  */
+  void *data;
+  bfd_size_type size;
+  /* The actual window used by BFD.  Small user-requested read-only
+     regions sharing a page may share a single window into the object
+     file.  Read-write versions shouldn't until I've fixed things to
+     keep track of which portions have been claimed by the
+     application; don't want to give the same region back when the
+     application wants two writable copies!  */
+  struct _bfd_window_internal *i;
+}
+bfd_window;
+
+extern void bfd_init_window
+  (bfd_window *);
+extern void bfd_free_window
+  (bfd_window *);
+extern bfd_boolean bfd_get_file_window
+  (bfd *, file_ptr, bfd_size_type, bfd_window *, bfd_boolean);
+
+/* XCOFF support routines for the linker.  */
+
+extern bfd_boolean bfd_xcoff_split_import_path
+  (bfd *, const char *, const char **, const char **);
+extern bfd_boolean bfd_xcoff_set_archive_import_path
+  (struct bfd_link_info *, bfd *, const char *);
+extern bfd_boolean bfd_xcoff_link_record_set
+  (bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, bfd_size_type);
+extern bfd_boolean bfd_xcoff_import_symbol
+  (bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, bfd_vma,
+   const char *, const char *, const char *, unsigned int);
+extern bfd_boolean bfd_xcoff_export_symbol
+  (bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *);
+extern bfd_boolean bfd_xcoff_link_count_reloc
+  (bfd *, struct bfd_link_info *, const char *);
+extern bfd_boolean bfd_xcoff_record_link_assignment
+  (bfd *, struct bfd_link_info *, const char *);
+extern bfd_boolean bfd_xcoff_size_dynamic_sections
+  (bfd *, struct bfd_link_info *, const char *, const char *,
+   unsigned long, unsigned long, unsigned long, bfd_boolean,
+   int, bfd_boolean, unsigned int, struct bfd_section **, bfd_boolean);
+extern bfd_boolean bfd_xcoff_link_generate_rtinit
+  (bfd *, const char *, const char *, bfd_boolean);
+
+/* XCOFF support routines for ar.  */
+extern bfd_boolean bfd_xcoff_ar_archive_set_magic
+  (bfd *, char *);
+
+/* Externally visible COFF routines.  */
+
+#if defined(__STDC__) || defined(ALMOST_STDC)
+struct internal_syment;
+union internal_auxent;
+#endif
+
+extern bfd_boolean bfd_coff_set_symbol_class
+  (bfd *, struct bfd_symbol *, unsigned int);
+
+extern bfd_boolean bfd_m68k_coff_create_embedded_relocs
+  (bfd *, struct bfd_link_info *, struct bfd_section *, struct bfd_section *, char **);
+
+/* ARM VFP11 erratum workaround support.  */
+typedef enum
+{
+  BFD_ARM_VFP11_FIX_DEFAULT,
+  BFD_ARM_VFP11_FIX_NONE,
+  BFD_ARM_VFP11_FIX_SCALAR,
+  BFD_ARM_VFP11_FIX_VECTOR
+} bfd_arm_vfp11_fix;
+
+extern void bfd_elf32_arm_init_maps
+  (bfd *);
+
+extern void bfd_elf32_arm_set_vfp11_fix
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_set_cortex_a8_fix
+  (bfd *, struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_vfp11_erratum_scan
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_vfp11_fix_veneer_locations
+  (bfd *, struct bfd_link_info *);
+
+/* ARM STM STM32L4XX erratum workaround support.  */
+typedef enum
+{
+  BFD_ARM_STM32L4XX_FIX_NONE,
+  BFD_ARM_STM32L4XX_FIX_DEFAULT,
+  BFD_ARM_STM32L4XX_FIX_ALL
+} bfd_arm_stm32l4xx_fix;
+
+extern void bfd_elf32_arm_set_stm32l4xx_fix
+  (bfd *, struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_stm32l4xx_erratum_scan
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_stm32l4xx_fix_veneer_locations
+  (bfd *, struct bfd_link_info *);
+
+/* ARM Interworking support.  Called from linker.  */
+extern bfd_boolean bfd_arm_allocate_interworking_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean bfd_arm_process_before_allocation
+  (bfd *, struct bfd_link_info *, int);
+
+extern bfd_boolean bfd_arm_get_bfd_for_interworking
+  (bfd *, struct bfd_link_info *);
+
+/* PE ARM Interworking support.  Called from linker.  */
+extern bfd_boolean bfd_arm_pe_allocate_interworking_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean bfd_arm_pe_process_before_allocation
+  (bfd *, struct bfd_link_info *, int);
+
+extern bfd_boolean bfd_arm_pe_get_bfd_for_interworking
+  (bfd *, struct bfd_link_info *);
+
+/* ELF ARM Interworking support.  Called from linker.  */
+extern bfd_boolean bfd_elf32_arm_allocate_interworking_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_process_before_allocation
+  (bfd *, struct bfd_link_info *);
+
+struct elf32_arm_params {
+  char *thumb_entry_symbol;
+  int byteswap_code;
+  int target1_is_rel;
+  char * target2_type;
+  int fix_v4bx;
+  int use_blx;
+  bfd_arm_vfp11_fix vfp11_denorm_fix;
+  bfd_arm_stm32l4xx_fix stm32l4xx_fix;
+  int no_enum_size_warning;
+  int no_wchar_size_warning;
+  int pic_veneer;
+  int fix_cortex_a8;
+  int fix_arm1176;
+  int merge_exidx_entries;
+  int cmse_implib;
+  bfd *in_implib_bfd;
+};
+
+void bfd_elf32_arm_set_target_params
+  (bfd *, struct bfd_link_info *, struct elf32_arm_params *);
+
+extern bfd_boolean bfd_elf32_arm_get_bfd_for_interworking
+  (bfd *, struct bfd_link_info *);
+
+extern bfd_boolean bfd_elf32_arm_add_glue_sections_to_bfd
+  (bfd *, struct bfd_link_info *);
+
+extern void bfd_elf32_arm_keep_private_stub_output_sections
+  (struct bfd_link_info *);
+
+/* ELF ARM mapping symbol support.  */
+#define BFD_ARM_SPECIAL_SYM_TYPE_MAP	(1 << 0)
+#define BFD_ARM_SPECIAL_SYM_TYPE_TAG	(1 << 1)
+#define BFD_ARM_SPECIAL_SYM_TYPE_OTHER  (1 << 2)
+#define BFD_ARM_SPECIAL_SYM_TYPE_ANY	(~0)
+
+extern bfd_boolean bfd_is_arm_special_symbol_name
+  (const char *, int);
+
+extern void bfd_elf32_arm_set_byteswap_code
+  (struct bfd_link_info *, int);
+
+extern void bfd_elf32_arm_use_long_plt (void);
+
+/* ARM Note section processing.  */
+extern bfd_boolean bfd_arm_merge_machines
+  (bfd *, bfd *);
+
+extern bfd_boolean bfd_arm_update_notes
+  (bfd *, const char *);
+
+extern unsigned int bfd_arm_get_mach_from_notes
+  (bfd *, const char *);
+
+/* ARM stub generation support.  Called from the linker.  */
+extern int elf32_arm_setup_section_lists
+  (bfd *, struct bfd_link_info *);
+extern void elf32_arm_next_input_section
+  (struct bfd_link_info *, struct bfd_section *);
+extern bfd_boolean elf32_arm_size_stubs
+  (bfd *, bfd *, struct bfd_link_info *, bfd_signed_vma,
+   struct bfd_section * (*) (const char *, struct bfd_section *,
+			     struct bfd_section *, unsigned int),
+   void (*) (void));
+extern bfd_boolean elf32_arm_build_stubs
+  (struct bfd_link_info *);
+
+/* ARM unwind section editing support.  */
+extern bfd_boolean elf32_arm_fix_exidx_coverage
+(struct bfd_section **, unsigned int, struct bfd_link_info *, bfd_boolean);
+
+/* C6x unwind section editing support.  */
+extern bfd_boolean elf32_tic6x_fix_exidx_coverage
+(struct bfd_section **, unsigned int, struct bfd_link_info *, bfd_boolean);
+
+extern void bfd_elf64_aarch64_init_maps
+  (bfd *);
+
+extern void bfd_elf32_aarch64_init_maps
+  (bfd *);
+
+extern void bfd_elf64_aarch64_set_options
+  (bfd *, struct bfd_link_info *, int, int, int, int, int, int);
+
+extern void bfd_elf32_aarch64_set_options
+  (bfd *, struct bfd_link_info *, int, int, int, int, int, int);
+
+/* ELF AArch64 mapping symbol support.  */
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_MAP	(1 << 0)
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_TAG	(1 << 1)
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_OTHER	(1 << 2)
+#define BFD_AARCH64_SPECIAL_SYM_TYPE_ANY	(~0)
+extern bfd_boolean bfd_is_aarch64_special_symbol_name
+  (const char * name, int type);
+
+/* AArch64 stub generation support for ELF64.  Called from the linker.  */
+extern int elf64_aarch64_setup_section_lists
+  (bfd *, struct bfd_link_info *);
+extern void elf64_aarch64_next_input_section
+  (struct bfd_link_info *, struct bfd_section *);
+extern bfd_boolean elf64_aarch64_size_stubs
+  (bfd *, bfd *, struct bfd_link_info *, bfd_signed_vma,
+   struct bfd_section * (*) (const char *, struct bfd_section *),
+   void (*) (void));
+extern bfd_boolean elf64_aarch64_build_stubs
+  (struct bfd_link_info *);
+/* AArch64 stub generation support for ELF32.  Called from the linker.  */
+extern int elf32_aarch64_setup_section_lists
+  (bfd *, struct bfd_link_info *);
+extern void elf32_aarch64_next_input_section
+  (struct bfd_link_info *, struct bfd_section *);
+extern bfd_boolean elf32_aarch64_size_stubs
+  (bfd *, bfd *, struct bfd_link_info *, bfd_signed_vma,
+   struct bfd_section * (*) (const char *, struct bfd_section *),
+   void (*) (void));
+extern bfd_boolean elf32_aarch64_build_stubs
+  (struct bfd_link_info *);
+
+
+/* TI COFF load page support.  */
+extern void bfd_ticoff_set_section_load_page
+  (struct bfd_section *, int);
+
+extern int bfd_ticoff_get_section_load_page
+  (struct bfd_section *);
+
+/* H8/300 functions.  */
+extern bfd_vma bfd_h8300_pad_address
+  (bfd *, bfd_vma);
+
+/* IA64 Itanium code generation.  Called from linker.  */
+extern void bfd_elf32_ia64_after_parse
+  (int);
+
+extern void bfd_elf64_ia64_after_parse
+  (int);
+
+/* V850 Note manipulation routines.  */
+extern bfd_boolean v850_elf_create_sections
+  (struct bfd_link_info *);
+
+extern bfd_boolean v850_elf_set_note
+  (bfd *, unsigned int, unsigned int);
+
+/* MIPS ABI flags data access.  For the disassembler.  */
+struct elf_internal_abiflags_v0;
+extern struct elf_internal_abiflags_v0 *bfd_mips_elf_get_abiflags (bfd *);
+/* Extracted from init.c.  */
+void bfd_init (void);
+
+/* Extracted from opncls.c.  */
+/* Set to N to open the next N BFDs using an alternate id space.  */
+extern unsigned int bfd_use_reserved_id;
+bfd *bfd_fopen (const char *filename, const char *target,
+    const char *mode, int fd);
+
+bfd *bfd_openr (const char *filename, const char *target);
+
+bfd *bfd_fdopenr (const char *filename, const char *target, int fd);
+
+bfd *bfd_openstreamr (const char * filename, const char * target, void * stream);
+
+bfd *bfd_openr_iovec (const char *filename, const char *target,
+    void *(*open_func) (struct bfd *nbfd,
+    void *open_closure),
+    void *open_closure,
+    file_ptr (*pread_func) (struct bfd *nbfd,
+    void *stream,
+    void *buf,
+    file_ptr nbytes,
+    file_ptr offset),
+    int (*close_func) (struct bfd *nbfd,
+    void *stream),
+    int (*stat_func) (struct bfd *abfd,
+    void *stream,
+    struct stat *sb));
+
+bfd *bfd_openw (const char *filename, const char *target);
+
+bfd_boolean bfd_close (bfd *abfd);
+
+bfd_boolean bfd_close_all_done (bfd *);
+
+bfd *bfd_create (const char *filename, bfd *templ);
+
+bfd_boolean bfd_make_writable (bfd *abfd);
+
+bfd_boolean bfd_make_readable (bfd *abfd);
+
+void *bfd_alloc (bfd *abfd, bfd_size_type wanted);
+
+void *bfd_zalloc (bfd *abfd, bfd_size_type wanted);
+
+unsigned long bfd_calc_gnu_debuglink_crc32
+   (unsigned long crc, const unsigned char *buf, bfd_size_type len);
+
+char *bfd_get_debug_link_info (bfd *abfd, unsigned long *crc32_out);
+
+char *bfd_get_alt_debug_link_info (bfd * abfd,
+    bfd_size_type *buildid_len,
+    bfd_byte **buildid_out);
+
+char *bfd_follow_gnu_debuglink (bfd *abfd, const char *dir);
+
+char *bfd_follow_gnu_debugaltlink (bfd *abfd, const char *dir);
+
+struct bfd_section *bfd_create_gnu_debuglink_section
+   (bfd *abfd, const char *filename);
+
+bfd_boolean bfd_fill_in_gnu_debuglink_section
+   (bfd *abfd, struct bfd_section *sect, const char *filename);
+
+char *bfd_follow_build_id_debuglink (bfd *abfd, const char *dir);
+
+/* Extracted from libbfd.c.  */
+
+/* Byte swapping macros for user section data.  */
+
+#define bfd_put_8(abfd, val, ptr) \
+  ((void) (*((unsigned char *) (ptr)) = (val) & 0xff))
+#define bfd_put_signed_8 \
+  bfd_put_8
+#define bfd_get_8(abfd, ptr) \
+  (*(const unsigned char *) (ptr) & 0xff)
+#define bfd_get_signed_8(abfd, ptr) \
+  (((*(const unsigned char *) (ptr) & 0xff) ^ 0x80) - 0x80)
+
+#define bfd_put_16(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_putx16, ((val),(ptr)))
+#define bfd_put_signed_16 \
+  bfd_put_16
+#define bfd_get_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx16, (ptr))
+#define bfd_get_signed_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx_signed_16, (ptr))
+
+#define bfd_put_32(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_putx32, ((val),(ptr)))
+#define bfd_put_signed_32 \
+  bfd_put_32
+#define bfd_get_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx32, (ptr))
+#define bfd_get_signed_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx_signed_32, (ptr))
+
+#define bfd_put_64(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_putx64, ((val), (ptr)))
+#define bfd_put_signed_64 \
+  bfd_put_64
+#define bfd_get_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx64, (ptr))
+#define bfd_get_signed_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_getx_signed_64, (ptr))
+
+#define bfd_get(bits, abfd, ptr)                       \
+  ((bits) == 8 ? (bfd_vma) bfd_get_8 (abfd, ptr)       \
+   : (bits) == 16 ? bfd_get_16 (abfd, ptr)             \
+   : (bits) == 32 ? bfd_get_32 (abfd, ptr)             \
+   : (bits) == 64 ? bfd_get_64 (abfd, ptr)             \
+   : (abort (), (bfd_vma) - 1))
+
+#define bfd_put(bits, abfd, val, ptr)                  \
+  ((bits) == 8 ? bfd_put_8  (abfd, val, ptr)           \
+   : (bits) == 16 ? bfd_put_16 (abfd, val, ptr)                \
+   : (bits) == 32 ? bfd_put_32 (abfd, val, ptr)                \
+   : (bits) == 64 ? bfd_put_64 (abfd, val, ptr)                \
+   : (abort (), (void) 0))
+
+
+/* Byte swapping macros for file header data.  */
+
+#define bfd_h_put_8(abfd, val, ptr) \
+  bfd_put_8 (abfd, val, ptr)
+#define bfd_h_put_signed_8(abfd, val, ptr) \
+  bfd_put_8 (abfd, val, ptr)
+#define bfd_h_get_8(abfd, ptr) \
+  bfd_get_8 (abfd, ptr)
+#define bfd_h_get_signed_8(abfd, ptr) \
+  bfd_get_signed_8 (abfd, ptr)
+
+#define bfd_h_put_16(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_h_putx16, (val, ptr))
+#define bfd_h_put_signed_16 \
+  bfd_h_put_16
+#define bfd_h_get_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx16, (ptr))
+#define bfd_h_get_signed_16(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx_signed_16, (ptr))
+
+#define bfd_h_put_32(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_h_putx32, (val, ptr))
+#define bfd_h_put_signed_32 \
+  bfd_h_put_32
+#define bfd_h_get_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx32, (ptr))
+#define bfd_h_get_signed_32(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx_signed_32, (ptr))
+
+#define bfd_h_put_64(abfd, val, ptr) \
+  BFD_SEND (abfd, bfd_h_putx64, (val, ptr))
+#define bfd_h_put_signed_64 \
+  bfd_h_put_64
+#define bfd_h_get_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx64, (ptr))
+#define bfd_h_get_signed_64(abfd, ptr) \
+  BFD_SEND (abfd, bfd_h_getx_signed_64, (ptr))
+
+/* Aliases for the above, which should eventually go away.  */
+
+#define H_PUT_64  bfd_h_put_64
+#define H_PUT_32  bfd_h_put_32
+#define H_PUT_16  bfd_h_put_16
+#define H_PUT_8   bfd_h_put_8
+#define H_PUT_S64 bfd_h_put_signed_64
+#define H_PUT_S32 bfd_h_put_signed_32
+#define H_PUT_S16 bfd_h_put_signed_16
+#define H_PUT_S8  bfd_h_put_signed_8
+#define H_GET_64  bfd_h_get_64
+#define H_GET_32  bfd_h_get_32
+#define H_GET_16  bfd_h_get_16
+#define H_GET_8   bfd_h_get_8
+#define H_GET_S64 bfd_h_get_signed_64
+#define H_GET_S32 bfd_h_get_signed_32
+#define H_GET_S16 bfd_h_get_signed_16
+#define H_GET_S8  bfd_h_get_signed_8
+
+
+/* Extracted from bfdio.c.  */
+long bfd_get_mtime (bfd *abfd);
+
+file_ptr bfd_get_size (bfd *abfd);
+
+void *bfd_mmap (bfd *abfd, void *addr, bfd_size_type len,
+    int prot, int flags, file_ptr offset,
+    void **map_addr, bfd_size_type *map_len);
+
+/* Extracted from bfdwin.c.  */
+/* Extracted from section.c.  */
+
+typedef struct bfd_section
+{
+  /* The name of the section; the name isn't a copy, the pointer is
+     the same as that passed to bfd_make_section.  */
+  const char *name;
+
+  /* A unique sequence number.  */
+  unsigned int id;
+
+  /* Which section in the bfd; 0..n-1 as sections are created in a bfd.  */
+  unsigned int index;
+
+  /* The next section in the list belonging to the BFD, or NULL.  */
+  struct bfd_section *next;
+
+  /* The previous section in the list belonging to the BFD, or NULL.  */
+  struct bfd_section *prev;
+
+  /* The field flags contains attributes of the section. Some
+     flags are read in from the object file, and some are
+     synthesized from other information.  */
+  flagword flags;
+
+#define SEC_NO_FLAGS   0x000
+
+  /* Tells the OS to allocate space for this section when loading.
+     This is clear for a section containing debug information only.  */
+#define SEC_ALLOC      0x001
+
+  /* Tells the OS to load the section from the file when loading.
+     This is clear for a .bss section.  */
+#define SEC_LOAD       0x002
+
+  /* The section contains data still to be relocated, so there is
+     some relocation information too.  */
+#define SEC_RELOC      0x004
+
+  /* A signal to the OS that the section contains read only data.  */
+#define SEC_READONLY   0x008
+
+  /* The section contains code only.  */
+#define SEC_CODE       0x010
+
+  /* The section contains data only.  */
+#define SEC_DATA       0x020
+
+  /* The section will reside in ROM.  */
+#define SEC_ROM        0x040
+
+  /* The section contains constructor information. This section
+     type is used by the linker to create lists of constructors and
+     destructors used by <<g++>>. When a back end sees a symbol
+     which should be used in a constructor list, it creates a new
+     section for the type of name (e.g., <<__CTOR_LIST__>>), attaches
+     the symbol to it, and builds a relocation. To build the lists
+     of constructors, all the linker has to do is catenate all the
+     sections called <<__CTOR_LIST__>> and relocate the data
+     contained within - exactly the operations it would peform on
+     standard data.  */
+#define SEC_CONSTRUCTOR 0x080
+
+  /* The section has contents - a data section could be
+     <<SEC_ALLOC>> | <<SEC_HAS_CONTENTS>>; a debug section could be
+     <<SEC_HAS_CONTENTS>>  */
+#define SEC_HAS_CONTENTS 0x100
+
+  /* An instruction to the linker to not output the section
+     even if it has information which would normally be written.  */
+#define SEC_NEVER_LOAD 0x200
+
+  /* The section contains thread local data.  */
+#define SEC_THREAD_LOCAL 0x400
+
+  /* The section has GOT references.  This flag is only for the
+     linker, and is currently only used by the elf32-hppa back end.
+     It will be set if global offset table references were detected
+     in this section, which indicate to the linker that the section
+     contains PIC code, and must be handled specially when doing a
+     static link.  */
+#define SEC_HAS_GOT_REF 0x800
+
+  /* The section contains common symbols (symbols may be defined
+     multiple times, the value of a symbol is the amount of
+     space it requires, and the largest symbol value is the one
+     used).  Most targets have exactly one of these (which we
+     translate to bfd_com_section_ptr), but ECOFF has two.  */
+#define SEC_IS_COMMON 0x1000
+
+  /* The section contains only debugging information.  For
+     example, this is set for ELF .debug and .stab sections.
+     strip tests this flag to see if a section can be
+     discarded.  */
+#define SEC_DEBUGGING 0x2000
+
+  /* The contents of this section are held in memory pointed to
+     by the contents field.  This is checked by bfd_get_section_contents,
+     and the data is retrieved from memory if appropriate.  */
+#define SEC_IN_MEMORY 0x4000
+
+  /* The contents of this section are to be excluded by the
+     linker for executable and shared objects unless those
+     objects are to be further relocated.  */
+#define SEC_EXCLUDE 0x8000
+
+  /* The contents of this section are to be sorted based on the sum of
+     the symbol and addend values specified by the associated relocation
+     entries.  Entries without associated relocation entries will be
+     appended to the end of the section in an unspecified order.  */
+#define SEC_SORT_ENTRIES 0x10000
+
+  /* When linking, duplicate sections of the same name should be
+     discarded, rather than being combined into a single section as
+     is usually done.  This is similar to how common symbols are
+     handled.  See SEC_LINK_DUPLICATES below.  */
+#define SEC_LINK_ONCE 0x20000
+
+  /* If SEC_LINK_ONCE is set, this bitfield describes how the linker
+     should handle duplicate sections.  */
+#define SEC_LINK_DUPLICATES 0xc0000
+
+  /* This value for SEC_LINK_DUPLICATES means that duplicate
+     sections with the same name should simply be discarded.  */
+#define SEC_LINK_DUPLICATES_DISCARD 0x0
+
+  /* This value for SEC_LINK_DUPLICATES means that the linker
+     should warn if there are any duplicate sections, although
+     it should still only link one copy.  */
+#define SEC_LINK_DUPLICATES_ONE_ONLY 0x40000
+
+  /* This value for SEC_LINK_DUPLICATES means that the linker
+     should warn if any duplicate sections are a different size.  */
+#define SEC_LINK_DUPLICATES_SAME_SIZE 0x80000
+
+  /* This value for SEC_LINK_DUPLICATES means that the linker
+     should warn if any duplicate sections contain different
+     contents.  */
+#define SEC_LINK_DUPLICATES_SAME_CONTENTS \
+  (SEC_LINK_DUPLICATES_ONE_ONLY | SEC_LINK_DUPLICATES_SAME_SIZE)
+
+  /* This section was created by the linker as part of dynamic
+     relocation or other arcane processing.  It is skipped when
+     going through the first-pass output, trusting that someone
+     else up the line will take care of it later.  */
+#define SEC_LINKER_CREATED 0x100000
+
+  /* This section should not be subject to garbage collection.
+     Also set to inform the linker that this section should not be
+     listed in the link map as discarded.  */
+#define SEC_KEEP 0x200000
+
+  /* This section contains "short" data, and should be placed
+     "near" the GP.  */
+#define SEC_SMALL_DATA 0x400000
+
+  /* Attempt to merge identical entities in the section.
+     Entity size is given in the entsize field.  */
+#define SEC_MERGE 0x800000
+
+  /* If given with SEC_MERGE, entities to merge are zero terminated
+     strings where entsize specifies character size instead of fixed
+     size entries.  */
+#define SEC_STRINGS 0x1000000
+
+  /* This section contains data about section groups.  */
+#define SEC_GROUP 0x2000000
+
+  /* The section is a COFF shared library section.  This flag is
+     only for the linker.  If this type of section appears in
+     the input file, the linker must copy it to the output file
+     without changing the vma or size.  FIXME: Although this
+     was originally intended to be general, it really is COFF
+     specific (and the flag was renamed to indicate this).  It
+     might be cleaner to have some more general mechanism to
+     allow the back end to control what the linker does with
+     sections.  */
+#define SEC_COFF_SHARED_LIBRARY 0x4000000
+
+  /* This input section should be copied to output in reverse order
+     as an array of pointers.  This is for ELF linker internal use
+     only.  */
+#define SEC_ELF_REVERSE_COPY 0x4000000
+
+  /* This section contains data which may be shared with other
+     executables or shared objects. This is for COFF only.  */
+#define SEC_COFF_SHARED 0x8000000
+
+  /* This section should be compressed.  This is for ELF linker
+     internal use only.  */
+#define SEC_ELF_COMPRESS 0x8000000
+
+  /* When a section with this flag is being linked, then if the size of
+     the input section is less than a page, it should not cross a page
+     boundary.  If the size of the input section is one page or more,
+     it should be aligned on a page boundary.  This is for TI
+     TMS320C54X only.  */
+#define SEC_TIC54X_BLOCK 0x10000000
+
+  /* This section should be renamed.  This is for ELF linker
+     internal use only.  */
+#define SEC_ELF_RENAME 0x10000000
+
+  /* Conditionally link this section; do not link if there are no
+     references found to any symbol in the section.  This is for TI
+     TMS320C54X only.  */
+#define SEC_TIC54X_CLINK 0x20000000
+
+  /* This section contains vliw code.  This is for Toshiba MeP only.  */
+#define SEC_MEP_VLIW 0x20000000
+
+  /* Indicate that section has the no read flag set. This happens
+     when memory read flag isn't set. */
+#define SEC_COFF_NOREAD 0x40000000
+
+  /* Indicate that section has the purecode flag set.  */
+#define SEC_ELF_PURECODE 0x80000000
+
+  /*  End of section flags.  */
+
+  /* Some internal packed boolean fields.  */
+
+  /* See the vma field.  */
+  unsigned int user_set_vma : 1;
+
+  /* A mark flag used by some of the linker backends.  */
+  unsigned int linker_mark : 1;
+
+  /* Another mark flag used by some of the linker backends.  Set for
+     output sections that have an input section.  */
+  unsigned int linker_has_input : 1;
+
+  /* Mark flag used by some linker backends for garbage collection.  */
+  unsigned int gc_mark : 1;
+
+  /* Section compression status.  */
+  unsigned int compress_status : 2;
+#define COMPRESS_SECTION_NONE    0
+#define COMPRESS_SECTION_DONE    1
+#define DECOMPRESS_SECTION_SIZED 2
+
+  /* The following flags are used by the ELF linker. */
+
+  /* Mark sections which have been allocated to segments.  */
+  unsigned int segment_mark : 1;
+
+  /* Type of sec_info information.  */
+  unsigned int sec_info_type:3;
+#define SEC_INFO_TYPE_NONE      0
+#define SEC_INFO_TYPE_STABS     1
+#define SEC_INFO_TYPE_MERGE     2
+#define SEC_INFO_TYPE_EH_FRAME  3
+#define SEC_INFO_TYPE_JUST_SYMS 4
+#define SEC_INFO_TYPE_TARGET    5
+#define SEC_INFO_TYPE_EH_FRAME_ENTRY 6
+
+  /* Nonzero if this section uses RELA relocations, rather than REL.  */
+  unsigned int use_rela_p:1;
+
+  /* Bits used by various backends.  The generic code doesn't touch
+     these fields.  */
+
+  unsigned int sec_flg0:1;
+  unsigned int sec_flg1:1;
+  unsigned int sec_flg2:1;
+  unsigned int sec_flg3:1;
+  unsigned int sec_flg4:1;
+  unsigned int sec_flg5:1;
+
+  /* End of internal packed boolean fields.  */
+
+  /*  The virtual memory address of the section - where it will be
+      at run time.  The symbols are relocated against this.  The
+      user_set_vma flag is maintained by bfd; if it's not set, the
+      backend can assign addresses (for example, in <<a.out>>, where
+      the default address for <<.data>> is dependent on the specific
+      target and various flags).  */
+  bfd_vma vma;
+
+  /*  The load address of the section - where it would be in a
+      rom image; really only used for writing section header
+      information.  */
+  bfd_vma lma;
+
+  /* The size of the section in *octets*, as it will be output.
+     Contains a value even if the section has no contents (e.g., the
+     size of <<.bss>>).  */
+  bfd_size_type size;
+
+  /* For input sections, the original size on disk of the section, in
+     octets.  This field should be set for any section whose size is
+     changed by linker relaxation.  It is required for sections where
+     the linker relaxation scheme doesn't cache altered section and
+     reloc contents (stabs, eh_frame, SEC_MERGE, some coff relaxing
+     targets), and thus the original size needs to be kept to read the
+     section multiple times.  For output sections, rawsize holds the
+     section size calculated on a previous linker relaxation pass.  */
+  bfd_size_type rawsize;
+
+  /* The compressed size of the section in octets.  */
+  bfd_size_type compressed_size;
+
+  /* Relaxation table. */
+  struct relax_table *relax;
+
+  /* Count of used relaxation table entries. */
+  int relax_count;
+
+
+  /* If this section is going to be output, then this value is the
+     offset in *bytes* into the output section of the first byte in the
+     input section (byte ==> smallest addressable unit on the
+     target).  In most cases, if this was going to start at the
+     100th octet (8-bit quantity) in the output section, this value
+     would be 100.  However, if the target byte size is 16 bits
+     (bfd_octets_per_byte is "2"), this value would be 50.  */
+  bfd_vma output_offset;
+
+  /* The output section through which to map on output.  */
+  struct bfd_section *output_section;
+
+  /* The alignment requirement of the section, as an exponent of 2 -
+     e.g., 3 aligns to 2^3 (or 8).  */
+  unsigned int alignment_power;
+
+  /* If an input section, a pointer to a vector of relocation
+     records for the data in this section.  */
+  struct reloc_cache_entry *relocation;
+
+  /* If an output section, a pointer to a vector of pointers to
+     relocation records for the data in this section.  */
+  struct reloc_cache_entry **orelocation;
+
+  /* The number of relocation records in one of the above.  */
+  unsigned reloc_count;
+
+  /* Information below is back end specific - and not always used
+     or updated.  */
+
+  /* File position of section data.  */
+  file_ptr filepos;
+
+  /* File position of relocation info.  */
+  file_ptr rel_filepos;
+
+  /* File position of line data.  */
+  file_ptr line_filepos;
+
+  /* Pointer to data for applications.  */
+  void *userdata;
+
+  /* If the SEC_IN_MEMORY flag is set, this points to the actual
+     contents.  */
+  unsigned char *contents;
+
+  /* Attached line number information.  */
+  alent *lineno;
+
+  /* Number of line number records.  */
+  unsigned int lineno_count;
+
+  /* Entity size for merging purposes.  */
+  unsigned int entsize;
+
+  /* Points to the kept section if this section is a link-once section,
+     and is discarded.  */
+  struct bfd_section *kept_section;
+
+  /* When a section is being output, this value changes as more
+     linenumbers are written out.  */
+  file_ptr moving_line_filepos;
+
+  /* What the section number is in the target world.  */
+  int target_index;
+
+  void *used_by_bfd;
+
+  /* If this is a constructor section then here is a list of the
+     relocations created to relocate items within it.  */
+  struct relent_chain *constructor_chain;
+
+  /* The BFD which owns the section.  */
+  bfd *owner;
+
+  /* A symbol which points at this section only.  */
+  struct bfd_symbol *symbol;
+  struct bfd_symbol **symbol_ptr_ptr;
+
+  /* Early in the link process, map_head and map_tail are used to build
+     a list of input sections attached to an output section.  Later,
+     output sections use these fields for a list of bfd_link_order
+     structs.  */
+  union {
+    struct bfd_link_order *link_order;
+    struct bfd_section *s;
+  } map_head, map_tail;
+} asection;
+
+/* Relax table contains information about instructions which can
+   be removed by relaxation -- replacing a long address with a
+   short address.  */
+struct relax_table {
+  /* Address where bytes may be deleted. */
+  bfd_vma addr;
+
+  /* Number of bytes to be deleted.  */
+  int size;
+};
+
+/* Note: the following are provided as inline functions rather than macros
+   because not all callers use the return value.  A macro implementation
+   would use a comma expression, eg: "((ptr)->foo = val, TRUE)" and some
+   compilers will complain about comma expressions that have no effect.  */
+static inline bfd_boolean
+bfd_set_section_userdata (bfd * abfd ATTRIBUTE_UNUSED, asection * ptr, void * val)
+{
+  ptr->userdata = val;
+  return TRUE;
+}
+
+static inline bfd_boolean
+bfd_set_section_vma (bfd * abfd ATTRIBUTE_UNUSED, asection * ptr, bfd_vma val)
+{
+  ptr->vma = ptr->lma = val;
+  ptr->user_set_vma = TRUE;
+  return TRUE;
+}
+
+static inline bfd_boolean
+bfd_set_section_alignment (bfd * abfd ATTRIBUTE_UNUSED, asection * ptr, unsigned int val)
+{
+  ptr->alignment_power = val;
+  return TRUE;
+}
+
+/* These sections are global, and are managed by BFD.  The application
+   and target back end are not permitted to change the values in
+   these sections.  */
+extern asection _bfd_std_section[4];
+
+#define BFD_ABS_SECTION_NAME "*ABS*"
+#define BFD_UND_SECTION_NAME "*UND*"
+#define BFD_COM_SECTION_NAME "*COM*"
+#define BFD_IND_SECTION_NAME "*IND*"
+
+/* Pointer to the common section.  */
+#define bfd_com_section_ptr (&_bfd_std_section[0])
+/* Pointer to the undefined section.  */
+#define bfd_und_section_ptr (&_bfd_std_section[1])
+/* Pointer to the absolute section.  */
+#define bfd_abs_section_ptr (&_bfd_std_section[2])
+/* Pointer to the indirect section.  */
+#define bfd_ind_section_ptr (&_bfd_std_section[3])
+
+#define bfd_is_und_section(sec) ((sec) == bfd_und_section_ptr)
+#define bfd_is_abs_section(sec) ((sec) == bfd_abs_section_ptr)
+#define bfd_is_ind_section(sec) ((sec) == bfd_ind_section_ptr)
+
+#define bfd_is_const_section(SEC)              \
+ (   ((SEC) == bfd_abs_section_ptr)            \
+  || ((SEC) == bfd_und_section_ptr)            \
+  || ((SEC) == bfd_com_section_ptr)            \
+  || ((SEC) == bfd_ind_section_ptr))
+
+/* Macros to handle insertion and deletion of a bfd's sections.  These
+   only handle the list pointers, ie. do not adjust section_count,
+   target_index etc.  */
+#define bfd_section_list_remove(ABFD, S) \
+  do                                                   \
+    {                                                  \
+      asection *_s = S;                                \
+      asection *_next = _s->next;                      \
+      asection *_prev = _s->prev;                      \
+      if (_prev)                                       \
+        _prev->next = _next;                           \
+      else                                             \
+        (ABFD)->sections = _next;                      \
+      if (_next)                                       \
+        _next->prev = _prev;                           \
+      else                                             \
+        (ABFD)->section_last = _prev;                  \
+    }                                                  \
+  while (0)
+#define bfd_section_list_append(ABFD, S) \
+  do                                                   \
+    {                                                  \
+      asection *_s = S;                                \
+      bfd *_abfd = ABFD;                               \
+      _s->next = NULL;                                 \
+      if (_abfd->section_last)                         \
+        {                                              \
+          _s->prev = _abfd->section_last;              \
+          _abfd->section_last->next = _s;              \
+        }                                              \
+      else                                             \
+        {                                              \
+          _s->prev = NULL;                             \
+          _abfd->sections = _s;                        \
+        }                                              \
+      _abfd->section_last = _s;                        \
+    }                                                  \
+  while (0)
+#define bfd_section_list_prepend(ABFD, S) \
+  do                                                   \
+    {                                                  \
+      asection *_s = S;                                \
+      bfd *_abfd = ABFD;                               \
+      _s->prev = NULL;                                 \
+      if (_abfd->sections)                             \
+        {                                              \
+          _s->next = _abfd->sections;                  \
+          _abfd->sections->prev = _s;                  \
+        }                                              \
+      else                                             \
+        {                                              \
+          _s->next = NULL;                             \
+          _abfd->section_last = _s;                    \
+        }                                              \
+      _abfd->sections = _s;                            \
+    }                                                  \
+  while (0)
+#define bfd_section_list_insert_after(ABFD, A, S) \
+  do                                                   \
+    {                                                  \
+      asection *_a = A;                                \
+      asection *_s = S;                                \
+      asection *_next = _a->next;                      \
+      _s->next = _next;                                \
+      _s->prev = _a;                                   \
+      _a->next = _s;                                   \
+      if (_next)                                       \
+        _next->prev = _s;                              \
+      else                                             \
+        (ABFD)->section_last = _s;                     \
+    }                                                  \
+  while (0)
+#define bfd_section_list_insert_before(ABFD, B, S) \
+  do                                                   \
+    {                                                  \
+      asection *_b = B;                                \
+      asection *_s = S;                                \
+      asection *_prev = _b->prev;                      \
+      _s->prev = _prev;                                \
+      _s->next = _b;                                   \
+      _b->prev = _s;                                   \
+      if (_prev)                                       \
+        _prev->next = _s;                              \
+      else                                             \
+        (ABFD)->sections = _s;                         \
+    }                                                  \
+  while (0)
+#define bfd_section_removed_from_list(ABFD, S) \
+  ((S)->next == NULL ? (ABFD)->section_last != (S) : (S)->next->prev != (S))
+
+#define BFD_FAKE_SECTION(SEC, SYM, NAME, IDX, FLAGS)                   \
+  /* name, id,  index, next, prev, flags, user_set_vma,            */  \
+  {  NAME, IDX, 0,     NULL, NULL, FLAGS, 0,                           \
+                                                                       \
+  /* linker_mark, linker_has_input, gc_mark, decompress_status,    */  \
+     0,           0,                1,       0,                        \
+                                                                       \
+  /* segment_mark, sec_info_type, use_rela_p,                      */  \
+     0,            0,             0,                                   \
+                                                                       \
+  /* sec_flg0, sec_flg1, sec_flg2, sec_flg3, sec_flg4, sec_flg5,   */  \
+     0,        0,        0,        0,        0,        0,              \
+                                                                       \
+  /* vma, lma, size, rawsize, compressed_size, relax, relax_count, */  \
+     0,   0,   0,    0,       0,               0,     0,               \
+                                                                       \
+  /* output_offset, output_section, alignment_power,               */  \
+     0,             &SEC,           0,                                 \
+                                                                       \
+  /* relocation, orelocation, reloc_count, filepos, rel_filepos,   */  \
+     NULL,       NULL,        0,           0,       0,                 \
+                                                                       \
+  /* line_filepos, userdata, contents, lineno, lineno_count,       */  \
+     0,            NULL,     NULL,     NULL,   0,                      \
+                                                                       \
+  /* entsize, kept_section, moving_line_filepos,                    */ \
+     0,       NULL,          0,                                        \
+                                                                       \
+  /* target_index, used_by_bfd, constructor_chain, owner,          */  \
+     0,            NULL,        NULL,              NULL,               \
+                                                                       \
+  /* symbol,                    symbol_ptr_ptr,                    */  \
+     (struct bfd_symbol *) SYM, &SEC.symbol,                           \
+                                                                       \
+  /* map_head, map_tail                                            */  \
+     { NULL }, { NULL }                                                \
+    }
+
+void bfd_section_list_clear (bfd *);
+
+asection *bfd_get_section_by_name (bfd *abfd, const char *name);
+
+asection *bfd_get_next_section_by_name (bfd *ibfd, asection *sec);
+
+asection *bfd_get_linker_section (bfd *abfd, const char *name);
+
+asection *bfd_get_section_by_name_if
+   (bfd *abfd,
+    const char *name,
+    bfd_boolean (*func) (bfd *abfd, asection *sect, void *obj),
+    void *obj);
+
+char *bfd_get_unique_section_name
+   (bfd *abfd, const char *templat, int *count);
+
+asection *bfd_make_section_old_way (bfd *abfd, const char *name);
+
+asection *bfd_make_section_anyway_with_flags
+   (bfd *abfd, const char *name, flagword flags);
+
+asection *bfd_make_section_anyway (bfd *abfd, const char *name);
+
+asection *bfd_make_section_with_flags
+   (bfd *, const char *name, flagword flags);
+
+asection *bfd_make_section (bfd *, const char *name);
+
+int bfd_get_next_section_id (void);
+
+bfd_boolean bfd_set_section_flags
+   (bfd *abfd, asection *sec, flagword flags);
+
+void bfd_rename_section
+   (bfd *abfd, asection *sec, const char *newname);
+
+void bfd_map_over_sections
+   (bfd *abfd,
+    void (*func) (bfd *abfd, asection *sect, void *obj),
+    void *obj);
+
+asection *bfd_sections_find_if
+   (bfd *abfd,
+    bfd_boolean (*operation) (bfd *abfd, asection *sect, void *obj),
+    void *obj);
+
+bfd_boolean bfd_set_section_size
+   (bfd *abfd, asection *sec, bfd_size_type val);
+
+bfd_boolean bfd_set_section_contents
+   (bfd *abfd, asection *section, const void *data,
+    file_ptr offset, bfd_size_type count);
+
+bfd_boolean bfd_get_section_contents
+   (bfd *abfd, asection *section, void *location, file_ptr offset,
+    bfd_size_type count);
+
+bfd_boolean bfd_malloc_and_get_section
+   (bfd *abfd, asection *section, bfd_byte **buf);
+
+bfd_boolean bfd_copy_private_section_data
+   (bfd *ibfd, asection *isec, bfd *obfd, asection *osec);
+
+#define bfd_copy_private_section_data(ibfd, isection, obfd, osection) \
+     BFD_SEND (obfd, _bfd_copy_private_section_data, \
+               (ibfd, isection, obfd, osection))
+bfd_boolean bfd_generic_is_group_section (bfd *, const asection *sec);
+
+bfd_boolean bfd_generic_discard_group (bfd *abfd, asection *group);
+
+/* Extracted from archures.c.  */
+enum bfd_architecture
+{
+  bfd_arch_unknown,   /* File arch not known.  */
+  bfd_arch_obscure,   /* Arch known, not one of these.  */
+  bfd_arch_m68k,      /* Motorola 68xxx */
+#define bfd_mach_m68000 1
+#define bfd_mach_m68008 2
+#define bfd_mach_m68010 3
+#define bfd_mach_m68020 4
+#define bfd_mach_m68030 5
+#define bfd_mach_m68040 6
+#define bfd_mach_m68060 7
+#define bfd_mach_cpu32  8
+#define bfd_mach_fido   9
+#define bfd_mach_mcf_isa_a_nodiv 10
+#define bfd_mach_mcf_isa_a 11
+#define bfd_mach_mcf_isa_a_mac 12
+#define bfd_mach_mcf_isa_a_emac 13
+#define bfd_mach_mcf_isa_aplus 14
+#define bfd_mach_mcf_isa_aplus_mac 15
+#define bfd_mach_mcf_isa_aplus_emac 16
+#define bfd_mach_mcf_isa_b_nousp 17
+#define bfd_mach_mcf_isa_b_nousp_mac 18
+#define bfd_mach_mcf_isa_b_nousp_emac 19
+#define bfd_mach_mcf_isa_b 20
+#define bfd_mach_mcf_isa_b_mac 21
+#define bfd_mach_mcf_isa_b_emac 22
+#define bfd_mach_mcf_isa_b_float 23
+#define bfd_mach_mcf_isa_b_float_mac 24
+#define bfd_mach_mcf_isa_b_float_emac 25
+#define bfd_mach_mcf_isa_c 26
+#define bfd_mach_mcf_isa_c_mac 27
+#define bfd_mach_mcf_isa_c_emac 28
+#define bfd_mach_mcf_isa_c_nodiv 29
+#define bfd_mach_mcf_isa_c_nodiv_mac 30
+#define bfd_mach_mcf_isa_c_nodiv_emac 31
+  bfd_arch_vax,       /* DEC Vax */
+  bfd_arch_i960,      /* Intel 960 */
+    /* The order of the following is important.
+       lower number indicates a machine type that
+       only accepts a subset of the instructions
+       available to machines with higher numbers.
+       The exception is the "ca", which is
+       incompatible with all other machines except
+       "core".  */
+
+#define bfd_mach_i960_core      1
+#define bfd_mach_i960_ka_sa     2
+#define bfd_mach_i960_kb_sb     3
+#define bfd_mach_i960_mc        4
+#define bfd_mach_i960_xa        5
+#define bfd_mach_i960_ca        6
+#define bfd_mach_i960_jx        7
+#define bfd_mach_i960_hx        8
+
+  bfd_arch_or1k,      /* OpenRISC 1000 */
+#define bfd_mach_or1k           1
+#define bfd_mach_or1knd         2
+
+  bfd_arch_sparc,     /* SPARC */
+#define bfd_mach_sparc                 1
+/* The difference between v8plus and v9 is that v9 is a true 64 bit env.  */
+#define bfd_mach_sparc_sparclet        2
+#define bfd_mach_sparc_sparclite       3
+#define bfd_mach_sparc_v8plus          4
+#define bfd_mach_sparc_v8plusa         5 /* with ultrasparc add'ns.  */
+#define bfd_mach_sparc_sparclite_le    6
+#define bfd_mach_sparc_v9              7
+#define bfd_mach_sparc_v9a             8 /* with ultrasparc add'ns.  */
+#define bfd_mach_sparc_v8plusb         9 /* with cheetah add'ns.  */
+#define bfd_mach_sparc_v9b             10 /* with cheetah add'ns.  */
+#define bfd_mach_sparc_v8plusc         11 /* with UA2005 and T1 add'ns.  */
+#define bfd_mach_sparc_v9c             12 /* with UA2005 and T1 add'ns.  */
+#define bfd_mach_sparc_v8plusd         13 /* with UA2007 and T3 add'ns.  */
+#define bfd_mach_sparc_v9d             14 /* with UA2007 and T3 add'ns.  */
+#define bfd_mach_sparc_v8pluse         15 /* with OSA2001 and T4 add'ns (no IMA).  */
+#define bfd_mach_sparc_v9e             16 /* with OSA2001 and T4 add'ns (no IMA).  */
+#define bfd_mach_sparc_v8plusv         17 /* with OSA2011 and T4 and IMA and FJMAU add'ns.  */
+#define bfd_mach_sparc_v9v             18 /* with OSA2011 and T4 and IMA and FJMAU add'ns.  */
+#define bfd_mach_sparc_v8plusm         19 /* with OSA2015 and M7 add'ns.  */
+#define bfd_mach_sparc_v9m             20 /* with OSA2015 and M7 add'ns.  */
+/* Nonzero if MACH has the v9 instruction set.  */
+#define bfd_mach_sparc_v9_p(mach) \
+  ((mach) >= bfd_mach_sparc_v8plus && (mach) <= bfd_mach_sparc_v9m \
+   && (mach) != bfd_mach_sparc_sparclite_le)
+/* Nonzero if MACH is a 64 bit sparc architecture.  */
+#define bfd_mach_sparc_64bit_p(mach) \
+  ((mach) >= bfd_mach_sparc_v9 \
+   && (mach) != bfd_mach_sparc_v8plusb \
+   && (mach) != bfd_mach_sparc_v8plusc \
+   && (mach) != bfd_mach_sparc_v8plusd \
+   && (mach) != bfd_mach_sparc_v8pluse \
+   && (mach) != bfd_mach_sparc_v8plusv \
+   && (mach) != bfd_mach_sparc_v8plusm)
+  bfd_arch_spu,       /* PowerPC SPU */
+#define bfd_mach_spu           256
+  bfd_arch_mips,      /* MIPS Rxxxx */
+#define bfd_mach_mips3000              3000
+#define bfd_mach_mips3900              3900
+#define bfd_mach_mips4000              4000
+#define bfd_mach_mips4010              4010
+#define bfd_mach_mips4100              4100
+#define bfd_mach_mips4111              4111
+#define bfd_mach_mips4120              4120
+#define bfd_mach_mips4300              4300
+#define bfd_mach_mips4400              4400
+#define bfd_mach_mips4600              4600
+#define bfd_mach_mips4650              4650
+#define bfd_mach_mips5000              5000
+#define bfd_mach_mips5400              5400
+#define bfd_mach_mips5500              5500
+#define bfd_mach_mips5900              5900
+#define bfd_mach_mips6000              6000
+#define bfd_mach_mips7000              7000
+#define bfd_mach_mips8000              8000
+#define bfd_mach_mips9000              9000
+#define bfd_mach_mips10000             10000
+#define bfd_mach_mips12000             12000
+#define bfd_mach_mips14000             14000
+#define bfd_mach_mips16000             16000
+#define bfd_mach_mips16                16
+#define bfd_mach_mips5                 5
+#define bfd_mach_mips_loongson_2e      3001
+#define bfd_mach_mips_loongson_2f      3002
+#define bfd_mach_mips_loongson_3a      3003
+#define bfd_mach_mips_sb1              12310201 /* octal 'SB', 01 */
+#define bfd_mach_mips_octeon           6501
+#define bfd_mach_mips_octeonp          6601
+#define bfd_mach_mips_octeon2          6502
+#define bfd_mach_mips_octeon3          6503
+#define bfd_mach_mips_xlr              887682   /* decimal 'XLR'  */
+#define bfd_mach_mipsisa32             32
+#define bfd_mach_mipsisa32r2           33
+#define bfd_mach_mipsisa32r3           34
+#define bfd_mach_mipsisa32r5           36
+#define bfd_mach_mipsisa32r6           37
+#define bfd_mach_mipsisa64             64
+#define bfd_mach_mipsisa64r2           65
+#define bfd_mach_mipsisa64r3           66
+#define bfd_mach_mipsisa64r5           68
+#define bfd_mach_mipsisa64r6           69
+#define bfd_mach_mips_micromips        96
+  bfd_arch_i386,      /* Intel 386 */
+#define bfd_mach_i386_intel_syntax     (1 << 0)
+#define bfd_mach_i386_i8086            (1 << 1)
+#define bfd_mach_i386_i386             (1 << 2)
+#define bfd_mach_x86_64                (1 << 3)
+#define bfd_mach_x64_32                (1 << 4)
+#define bfd_mach_i386_i386_intel_syntax (bfd_mach_i386_i386 | bfd_mach_i386_intel_syntax)
+#define bfd_mach_x86_64_intel_syntax   (bfd_mach_x86_64 | bfd_mach_i386_intel_syntax)
+#define bfd_mach_x64_32_intel_syntax   (bfd_mach_x64_32 | bfd_mach_i386_intel_syntax)
+  bfd_arch_l1om,   /* Intel L1OM */
+#define bfd_mach_l1om                  (1 << 5)
+#define bfd_mach_l1om_intel_syntax     (bfd_mach_l1om | bfd_mach_i386_intel_syntax)
+  bfd_arch_k1om,   /* Intel K1OM */
+#define bfd_mach_k1om                  (1 << 6)
+#define bfd_mach_k1om_intel_syntax     (bfd_mach_k1om | bfd_mach_i386_intel_syntax)
+#define bfd_mach_i386_nacl             (1 << 7)
+#define bfd_mach_i386_i386_nacl        (bfd_mach_i386_i386 | bfd_mach_i386_nacl)
+#define bfd_mach_x86_64_nacl           (bfd_mach_x86_64 | bfd_mach_i386_nacl)
+#define bfd_mach_x64_32_nacl           (bfd_mach_x64_32 | bfd_mach_i386_nacl)
+  bfd_arch_iamcu,   /* Intel MCU */
+#define bfd_mach_iamcu                 (1 << 8)
+#define bfd_mach_i386_iamcu            (bfd_mach_i386_i386 | bfd_mach_iamcu)
+#define bfd_mach_i386_iamcu_intel_syntax (bfd_mach_i386_iamcu | bfd_mach_i386_intel_syntax)
+  bfd_arch_we32k,     /* AT&T WE32xxx */
+  bfd_arch_tahoe,     /* CCI/Harris Tahoe */
+  bfd_arch_i860,      /* Intel 860 */
+  bfd_arch_i370,      /* IBM 360/370 Mainframes */
+  bfd_arch_romp,      /* IBM ROMP PC/RT */
+  bfd_arch_convex,    /* Convex */
+  bfd_arch_m88k,      /* Motorola 88xxx */
+  bfd_arch_m98k,      /* Motorola 98xxx */
+  bfd_arch_pyramid,   /* Pyramid Technology */
+  bfd_arch_h8300,     /* Renesas H8/300 (formerly Hitachi H8/300) */
+#define bfd_mach_h8300    1
+#define bfd_mach_h8300h   2
+#define bfd_mach_h8300s   3
+#define bfd_mach_h8300hn  4
+#define bfd_mach_h8300sn  5
+#define bfd_mach_h8300sx  6
+#define bfd_mach_h8300sxn 7
+  bfd_arch_pdp11,     /* DEC PDP-11 */
+  bfd_arch_plugin,
+  bfd_arch_powerpc,   /* PowerPC */
+#define bfd_mach_ppc           32
+#define bfd_mach_ppc64         64
+#define bfd_mach_ppc_403       403
+#define bfd_mach_ppc_403gc     4030
+#define bfd_mach_ppc_405       405
+#define bfd_mach_ppc_505       505
+#define bfd_mach_ppc_601       601
+#define bfd_mach_ppc_602       602
+#define bfd_mach_ppc_603       603
+#define bfd_mach_ppc_ec603e    6031
+#define bfd_mach_ppc_604       604
+#define bfd_mach_ppc_620       620
+#define bfd_mach_ppc_630       630
+#define bfd_mach_ppc_750       750
+#define bfd_mach_ppc_860       860
+#define bfd_mach_ppc_a35       35
+#define bfd_mach_ppc_rs64ii    642
+#define bfd_mach_ppc_rs64iii   643
+#define bfd_mach_ppc_7400      7400
+#define bfd_mach_ppc_e500      500
+#define bfd_mach_ppc_e500mc    5001
+#define bfd_mach_ppc_e500mc64  5005
+#define bfd_mach_ppc_e5500     5006
+#define bfd_mach_ppc_e6500     5007
+#define bfd_mach_ppc_titan     83
+#define bfd_mach_ppc_vle       84
+  bfd_arch_rs6000,    /* IBM RS/6000 */
+#define bfd_mach_rs6k          6000
+#define bfd_mach_rs6k_rs1      6001
+#define bfd_mach_rs6k_rsc      6003
+#define bfd_mach_rs6k_rs2      6002
+  bfd_arch_hppa,      /* HP PA RISC */
+#define bfd_mach_hppa10        10
+#define bfd_mach_hppa11        11
+#define bfd_mach_hppa20        20
+#define bfd_mach_hppa20w       25
+  bfd_arch_d10v,      /* Mitsubishi D10V */
+#define bfd_mach_d10v          1
+#define bfd_mach_d10v_ts2      2
+#define bfd_mach_d10v_ts3      3
+  bfd_arch_d30v,      /* Mitsubishi D30V */
+  bfd_arch_dlx,       /* DLX */
+  bfd_arch_m68hc11,   /* Motorola 68HC11 */
+  bfd_arch_m68hc12,   /* Motorola 68HC12 */
+#define bfd_mach_m6812_default 0
+#define bfd_mach_m6812         1
+#define bfd_mach_m6812s        2
+  bfd_arch_m9s12x,   /* Freescale S12X */
+  bfd_arch_m9s12xg,  /* Freescale XGATE */
+  bfd_arch_z8k,       /* Zilog Z8000 */
+#define bfd_mach_z8001         1
+#define bfd_mach_z8002         2
+  bfd_arch_h8500,     /* Renesas H8/500 (formerly Hitachi H8/500) */
+  bfd_arch_sh,        /* Renesas / SuperH SH (formerly Hitachi SH) */
+#define bfd_mach_sh            1
+#define bfd_mach_sh2        0x20
+#define bfd_mach_sh_dsp     0x2d
+#define bfd_mach_sh2a       0x2a
+#define bfd_mach_sh2a_nofpu 0x2b
+#define bfd_mach_sh2a_nofpu_or_sh4_nommu_nofpu 0x2a1
+#define bfd_mach_sh2a_nofpu_or_sh3_nommu 0x2a2
+#define bfd_mach_sh2a_or_sh4  0x2a3
+#define bfd_mach_sh2a_or_sh3e 0x2a4
+#define bfd_mach_sh2e       0x2e
+#define bfd_mach_sh3        0x30
+#define bfd_mach_sh3_nommu  0x31
+#define bfd_mach_sh3_dsp    0x3d
+#define bfd_mach_sh3e       0x3e
+#define bfd_mach_sh4        0x40
+#define bfd_mach_sh4_nofpu  0x41
+#define bfd_mach_sh4_nommu_nofpu  0x42
+#define bfd_mach_sh4a       0x4a
+#define bfd_mach_sh4a_nofpu 0x4b
+#define bfd_mach_sh4al_dsp  0x4d
+#define bfd_mach_sh5        0x50
+  bfd_arch_alpha,     /* Dec Alpha */
+#define bfd_mach_alpha_ev4  0x10
+#define bfd_mach_alpha_ev5  0x20
+#define bfd_mach_alpha_ev6  0x30
+  bfd_arch_arm,       /* Advanced Risc Machines ARM.  */
+#define bfd_mach_arm_unknown   0
+#define bfd_mach_arm_2         1
+#define bfd_mach_arm_2a        2
+#define bfd_mach_arm_3         3
+#define bfd_mach_arm_3M        4
+#define bfd_mach_arm_4         5
+#define bfd_mach_arm_4T        6
+#define bfd_mach_arm_5         7
+#define bfd_mach_arm_5T        8
+#define bfd_mach_arm_5TE       9
+#define bfd_mach_arm_XScale    10
+#define bfd_mach_arm_ep9312    11
+#define bfd_mach_arm_iWMMXt    12
+#define bfd_mach_arm_iWMMXt2   13
+  bfd_arch_nds32,     /* Andes NDS32 */
+#define bfd_mach_n1            1
+#define bfd_mach_n1h           2
+#define bfd_mach_n1h_v2        3
+#define bfd_mach_n1h_v3        4
+#define bfd_mach_n1h_v3m       5
+  bfd_arch_ns32k,     /* National Semiconductors ns32000 */
+  bfd_arch_w65,       /* WDC 65816 */
+  bfd_arch_tic30,     /* Texas Instruments TMS320C30 */
+  bfd_arch_tic4x,     /* Texas Instruments TMS320C3X/4X */
+#define bfd_mach_tic3x         30
+#define bfd_mach_tic4x         40
+  bfd_arch_tic54x,    /* Texas Instruments TMS320C54X */
+  bfd_arch_tic6x,     /* Texas Instruments TMS320C6X */
+  bfd_arch_tic80,     /* TI TMS320c80 (MVP) */
+  bfd_arch_v850,      /* NEC V850 */
+  bfd_arch_v850_rh850,/* NEC V850 (using RH850 ABI) */
+#define bfd_mach_v850          1
+#define bfd_mach_v850e         'E'
+#define bfd_mach_v850e1        '1'
+#define bfd_mach_v850e2        0x4532
+#define bfd_mach_v850e2v3      0x45325633
+#define bfd_mach_v850e3v5      0x45335635 /* ('E'|'3'|'V'|'5') */
+  bfd_arch_arc,       /* ARC Cores */
+#define bfd_mach_arc_a4        0
+#define bfd_mach_arc_a5        1
+#define bfd_mach_arc_arc600    2
+#define bfd_mach_arc_arc601    4
+#define bfd_mach_arc_arc700    3
+#define bfd_mach_arc_arcv2     5
+ bfd_arch_m32c,     /* Renesas M16C/M32C.  */
+#define bfd_mach_m16c        0x75
+#define bfd_mach_m32c        0x78
+  bfd_arch_m32r,      /* Renesas M32R (formerly Mitsubishi M32R/D) */
+#define bfd_mach_m32r          1 /* For backwards compatibility.  */
+#define bfd_mach_m32rx         'x'
+#define bfd_mach_m32r2         '2'
+  bfd_arch_mn10200,   /* Matsushita MN10200 */
+  bfd_arch_mn10300,   /* Matsushita MN10300 */
+#define bfd_mach_mn10300               300
+#define bfd_mach_am33          330
+#define bfd_mach_am33_2        332
+  bfd_arch_fr30,
+#define bfd_mach_fr30          0x46523330
+  bfd_arch_frv,
+#define bfd_mach_frv           1
+#define bfd_mach_frvsimple     2
+#define bfd_mach_fr300         300
+#define bfd_mach_fr400         400
+#define bfd_mach_fr450         450
+#define bfd_mach_frvtomcat     499     /* fr500 prototype */
+#define bfd_mach_fr500         500
+#define bfd_mach_fr550         550
+  bfd_arch_moxie,       /* The moxie processor */
+#define bfd_mach_moxie         1
+  bfd_arch_ft32,       /* The ft32 processor */
+#define bfd_mach_ft32          1
+  bfd_arch_mcore,
+  bfd_arch_mep,
+#define bfd_mach_mep           1
+#define bfd_mach_mep_h1        0x6831
+#define bfd_mach_mep_c5        0x6335
+  bfd_arch_metag,
+#define bfd_mach_metag         1
+  bfd_arch_ia64,      /* HP/Intel ia64 */
+#define bfd_mach_ia64_elf64    64
+#define bfd_mach_ia64_elf32    32
+  bfd_arch_ip2k,      /* Ubicom IP2K microcontrollers. */
+#define bfd_mach_ip2022        1
+#define bfd_mach_ip2022ext     2
+ bfd_arch_iq2000,     /* Vitesse IQ2000.  */
+#define bfd_mach_iq2000        1
+#define bfd_mach_iq10          2
+  bfd_arch_epiphany,   /* Adapteva EPIPHANY */
+#define bfd_mach_epiphany16    1
+#define bfd_mach_epiphany32    2
+  bfd_arch_mt,
+#define bfd_mach_ms1           1
+#define bfd_mach_mrisc2        2
+#define bfd_mach_ms2           3
+  bfd_arch_pj,
+  bfd_arch_avr,       /* Atmel AVR microcontrollers.  */
+#define bfd_mach_avr1          1
+#define bfd_mach_avr2          2
+#define bfd_mach_avr25         25
+#define bfd_mach_avr3          3
+#define bfd_mach_avr31         31
+#define bfd_mach_avr35         35
+#define bfd_mach_avr4          4
+#define bfd_mach_avr5          5
+#define bfd_mach_avr51         51
+#define bfd_mach_avr6          6
+#define bfd_mach_avrtiny   100
+#define bfd_mach_avrxmega1 101
+#define bfd_mach_avrxmega2 102
+#define bfd_mach_avrxmega3 103
+#define bfd_mach_avrxmega4 104
+#define bfd_mach_avrxmega5 105
+#define bfd_mach_avrxmega6 106
+#define bfd_mach_avrxmega7 107
+  bfd_arch_bfin,        /* ADI Blackfin */
+#define bfd_mach_bfin          1
+  bfd_arch_cr16,       /* National Semiconductor CompactRISC (ie CR16). */
+#define bfd_mach_cr16          1
+  bfd_arch_cr16c,       /* National Semiconductor CompactRISC. */
+#define bfd_mach_cr16c         1
+  bfd_arch_crx,       /*  National Semiconductor CRX.  */
+#define bfd_mach_crx           1
+  bfd_arch_cris,      /* Axis CRIS */
+#define bfd_mach_cris_v0_v10   255
+#define bfd_mach_cris_v32      32
+#define bfd_mach_cris_v10_v32  1032
+  bfd_arch_riscv,
+#define bfd_mach_riscv32       132
+#define bfd_mach_riscv64       164
+  bfd_arch_rl78,
+#define bfd_mach_rl78  0x75
+  bfd_arch_rx,        /* Renesas RX.  */
+#define bfd_mach_rx            0x75
+  bfd_arch_s390,      /* IBM s390 */
+#define bfd_mach_s390_31       31
+#define bfd_mach_s390_64       64
+  bfd_arch_score,     /* Sunplus score */
+#define bfd_mach_score3         3
+#define bfd_mach_score7         7
+  bfd_arch_mmix,      /* Donald Knuth's educational processor.  */
+  bfd_arch_xstormy16,
+#define bfd_mach_xstormy16     1
+  bfd_arch_msp430,    /* Texas Instruments MSP430 architecture.  */
+#define bfd_mach_msp11          11
+#define bfd_mach_msp110         110
+#define bfd_mach_msp12          12
+#define bfd_mach_msp13          13
+#define bfd_mach_msp14          14
+#define bfd_mach_msp15          15
+#define bfd_mach_msp16          16
+#define bfd_mach_msp20          20
+#define bfd_mach_msp21          21
+#define bfd_mach_msp22          22
+#define bfd_mach_msp23          23
+#define bfd_mach_msp24          24
+#define bfd_mach_msp26          26
+#define bfd_mach_msp31          31
+#define bfd_mach_msp32          32
+#define bfd_mach_msp33          33
+#define bfd_mach_msp41          41
+#define bfd_mach_msp42          42
+#define bfd_mach_msp43          43
+#define bfd_mach_msp44          44
+#define bfd_mach_msp430x        45
+#define bfd_mach_msp46          46
+#define bfd_mach_msp47          47
+#define bfd_mach_msp54          54
+  bfd_arch_xc16x,     /* Infineon's XC16X Series.               */
+#define bfd_mach_xc16x         1
+#define bfd_mach_xc16xl        2
+#define bfd_mach_xc16xs        3
+  bfd_arch_xgate,   /* Freescale XGATE */
+#define bfd_mach_xgate         1
+  bfd_arch_xtensa,    /* Tensilica's Xtensa cores.  */
+#define bfd_mach_xtensa        1
+  bfd_arch_z80,
+#define bfd_mach_z80strict      1 /* No undocumented opcodes.  */
+#define bfd_mach_z80            3 /* With ixl, ixh, iyl, and iyh.  */
+#define bfd_mach_z80full        7 /* All undocumented instructions.  */
+#define bfd_mach_r800           11 /* R800: successor with multiplication.  */
+  bfd_arch_lm32,      /* Lattice Mico32 */
+#define bfd_mach_lm32      1
+  bfd_arch_microblaze,/* Xilinx MicroBlaze. */
+  bfd_arch_tilepro,   /* Tilera TILEPro */
+  bfd_arch_tilegx, /* Tilera TILE-Gx */
+#define bfd_mach_tilepro   1
+#define bfd_mach_tilegx    1
+#define bfd_mach_tilegx32  2
+  bfd_arch_aarch64,   /* AArch64  */
+#define bfd_mach_aarch64 0
+#define bfd_mach_aarch64_ilp32 32
+  bfd_arch_nios2,      /* Nios II */
+#define bfd_mach_nios2         0
+#define bfd_mach_nios2r1       1
+#define bfd_mach_nios2r2       2
+  bfd_arch_visium,     /* Visium */
+#define bfd_mach_visium        1
+  bfd_arch_last
+  };
+
+typedef struct bfd_arch_info
+{
+  int bits_per_word;
+  int bits_per_address;
+  int bits_per_byte;
+  enum bfd_architecture arch;
+  unsigned long mach;
+  const char *arch_name;
+  const char *printable_name;
+  unsigned int section_align_power;
+  /* TRUE if this is the default machine for the architecture.
+     The default arch should be the first entry for an arch so that
+     all the entries for that arch can be accessed via <<next>>.  */
+  bfd_boolean the_default;
+  const struct bfd_arch_info * (*compatible)
+    (const struct bfd_arch_info *a, const struct bfd_arch_info *b);
+
+  bfd_boolean (*scan) (const struct bfd_arch_info *, const char *);
+
+  /* Allocate via bfd_malloc and return a fill buffer of size COUNT.  If
+     IS_BIGENDIAN is TRUE, the order of bytes is big endian.  If CODE is
+     TRUE, the buffer contains code.  */
+  void *(*fill) (bfd_size_type count, bfd_boolean is_bigendian,
+                 bfd_boolean code);
+
+  const struct bfd_arch_info *next;
+}
+bfd_arch_info_type;
+
+const char *bfd_printable_name (bfd *abfd);
+
+const bfd_arch_info_type *bfd_scan_arch (const char *string);
+
+const char **bfd_arch_list (void);
+
+const bfd_arch_info_type *bfd_arch_get_compatible
+   (const bfd *abfd, const bfd *bbfd, bfd_boolean accept_unknowns);
+
+void bfd_set_arch_info (bfd *abfd, const bfd_arch_info_type *arg);
+
+bfd_boolean bfd_default_set_arch_mach
+   (bfd *abfd, enum bfd_architecture arch, unsigned long mach);
+
+enum bfd_architecture bfd_get_arch (bfd *abfd);
+
+unsigned long bfd_get_mach (bfd *abfd);
+
+unsigned int bfd_arch_bits_per_byte (bfd *abfd);
+
+unsigned int bfd_arch_bits_per_address (bfd *abfd);
+
+const bfd_arch_info_type *bfd_get_arch_info (bfd *abfd);
+
+const bfd_arch_info_type *bfd_lookup_arch
+   (enum bfd_architecture arch, unsigned long machine);
+
+const char *bfd_printable_arch_mach
+   (enum bfd_architecture arch, unsigned long machine);
+
+unsigned int bfd_octets_per_byte (bfd *abfd);
+
+unsigned int bfd_arch_mach_octets_per_byte
+   (enum bfd_architecture arch, unsigned long machine);
+
+/* Extracted from reloc.c.  */
+
+typedef enum bfd_reloc_status
+{
+  /* No errors detected.  */
+  bfd_reloc_ok,
+
+  /* The relocation was performed, but there was an overflow.  */
+  bfd_reloc_overflow,
+
+  /* The address to relocate was not within the section supplied.  */
+  bfd_reloc_outofrange,
+
+  /* Used by special functions.  */
+  bfd_reloc_continue,
+
+  /* Unsupported relocation size requested.  */
+  bfd_reloc_notsupported,
+
+  /* Unused.  */
+  bfd_reloc_other,
+
+  /* The symbol to relocate against was undefined.  */
+  bfd_reloc_undefined,
+
+  /* The relocation was performed, but may not be ok - presently
+     generated only when linking i960 coff files with i960 b.out
+     symbols.  If this type is returned, the error_message argument
+     to bfd_perform_relocation will be set.  */
+  bfd_reloc_dangerous
+ }
+ bfd_reloc_status_type;
+
+
+typedef struct reloc_cache_entry
+{
+  /* A pointer into the canonical table of pointers.  */
+  struct bfd_symbol **sym_ptr_ptr;
+
+  /* offset in section.  */
+  bfd_size_type address;
+
+  /* addend for relocation value.  */
+  bfd_vma addend;
+
+  /* Pointer to how to perform the required relocation.  */
+  reloc_howto_type *howto;
+
+}
+arelent;
+
+
+enum complain_overflow
+{
+  /* Do not complain on overflow.  */
+  complain_overflow_dont,
+
+  /* Complain if the value overflows when considered as a signed
+     number one bit larger than the field.  ie. A bitfield of N bits
+     is allowed to represent -2**n to 2**n-1.  */
+  complain_overflow_bitfield,
+
+  /* Complain if the value overflows when considered as a signed
+     number.  */
+  complain_overflow_signed,
+
+  /* Complain if the value overflows when considered as an
+     unsigned number.  */
+  complain_overflow_unsigned
+};
+struct bfd_symbol;             /* Forward declaration.  */
+
+struct reloc_howto_struct
+{
+  /*  The type field has mainly a documentary use - the back end can
+      do what it wants with it, though normally the back end's
+      external idea of what a reloc number is stored
+      in this field.  For example, a PC relative word relocation
+      in a coff environment has the type 023 - because that's
+      what the outside world calls a R_PCRWORD reloc.  */
+  unsigned int type;
+
+  /*  The value the final relocation is shifted right by.  This drops
+      unwanted data from the relocation.  */
+  unsigned int rightshift;
+
+  /*  The size of the item to be relocated.  This is *not* a
+      power-of-two measure.  To get the number of bytes operated
+      on by a type of relocation, use bfd_get_reloc_size.  */
+  int size;
+
+  /*  The number of bits in the item to be relocated.  This is used
+      when doing overflow checking.  */
+  unsigned int bitsize;
+
+  /*  The relocation is relative to the field being relocated.  */
+  bfd_boolean pc_relative;
+
+  /*  The bit position of the reloc value in the destination.
+      The relocated value is left shifted by this amount.  */
+  unsigned int bitpos;
+
+  /* What type of overflow error should be checked for when
+     relocating.  */
+  enum complain_overflow complain_on_overflow;
+
+  /* If this field is non null, then the supplied function is
+     called rather than the normal function.  This allows really
+     strange relocation methods to be accommodated (e.g., i960 callj
+     instructions).  */
+  bfd_reloc_status_type (*special_function)
+    (bfd *, arelent *, struct bfd_symbol *, void *, asection *,
+     bfd *, char **);
+
+  /* The textual name of the relocation type.  */
+  char *name;
+
+  /* Some formats record a relocation addend in the section contents
+     rather than with the relocation.  For ELF formats this is the
+     distinction between USE_REL and USE_RELA (though the code checks
+     for USE_REL == 1/0).  The value of this field is TRUE if the
+     addend is recorded with the section contents; when performing a
+     partial link (ld -r) the section contents (the data) will be
+     modified.  The value of this field is FALSE if addends are
+     recorded with the relocation (in arelent.addend); when performing
+     a partial link the relocation will be modified.
+     All relocations for all ELF USE_RELA targets should set this field
+     to FALSE (values of TRUE should be looked on with suspicion).
+     However, the converse is not true: not all relocations of all ELF
+     USE_REL targets set this field to TRUE.  Why this is so is peculiar
+     to each particular target.  For relocs that aren't used in partial
+     links (e.g. GOT stuff) it doesn't matter what this is set to.  */
+  bfd_boolean partial_inplace;
+
+  /* src_mask selects the part of the instruction (or data) to be used
+     in the relocation sum.  If the target relocations don't have an
+     addend in the reloc, eg. ELF USE_REL, src_mask will normally equal
+     dst_mask to extract the addend from the section contents.  If
+     relocations do have an addend in the reloc, eg. ELF USE_RELA, this
+     field should be zero.  Non-zero values for ELF USE_RELA targets are
+     bogus as in those cases the value in the dst_mask part of the
+     section contents should be treated as garbage.  */
+  bfd_vma src_mask;
+
+  /* dst_mask selects which parts of the instruction (or data) are
+     replaced with a relocated value.  */
+  bfd_vma dst_mask;
+
+  /* When some formats create PC relative instructions, they leave
+     the value of the pc of the place being relocated in the offset
+     slot of the instruction, so that a PC relative relocation can
+     be made just by adding in an ordinary offset (e.g., sun3 a.out).
+     Some formats leave the displacement part of an instruction
+     empty (e.g., m88k bcs); this flag signals the fact.  */
+  bfd_boolean pcrel_offset;
+};
+
+#define HOWTO(C, R, S, B, P, BI, O, SF, NAME, INPLACE, MASKSRC, MASKDST, PC) \
+  { (unsigned) C, R, S, B, P, BI, O, SF, NAME, INPLACE, MASKSRC, MASKDST, PC }
+#define NEWHOWTO(FUNCTION, NAME, SIZE, REL, IN) \
+  HOWTO (0, 0, SIZE, 0, REL, 0, complain_overflow_dont, FUNCTION, \
+         NAME, FALSE, 0, 0, IN)
+
+#define EMPTY_HOWTO(C) \
+  HOWTO ((C), 0, 0, 0, FALSE, 0, complain_overflow_dont, NULL, \
+         NULL, FALSE, 0, 0, FALSE)
+
+#define HOWTO_PREPARE(relocation, symbol)               \
+  {                                                     \
+    if (symbol != NULL)                                 \
+      {                                                 \
+        if (bfd_is_com_section (symbol->section))       \
+          {                                             \
+            relocation = 0;                             \
+          }                                             \
+        else                                            \
+          {                                             \
+            relocation = symbol->value;                 \
+          }                                             \
+      }                                                 \
+  }
+
+unsigned int bfd_get_reloc_size (reloc_howto_type *);
+
+typedef struct relent_chain
+{
+  arelent relent;
+  struct relent_chain *next;
+}
+arelent_chain;
+
+bfd_reloc_status_type bfd_check_overflow
+   (enum complain_overflow how,
+    unsigned int bitsize,
+    unsigned int rightshift,
+    unsigned int addrsize,
+    bfd_vma relocation);
+
+bfd_reloc_status_type bfd_perform_relocation
+   (bfd *abfd,
+    arelent *reloc_entry,
+    void *data,
+    asection *input_section,
+    bfd *output_bfd,
+    char **error_message);
+
+bfd_reloc_status_type bfd_install_relocation
+   (bfd *abfd,
+    arelent *reloc_entry,
+    void *data, bfd_vma data_start,
+    asection *input_section,
+    char **error_message);
+
+enum bfd_reloc_code_real {
+  _dummy_first_bfd_reloc_code_real,
+
+
+/* Basic absolute relocations of N bits.  */
+  BFD_RELOC_64,
+  BFD_RELOC_32,
+  BFD_RELOC_26,
+  BFD_RELOC_24,
+  BFD_RELOC_16,
+  BFD_RELOC_14,
+  BFD_RELOC_8,
+
+/* PC-relative relocations.  Sometimes these are relative to the address
+of the relocation itself; sometimes they are relative to the start of
+the section containing the relocation.  It depends on the specific target.
+
+The 24-bit relocation is used in some Intel 960 configurations.  */
+  BFD_RELOC_64_PCREL,
+  BFD_RELOC_32_PCREL,
+  BFD_RELOC_24_PCREL,
+  BFD_RELOC_16_PCREL,
+  BFD_RELOC_12_PCREL,
+  BFD_RELOC_8_PCREL,
+
+/* Section relative relocations.  Some targets need this for DWARF2.  */
+  BFD_RELOC_32_SECREL,
+
+/* For ELF.  */
+  BFD_RELOC_32_GOT_PCREL,
+  BFD_RELOC_16_GOT_PCREL,
+  BFD_RELOC_8_GOT_PCREL,
+  BFD_RELOC_32_GOTOFF,
+  BFD_RELOC_16_GOTOFF,
+  BFD_RELOC_LO16_GOTOFF,
+  BFD_RELOC_HI16_GOTOFF,
+  BFD_RELOC_HI16_S_GOTOFF,
+  BFD_RELOC_8_GOTOFF,
+  BFD_RELOC_64_PLT_PCREL,
+  BFD_RELOC_32_PLT_PCREL,
+  BFD_RELOC_24_PLT_PCREL,
+  BFD_RELOC_16_PLT_PCREL,
+  BFD_RELOC_8_PLT_PCREL,
+  BFD_RELOC_64_PLTOFF,
+  BFD_RELOC_32_PLTOFF,
+  BFD_RELOC_16_PLTOFF,
+  BFD_RELOC_LO16_PLTOFF,
+  BFD_RELOC_HI16_PLTOFF,
+  BFD_RELOC_HI16_S_PLTOFF,
+  BFD_RELOC_8_PLTOFF,
+
+/* Size relocations.  */
+  BFD_RELOC_SIZE32,
+  BFD_RELOC_SIZE64,
+
+/* Relocations used by 68K ELF.  */
+  BFD_RELOC_68K_GLOB_DAT,
+  BFD_RELOC_68K_JMP_SLOT,
+  BFD_RELOC_68K_RELATIVE,
+  BFD_RELOC_68K_TLS_GD32,
+  BFD_RELOC_68K_TLS_GD16,
+  BFD_RELOC_68K_TLS_GD8,
+  BFD_RELOC_68K_TLS_LDM32,
+  BFD_RELOC_68K_TLS_LDM16,
+  BFD_RELOC_68K_TLS_LDM8,
+  BFD_RELOC_68K_TLS_LDO32,
+  BFD_RELOC_68K_TLS_LDO16,
+  BFD_RELOC_68K_TLS_LDO8,
+  BFD_RELOC_68K_TLS_IE32,
+  BFD_RELOC_68K_TLS_IE16,
+  BFD_RELOC_68K_TLS_IE8,
+  BFD_RELOC_68K_TLS_LE32,
+  BFD_RELOC_68K_TLS_LE16,
+  BFD_RELOC_68K_TLS_LE8,
+
+/* Linkage-table relative.  */
+  BFD_RELOC_32_BASEREL,
+  BFD_RELOC_16_BASEREL,
+  BFD_RELOC_LO16_BASEREL,
+  BFD_RELOC_HI16_BASEREL,
+  BFD_RELOC_HI16_S_BASEREL,
+  BFD_RELOC_8_BASEREL,
+  BFD_RELOC_RVA,
+
+/* Absolute 8-bit relocation, but used to form an address like 0xFFnn.  */
+  BFD_RELOC_8_FFnn,
+
+/* These PC-relative relocations are stored as word displacements --
+i.e., byte displacements shifted right two bits.  The 30-bit word
+displacement (<<32_PCREL_S2>> -- 32 bits, shifted 2) is used on the
+SPARC.  (SPARC tools generally refer to this as <<WDISP30>>.)  The
+signed 16-bit displacement is used on the MIPS, and the 23-bit
+displacement is used on the Alpha.  */
+  BFD_RELOC_32_PCREL_S2,
+  BFD_RELOC_16_PCREL_S2,
+  BFD_RELOC_23_PCREL_S2,
+
+/* High 22 bits and low 10 bits of 32-bit value, placed into lower bits of
+the target word.  These are used on the SPARC.  */
+  BFD_RELOC_HI22,
+  BFD_RELOC_LO10,
+
+/* For systems that allocate a Global Pointer register, these are
+displacements off that register.  These relocation types are
+handled specially, because the value the register will have is
+decided relatively late.  */
+  BFD_RELOC_GPREL16,
+  BFD_RELOC_GPREL32,
+
+/* Reloc types used for i960/b.out.  */
+  BFD_RELOC_I960_CALLJ,
+
+/* SPARC ELF relocations.  There is probably some overlap with other
+relocation types already defined.  */
+  BFD_RELOC_NONE,
+  BFD_RELOC_SPARC_WDISP22,
+  BFD_RELOC_SPARC22,
+  BFD_RELOC_SPARC13,
+  BFD_RELOC_SPARC_GOT10,
+  BFD_RELOC_SPARC_GOT13,
+  BFD_RELOC_SPARC_GOT22,
+  BFD_RELOC_SPARC_PC10,
+  BFD_RELOC_SPARC_PC22,
+  BFD_RELOC_SPARC_WPLT30,
+  BFD_RELOC_SPARC_COPY,
+  BFD_RELOC_SPARC_GLOB_DAT,
+  BFD_RELOC_SPARC_JMP_SLOT,
+  BFD_RELOC_SPARC_RELATIVE,
+  BFD_RELOC_SPARC_UA16,
+  BFD_RELOC_SPARC_UA32,
+  BFD_RELOC_SPARC_UA64,
+  BFD_RELOC_SPARC_GOTDATA_HIX22,
+  BFD_RELOC_SPARC_GOTDATA_LOX10,
+  BFD_RELOC_SPARC_GOTDATA_OP_HIX22,
+  BFD_RELOC_SPARC_GOTDATA_OP_LOX10,
+  BFD_RELOC_SPARC_GOTDATA_OP,
+  BFD_RELOC_SPARC_JMP_IREL,
+  BFD_RELOC_SPARC_IRELATIVE,
+
+/* I think these are specific to SPARC a.out (e.g., Sun 4).  */
+  BFD_RELOC_SPARC_BASE13,
+  BFD_RELOC_SPARC_BASE22,
+
+/* SPARC64 relocations  */
+#define BFD_RELOC_SPARC_64 BFD_RELOC_64
+  BFD_RELOC_SPARC_10,
+  BFD_RELOC_SPARC_11,
+  BFD_RELOC_SPARC_OLO10,
+  BFD_RELOC_SPARC_HH22,
+  BFD_RELOC_SPARC_HM10,
+  BFD_RELOC_SPARC_LM22,
+  BFD_RELOC_SPARC_PC_HH22,
+  BFD_RELOC_SPARC_PC_HM10,
+  BFD_RELOC_SPARC_PC_LM22,
+  BFD_RELOC_SPARC_WDISP16,
+  BFD_RELOC_SPARC_WDISP19,
+  BFD_RELOC_SPARC_7,
+  BFD_RELOC_SPARC_6,
+  BFD_RELOC_SPARC_5,
+#define BFD_RELOC_SPARC_DISP64 BFD_RELOC_64_PCREL
+  BFD_RELOC_SPARC_PLT32,
+  BFD_RELOC_SPARC_PLT64,
+  BFD_RELOC_SPARC_HIX22,
+  BFD_RELOC_SPARC_LOX10,
+  BFD_RELOC_SPARC_H44,
+  BFD_RELOC_SPARC_M44,
+  BFD_RELOC_SPARC_L44,
+  BFD_RELOC_SPARC_REGISTER,
+  BFD_RELOC_SPARC_H34,
+  BFD_RELOC_SPARC_SIZE32,
+  BFD_RELOC_SPARC_SIZE64,
+  BFD_RELOC_SPARC_WDISP10,
+
+/* SPARC little endian relocation  */
+  BFD_RELOC_SPARC_REV32,
+
+/* SPARC TLS relocations  */
+  BFD_RELOC_SPARC_TLS_GD_HI22,
+  BFD_RELOC_SPARC_TLS_GD_LO10,
+  BFD_RELOC_SPARC_TLS_GD_ADD,
+  BFD_RELOC_SPARC_TLS_GD_CALL,
+  BFD_RELOC_SPARC_TLS_LDM_HI22,
+  BFD_RELOC_SPARC_TLS_LDM_LO10,
+  BFD_RELOC_SPARC_TLS_LDM_ADD,
+  BFD_RELOC_SPARC_TLS_LDM_CALL,
+  BFD_RELOC_SPARC_TLS_LDO_HIX22,
+  BFD_RELOC_SPARC_TLS_LDO_LOX10,
+  BFD_RELOC_SPARC_TLS_LDO_ADD,
+  BFD_RELOC_SPARC_TLS_IE_HI22,
+  BFD_RELOC_SPARC_TLS_IE_LO10,
+  BFD_RELOC_SPARC_TLS_IE_LD,
+  BFD_RELOC_SPARC_TLS_IE_LDX,
+  BFD_RELOC_SPARC_TLS_IE_ADD,
+  BFD_RELOC_SPARC_TLS_LE_HIX22,
+  BFD_RELOC_SPARC_TLS_LE_LOX10,
+  BFD_RELOC_SPARC_TLS_DTPMOD32,
+  BFD_RELOC_SPARC_TLS_DTPMOD64,
+  BFD_RELOC_SPARC_TLS_DTPOFF32,
+  BFD_RELOC_SPARC_TLS_DTPOFF64,
+  BFD_RELOC_SPARC_TLS_TPOFF32,
+  BFD_RELOC_SPARC_TLS_TPOFF64,
+
+/* SPU Relocations.  */
+  BFD_RELOC_SPU_IMM7,
+  BFD_RELOC_SPU_IMM8,
+  BFD_RELOC_SPU_IMM10,
+  BFD_RELOC_SPU_IMM10W,
+  BFD_RELOC_SPU_IMM16,
+  BFD_RELOC_SPU_IMM16W,
+  BFD_RELOC_SPU_IMM18,
+  BFD_RELOC_SPU_PCREL9a,
+  BFD_RELOC_SPU_PCREL9b,
+  BFD_RELOC_SPU_PCREL16,
+  BFD_RELOC_SPU_LO16,
+  BFD_RELOC_SPU_HI16,
+  BFD_RELOC_SPU_PPU32,
+  BFD_RELOC_SPU_PPU64,
+  BFD_RELOC_SPU_ADD_PIC,
+
+/* Alpha ECOFF and ELF relocations.  Some of these treat the symbol or
+"addend" in some special way.
+For GPDISP_HI16 ("gpdisp") relocations, the symbol is ignored when
+writing; when reading, it will be the absolute section symbol.  The
+addend is the displacement in bytes of the "lda" instruction from
+the "ldah" instruction (which is at the address of this reloc).  */
+  BFD_RELOC_ALPHA_GPDISP_HI16,
+
+/* For GPDISP_LO16 ("ignore") relocations, the symbol is handled as
+with GPDISP_HI16 relocs.  The addend is ignored when writing the
+relocations out, and is filled in with the file's GP value on
+reading, for convenience.  */
+  BFD_RELOC_ALPHA_GPDISP_LO16,
+
+/* The ELF GPDISP relocation is exactly the same as the GPDISP_HI16
+relocation except that there is no accompanying GPDISP_LO16
+relocation.  */
+  BFD_RELOC_ALPHA_GPDISP,
+
+/* The Alpha LITERAL/LITUSE relocs are produced by a symbol reference;
+the assembler turns it into a LDQ instruction to load the address of
+the symbol, and then fills in a register in the real instruction.
+
+The LITERAL reloc, at the LDQ instruction, refers to the .lita
+section symbol.  The addend is ignored when writing, but is filled
+in with the file's GP value on reading, for convenience, as with the
+GPDISP_LO16 reloc.
+
+The ELF_LITERAL reloc is somewhere between 16_GOTOFF and GPDISP_LO16.
+It should refer to the symbol to be referenced, as with 16_GOTOFF,
+but it generates output not based on the position within the .got
+section, but relative to the GP value chosen for the file during the
+final link stage.
+
+The LITUSE reloc, on the instruction using the loaded address, gives
+information to the linker that it might be able to use to optimize
+away some literal section references.  The symbol is ignored (read
+as the absolute section symbol), and the "addend" indicates the type
+of instruction using the register:
+1 - "memory" fmt insn
+2 - byte-manipulation (byte offset reg)
+3 - jsr (target of branch)  */
+  BFD_RELOC_ALPHA_LITERAL,
+  BFD_RELOC_ALPHA_ELF_LITERAL,
+  BFD_RELOC_ALPHA_LITUSE,
+
+/* The HINT relocation indicates a value that should be filled into the
+"hint" field of a jmp/jsr/ret instruction, for possible branch-
+prediction logic which may be provided on some processors.  */
+  BFD_RELOC_ALPHA_HINT,
+
+/* The LINKAGE relocation outputs a linkage pair in the object file,
+which is filled by the linker.  */
+  BFD_RELOC_ALPHA_LINKAGE,
+
+/* The CODEADDR relocation outputs a STO_CA in the object file,
+which is filled by the linker.  */
+  BFD_RELOC_ALPHA_CODEADDR,
+
+/* The GPREL_HI/LO relocations together form a 32-bit offset from the
+GP register.  */
+  BFD_RELOC_ALPHA_GPREL_HI16,
+  BFD_RELOC_ALPHA_GPREL_LO16,
+
+/* Like BFD_RELOC_23_PCREL_S2, except that the source and target must
+share a common GP, and the target address is adjusted for
+STO_ALPHA_STD_GPLOAD.  */
+  BFD_RELOC_ALPHA_BRSGP,
+
+/* The NOP relocation outputs a NOP if the longword displacement
+between two procedure entry points is < 2^21.  */
+  BFD_RELOC_ALPHA_NOP,
+
+/* The BSR relocation outputs a BSR if the longword displacement
+between two procedure entry points is < 2^21.  */
+  BFD_RELOC_ALPHA_BSR,
+
+/* The LDA relocation outputs a LDA if the longword displacement
+between two procedure entry points is < 2^16.  */
+  BFD_RELOC_ALPHA_LDA,
+
+/* The BOH relocation outputs a BSR if the longword displacement
+between two procedure entry points is < 2^21, or else a hint.  */
+  BFD_RELOC_ALPHA_BOH,
+
+/* Alpha thread-local storage relocations.  */
+  BFD_RELOC_ALPHA_TLSGD,
+  BFD_RELOC_ALPHA_TLSLDM,
+  BFD_RELOC_ALPHA_DTPMOD64,
+  BFD_RELOC_ALPHA_GOTDTPREL16,
+  BFD_RELOC_ALPHA_DTPREL64,
+  BFD_RELOC_ALPHA_DTPREL_HI16,
+  BFD_RELOC_ALPHA_DTPREL_LO16,
+  BFD_RELOC_ALPHA_DTPREL16,
+  BFD_RELOC_ALPHA_GOTTPREL16,
+  BFD_RELOC_ALPHA_TPREL64,
+  BFD_RELOC_ALPHA_TPREL_HI16,
+  BFD_RELOC_ALPHA_TPREL_LO16,
+  BFD_RELOC_ALPHA_TPREL16,
+
+/* The MIPS jump instruction.  */
+  BFD_RELOC_MIPS_JMP,
+  BFD_RELOC_MICROMIPS_JMP,
+
+/* The MIPS16 jump instruction.  */
+  BFD_RELOC_MIPS16_JMP,
+
+/* MIPS16 GP relative reloc.  */
+  BFD_RELOC_MIPS16_GPREL,
+
+/* High 16 bits of 32-bit value; simple reloc.  */
+  BFD_RELOC_HI16,
+
+/* High 16 bits of 32-bit value but the low 16 bits will be sign
+extended and added to form the final result.  If the low 16
+bits form a negative number, we need to add one to the high value
+to compensate for the borrow when the low bits are added.  */
+  BFD_RELOC_HI16_S,
+
+/* Low 16 bits.  */
+  BFD_RELOC_LO16,
+
+/* High 16 bits of 32-bit pc-relative value  */
+  BFD_RELOC_HI16_PCREL,
+
+/* High 16 bits of 32-bit pc-relative value, adjusted  */
+  BFD_RELOC_HI16_S_PCREL,
+
+/* Low 16 bits of pc-relative value  */
+  BFD_RELOC_LO16_PCREL,
+
+/* Equivalent of BFD_RELOC_MIPS_*, but with the MIPS16 layout of
+16-bit immediate fields  */
+  BFD_RELOC_MIPS16_GOT16,
+  BFD_RELOC_MIPS16_CALL16,
+
+/* MIPS16 high 16 bits of 32-bit value.  */
+  BFD_RELOC_MIPS16_HI16,
+
+/* MIPS16 high 16 bits of 32-bit value but the low 16 bits will be sign
+extended and added to form the final result.  If the low 16
+bits form a negative number, we need to add one to the high value
+to compensate for the borrow when the low bits are added.  */
+  BFD_RELOC_MIPS16_HI16_S,
+
+/* MIPS16 low 16 bits.  */
+  BFD_RELOC_MIPS16_LO16,
+
+/* MIPS16 TLS relocations  */
+  BFD_RELOC_MIPS16_TLS_GD,
+  BFD_RELOC_MIPS16_TLS_LDM,
+  BFD_RELOC_MIPS16_TLS_DTPREL_HI16,
+  BFD_RELOC_MIPS16_TLS_DTPREL_LO16,
+  BFD_RELOC_MIPS16_TLS_GOTTPREL,
+  BFD_RELOC_MIPS16_TLS_TPREL_HI16,
+  BFD_RELOC_MIPS16_TLS_TPREL_LO16,
+
+/* Relocation against a MIPS literal section.  */
+  BFD_RELOC_MIPS_LITERAL,
+  BFD_RELOC_MICROMIPS_LITERAL,
+
+/* microMIPS PC-relative relocations.  */
+  BFD_RELOC_MICROMIPS_7_PCREL_S1,
+  BFD_RELOC_MICROMIPS_10_PCREL_S1,
+  BFD_RELOC_MICROMIPS_16_PCREL_S1,
+
+/* MIPS16 PC-relative relocation.  */
+  BFD_RELOC_MIPS16_16_PCREL_S1,
+
+/* MIPS PC-relative relocations.  */
+  BFD_RELOC_MIPS_21_PCREL_S2,
+  BFD_RELOC_MIPS_26_PCREL_S2,
+  BFD_RELOC_MIPS_18_PCREL_S3,
+  BFD_RELOC_MIPS_19_PCREL_S2,
+
+/* microMIPS versions of generic BFD relocs.  */
+  BFD_RELOC_MICROMIPS_GPREL16,
+  BFD_RELOC_MICROMIPS_HI16,
+  BFD_RELOC_MICROMIPS_HI16_S,
+  BFD_RELOC_MICROMIPS_LO16,
+
+/* MIPS ELF relocations.  */
+  BFD_RELOC_MIPS_GOT16,
+  BFD_RELOC_MICROMIPS_GOT16,
+  BFD_RELOC_MIPS_CALL16,
+  BFD_RELOC_MICROMIPS_CALL16,
+  BFD_RELOC_MIPS_GOT_HI16,
+  BFD_RELOC_MICROMIPS_GOT_HI16,
+  BFD_RELOC_MIPS_GOT_LO16,
+  BFD_RELOC_MICROMIPS_GOT_LO16,
+  BFD_RELOC_MIPS_CALL_HI16,
+  BFD_RELOC_MICROMIPS_CALL_HI16,
+  BFD_RELOC_MIPS_CALL_LO16,
+  BFD_RELOC_MICROMIPS_CALL_LO16,
+  BFD_RELOC_MIPS_SUB,
+  BFD_RELOC_MICROMIPS_SUB,
+  BFD_RELOC_MIPS_GOT_PAGE,
+  BFD_RELOC_MICROMIPS_GOT_PAGE,
+  BFD_RELOC_MIPS_GOT_OFST,
+  BFD_RELOC_MICROMIPS_GOT_OFST,
+  BFD_RELOC_MIPS_GOT_DISP,
+  BFD_RELOC_MICROMIPS_GOT_DISP,
+  BFD_RELOC_MIPS_SHIFT5,
+  BFD_RELOC_MIPS_SHIFT6,
+  BFD_RELOC_MIPS_INSERT_A,
+  BFD_RELOC_MIPS_INSERT_B,
+  BFD_RELOC_MIPS_DELETE,
+  BFD_RELOC_MIPS_HIGHEST,
+  BFD_RELOC_MICROMIPS_HIGHEST,
+  BFD_RELOC_MIPS_HIGHER,
+  BFD_RELOC_MICROMIPS_HIGHER,
+  BFD_RELOC_MIPS_SCN_DISP,
+  BFD_RELOC_MICROMIPS_SCN_DISP,
+  BFD_RELOC_MIPS_REL16,
+  BFD_RELOC_MIPS_RELGOT,
+  BFD_RELOC_MIPS_JALR,
+  BFD_RELOC_MICROMIPS_JALR,
+  BFD_RELOC_MIPS_TLS_DTPMOD32,
+  BFD_RELOC_MIPS_TLS_DTPREL32,
+  BFD_RELOC_MIPS_TLS_DTPMOD64,
+  BFD_RELOC_MIPS_TLS_DTPREL64,
+  BFD_RELOC_MIPS_TLS_GD,
+  BFD_RELOC_MICROMIPS_TLS_GD,
+  BFD_RELOC_MIPS_TLS_LDM,
+  BFD_RELOC_MICROMIPS_TLS_LDM,
+  BFD_RELOC_MIPS_TLS_DTPREL_HI16,
+  BFD_RELOC_MICROMIPS_TLS_DTPREL_HI16,
+  BFD_RELOC_MIPS_TLS_DTPREL_LO16,
+  BFD_RELOC_MICROMIPS_TLS_DTPREL_LO16,
+  BFD_RELOC_MIPS_TLS_GOTTPREL,
+  BFD_RELOC_MICROMIPS_TLS_GOTTPREL,
+  BFD_RELOC_MIPS_TLS_TPREL32,
+  BFD_RELOC_MIPS_TLS_TPREL64,
+  BFD_RELOC_MIPS_TLS_TPREL_HI16,
+  BFD_RELOC_MICROMIPS_TLS_TPREL_HI16,
+  BFD_RELOC_MIPS_TLS_TPREL_LO16,
+  BFD_RELOC_MICROMIPS_TLS_TPREL_LO16,
+  BFD_RELOC_MIPS_EH,
+
+
+/* MIPS ELF relocations (VxWorks and PLT extensions).  */
+  BFD_RELOC_MIPS_COPY,
+  BFD_RELOC_MIPS_JUMP_SLOT,
+
+
+/* Moxie ELF relocations.  */
+  BFD_RELOC_MOXIE_10_PCREL,
+
+
+/* FT32 ELF relocations.  */
+  BFD_RELOC_FT32_10,
+  BFD_RELOC_FT32_20,
+  BFD_RELOC_FT32_17,
+  BFD_RELOC_FT32_18,
+
+
+/* Fujitsu Frv Relocations.  */
+  BFD_RELOC_FRV_LABEL16,
+  BFD_RELOC_FRV_LABEL24,
+  BFD_RELOC_FRV_LO16,
+  BFD_RELOC_FRV_HI16,
+  BFD_RELOC_FRV_GPREL12,
+  BFD_RELOC_FRV_GPRELU12,
+  BFD_RELOC_FRV_GPREL32,
+  BFD_RELOC_FRV_GPRELHI,
+  BFD_RELOC_FRV_GPRELLO,
+  BFD_RELOC_FRV_GOT12,
+  BFD_RELOC_FRV_GOTHI,
+  BFD_RELOC_FRV_GOTLO,
+  BFD_RELOC_FRV_FUNCDESC,
+  BFD_RELOC_FRV_FUNCDESC_GOT12,
+  BFD_RELOC_FRV_FUNCDESC_GOTHI,
+  BFD_RELOC_FRV_FUNCDESC_GOTLO,
+  BFD_RELOC_FRV_FUNCDESC_VALUE,
+  BFD_RELOC_FRV_FUNCDESC_GOTOFF12,
+  BFD_RELOC_FRV_FUNCDESC_GOTOFFHI,
+  BFD_RELOC_FRV_FUNCDESC_GOTOFFLO,
+  BFD_RELOC_FRV_GOTOFF12,
+  BFD_RELOC_FRV_GOTOFFHI,
+  BFD_RELOC_FRV_GOTOFFLO,
+  BFD_RELOC_FRV_GETTLSOFF,
+  BFD_RELOC_FRV_TLSDESC_VALUE,
+  BFD_RELOC_FRV_GOTTLSDESC12,
+  BFD_RELOC_FRV_GOTTLSDESCHI,
+  BFD_RELOC_FRV_GOTTLSDESCLO,
+  BFD_RELOC_FRV_TLSMOFF12,
+  BFD_RELOC_FRV_TLSMOFFHI,
+  BFD_RELOC_FRV_TLSMOFFLO,
+  BFD_RELOC_FRV_GOTTLSOFF12,
+  BFD_RELOC_FRV_GOTTLSOFFHI,
+  BFD_RELOC_FRV_GOTTLSOFFLO,
+  BFD_RELOC_FRV_TLSOFF,
+  BFD_RELOC_FRV_TLSDESC_RELAX,
+  BFD_RELOC_FRV_GETTLSOFF_RELAX,
+  BFD_RELOC_FRV_TLSOFF_RELAX,
+  BFD_RELOC_FRV_TLSMOFF,
+
+
+/* This is a 24bit GOT-relative reloc for the mn10300.  */
+  BFD_RELOC_MN10300_GOTOFF24,
+
+/* This is a 32bit GOT-relative reloc for the mn10300, offset by two bytes
+in the instruction.  */
+  BFD_RELOC_MN10300_GOT32,
+
+/* This is a 24bit GOT-relative reloc for the mn10300, offset by two bytes
+in the instruction.  */
+  BFD_RELOC_MN10300_GOT24,
+
+/* This is a 16bit GOT-relative reloc for the mn10300, offset by two bytes
+in the instruction.  */
+  BFD_RELOC_MN10300_GOT16,
+
+/* Copy symbol at runtime.  */
+  BFD_RELOC_MN10300_COPY,
+
+/* Create GOT entry.  */
+  BFD_RELOC_MN10300_GLOB_DAT,
+
+/* Create PLT entry.  */
+  BFD_RELOC_MN10300_JMP_SLOT,
+
+/* Adjust by program base.  */
+  BFD_RELOC_MN10300_RELATIVE,
+
+/* Together with another reloc targeted at the same location,
+allows for a value that is the difference of two symbols
+in the same section.  */
+  BFD_RELOC_MN10300_SYM_DIFF,
+
+/* The addend of this reloc is an alignment power that must
+be honoured at the offset's location, regardless of linker
+relaxation.  */
+  BFD_RELOC_MN10300_ALIGN,
+
+/* Various TLS-related relocations.  */
+  BFD_RELOC_MN10300_TLS_GD,
+  BFD_RELOC_MN10300_TLS_LD,
+  BFD_RELOC_MN10300_TLS_LDO,
+  BFD_RELOC_MN10300_TLS_GOTIE,
+  BFD_RELOC_MN10300_TLS_IE,
+  BFD_RELOC_MN10300_TLS_LE,
+  BFD_RELOC_MN10300_TLS_DTPMOD,
+  BFD_RELOC_MN10300_TLS_DTPOFF,
+  BFD_RELOC_MN10300_TLS_TPOFF,
+
+/* This is a 32bit pcrel reloc for the mn10300, offset by two bytes in the
+instruction.  */
+  BFD_RELOC_MN10300_32_PCREL,
+
+/* This is a 16bit pcrel reloc for the mn10300, offset by two bytes in the
+instruction.  */
+  BFD_RELOC_MN10300_16_PCREL,
+
+
+/* i386/elf relocations  */
+  BFD_RELOC_386_GOT32,
+  BFD_RELOC_386_PLT32,
+  BFD_RELOC_386_COPY,
+  BFD_RELOC_386_GLOB_DAT,
+  BFD_RELOC_386_JUMP_SLOT,
+  BFD_RELOC_386_RELATIVE,
+  BFD_RELOC_386_GOTOFF,
+  BFD_RELOC_386_GOTPC,
+  BFD_RELOC_386_TLS_TPOFF,
+  BFD_RELOC_386_TLS_IE,
+  BFD_RELOC_386_TLS_GOTIE,
+  BFD_RELOC_386_TLS_LE,
+  BFD_RELOC_386_TLS_GD,
+  BFD_RELOC_386_TLS_LDM,
+  BFD_RELOC_386_TLS_LDO_32,
+  BFD_RELOC_386_TLS_IE_32,
+  BFD_RELOC_386_TLS_LE_32,
+  BFD_RELOC_386_TLS_DTPMOD32,
+  BFD_RELOC_386_TLS_DTPOFF32,
+  BFD_RELOC_386_TLS_TPOFF32,
+  BFD_RELOC_386_TLS_GOTDESC,
+  BFD_RELOC_386_TLS_DESC_CALL,
+  BFD_RELOC_386_TLS_DESC,
+  BFD_RELOC_386_IRELATIVE,
+  BFD_RELOC_386_GOT32X,
+
+/* x86-64/elf relocations  */
+  BFD_RELOC_X86_64_GOT32,
+  BFD_RELOC_X86_64_PLT32,
+  BFD_RELOC_X86_64_COPY,
+  BFD_RELOC_X86_64_GLOB_DAT,
+  BFD_RELOC_X86_64_JUMP_SLOT,
+  BFD_RELOC_X86_64_RELATIVE,
+  BFD_RELOC_X86_64_GOTPCREL,
+  BFD_RELOC_X86_64_32S,
+  BFD_RELOC_X86_64_DTPMOD64,
+  BFD_RELOC_X86_64_DTPOFF64,
+  BFD_RELOC_X86_64_TPOFF64,
+  BFD_RELOC_X86_64_TLSGD,
+  BFD_RELOC_X86_64_TLSLD,
+  BFD_RELOC_X86_64_DTPOFF32,
+  BFD_RELOC_X86_64_GOTTPOFF,
+  BFD_RELOC_X86_64_TPOFF32,
+  BFD_RELOC_X86_64_GOTOFF64,
+  BFD_RELOC_X86_64_GOTPC32,
+  BFD_RELOC_X86_64_GOT64,
+  BFD_RELOC_X86_64_GOTPCREL64,
+  BFD_RELOC_X86_64_GOTPC64,
+  BFD_RELOC_X86_64_GOTPLT64,
+  BFD_RELOC_X86_64_PLTOFF64,
+  BFD_RELOC_X86_64_GOTPC32_TLSDESC,
+  BFD_RELOC_X86_64_TLSDESC_CALL,
+  BFD_RELOC_X86_64_TLSDESC,
+  BFD_RELOC_X86_64_IRELATIVE,
+  BFD_RELOC_X86_64_PC32_BND,
+  BFD_RELOC_X86_64_PLT32_BND,
+  BFD_RELOC_X86_64_GOTPCRELX,
+  BFD_RELOC_X86_64_REX_GOTPCRELX,
+
+/* ns32k relocations  */
+  BFD_RELOC_NS32K_IMM_8,
+  BFD_RELOC_NS32K_IMM_16,
+  BFD_RELOC_NS32K_IMM_32,
+  BFD_RELOC_NS32K_IMM_8_PCREL,
+  BFD_RELOC_NS32K_IMM_16_PCREL,
+  BFD_RELOC_NS32K_IMM_32_PCREL,
+  BFD_RELOC_NS32K_DISP_8,
+  BFD_RELOC_NS32K_DISP_16,
+  BFD_RELOC_NS32K_DISP_32,
+  BFD_RELOC_NS32K_DISP_8_PCREL,
+  BFD_RELOC_NS32K_DISP_16_PCREL,
+  BFD_RELOC_NS32K_DISP_32_PCREL,
+
+/* PDP11 relocations  */
+  BFD_RELOC_PDP11_DISP_8_PCREL,
+  BFD_RELOC_PDP11_DISP_6_PCREL,
+
+/* Picojava relocs.  Not all of these appear in object files.  */
+  BFD_RELOC_PJ_CODE_HI16,
+  BFD_RELOC_PJ_CODE_LO16,
+  BFD_RELOC_PJ_CODE_DIR16,
+  BFD_RELOC_PJ_CODE_DIR32,
+  BFD_RELOC_PJ_CODE_REL16,
+  BFD_RELOC_PJ_CODE_REL32,
+
+/* Power(rs6000) and PowerPC relocations.  */
+  BFD_RELOC_PPC_B26,
+  BFD_RELOC_PPC_BA26,
+  BFD_RELOC_PPC_TOC16,
+  BFD_RELOC_PPC_B16,
+  BFD_RELOC_PPC_B16_BRTAKEN,
+  BFD_RELOC_PPC_B16_BRNTAKEN,
+  BFD_RELOC_PPC_BA16,
+  BFD_RELOC_PPC_BA16_BRTAKEN,
+  BFD_RELOC_PPC_BA16_BRNTAKEN,
+  BFD_RELOC_PPC_COPY,
+  BFD_RELOC_PPC_GLOB_DAT,
+  BFD_RELOC_PPC_JMP_SLOT,
+  BFD_RELOC_PPC_RELATIVE,
+  BFD_RELOC_PPC_LOCAL24PC,
+  BFD_RELOC_PPC_EMB_NADDR32,
+  BFD_RELOC_PPC_EMB_NADDR16,
+  BFD_RELOC_PPC_EMB_NADDR16_LO,
+  BFD_RELOC_PPC_EMB_NADDR16_HI,
+  BFD_RELOC_PPC_EMB_NADDR16_HA,
+  BFD_RELOC_PPC_EMB_SDAI16,
+  BFD_RELOC_PPC_EMB_SDA2I16,
+  BFD_RELOC_PPC_EMB_SDA2REL,
+  BFD_RELOC_PPC_EMB_SDA21,
+  BFD_RELOC_PPC_EMB_MRKREF,
+  BFD_RELOC_PPC_EMB_RELSEC16,
+  BFD_RELOC_PPC_EMB_RELST_LO,
+  BFD_RELOC_PPC_EMB_RELST_HI,
+  BFD_RELOC_PPC_EMB_RELST_HA,
+  BFD_RELOC_PPC_EMB_BIT_FLD,
+  BFD_RELOC_PPC_EMB_RELSDA,
+  BFD_RELOC_PPC_VLE_REL8,
+  BFD_RELOC_PPC_VLE_REL15,
+  BFD_RELOC_PPC_VLE_REL24,
+  BFD_RELOC_PPC_VLE_LO16A,
+  BFD_RELOC_PPC_VLE_LO16D,
+  BFD_RELOC_PPC_VLE_HI16A,
+  BFD_RELOC_PPC_VLE_HI16D,
+  BFD_RELOC_PPC_VLE_HA16A,
+  BFD_RELOC_PPC_VLE_HA16D,
+  BFD_RELOC_PPC_VLE_SDA21,
+  BFD_RELOC_PPC_VLE_SDA21_LO,
+  BFD_RELOC_PPC_VLE_SDAREL_LO16A,
+  BFD_RELOC_PPC_VLE_SDAREL_LO16D,
+  BFD_RELOC_PPC_VLE_SDAREL_HI16A,
+  BFD_RELOC_PPC_VLE_SDAREL_HI16D,
+  BFD_RELOC_PPC_VLE_SDAREL_HA16A,
+  BFD_RELOC_PPC_VLE_SDAREL_HA16D,
+  BFD_RELOC_PPC_16DX_HA,
+  BFD_RELOC_PPC_REL16DX_HA,
+  BFD_RELOC_PPC64_HIGHER,
+  BFD_RELOC_PPC64_HIGHER_S,
+  BFD_RELOC_PPC64_HIGHEST,
+  BFD_RELOC_PPC64_HIGHEST_S,
+  BFD_RELOC_PPC64_TOC16_LO,
+  BFD_RELOC_PPC64_TOC16_HI,
+  BFD_RELOC_PPC64_TOC16_HA,
+  BFD_RELOC_PPC64_TOC,
+  BFD_RELOC_PPC64_PLTGOT16,
+  BFD_RELOC_PPC64_PLTGOT16_LO,
+  BFD_RELOC_PPC64_PLTGOT16_HI,
+  BFD_RELOC_PPC64_PLTGOT16_HA,
+  BFD_RELOC_PPC64_ADDR16_DS,
+  BFD_RELOC_PPC64_ADDR16_LO_DS,
+  BFD_RELOC_PPC64_GOT16_DS,
+  BFD_RELOC_PPC64_GOT16_LO_DS,
+  BFD_RELOC_PPC64_PLT16_LO_DS,
+  BFD_RELOC_PPC64_SECTOFF_DS,
+  BFD_RELOC_PPC64_SECTOFF_LO_DS,
+  BFD_RELOC_PPC64_TOC16_DS,
+  BFD_RELOC_PPC64_TOC16_LO_DS,
+  BFD_RELOC_PPC64_PLTGOT16_DS,
+  BFD_RELOC_PPC64_PLTGOT16_LO_DS,
+  BFD_RELOC_PPC64_ADDR16_HIGH,
+  BFD_RELOC_PPC64_ADDR16_HIGHA,
+  BFD_RELOC_PPC64_ADDR64_LOCAL,
+  BFD_RELOC_PPC64_ENTRY,
+
+/* PowerPC and PowerPC64 thread-local storage relocations.  */
+  BFD_RELOC_PPC_TLS,
+  BFD_RELOC_PPC_TLSGD,
+  BFD_RELOC_PPC_TLSLD,
+  BFD_RELOC_PPC_DTPMOD,
+  BFD_RELOC_PPC_TPREL16,
+  BFD_RELOC_PPC_TPREL16_LO,
+  BFD_RELOC_PPC_TPREL16_HI,
+  BFD_RELOC_PPC_TPREL16_HA,
+  BFD_RELOC_PPC_TPREL,
+  BFD_RELOC_PPC_DTPREL16,
+  BFD_RELOC_PPC_DTPREL16_LO,
+  BFD_RELOC_PPC_DTPREL16_HI,
+  BFD_RELOC_PPC_DTPREL16_HA,
+  BFD_RELOC_PPC_DTPREL,
+  BFD_RELOC_PPC_GOT_TLSGD16,
+  BFD_RELOC_PPC_GOT_TLSGD16_LO,
+  BFD_RELOC_PPC_GOT_TLSGD16_HI,
+  BFD_RELOC_PPC_GOT_TLSGD16_HA,
+  BFD_RELOC_PPC_GOT_TLSLD16,
+  BFD_RELOC_PPC_GOT_TLSLD16_LO,
+  BFD_RELOC_PPC_GOT_TLSLD16_HI,
+  BFD_RELOC_PPC_GOT_TLSLD16_HA,
+  BFD_RELOC_PPC_GOT_TPREL16,
+  BFD_RELOC_PPC_GOT_TPREL16_LO,
+  BFD_RELOC_PPC_GOT_TPREL16_HI,
+  BFD_RELOC_PPC_GOT_TPREL16_HA,
+  BFD_RELOC_PPC_GOT_DTPREL16,
+  BFD_RELOC_PPC_GOT_DTPREL16_LO,
+  BFD_RELOC_PPC_GOT_DTPREL16_HI,
+  BFD_RELOC_PPC_GOT_DTPREL16_HA,
+  BFD_RELOC_PPC64_TPREL16_DS,
+  BFD_RELOC_PPC64_TPREL16_LO_DS,
+  BFD_RELOC_PPC64_TPREL16_HIGHER,
+  BFD_RELOC_PPC64_TPREL16_HIGHERA,
+  BFD_RELOC_PPC64_TPREL16_HIGHEST,
+  BFD_RELOC_PPC64_TPREL16_HIGHESTA,
+  BFD_RELOC_PPC64_DTPREL16_DS,
+  BFD_RELOC_PPC64_DTPREL16_LO_DS,
+  BFD_RELOC_PPC64_DTPREL16_HIGHER,
+  BFD_RELOC_PPC64_DTPREL16_HIGHERA,
+  BFD_RELOC_PPC64_DTPREL16_HIGHEST,
+  BFD_RELOC_PPC64_DTPREL16_HIGHESTA,
+  BFD_RELOC_PPC64_TPREL16_HIGH,
+  BFD_RELOC_PPC64_TPREL16_HIGHA,
+  BFD_RELOC_PPC64_DTPREL16_HIGH,
+  BFD_RELOC_PPC64_DTPREL16_HIGHA,
+
+/* IBM 370/390 relocations  */
+  BFD_RELOC_I370_D12,
+
+/* The type of reloc used to build a constructor table - at the moment
+probably a 32 bit wide absolute relocation, but the target can choose.
+It generally does map to one of the other relocation types.  */
+  BFD_RELOC_CTOR,
+
+/* ARM 26 bit pc-relative branch.  The lowest two bits must be zero and are
+not stored in the instruction.  */
+  BFD_RELOC_ARM_PCREL_BRANCH,
+
+/* ARM 26 bit pc-relative branch.  The lowest bit must be zero and is
+not stored in the instruction.  The 2nd lowest bit comes from a 1 bit
+field in the instruction.  */
+  BFD_RELOC_ARM_PCREL_BLX,
+
+/* Thumb 22 bit pc-relative branch.  The lowest bit must be zero and is
+not stored in the instruction.  The 2nd lowest bit comes from a 1 bit
+field in the instruction.  */
+  BFD_RELOC_THUMB_PCREL_BLX,
+
+/* ARM 26-bit pc-relative branch for an unconditional BL or BLX instruction.  */
+  BFD_RELOC_ARM_PCREL_CALL,
+
+/* ARM 26-bit pc-relative branch for B or conditional BL instruction.  */
+  BFD_RELOC_ARM_PCREL_JUMP,
+
+/* Thumb 7-, 9-, 12-, 20-, 23-, and 25-bit pc-relative branches.
+The lowest bit must be zero and is not stored in the instruction.
+Note that the corresponding ELF R_ARM_THM_JUMPnn constant has an
+"nn" one smaller in all cases.  Note further that BRANCH23
+corresponds to R_ARM_THM_CALL.  */
+  BFD_RELOC_THUMB_PCREL_BRANCH7,
+  BFD_RELOC_THUMB_PCREL_BRANCH9,
+  BFD_RELOC_THUMB_PCREL_BRANCH12,
+  BFD_RELOC_THUMB_PCREL_BRANCH20,
+  BFD_RELOC_THUMB_PCREL_BRANCH23,
+  BFD_RELOC_THUMB_PCREL_BRANCH25,
+
+/* 12-bit immediate offset, used in ARM-format ldr and str instructions.  */
+  BFD_RELOC_ARM_OFFSET_IMM,
+
+/* 5-bit immediate offset, used in Thumb-format ldr and str instructions.  */
+  BFD_RELOC_ARM_THUMB_OFFSET,
+
+/* Pc-relative or absolute relocation depending on target.  Used for
+entries in .init_array sections.  */
+  BFD_RELOC_ARM_TARGET1,
+
+/* Read-only segment base relative address.  */
+  BFD_RELOC_ARM_ROSEGREL32,
+
+/* Data segment base relative address.  */
+  BFD_RELOC_ARM_SBREL32,
+
+/* This reloc is used for references to RTTI data from exception handling
+tables.  The actual definition depends on the target.  It may be a
+pc-relative or some form of GOT-indirect relocation.  */
+  BFD_RELOC_ARM_TARGET2,
+
+/* 31-bit PC relative address.  */
+  BFD_RELOC_ARM_PREL31,
+
+/* Low and High halfword relocations for MOVW and MOVT instructions.  */
+  BFD_RELOC_ARM_MOVW,
+  BFD_RELOC_ARM_MOVT,
+  BFD_RELOC_ARM_MOVW_PCREL,
+  BFD_RELOC_ARM_MOVT_PCREL,
+  BFD_RELOC_ARM_THUMB_MOVW,
+  BFD_RELOC_ARM_THUMB_MOVT,
+  BFD_RELOC_ARM_THUMB_MOVW_PCREL,
+  BFD_RELOC_ARM_THUMB_MOVT_PCREL,
+
+/* Relocations for setting up GOTs and PLTs for shared libraries.  */
+  BFD_RELOC_ARM_JUMP_SLOT,
+  BFD_RELOC_ARM_GLOB_DAT,
+  BFD_RELOC_ARM_GOT32,
+  BFD_RELOC_ARM_PLT32,
+  BFD_RELOC_ARM_RELATIVE,
+  BFD_RELOC_ARM_GOTOFF,
+  BFD_RELOC_ARM_GOTPC,
+  BFD_RELOC_ARM_GOT_PREL,
+
+/* ARM thread-local storage relocations.  */
+  BFD_RELOC_ARM_TLS_GD32,
+  BFD_RELOC_ARM_TLS_LDO32,
+  BFD_RELOC_ARM_TLS_LDM32,
+  BFD_RELOC_ARM_TLS_DTPOFF32,
+  BFD_RELOC_ARM_TLS_DTPMOD32,
+  BFD_RELOC_ARM_TLS_TPOFF32,
+  BFD_RELOC_ARM_TLS_IE32,
+  BFD_RELOC_ARM_TLS_LE32,
+  BFD_RELOC_ARM_TLS_GOTDESC,
+  BFD_RELOC_ARM_TLS_CALL,
+  BFD_RELOC_ARM_THM_TLS_CALL,
+  BFD_RELOC_ARM_TLS_DESCSEQ,
+  BFD_RELOC_ARM_THM_TLS_DESCSEQ,
+  BFD_RELOC_ARM_TLS_DESC,
+
+/* ARM group relocations.  */
+  BFD_RELOC_ARM_ALU_PC_G0_NC,
+  BFD_RELOC_ARM_ALU_PC_G0,
+  BFD_RELOC_ARM_ALU_PC_G1_NC,
+  BFD_RELOC_ARM_ALU_PC_G1,
+  BFD_RELOC_ARM_ALU_PC_G2,
+  BFD_RELOC_ARM_LDR_PC_G0,
+  BFD_RELOC_ARM_LDR_PC_G1,
+  BFD_RELOC_ARM_LDR_PC_G2,
+  BFD_RELOC_ARM_LDRS_PC_G0,
+  BFD_RELOC_ARM_LDRS_PC_G1,
+  BFD_RELOC_ARM_LDRS_PC_G2,
+  BFD_RELOC_ARM_LDC_PC_G0,
+  BFD_RELOC_ARM_LDC_PC_G1,
+  BFD_RELOC_ARM_LDC_PC_G2,
+  BFD_RELOC_ARM_ALU_SB_G0_NC,
+  BFD_RELOC_ARM_ALU_SB_G0,
+  BFD_RELOC_ARM_ALU_SB_G1_NC,
+  BFD_RELOC_ARM_ALU_SB_G1,
+  BFD_RELOC_ARM_ALU_SB_G2,
+  BFD_RELOC_ARM_LDR_SB_G0,
+  BFD_RELOC_ARM_LDR_SB_G1,
+  BFD_RELOC_ARM_LDR_SB_G2,
+  BFD_RELOC_ARM_LDRS_SB_G0,
+  BFD_RELOC_ARM_LDRS_SB_G1,
+  BFD_RELOC_ARM_LDRS_SB_G2,
+  BFD_RELOC_ARM_LDC_SB_G0,
+  BFD_RELOC_ARM_LDC_SB_G1,
+  BFD_RELOC_ARM_LDC_SB_G2,
+
+/* Annotation of BX instructions.  */
+  BFD_RELOC_ARM_V4BX,
+
+/* ARM support for STT_GNU_IFUNC.  */
+  BFD_RELOC_ARM_IRELATIVE,
+
+/* Thumb1 relocations to support execute-only code.  */
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G0_NC,
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G1_NC,
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G2_NC,
+  BFD_RELOC_ARM_THUMB_ALU_ABS_G3_NC,
+
+/* These relocs are only used within the ARM assembler.  They are not
+(at present) written to any object files.  */
+  BFD_RELOC_ARM_IMMEDIATE,
+  BFD_RELOC_ARM_ADRL_IMMEDIATE,
+  BFD_RELOC_ARM_T32_IMMEDIATE,
+  BFD_RELOC_ARM_T32_ADD_IMM,
+  BFD_RELOC_ARM_T32_IMM12,
+  BFD_RELOC_ARM_T32_ADD_PC12,
+  BFD_RELOC_ARM_SHIFT_IMM,
+  BFD_RELOC_ARM_SMC,
+  BFD_RELOC_ARM_HVC,
+  BFD_RELOC_ARM_SWI,
+  BFD_RELOC_ARM_MULTI,
+  BFD_RELOC_ARM_CP_OFF_IMM,
+  BFD_RELOC_ARM_CP_OFF_IMM_S2,
+  BFD_RELOC_ARM_T32_CP_OFF_IMM,
+  BFD_RELOC_ARM_T32_CP_OFF_IMM_S2,
+  BFD_RELOC_ARM_ADR_IMM,
+  BFD_RELOC_ARM_LDR_IMM,
+  BFD_RELOC_ARM_LITERAL,
+  BFD_RELOC_ARM_IN_POOL,
+  BFD_RELOC_ARM_OFFSET_IMM8,
+  BFD_RELOC_ARM_T32_OFFSET_U8,
+  BFD_RELOC_ARM_T32_OFFSET_IMM,
+  BFD_RELOC_ARM_HWLITERAL,
+  BFD_RELOC_ARM_THUMB_ADD,
+  BFD_RELOC_ARM_THUMB_IMM,
+  BFD_RELOC_ARM_THUMB_SHIFT,
+
+/* Renesas / SuperH SH relocs.  Not all of these appear in object files.  */
+  BFD_RELOC_SH_PCDISP8BY2,
+  BFD_RELOC_SH_PCDISP12BY2,
+  BFD_RELOC_SH_IMM3,
+  BFD_RELOC_SH_IMM3U,
+  BFD_RELOC_SH_DISP12,
+  BFD_RELOC_SH_DISP12BY2,
+  BFD_RELOC_SH_DISP12BY4,
+  BFD_RELOC_SH_DISP12BY8,
+  BFD_RELOC_SH_DISP20,
+  BFD_RELOC_SH_DISP20BY8,
+  BFD_RELOC_SH_IMM4,
+  BFD_RELOC_SH_IMM4BY2,
+  BFD_RELOC_SH_IMM4BY4,
+  BFD_RELOC_SH_IMM8,
+  BFD_RELOC_SH_IMM8BY2,
+  BFD_RELOC_SH_IMM8BY4,
+  BFD_RELOC_SH_PCRELIMM8BY2,
+  BFD_RELOC_SH_PCRELIMM8BY4,
+  BFD_RELOC_SH_SWITCH16,
+  BFD_RELOC_SH_SWITCH32,
+  BFD_RELOC_SH_USES,
+  BFD_RELOC_SH_COUNT,
+  BFD_RELOC_SH_ALIGN,
+  BFD_RELOC_SH_CODE,
+  BFD_RELOC_SH_DATA,
+  BFD_RELOC_SH_LABEL,
+  BFD_RELOC_SH_LOOP_START,
+  BFD_RELOC_SH_LOOP_END,
+  BFD_RELOC_SH_COPY,
+  BFD_RELOC_SH_GLOB_DAT,
+  BFD_RELOC_SH_JMP_SLOT,
+  BFD_RELOC_SH_RELATIVE,
+  BFD_RELOC_SH_GOTPC,
+  BFD_RELOC_SH_GOT_LOW16,
+  BFD_RELOC_SH_GOT_MEDLOW16,
+  BFD_RELOC_SH_GOT_MEDHI16,
+  BFD_RELOC_SH_GOT_HI16,
+  BFD_RELOC_SH_GOTPLT_LOW16,
+  BFD_RELOC_SH_GOTPLT_MEDLOW16,
+  BFD_RELOC_SH_GOTPLT_MEDHI16,
+  BFD_RELOC_SH_GOTPLT_HI16,
+  BFD_RELOC_SH_PLT_LOW16,
+  BFD_RELOC_SH_PLT_MEDLOW16,
+  BFD_RELOC_SH_PLT_MEDHI16,
+  BFD_RELOC_SH_PLT_HI16,
+  BFD_RELOC_SH_GOTOFF_LOW16,
+  BFD_RELOC_SH_GOTOFF_MEDLOW16,
+  BFD_RELOC_SH_GOTOFF_MEDHI16,
+  BFD_RELOC_SH_GOTOFF_HI16,
+  BFD_RELOC_SH_GOTPC_LOW16,
+  BFD_RELOC_SH_GOTPC_MEDLOW16,
+  BFD_RELOC_SH_GOTPC_MEDHI16,
+  BFD_RELOC_SH_GOTPC_HI16,
+  BFD_RELOC_SH_COPY64,
+  BFD_RELOC_SH_GLOB_DAT64,
+  BFD_RELOC_SH_JMP_SLOT64,
+  BFD_RELOC_SH_RELATIVE64,
+  BFD_RELOC_SH_GOT10BY4,
+  BFD_RELOC_SH_GOT10BY8,
+  BFD_RELOC_SH_GOTPLT10BY4,
+  BFD_RELOC_SH_GOTPLT10BY8,
+  BFD_RELOC_SH_GOTPLT32,
+  BFD_RELOC_SH_SHMEDIA_CODE,
+  BFD_RELOC_SH_IMMU5,
+  BFD_RELOC_SH_IMMS6,
+  BFD_RELOC_SH_IMMS6BY32,
+  BFD_RELOC_SH_IMMU6,
+  BFD_RELOC_SH_IMMS10,
+  BFD_RELOC_SH_IMMS10BY2,
+  BFD_RELOC_SH_IMMS10BY4,
+  BFD_RELOC_SH_IMMS10BY8,
+  BFD_RELOC_SH_IMMS16,
+  BFD_RELOC_SH_IMMU16,
+  BFD_RELOC_SH_IMM_LOW16,
+  BFD_RELOC_SH_IMM_LOW16_PCREL,
+  BFD_RELOC_SH_IMM_MEDLOW16,
+  BFD_RELOC_SH_IMM_MEDLOW16_PCREL,
+  BFD_RELOC_SH_IMM_MEDHI16,
+  BFD_RELOC_SH_IMM_MEDHI16_PCREL,
+  BFD_RELOC_SH_IMM_HI16,
+  BFD_RELOC_SH_IMM_HI16_PCREL,
+  BFD_RELOC_SH_PT_16,
+  BFD_RELOC_SH_TLS_GD_32,
+  BFD_RELOC_SH_TLS_LD_32,
+  BFD_RELOC_SH_TLS_LDO_32,
+  BFD_RELOC_SH_TLS_IE_32,
+  BFD_RELOC_SH_TLS_LE_32,
+  BFD_RELOC_SH_TLS_DTPMOD32,
+  BFD_RELOC_SH_TLS_DTPOFF32,
+  BFD_RELOC_SH_TLS_TPOFF32,
+  BFD_RELOC_SH_GOT20,
+  BFD_RELOC_SH_GOTOFF20,
+  BFD_RELOC_SH_GOTFUNCDESC,
+  BFD_RELOC_SH_GOTFUNCDESC20,
+  BFD_RELOC_SH_GOTOFFFUNCDESC,
+  BFD_RELOC_SH_GOTOFFFUNCDESC20,
+  BFD_RELOC_SH_FUNCDESC,
+
+/* ARC relocs.  */
+  BFD_RELOC_ARC_NONE,
+  BFD_RELOC_ARC_8,
+  BFD_RELOC_ARC_16,
+  BFD_RELOC_ARC_24,
+  BFD_RELOC_ARC_32,
+  BFD_RELOC_ARC_N8,
+  BFD_RELOC_ARC_N16,
+  BFD_RELOC_ARC_N24,
+  BFD_RELOC_ARC_N32,
+  BFD_RELOC_ARC_SDA,
+  BFD_RELOC_ARC_SECTOFF,
+  BFD_RELOC_ARC_S21H_PCREL,
+  BFD_RELOC_ARC_S21W_PCREL,
+  BFD_RELOC_ARC_S25H_PCREL,
+  BFD_RELOC_ARC_S25W_PCREL,
+  BFD_RELOC_ARC_SDA32,
+  BFD_RELOC_ARC_SDA_LDST,
+  BFD_RELOC_ARC_SDA_LDST1,
+  BFD_RELOC_ARC_SDA_LDST2,
+  BFD_RELOC_ARC_SDA16_LD,
+  BFD_RELOC_ARC_SDA16_LD1,
+  BFD_RELOC_ARC_SDA16_LD2,
+  BFD_RELOC_ARC_S13_PCREL,
+  BFD_RELOC_ARC_W,
+  BFD_RELOC_ARC_32_ME,
+  BFD_RELOC_ARC_32_ME_S,
+  BFD_RELOC_ARC_N32_ME,
+  BFD_RELOC_ARC_SECTOFF_ME,
+  BFD_RELOC_ARC_SDA32_ME,
+  BFD_RELOC_ARC_W_ME,
+  BFD_RELOC_AC_SECTOFF_U8,
+  BFD_RELOC_AC_SECTOFF_U8_1,
+  BFD_RELOC_AC_SECTOFF_U8_2,
+  BFD_RELOC_AC_SECTOFF_S9,
+  BFD_RELOC_AC_SECTOFF_S9_1,
+  BFD_RELOC_AC_SECTOFF_S9_2,
+  BFD_RELOC_ARC_SECTOFF_ME_1,
+  BFD_RELOC_ARC_SECTOFF_ME_2,
+  BFD_RELOC_ARC_SECTOFF_1,
+  BFD_RELOC_ARC_SECTOFF_2,
+  BFD_RELOC_ARC_SDA_12,
+  BFD_RELOC_ARC_SDA16_ST2,
+  BFD_RELOC_ARC_32_PCREL,
+  BFD_RELOC_ARC_PC32,
+  BFD_RELOC_ARC_GOT32,
+  BFD_RELOC_ARC_GOTPC32,
+  BFD_RELOC_ARC_PLT32,
+  BFD_RELOC_ARC_COPY,
+  BFD_RELOC_ARC_GLOB_DAT,
+  BFD_RELOC_ARC_JMP_SLOT,
+  BFD_RELOC_ARC_RELATIVE,
+  BFD_RELOC_ARC_GOTOFF,
+  BFD_RELOC_ARC_GOTPC,
+  BFD_RELOC_ARC_S21W_PCREL_PLT,
+  BFD_RELOC_ARC_S25H_PCREL_PLT,
+  BFD_RELOC_ARC_TLS_DTPMOD,
+  BFD_RELOC_ARC_TLS_TPOFF,
+  BFD_RELOC_ARC_TLS_GD_GOT,
+  BFD_RELOC_ARC_TLS_GD_LD,
+  BFD_RELOC_ARC_TLS_GD_CALL,
+  BFD_RELOC_ARC_TLS_IE_GOT,
+  BFD_RELOC_ARC_TLS_DTPOFF,
+  BFD_RELOC_ARC_TLS_DTPOFF_S9,
+  BFD_RELOC_ARC_TLS_LE_S9,
+  BFD_RELOC_ARC_TLS_LE_32,
+  BFD_RELOC_ARC_S25W_PCREL_PLT,
+  BFD_RELOC_ARC_S21H_PCREL_PLT,
+  BFD_RELOC_ARC_NPS_CMEM16,
+
+/* ADI Blackfin 16 bit immediate absolute reloc.  */
+  BFD_RELOC_BFIN_16_IMM,
+
+/* ADI Blackfin 16 bit immediate absolute reloc higher 16 bits.  */
+  BFD_RELOC_BFIN_16_HIGH,
+
+/* ADI Blackfin 'a' part of LSETUP.  */
+  BFD_RELOC_BFIN_4_PCREL,
+
+/* ADI Blackfin.  */
+  BFD_RELOC_BFIN_5_PCREL,
+
+/* ADI Blackfin 16 bit immediate absolute reloc lower 16 bits.  */
+  BFD_RELOC_BFIN_16_LOW,
+
+/* ADI Blackfin.  */
+  BFD_RELOC_BFIN_10_PCREL,
+
+/* ADI Blackfin 'b' part of LSETUP.  */
+  BFD_RELOC_BFIN_11_PCREL,
+
+/* ADI Blackfin.  */
+  BFD_RELOC_BFIN_12_PCREL_JUMP,
+
+/* ADI Blackfin Short jump, pcrel.  */
+  BFD_RELOC_BFIN_12_PCREL_JUMP_S,
+
+/* ADI Blackfin Call.x not implemented.  */
+  BFD_RELOC_BFIN_24_PCREL_CALL_X,
+
+/* ADI Blackfin Long Jump pcrel.  */
+  BFD_RELOC_BFIN_24_PCREL_JUMP_L,
+
+/* ADI Blackfin FD-PIC relocations.  */
+  BFD_RELOC_BFIN_GOT17M4,
+  BFD_RELOC_BFIN_GOTHI,
+  BFD_RELOC_BFIN_GOTLO,
+  BFD_RELOC_BFIN_FUNCDESC,
+  BFD_RELOC_BFIN_FUNCDESC_GOT17M4,
+  BFD_RELOC_BFIN_FUNCDESC_GOTHI,
+  BFD_RELOC_BFIN_FUNCDESC_GOTLO,
+  BFD_RELOC_BFIN_FUNCDESC_VALUE,
+  BFD_RELOC_BFIN_FUNCDESC_GOTOFF17M4,
+  BFD_RELOC_BFIN_FUNCDESC_GOTOFFHI,
+  BFD_RELOC_BFIN_FUNCDESC_GOTOFFLO,
+  BFD_RELOC_BFIN_GOTOFF17M4,
+  BFD_RELOC_BFIN_GOTOFFHI,
+  BFD_RELOC_BFIN_GOTOFFLO,
+
+/* ADI Blackfin GOT relocation.  */
+  BFD_RELOC_BFIN_GOT,
+
+/* ADI Blackfin PLTPC relocation.  */
+  BFD_RELOC_BFIN_PLTPC,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_PUSH,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_CONST,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_ADD,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_SUB,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_MULT,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_DIV,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_MOD,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LSHIFT,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_RSHIFT,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_AND,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_OR,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_XOR,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LAND,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LOR,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_LEN,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_NEG,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_COMP,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_PAGE,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_HWPAGE,
+
+/* ADI Blackfin arithmetic relocation.  */
+  BFD_ARELOC_BFIN_ADDR,
+
+/* Mitsubishi D10V relocs.
+This is a 10-bit reloc with the right 2 bits
+assumed to be 0.  */
+  BFD_RELOC_D10V_10_PCREL_R,
+
+/* Mitsubishi D10V relocs.
+This is a 10-bit reloc with the right 2 bits
+assumed to be 0.  This is the same as the previous reloc
+except it is in the left container, i.e.,
+shifted left 15 bits.  */
+  BFD_RELOC_D10V_10_PCREL_L,
+
+/* This is an 18-bit reloc with the right 2 bits
+assumed to be 0.  */
+  BFD_RELOC_D10V_18,
+
+/* This is an 18-bit reloc with the right 2 bits
+assumed to be 0.  */
+  BFD_RELOC_D10V_18_PCREL,
+
+/* Mitsubishi D30V relocs.
+This is a 6-bit absolute reloc.  */
+  BFD_RELOC_D30V_6,
+
+/* This is a 6-bit pc-relative reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_9_PCREL,
+
+/* This is a 6-bit pc-relative reloc with
+the right 3 bits assumed to be 0. Same
+as the previous reloc but on the right side
+of the container.  */
+  BFD_RELOC_D30V_9_PCREL_R,
+
+/* This is a 12-bit absolute reloc with the
+right 3 bitsassumed to be 0.  */
+  BFD_RELOC_D30V_15,
+
+/* This is a 12-bit pc-relative reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_15_PCREL,
+
+/* This is a 12-bit pc-relative reloc with
+the right 3 bits assumed to be 0. Same
+as the previous reloc but on the right side
+of the container.  */
+  BFD_RELOC_D30V_15_PCREL_R,
+
+/* This is an 18-bit absolute reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_21,
+
+/* This is an 18-bit pc-relative reloc with
+the right 3 bits assumed to be 0.  */
+  BFD_RELOC_D30V_21_PCREL,
+
+/* This is an 18-bit pc-relative reloc with
+the right 3 bits assumed to be 0. Same
+as the previous reloc but on the right side
+of the container.  */
+  BFD_RELOC_D30V_21_PCREL_R,
+
+/* This is a 32-bit absolute reloc.  */
+  BFD_RELOC_D30V_32,
+
+/* This is a 32-bit pc-relative reloc.  */
+  BFD_RELOC_D30V_32_PCREL,
+
+/* DLX relocs  */
+  BFD_RELOC_DLX_HI16_S,
+
+/* DLX relocs  */
+  BFD_RELOC_DLX_LO16,
+
+/* DLX relocs  */
+  BFD_RELOC_DLX_JMP26,
+
+/* Renesas M16C/M32C Relocations.  */
+  BFD_RELOC_M32C_HI8,
+  BFD_RELOC_M32C_RL_JUMP,
+  BFD_RELOC_M32C_RL_1ADDR,
+  BFD_RELOC_M32C_RL_2ADDR,
+
+/* Renesas M32R (formerly Mitsubishi M32R) relocs.
+This is a 24 bit absolute address.  */
+  BFD_RELOC_M32R_24,
+
+/* This is a 10-bit pc-relative reloc with the right 2 bits assumed to be 0.  */
+  BFD_RELOC_M32R_10_PCREL,
+
+/* This is an 18-bit reloc with the right 2 bits assumed to be 0.  */
+  BFD_RELOC_M32R_18_PCREL,
+
+/* This is a 26-bit reloc with the right 2 bits assumed to be 0.  */
+  BFD_RELOC_M32R_26_PCREL,
+
+/* This is a 16-bit reloc containing the high 16 bits of an address
+used when the lower 16 bits are treated as unsigned.  */
+  BFD_RELOC_M32R_HI16_ULO,
+
+/* This is a 16-bit reloc containing the high 16 bits of an address
+used when the lower 16 bits are treated as signed.  */
+  BFD_RELOC_M32R_HI16_SLO,
+
+/* This is a 16-bit reloc containing the lower 16 bits of an address.  */
+  BFD_RELOC_M32R_LO16,
+
+/* This is a 16-bit reloc containing the small data area offset for use in
+add3, load, and store instructions.  */
+  BFD_RELOC_M32R_SDA16,
+
+/* For PIC.  */
+  BFD_RELOC_M32R_GOT24,
+  BFD_RELOC_M32R_26_PLTREL,
+  BFD_RELOC_M32R_COPY,
+  BFD_RELOC_M32R_GLOB_DAT,
+  BFD_RELOC_M32R_JMP_SLOT,
+  BFD_RELOC_M32R_RELATIVE,
+  BFD_RELOC_M32R_GOTOFF,
+  BFD_RELOC_M32R_GOTOFF_HI_ULO,
+  BFD_RELOC_M32R_GOTOFF_HI_SLO,
+  BFD_RELOC_M32R_GOTOFF_LO,
+  BFD_RELOC_M32R_GOTPC24,
+  BFD_RELOC_M32R_GOT16_HI_ULO,
+  BFD_RELOC_M32R_GOT16_HI_SLO,
+  BFD_RELOC_M32R_GOT16_LO,
+  BFD_RELOC_M32R_GOTPC_HI_ULO,
+  BFD_RELOC_M32R_GOTPC_HI_SLO,
+  BFD_RELOC_M32R_GOTPC_LO,
+
+/* NDS32 relocs.
+This is a 20 bit absolute address.  */
+  BFD_RELOC_NDS32_20,
+
+/* This is a 9-bit pc-relative reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_9_PCREL,
+
+/* This is a 9-bit pc-relative reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_WORD_9_PCREL,
+
+/* This is an 15-bit reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_15_PCREL,
+
+/* This is an 17-bit reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_17_PCREL,
+
+/* This is a 25-bit reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_25_PCREL,
+
+/* This is a 20-bit reloc containing the high 20 bits of an address
+used with the lower 12 bits  */
+  BFD_RELOC_NDS32_HI20,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift right by 3. This is used with ldi,sdi...  */
+  BFD_RELOC_NDS32_LO12S3,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 2. This is used with lwi,swi...  */
+  BFD_RELOC_NDS32_LO12S2,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 1. This is used with lhi,shi...  */
+  BFD_RELOC_NDS32_LO12S1,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 0. This is used with lbisbi...  */
+  BFD_RELOC_NDS32_LO12S0,
+
+/* This is a 12-bit reloc containing the lower 12 bits of an address
+then shift left by 0. This is only used with branch relaxations  */
+  BFD_RELOC_NDS32_LO12S0_ORI,
+
+/* This is a 15-bit reloc containing the small data area 18-bit signed offset
+and shift left by 3 for use in ldi, sdi...  */
+  BFD_RELOC_NDS32_SDA15S3,
+
+/* This is a 15-bit reloc containing the small data area 17-bit signed offset
+and shift left by 2 for use in lwi, swi...  */
+  BFD_RELOC_NDS32_SDA15S2,
+
+/* This is a 15-bit reloc containing the small data area 16-bit signed offset
+and shift left by 1 for use in lhi, shi...  */
+  BFD_RELOC_NDS32_SDA15S1,
+
+/* This is a 15-bit reloc containing the small data area 15-bit signed offset
+and shift left by 0 for use in lbi, sbi...  */
+  BFD_RELOC_NDS32_SDA15S0,
+
+/* This is a 16-bit reloc containing the small data area 16-bit signed offset
+and shift left by 3  */
+  BFD_RELOC_NDS32_SDA16S3,
+
+/* This is a 17-bit reloc containing the small data area 17-bit signed offset
+and shift left by 2 for use in lwi.gp, swi.gp...  */
+  BFD_RELOC_NDS32_SDA17S2,
+
+/* This is a 18-bit reloc containing the small data area 18-bit signed offset
+and shift left by 1 for use in lhi.gp, shi.gp...  */
+  BFD_RELOC_NDS32_SDA18S1,
+
+/* This is a 19-bit reloc containing the small data area 19-bit signed offset
+and shift left by 0 for use in lbi.gp, sbi.gp...  */
+  BFD_RELOC_NDS32_SDA19S0,
+
+/* for PIC  */
+  BFD_RELOC_NDS32_GOT20,
+  BFD_RELOC_NDS32_9_PLTREL,
+  BFD_RELOC_NDS32_25_PLTREL,
+  BFD_RELOC_NDS32_COPY,
+  BFD_RELOC_NDS32_GLOB_DAT,
+  BFD_RELOC_NDS32_JMP_SLOT,
+  BFD_RELOC_NDS32_RELATIVE,
+  BFD_RELOC_NDS32_GOTOFF,
+  BFD_RELOC_NDS32_GOTOFF_HI20,
+  BFD_RELOC_NDS32_GOTOFF_LO12,
+  BFD_RELOC_NDS32_GOTPC20,
+  BFD_RELOC_NDS32_GOT_HI20,
+  BFD_RELOC_NDS32_GOT_LO12,
+  BFD_RELOC_NDS32_GOTPC_HI20,
+  BFD_RELOC_NDS32_GOTPC_LO12,
+
+/* for relax  */
+  BFD_RELOC_NDS32_INSN16,
+  BFD_RELOC_NDS32_LABEL,
+  BFD_RELOC_NDS32_LONGCALL1,
+  BFD_RELOC_NDS32_LONGCALL2,
+  BFD_RELOC_NDS32_LONGCALL3,
+  BFD_RELOC_NDS32_LONGJUMP1,
+  BFD_RELOC_NDS32_LONGJUMP2,
+  BFD_RELOC_NDS32_LONGJUMP3,
+  BFD_RELOC_NDS32_LOADSTORE,
+  BFD_RELOC_NDS32_9_FIXED,
+  BFD_RELOC_NDS32_15_FIXED,
+  BFD_RELOC_NDS32_17_FIXED,
+  BFD_RELOC_NDS32_25_FIXED,
+  BFD_RELOC_NDS32_LONGCALL4,
+  BFD_RELOC_NDS32_LONGCALL5,
+  BFD_RELOC_NDS32_LONGCALL6,
+  BFD_RELOC_NDS32_LONGJUMP4,
+  BFD_RELOC_NDS32_LONGJUMP5,
+  BFD_RELOC_NDS32_LONGJUMP6,
+  BFD_RELOC_NDS32_LONGJUMP7,
+
+/* for PIC  */
+  BFD_RELOC_NDS32_PLTREL_HI20,
+  BFD_RELOC_NDS32_PLTREL_LO12,
+  BFD_RELOC_NDS32_PLT_GOTREL_HI20,
+  BFD_RELOC_NDS32_PLT_GOTREL_LO12,
+
+/* for floating point  */
+  BFD_RELOC_NDS32_SDA12S2_DP,
+  BFD_RELOC_NDS32_SDA12S2_SP,
+  BFD_RELOC_NDS32_LO12S2_DP,
+  BFD_RELOC_NDS32_LO12S2_SP,
+
+/* for dwarf2 debug_line.  */
+  BFD_RELOC_NDS32_DWARF2_OP1,
+  BFD_RELOC_NDS32_DWARF2_OP2,
+  BFD_RELOC_NDS32_DWARF2_LEB,
+
+/* for eliminate 16-bit instructions  */
+  BFD_RELOC_NDS32_UPDATE_TA,
+
+/* for PIC object relaxation  */
+  BFD_RELOC_NDS32_PLT_GOTREL_LO20,
+  BFD_RELOC_NDS32_PLT_GOTREL_LO15,
+  BFD_RELOC_NDS32_PLT_GOTREL_LO19,
+  BFD_RELOC_NDS32_GOT_LO15,
+  BFD_RELOC_NDS32_GOT_LO19,
+  BFD_RELOC_NDS32_GOTOFF_LO15,
+  BFD_RELOC_NDS32_GOTOFF_LO19,
+  BFD_RELOC_NDS32_GOT15S2,
+  BFD_RELOC_NDS32_GOT17S2,
+
+/* NDS32 relocs.
+This is a 5 bit absolute address.  */
+  BFD_RELOC_NDS32_5,
+
+/* This is a 10-bit unsigned pc-relative reloc with the right 1 bit assumed to be 0.  */
+  BFD_RELOC_NDS32_10_UPCREL,
+
+/* If fp were omitted, fp can used as another gp.  */
+  BFD_RELOC_NDS32_SDA_FP7U2_RELA,
+
+/* relaxation relative relocation types  */
+  BFD_RELOC_NDS32_RELAX_ENTRY,
+  BFD_RELOC_NDS32_GOT_SUFF,
+  BFD_RELOC_NDS32_GOTOFF_SUFF,
+  BFD_RELOC_NDS32_PLT_GOT_SUFF,
+  BFD_RELOC_NDS32_MULCALL_SUFF,
+  BFD_RELOC_NDS32_PTR,
+  BFD_RELOC_NDS32_PTR_COUNT,
+  BFD_RELOC_NDS32_PTR_RESOLVED,
+  BFD_RELOC_NDS32_PLTBLOCK,
+  BFD_RELOC_NDS32_RELAX_REGION_BEGIN,
+  BFD_RELOC_NDS32_RELAX_REGION_END,
+  BFD_RELOC_NDS32_MINUEND,
+  BFD_RELOC_NDS32_SUBTRAHEND,
+  BFD_RELOC_NDS32_DIFF8,
+  BFD_RELOC_NDS32_DIFF16,
+  BFD_RELOC_NDS32_DIFF32,
+  BFD_RELOC_NDS32_DIFF_ULEB128,
+  BFD_RELOC_NDS32_EMPTY,
+
+/* This is a 25 bit absolute address.  */
+  BFD_RELOC_NDS32_25_ABS,
+
+/* For ex9 and ifc using.  */
+  BFD_RELOC_NDS32_DATA,
+  BFD_RELOC_NDS32_TRAN,
+  BFD_RELOC_NDS32_17IFC_PCREL,
+  BFD_RELOC_NDS32_10IFCU_PCREL,
+
+/* For TLS.  */
+  BFD_RELOC_NDS32_TPOFF,
+  BFD_RELOC_NDS32_TLS_LE_HI20,
+  BFD_RELOC_NDS32_TLS_LE_LO12,
+  BFD_RELOC_NDS32_TLS_LE_ADD,
+  BFD_RELOC_NDS32_TLS_LE_LS,
+  BFD_RELOC_NDS32_GOTTPOFF,
+  BFD_RELOC_NDS32_TLS_IE_HI20,
+  BFD_RELOC_NDS32_TLS_IE_LO12S2,
+  BFD_RELOC_NDS32_TLS_TPOFF,
+  BFD_RELOC_NDS32_TLS_LE_20,
+  BFD_RELOC_NDS32_TLS_LE_15S0,
+  BFD_RELOC_NDS32_TLS_LE_15S1,
+  BFD_RELOC_NDS32_TLS_LE_15S2,
+
+/* This is a 9-bit reloc  */
+  BFD_RELOC_V850_9_PCREL,
+
+/* This is a 22-bit reloc  */
+  BFD_RELOC_V850_22_PCREL,
+
+/* This is a 16 bit offset from the short data area pointer.  */
+  BFD_RELOC_V850_SDA_16_16_OFFSET,
+
+/* This is a 16 bit offset (of which only 15 bits are used) from the
+short data area pointer.  */
+  BFD_RELOC_V850_SDA_15_16_OFFSET,
+
+/* This is a 16 bit offset from the zero data area pointer.  */
+  BFD_RELOC_V850_ZDA_16_16_OFFSET,
+
+/* This is a 16 bit offset (of which only 15 bits are used) from the
+zero data area pointer.  */
+  BFD_RELOC_V850_ZDA_15_16_OFFSET,
+
+/* This is an 8 bit offset (of which only 6 bits are used) from the
+tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_6_8_OFFSET,
+
+/* This is an 8bit offset (of which only 7 bits are used) from the tiny
+data area pointer.  */
+  BFD_RELOC_V850_TDA_7_8_OFFSET,
+
+/* This is a 7 bit offset from the tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_7_7_OFFSET,
+
+/* This is a 16 bit offset from the tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_16_16_OFFSET,
+
+/* This is a 5 bit offset (of which only 4 bits are used) from the tiny
+data area pointer.  */
+  BFD_RELOC_V850_TDA_4_5_OFFSET,
+
+/* This is a 4 bit offset from the tiny data area pointer.  */
+  BFD_RELOC_V850_TDA_4_4_OFFSET,
+
+/* This is a 16 bit offset from the short data area pointer, with the
+bits placed non-contiguously in the instruction.  */
+  BFD_RELOC_V850_SDA_16_16_SPLIT_OFFSET,
+
+/* This is a 16 bit offset from the zero data area pointer, with the
+bits placed non-contiguously in the instruction.  */
+  BFD_RELOC_V850_ZDA_16_16_SPLIT_OFFSET,
+
+/* This is a 6 bit offset from the call table base pointer.  */
+  BFD_RELOC_V850_CALLT_6_7_OFFSET,
+
+/* This is a 16 bit offset from the call table base pointer.  */
+  BFD_RELOC_V850_CALLT_16_16_OFFSET,
+
+/* Used for relaxing indirect function calls.  */
+  BFD_RELOC_V850_LONGCALL,
+
+/* Used for relaxing indirect jumps.  */
+  BFD_RELOC_V850_LONGJUMP,
+
+/* Used to maintain alignment whilst relaxing.  */
+  BFD_RELOC_V850_ALIGN,
+
+/* This is a variation of BFD_RELOC_LO16 that can be used in v850e ld.bu
+instructions.  */
+  BFD_RELOC_V850_LO16_SPLIT_OFFSET,
+
+/* This is a 16-bit reloc.  */
+  BFD_RELOC_V850_16_PCREL,
+
+/* This is a 17-bit reloc.  */
+  BFD_RELOC_V850_17_PCREL,
+
+/* This is a 23-bit reloc.  */
+  BFD_RELOC_V850_23,
+
+/* This is a 32-bit reloc.  */
+  BFD_RELOC_V850_32_PCREL,
+
+/* This is a 32-bit reloc.  */
+  BFD_RELOC_V850_32_ABS,
+
+/* This is a 16-bit reloc.  */
+  BFD_RELOC_V850_16_SPLIT_OFFSET,
+
+/* This is a 16-bit reloc.  */
+  BFD_RELOC_V850_16_S1,
+
+/* Low 16 bits. 16 bit shifted by 1.  */
+  BFD_RELOC_V850_LO16_S1,
+
+/* This is a 16 bit offset from the call table base pointer.  */
+  BFD_RELOC_V850_CALLT_15_16_OFFSET,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_GOTPCREL,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_16_GOT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_GOT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_22_PLT_PCREL,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_PLT_PCREL,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_COPY,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_GLOB_DAT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_JMP_SLOT,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_RELATIVE,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_16_GOTOFF,
+
+/* DSO relocations.  */
+  BFD_RELOC_V850_32_GOTOFF,
+
+/* start code.  */
+  BFD_RELOC_V850_CODE,
+
+/* start data in text.  */
+  BFD_RELOC_V850_DATA,
+
+/* This is a 8bit DP reloc for the tms320c30, where the most
+significant 8 bits of a 24 bit word are placed into the least
+significant 8 bits of the opcode.  */
+  BFD_RELOC_TIC30_LDP,
+
+/* This is a 7bit reloc for the tms320c54x, where the least
+significant 7 bits of a 16 bit word are placed into the least
+significant 7 bits of the opcode.  */
+  BFD_RELOC_TIC54X_PARTLS7,
+
+/* This is a 9bit DP reloc for the tms320c54x, where the most
+significant 9 bits of a 16 bit word are placed into the least
+significant 9 bits of the opcode.  */
+  BFD_RELOC_TIC54X_PARTMS9,
+
+/* This is an extended address 23-bit reloc for the tms320c54x.  */
+  BFD_RELOC_TIC54X_23,
+
+/* This is a 16-bit reloc for the tms320c54x, where the least
+significant 16 bits of a 23-bit extended address are placed into
+the opcode.  */
+  BFD_RELOC_TIC54X_16_OF_23,
+
+/* This is a reloc for the tms320c54x, where the most
+significant 7 bits of a 23-bit extended address are placed into
+the opcode.  */
+  BFD_RELOC_TIC54X_MS7_OF_23,
+
+/* TMS320C6000 relocations.  */
+  BFD_RELOC_C6000_PCR_S21,
+  BFD_RELOC_C6000_PCR_S12,
+  BFD_RELOC_C6000_PCR_S10,
+  BFD_RELOC_C6000_PCR_S7,
+  BFD_RELOC_C6000_ABS_S16,
+  BFD_RELOC_C6000_ABS_L16,
+  BFD_RELOC_C6000_ABS_H16,
+  BFD_RELOC_C6000_SBR_U15_B,
+  BFD_RELOC_C6000_SBR_U15_H,
+  BFD_RELOC_C6000_SBR_U15_W,
+  BFD_RELOC_C6000_SBR_S16,
+  BFD_RELOC_C6000_SBR_L16_B,
+  BFD_RELOC_C6000_SBR_L16_H,
+  BFD_RELOC_C6000_SBR_L16_W,
+  BFD_RELOC_C6000_SBR_H16_B,
+  BFD_RELOC_C6000_SBR_H16_H,
+  BFD_RELOC_C6000_SBR_H16_W,
+  BFD_RELOC_C6000_SBR_GOT_U15_W,
+  BFD_RELOC_C6000_SBR_GOT_L16_W,
+  BFD_RELOC_C6000_SBR_GOT_H16_W,
+  BFD_RELOC_C6000_DSBT_INDEX,
+  BFD_RELOC_C6000_PREL31,
+  BFD_RELOC_C6000_COPY,
+  BFD_RELOC_C6000_JUMP_SLOT,
+  BFD_RELOC_C6000_EHTYPE,
+  BFD_RELOC_C6000_PCR_H16,
+  BFD_RELOC_C6000_PCR_L16,
+  BFD_RELOC_C6000_ALIGN,
+  BFD_RELOC_C6000_FPHEAD,
+  BFD_RELOC_C6000_NOCMP,
+
+/* This is a 48 bit reloc for the FR30 that stores 32 bits.  */
+  BFD_RELOC_FR30_48,
+
+/* This is a 32 bit reloc for the FR30 that stores 20 bits split up into
+two sections.  */
+  BFD_RELOC_FR30_20,
+
+/* This is a 16 bit reloc for the FR30 that stores a 6 bit word offset in
+4 bits.  */
+  BFD_RELOC_FR30_6_IN_4,
+
+/* This is a 16 bit reloc for the FR30 that stores an 8 bit byte offset
+into 8 bits.  */
+  BFD_RELOC_FR30_8_IN_8,
+
+/* This is a 16 bit reloc for the FR30 that stores a 9 bit short offset
+into 8 bits.  */
+  BFD_RELOC_FR30_9_IN_8,
+
+/* This is a 16 bit reloc for the FR30 that stores a 10 bit word offset
+into 8 bits.  */
+  BFD_RELOC_FR30_10_IN_8,
+
+/* This is a 16 bit reloc for the FR30 that stores a 9 bit pc relative
+short offset into 8 bits.  */
+  BFD_RELOC_FR30_9_PCREL,
+
+/* This is a 16 bit reloc for the FR30 that stores a 12 bit pc relative
+short offset into 11 bits.  */
+  BFD_RELOC_FR30_12_PCREL,
+
+/* Motorola Mcore relocations.  */
+  BFD_RELOC_MCORE_PCREL_IMM8BY4,
+  BFD_RELOC_MCORE_PCREL_IMM11BY2,
+  BFD_RELOC_MCORE_PCREL_IMM4BY2,
+  BFD_RELOC_MCORE_PCREL_32,
+  BFD_RELOC_MCORE_PCREL_JSR_IMM11BY2,
+  BFD_RELOC_MCORE_RVA,
+
+/* Toshiba Media Processor Relocations.  */
+  BFD_RELOC_MEP_8,
+  BFD_RELOC_MEP_16,
+  BFD_RELOC_MEP_32,
+  BFD_RELOC_MEP_PCREL8A2,
+  BFD_RELOC_MEP_PCREL12A2,
+  BFD_RELOC_MEP_PCREL17A2,
+  BFD_RELOC_MEP_PCREL24A2,
+  BFD_RELOC_MEP_PCABS24A2,
+  BFD_RELOC_MEP_LOW16,
+  BFD_RELOC_MEP_HI16U,
+  BFD_RELOC_MEP_HI16S,
+  BFD_RELOC_MEP_GPREL,
+  BFD_RELOC_MEP_TPREL,
+  BFD_RELOC_MEP_TPREL7,
+  BFD_RELOC_MEP_TPREL7A2,
+  BFD_RELOC_MEP_TPREL7A4,
+  BFD_RELOC_MEP_UIMM24,
+  BFD_RELOC_MEP_ADDR24A4,
+  BFD_RELOC_MEP_GNU_VTINHERIT,
+  BFD_RELOC_MEP_GNU_VTENTRY,
+
+
+/* Imagination Technologies Meta relocations.  */
+  BFD_RELOC_METAG_HIADDR16,
+  BFD_RELOC_METAG_LOADDR16,
+  BFD_RELOC_METAG_RELBRANCH,
+  BFD_RELOC_METAG_GETSETOFF,
+  BFD_RELOC_METAG_HIOG,
+  BFD_RELOC_METAG_LOOG,
+  BFD_RELOC_METAG_REL8,
+  BFD_RELOC_METAG_REL16,
+  BFD_RELOC_METAG_HI16_GOTOFF,
+  BFD_RELOC_METAG_LO16_GOTOFF,
+  BFD_RELOC_METAG_GETSET_GOTOFF,
+  BFD_RELOC_METAG_GETSET_GOT,
+  BFD_RELOC_METAG_HI16_GOTPC,
+  BFD_RELOC_METAG_LO16_GOTPC,
+  BFD_RELOC_METAG_HI16_PLT,
+  BFD_RELOC_METAG_LO16_PLT,
+  BFD_RELOC_METAG_RELBRANCH_PLT,
+  BFD_RELOC_METAG_GOTOFF,
+  BFD_RELOC_METAG_PLT,
+  BFD_RELOC_METAG_COPY,
+  BFD_RELOC_METAG_JMP_SLOT,
+  BFD_RELOC_METAG_RELATIVE,
+  BFD_RELOC_METAG_GLOB_DAT,
+  BFD_RELOC_METAG_TLS_GD,
+  BFD_RELOC_METAG_TLS_LDM,
+  BFD_RELOC_METAG_TLS_LDO_HI16,
+  BFD_RELOC_METAG_TLS_LDO_LO16,
+  BFD_RELOC_METAG_TLS_LDO,
+  BFD_RELOC_METAG_TLS_IE,
+  BFD_RELOC_METAG_TLS_IENONPIC,
+  BFD_RELOC_METAG_TLS_IENONPIC_HI16,
+  BFD_RELOC_METAG_TLS_IENONPIC_LO16,
+  BFD_RELOC_METAG_TLS_TPOFF,
+  BFD_RELOC_METAG_TLS_DTPMOD,
+  BFD_RELOC_METAG_TLS_DTPOFF,
+  BFD_RELOC_METAG_TLS_LE,
+  BFD_RELOC_METAG_TLS_LE_HI16,
+  BFD_RELOC_METAG_TLS_LE_LO16,
+
+/* These are relocations for the GETA instruction.  */
+  BFD_RELOC_MMIX_GETA,
+  BFD_RELOC_MMIX_GETA_1,
+  BFD_RELOC_MMIX_GETA_2,
+  BFD_RELOC_MMIX_GETA_3,
+
+/* These are relocations for a conditional branch instruction.  */
+  BFD_RELOC_MMIX_CBRANCH,
+  BFD_RELOC_MMIX_CBRANCH_J,
+  BFD_RELOC_MMIX_CBRANCH_1,
+  BFD_RELOC_MMIX_CBRANCH_2,
+  BFD_RELOC_MMIX_CBRANCH_3,
+
+/* These are relocations for the PUSHJ instruction.  */
+  BFD_RELOC_MMIX_PUSHJ,
+  BFD_RELOC_MMIX_PUSHJ_1,
+  BFD_RELOC_MMIX_PUSHJ_2,
+  BFD_RELOC_MMIX_PUSHJ_3,
+  BFD_RELOC_MMIX_PUSHJ_STUBBABLE,
+
+/* These are relocations for the JMP instruction.  */
+  BFD_RELOC_MMIX_JMP,
+  BFD_RELOC_MMIX_JMP_1,
+  BFD_RELOC_MMIX_JMP_2,
+  BFD_RELOC_MMIX_JMP_3,
+
+/* This is a relocation for a relative address as in a GETA instruction or
+a branch.  */
+  BFD_RELOC_MMIX_ADDR19,
+
+/* This is a relocation for a relative address as in a JMP instruction.  */
+  BFD_RELOC_MMIX_ADDR27,
+
+/* This is a relocation for an instruction field that may be a general
+register or a value 0..255.  */
+  BFD_RELOC_MMIX_REG_OR_BYTE,
+
+/* This is a relocation for an instruction field that may be a general
+register.  */
+  BFD_RELOC_MMIX_REG,
+
+/* This is a relocation for two instruction fields holding a register and
+an offset, the equivalent of the relocation.  */
+  BFD_RELOC_MMIX_BASE_PLUS_OFFSET,
+
+/* This relocation is an assertion that the expression is not allocated as
+a global register.  It does not modify contents.  */
+  BFD_RELOC_MMIX_LOCAL,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit pc relative
+short offset into 7 bits.  */
+  BFD_RELOC_AVR_7_PCREL,
+
+/* This is a 16 bit reloc for the AVR that stores 13 bit pc relative
+short offset into 12 bits.  */
+  BFD_RELOC_AVR_13_PCREL,
+
+/* This is a 16 bit reloc for the AVR that stores 17 bit value (usually
+program memory address) into 16 bits.  */
+  BFD_RELOC_AVR_16_PM,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (usually
+data memory address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_LO8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (high 8 bit
+of data memory address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HI8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (most high 8 bit
+of program memory address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HH8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (most high 8 bit
+of 32 bit value) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_MS8_LDI,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(usually data memory address) into 8 bit immediate value of SUBI insn.  */
+  BFD_RELOC_AVR_LO8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(high 8 bit of data memory address) into 8 bit immediate value of
+SUBI insn.  */
+  BFD_RELOC_AVR_HI8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(most high 8 bit of program memory address) into 8 bit immediate value
+of LDI or SUBI insn.  */
+  BFD_RELOC_AVR_HH8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value (msb
+of 32 bit value) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_MS8_LDI_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (usually
+command address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_LO8_LDI_PM,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value
+(command address) into 8 bit immediate value of LDI insn. If the address
+is beyond the 128k boundary, the linker inserts a jump stub for this reloc
+in the lower 128k.  */
+  BFD_RELOC_AVR_LO8_LDI_GS,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (high 8 bit
+of command address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HI8_LDI_PM,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (high 8 bit
+of command address) into 8 bit immediate value of LDI insn.  If the address
+is beyond the 128k boundary, the linker inserts a jump stub for this reloc
+below 128k.  */
+  BFD_RELOC_AVR_HI8_LDI_GS,
+
+/* This is a 16 bit reloc for the AVR that stores 8 bit value (most high 8 bit
+of command address) into 8 bit immediate value of LDI insn.  */
+  BFD_RELOC_AVR_HH8_LDI_PM,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(usually command address) into 8 bit immediate value of SUBI insn.  */
+  BFD_RELOC_AVR_LO8_LDI_PM_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(high 8 bit of 16 bit command address) into 8 bit immediate value
+of SUBI insn.  */
+  BFD_RELOC_AVR_HI8_LDI_PM_NEG,
+
+/* This is a 16 bit reloc for the AVR that stores negated 8 bit value
+(high 6 bit of 22 bit command address) into 8 bit immediate
+value of SUBI insn.  */
+  BFD_RELOC_AVR_HH8_LDI_PM_NEG,
+
+/* This is a 32 bit reloc for the AVR that stores 23 bit value
+into 22 bits.  */
+  BFD_RELOC_AVR_CALL,
+
+/* This is a 16 bit reloc for the AVR that stores all needed bits
+for absolute addressing with ldi with overflow check to linktime  */
+  BFD_RELOC_AVR_LDI,
+
+/* This is a 6 bit reloc for the AVR that stores offset for ldd/std
+instructions  */
+  BFD_RELOC_AVR_6,
+
+/* This is a 6 bit reloc for the AVR that stores offset for adiw/sbiw
+instructions  */
+  BFD_RELOC_AVR_6_ADIW,
+
+/* This is a 8 bit reloc for the AVR that stores bits 0..7 of a symbol
+in .byte lo8(symbol)  */
+  BFD_RELOC_AVR_8_LO,
+
+/* This is a 8 bit reloc for the AVR that stores bits 8..15 of a symbol
+in .byte hi8(symbol)  */
+  BFD_RELOC_AVR_8_HI,
+
+/* This is a 8 bit reloc for the AVR that stores bits 16..23 of a symbol
+in .byte hlo8(symbol)  */
+  BFD_RELOC_AVR_8_HLO,
+
+/* AVR relocations to mark the difference of two local symbols.
+These are only needed to support linker relaxation and can be ignored
+when not relaxing.  The field is set to the value of the difference
+assuming no relaxation.  The relocation encodes the position of the
+second symbol so the linker can determine whether to adjust the field
+value.  */
+  BFD_RELOC_AVR_DIFF8,
+  BFD_RELOC_AVR_DIFF16,
+  BFD_RELOC_AVR_DIFF32,
+
+/* This is a 7 bit reloc for the AVR that stores SRAM address for 16bit
+lds and sts instructions supported only tiny core.  */
+  BFD_RELOC_AVR_LDS_STS_16,
+
+/* This is a 6 bit reloc for the AVR that stores an I/O register
+number for the IN and OUT instructions  */
+  BFD_RELOC_AVR_PORT6,
+
+/* This is a 5 bit reloc for the AVR that stores an I/O register
+number for the SBIC, SBIS, SBI and CBI instructions  */
+  BFD_RELOC_AVR_PORT5,
+
+/* RISC-V relocations.  */
+  BFD_RELOC_RISCV_HI20,
+  BFD_RELOC_RISCV_PCREL_HI20,
+  BFD_RELOC_RISCV_PCREL_LO12_I,
+  BFD_RELOC_RISCV_PCREL_LO12_S,
+  BFD_RELOC_RISCV_LO12_I,
+  BFD_RELOC_RISCV_LO12_S,
+  BFD_RELOC_RISCV_GPREL12_I,
+  BFD_RELOC_RISCV_GPREL12_S,
+  BFD_RELOC_RISCV_TPREL_HI20,
+  BFD_RELOC_RISCV_TPREL_LO12_I,
+  BFD_RELOC_RISCV_TPREL_LO12_S,
+  BFD_RELOC_RISCV_TPREL_ADD,
+  BFD_RELOC_RISCV_CALL,
+  BFD_RELOC_RISCV_CALL_PLT,
+  BFD_RELOC_RISCV_ADD8,
+  BFD_RELOC_RISCV_ADD16,
+  BFD_RELOC_RISCV_ADD32,
+  BFD_RELOC_RISCV_ADD64,
+  BFD_RELOC_RISCV_SUB8,
+  BFD_RELOC_RISCV_SUB16,
+  BFD_RELOC_RISCV_SUB32,
+  BFD_RELOC_RISCV_SUB64,
+  BFD_RELOC_RISCV_GOT_HI20,
+  BFD_RELOC_RISCV_TLS_GOT_HI20,
+  BFD_RELOC_RISCV_TLS_GD_HI20,
+  BFD_RELOC_RISCV_JMP,
+  BFD_RELOC_RISCV_TLS_DTPMOD32,
+  BFD_RELOC_RISCV_TLS_DTPREL32,
+  BFD_RELOC_RISCV_TLS_DTPMOD64,
+  BFD_RELOC_RISCV_TLS_DTPREL64,
+  BFD_RELOC_RISCV_TLS_TPREL32,
+  BFD_RELOC_RISCV_TLS_TPREL64,
+  BFD_RELOC_RISCV_ALIGN,
+  BFD_RELOC_RISCV_RVC_BRANCH,
+  BFD_RELOC_RISCV_RVC_JUMP,
+  BFD_RELOC_RISCV_RVC_LUI,
+  BFD_RELOC_RISCV_GPREL_I,
+  BFD_RELOC_RISCV_GPREL_S,
+  BFD_RELOC_RISCV_TPREL_I,
+  BFD_RELOC_RISCV_TPREL_S,
+  BFD_RELOC_RISCV_RELAX,
+  BFD_RELOC_RISCV_CFA,
+  BFD_RELOC_RISCV_SUB6,
+  BFD_RELOC_RISCV_SET6,
+  BFD_RELOC_RISCV_SET8,
+  BFD_RELOC_RISCV_SET16,
+  BFD_RELOC_RISCV_SET32,
+/* Riscv, Pulp Specific */
+  BFD_RELOC_RISCV_REL12,
+  BFD_RELOC_RISCV_RELU5,
+  BFD_RELOC_RISCV_12_I,
+  BFD_RELOC_RISCV_12_S,
+
+
+/* Renesas RL78 Relocations.  */
+  BFD_RELOC_RL78_NEG8,
+  BFD_RELOC_RL78_NEG16,
+  BFD_RELOC_RL78_NEG24,
+  BFD_RELOC_RL78_NEG32,
+  BFD_RELOC_RL78_16_OP,
+  BFD_RELOC_RL78_24_OP,
+  BFD_RELOC_RL78_32_OP,
+  BFD_RELOC_RL78_8U,
+  BFD_RELOC_RL78_16U,
+  BFD_RELOC_RL78_24U,
+  BFD_RELOC_RL78_DIR3U_PCREL,
+  BFD_RELOC_RL78_DIFF,
+  BFD_RELOC_RL78_GPRELB,
+  BFD_RELOC_RL78_GPRELW,
+  BFD_RELOC_RL78_GPRELL,
+  BFD_RELOC_RL78_SYM,
+  BFD_RELOC_RL78_OP_SUBTRACT,
+  BFD_RELOC_RL78_OP_NEG,
+  BFD_RELOC_RL78_OP_AND,
+  BFD_RELOC_RL78_OP_SHRA,
+  BFD_RELOC_RL78_ABS8,
+  BFD_RELOC_RL78_ABS16,
+  BFD_RELOC_RL78_ABS16_REV,
+  BFD_RELOC_RL78_ABS32,
+  BFD_RELOC_RL78_ABS32_REV,
+  BFD_RELOC_RL78_ABS16U,
+  BFD_RELOC_RL78_ABS16UW,
+  BFD_RELOC_RL78_ABS16UL,
+  BFD_RELOC_RL78_RELAX,
+  BFD_RELOC_RL78_HI16,
+  BFD_RELOC_RL78_HI8,
+  BFD_RELOC_RL78_LO16,
+  BFD_RELOC_RL78_CODE,
+  BFD_RELOC_RL78_SADDR,
+
+/* Renesas RX Relocations.  */
+  BFD_RELOC_RX_NEG8,
+  BFD_RELOC_RX_NEG16,
+  BFD_RELOC_RX_NEG24,
+  BFD_RELOC_RX_NEG32,
+  BFD_RELOC_RX_16_OP,
+  BFD_RELOC_RX_24_OP,
+  BFD_RELOC_RX_32_OP,
+  BFD_RELOC_RX_8U,
+  BFD_RELOC_RX_16U,
+  BFD_RELOC_RX_24U,
+  BFD_RELOC_RX_DIR3U_PCREL,
+  BFD_RELOC_RX_DIFF,
+  BFD_RELOC_RX_GPRELB,
+  BFD_RELOC_RX_GPRELW,
+  BFD_RELOC_RX_GPRELL,
+  BFD_RELOC_RX_SYM,
+  BFD_RELOC_RX_OP_SUBTRACT,
+  BFD_RELOC_RX_OP_NEG,
+  BFD_RELOC_RX_ABS8,
+  BFD_RELOC_RX_ABS16,
+  BFD_RELOC_RX_ABS16_REV,
+  BFD_RELOC_RX_ABS32,
+  BFD_RELOC_RX_ABS32_REV,
+  BFD_RELOC_RX_ABS16U,
+  BFD_RELOC_RX_ABS16UW,
+  BFD_RELOC_RX_ABS16UL,
+  BFD_RELOC_RX_RELAX,
+
+/* Direct 12 bit.  */
+  BFD_RELOC_390_12,
+
+/* 12 bit GOT offset.  */
+  BFD_RELOC_390_GOT12,
+
+/* 32 bit PC relative PLT address.  */
+  BFD_RELOC_390_PLT32,
+
+/* Copy symbol at runtime.  */
+  BFD_RELOC_390_COPY,
+
+/* Create GOT entry.  */
+  BFD_RELOC_390_GLOB_DAT,
+
+/* Create PLT entry.  */
+  BFD_RELOC_390_JMP_SLOT,
+
+/* Adjust by program base.  */
+  BFD_RELOC_390_RELATIVE,
+
+/* 32 bit PC relative offset to GOT.  */
+  BFD_RELOC_390_GOTPC,
+
+/* 16 bit GOT offset.  */
+  BFD_RELOC_390_GOT16,
+
+/* PC relative 12 bit shifted by 1.  */
+  BFD_RELOC_390_PC12DBL,
+
+/* 12 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT12DBL,
+
+/* PC relative 16 bit shifted by 1.  */
+  BFD_RELOC_390_PC16DBL,
+
+/* 16 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT16DBL,
+
+/* PC relative 24 bit shifted by 1.  */
+  BFD_RELOC_390_PC24DBL,
+
+/* 24 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT24DBL,
+
+/* PC relative 32 bit shifted by 1.  */
+  BFD_RELOC_390_PC32DBL,
+
+/* 32 bit PC rel. PLT shifted by 1.  */
+  BFD_RELOC_390_PLT32DBL,
+
+/* 32 bit PC rel. GOT shifted by 1.  */
+  BFD_RELOC_390_GOTPCDBL,
+
+/* 64 bit GOT offset.  */
+  BFD_RELOC_390_GOT64,
+
+/* 64 bit PC relative PLT address.  */
+  BFD_RELOC_390_PLT64,
+
+/* 32 bit rel. offset to GOT entry.  */
+  BFD_RELOC_390_GOTENT,
+
+/* 64 bit offset to GOT.  */
+  BFD_RELOC_390_GOTOFF64,
+
+/* 12-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT12,
+
+/* 16-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT16,
+
+/* 32-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT32,
+
+/* 64-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLT64,
+
+/* 32-bit rel. offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_390_GOTPLTENT,
+
+/* 16-bit rel. offset from the GOT to a PLT entry.  */
+  BFD_RELOC_390_PLTOFF16,
+
+/* 32-bit rel. offset from the GOT to a PLT entry.  */
+  BFD_RELOC_390_PLTOFF32,
+
+/* 64-bit rel. offset from the GOT to a PLT entry.  */
+  BFD_RELOC_390_PLTOFF64,
+
+/* s390 tls relocations.  */
+  BFD_RELOC_390_TLS_LOAD,
+  BFD_RELOC_390_TLS_GDCALL,
+  BFD_RELOC_390_TLS_LDCALL,
+  BFD_RELOC_390_TLS_GD32,
+  BFD_RELOC_390_TLS_GD64,
+  BFD_RELOC_390_TLS_GOTIE12,
+  BFD_RELOC_390_TLS_GOTIE32,
+  BFD_RELOC_390_TLS_GOTIE64,
+  BFD_RELOC_390_TLS_LDM32,
+  BFD_RELOC_390_TLS_LDM64,
+  BFD_RELOC_390_TLS_IE32,
+  BFD_RELOC_390_TLS_IE64,
+  BFD_RELOC_390_TLS_IEENT,
+  BFD_RELOC_390_TLS_LE32,
+  BFD_RELOC_390_TLS_LE64,
+  BFD_RELOC_390_TLS_LDO32,
+  BFD_RELOC_390_TLS_LDO64,
+  BFD_RELOC_390_TLS_DTPMOD,
+  BFD_RELOC_390_TLS_DTPOFF,
+  BFD_RELOC_390_TLS_TPOFF,
+
+/* Long displacement extension.  */
+  BFD_RELOC_390_20,
+  BFD_RELOC_390_GOT20,
+  BFD_RELOC_390_GOTPLT20,
+  BFD_RELOC_390_TLS_GOTIE20,
+
+/* STT_GNU_IFUNC relocation.  */
+  BFD_RELOC_390_IRELATIVE,
+
+/* Score relocations
+Low 16 bit for load/store  */
+  BFD_RELOC_SCORE_GPREL15,
+
+/* This is a 24-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE_DUMMY2,
+  BFD_RELOC_SCORE_JMP,
+
+/* This is a 19-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE_BRANCH,
+
+/* This is a 32-bit reloc for 48-bit instructions.  */
+  BFD_RELOC_SCORE_IMM30,
+
+/* This is a 32-bit reloc for 48-bit instructions.  */
+  BFD_RELOC_SCORE_IMM32,
+
+/* This is a 11-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE16_JMP,
+
+/* This is a 8-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE16_BRANCH,
+
+/* This is a 9-bit reloc with the right 1 bit assumed to be 0  */
+  BFD_RELOC_SCORE_BCMP,
+
+/* Undocumented Score relocs  */
+  BFD_RELOC_SCORE_GOT15,
+  BFD_RELOC_SCORE_GOT_LO16,
+  BFD_RELOC_SCORE_CALL15,
+  BFD_RELOC_SCORE_DUMMY_HI16,
+
+/* Scenix IP2K - 9-bit register number / data address  */
+  BFD_RELOC_IP2K_FR9,
+
+/* Scenix IP2K - 4-bit register/data bank number  */
+  BFD_RELOC_IP2K_BANK,
+
+/* Scenix IP2K - low 13 bits of instruction word address  */
+  BFD_RELOC_IP2K_ADDR16CJP,
+
+/* Scenix IP2K - high 3 bits of instruction word address  */
+  BFD_RELOC_IP2K_PAGE3,
+
+/* Scenix IP2K - ext/low/high 8 bits of data address  */
+  BFD_RELOC_IP2K_LO8DATA,
+  BFD_RELOC_IP2K_HI8DATA,
+  BFD_RELOC_IP2K_EX8DATA,
+
+/* Scenix IP2K - low/high 8 bits of instruction word address  */
+  BFD_RELOC_IP2K_LO8INSN,
+  BFD_RELOC_IP2K_HI8INSN,
+
+/* Scenix IP2K - even/odd PC modifier to modify snb pcl.0  */
+  BFD_RELOC_IP2K_PC_SKIP,
+
+/* Scenix IP2K - 16 bit word address in text section.  */
+  BFD_RELOC_IP2K_TEXT,
+
+/* Scenix IP2K - 7-bit sp or dp offset  */
+  BFD_RELOC_IP2K_FR_OFFSET,
+
+/* Scenix VPE4K coprocessor - data/insn-space addressing  */
+  BFD_RELOC_VPE4KMATH_DATA,
+  BFD_RELOC_VPE4KMATH_INSN,
+
+/* These two relocations are used by the linker to determine which of
+the entries in a C++ virtual function table are actually used.  When
+the --gc-sections option is given, the linker will zero out the entries
+that are not used, so that the code for those functions need not be
+included in the output.
+
+VTABLE_INHERIT is a zero-space relocation used to describe to the
+linker the inheritance tree of a C++ virtual function table.  The
+relocation's symbol should be the parent class' vtable, and the
+relocation should be located at the child vtable.
+
+VTABLE_ENTRY is a zero-space relocation that describes the use of a
+virtual function table entry.  The reloc's symbol should refer to the
+table of the class mentioned in the code.  Off of that base, an offset
+describes the entry that is being used.  For Rela hosts, this offset
+is stored in the reloc's addend.  For Rel hosts, we are forced to put
+this offset in the reloc's section offset.  */
+  BFD_RELOC_VTABLE_INHERIT,
+  BFD_RELOC_VTABLE_ENTRY,
+
+/* Intel IA64 Relocations.  */
+  BFD_RELOC_IA64_IMM14,
+  BFD_RELOC_IA64_IMM22,
+  BFD_RELOC_IA64_IMM64,
+  BFD_RELOC_IA64_DIR32MSB,
+  BFD_RELOC_IA64_DIR32LSB,
+  BFD_RELOC_IA64_DIR64MSB,
+  BFD_RELOC_IA64_DIR64LSB,
+  BFD_RELOC_IA64_GPREL22,
+  BFD_RELOC_IA64_GPREL64I,
+  BFD_RELOC_IA64_GPREL32MSB,
+  BFD_RELOC_IA64_GPREL32LSB,
+  BFD_RELOC_IA64_GPREL64MSB,
+  BFD_RELOC_IA64_GPREL64LSB,
+  BFD_RELOC_IA64_LTOFF22,
+  BFD_RELOC_IA64_LTOFF64I,
+  BFD_RELOC_IA64_PLTOFF22,
+  BFD_RELOC_IA64_PLTOFF64I,
+  BFD_RELOC_IA64_PLTOFF64MSB,
+  BFD_RELOC_IA64_PLTOFF64LSB,
+  BFD_RELOC_IA64_FPTR64I,
+  BFD_RELOC_IA64_FPTR32MSB,
+  BFD_RELOC_IA64_FPTR32LSB,
+  BFD_RELOC_IA64_FPTR64MSB,
+  BFD_RELOC_IA64_FPTR64LSB,
+  BFD_RELOC_IA64_PCREL21B,
+  BFD_RELOC_IA64_PCREL21BI,
+  BFD_RELOC_IA64_PCREL21M,
+  BFD_RELOC_IA64_PCREL21F,
+  BFD_RELOC_IA64_PCREL22,
+  BFD_RELOC_IA64_PCREL60B,
+  BFD_RELOC_IA64_PCREL64I,
+  BFD_RELOC_IA64_PCREL32MSB,
+  BFD_RELOC_IA64_PCREL32LSB,
+  BFD_RELOC_IA64_PCREL64MSB,
+  BFD_RELOC_IA64_PCREL64LSB,
+  BFD_RELOC_IA64_LTOFF_FPTR22,
+  BFD_RELOC_IA64_LTOFF_FPTR64I,
+  BFD_RELOC_IA64_LTOFF_FPTR32MSB,
+  BFD_RELOC_IA64_LTOFF_FPTR32LSB,
+  BFD_RELOC_IA64_LTOFF_FPTR64MSB,
+  BFD_RELOC_IA64_LTOFF_FPTR64LSB,
+  BFD_RELOC_IA64_SEGREL32MSB,
+  BFD_RELOC_IA64_SEGREL32LSB,
+  BFD_RELOC_IA64_SEGREL64MSB,
+  BFD_RELOC_IA64_SEGREL64LSB,
+  BFD_RELOC_IA64_SECREL32MSB,
+  BFD_RELOC_IA64_SECREL32LSB,
+  BFD_RELOC_IA64_SECREL64MSB,
+  BFD_RELOC_IA64_SECREL64LSB,
+  BFD_RELOC_IA64_REL32MSB,
+  BFD_RELOC_IA64_REL32LSB,
+  BFD_RELOC_IA64_REL64MSB,
+  BFD_RELOC_IA64_REL64LSB,
+  BFD_RELOC_IA64_LTV32MSB,
+  BFD_RELOC_IA64_LTV32LSB,
+  BFD_RELOC_IA64_LTV64MSB,
+  BFD_RELOC_IA64_LTV64LSB,
+  BFD_RELOC_IA64_IPLTMSB,
+  BFD_RELOC_IA64_IPLTLSB,
+  BFD_RELOC_IA64_COPY,
+  BFD_RELOC_IA64_LTOFF22X,
+  BFD_RELOC_IA64_LDXMOV,
+  BFD_RELOC_IA64_TPREL14,
+  BFD_RELOC_IA64_TPREL22,
+  BFD_RELOC_IA64_TPREL64I,
+  BFD_RELOC_IA64_TPREL64MSB,
+  BFD_RELOC_IA64_TPREL64LSB,
+  BFD_RELOC_IA64_LTOFF_TPREL22,
+  BFD_RELOC_IA64_DTPMOD64MSB,
+  BFD_RELOC_IA64_DTPMOD64LSB,
+  BFD_RELOC_IA64_LTOFF_DTPMOD22,
+  BFD_RELOC_IA64_DTPREL14,
+  BFD_RELOC_IA64_DTPREL22,
+  BFD_RELOC_IA64_DTPREL64I,
+  BFD_RELOC_IA64_DTPREL32MSB,
+  BFD_RELOC_IA64_DTPREL32LSB,
+  BFD_RELOC_IA64_DTPREL64MSB,
+  BFD_RELOC_IA64_DTPREL64LSB,
+  BFD_RELOC_IA64_LTOFF_DTPREL22,
+
+/* Motorola 68HC11 reloc.
+This is the 8 bit high part of an absolute address.  */
+  BFD_RELOC_M68HC11_HI8,
+
+/* Motorola 68HC11 reloc.
+This is the 8 bit low part of an absolute address.  */
+  BFD_RELOC_M68HC11_LO8,
+
+/* Motorola 68HC11 reloc.
+This is the 3 bit of a value.  */
+  BFD_RELOC_M68HC11_3B,
+
+/* Motorola 68HC11 reloc.
+This reloc marks the beginning of a jump/call instruction.
+It is used for linker relaxation to correctly identify beginning
+of instruction and change some branches to use PC-relative
+addressing mode.  */
+  BFD_RELOC_M68HC11_RL_JUMP,
+
+/* Motorola 68HC11 reloc.
+This reloc marks a group of several instructions that gcc generates
+and for which the linker relaxation pass can modify and/or remove
+some of them.  */
+  BFD_RELOC_M68HC11_RL_GROUP,
+
+/* Motorola 68HC11 reloc.
+This is the 16-bit lower part of an address.  It is used for 'call'
+instruction to specify the symbol address without any special
+transformation (due to memory bank window).  */
+  BFD_RELOC_M68HC11_LO16,
+
+/* Motorola 68HC11 reloc.
+This is a 8-bit reloc that specifies the page number of an address.
+It is used by 'call' instruction to specify the page number of
+the symbol.  */
+  BFD_RELOC_M68HC11_PAGE,
+
+/* Motorola 68HC11 reloc.
+This is a 24-bit reloc that represents the address with a 16-bit
+value and a 8-bit page number.  The symbol address is transformed
+to follow the 16K memory bank of 68HC12 (seen as mapped in the window).  */
+  BFD_RELOC_M68HC11_24,
+
+/* Motorola 68HC12 reloc.
+This is the 5 bits of a value.  */
+  BFD_RELOC_M68HC12_5B,
+
+/* Freescale XGATE reloc.
+This reloc marks the beginning of a bra/jal instruction.  */
+  BFD_RELOC_XGATE_RL_JUMP,
+
+/* Freescale XGATE reloc.
+This reloc marks a group of several instructions that gcc generates
+and for which the linker relaxation pass can modify and/or remove
+some of them.  */
+  BFD_RELOC_XGATE_RL_GROUP,
+
+/* Freescale XGATE reloc.
+This is the 16-bit lower part of an address.  It is used for the '16-bit'
+instructions.  */
+  BFD_RELOC_XGATE_LO16,
+
+/* Freescale XGATE reloc.  */
+  BFD_RELOC_XGATE_GPAGE,
+
+/* Freescale XGATE reloc.  */
+  BFD_RELOC_XGATE_24,
+
+/* Freescale XGATE reloc.
+This is a 9-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_PCREL_9,
+
+/* Freescale XGATE reloc.
+This is a 10-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_PCREL_10,
+
+/* Freescale XGATE reloc.
+This is the 16-bit lower part of an address.  It is used for the '16-bit'
+instructions.  */
+  BFD_RELOC_XGATE_IMM8_LO,
+
+/* Freescale XGATE reloc.
+This is the 16-bit higher part of an address.  It is used for the '16-bit'
+instructions.  */
+  BFD_RELOC_XGATE_IMM8_HI,
+
+/* Freescale XGATE reloc.
+This is a 3-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_IMM3,
+
+/* Freescale XGATE reloc.
+This is a 4-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_IMM4,
+
+/* Freescale XGATE reloc.
+This is a 5-bit pc-relative reloc.  */
+  BFD_RELOC_XGATE_IMM5,
+
+/* Motorola 68HC12 reloc.
+This is the 9 bits of a value.  */
+  BFD_RELOC_M68HC12_9B,
+
+/* Motorola 68HC12 reloc.
+This is the 16 bits of a value.  */
+  BFD_RELOC_M68HC12_16B,
+
+/* Motorola 68HC12/XGATE reloc.
+This is a PCREL9 branch.  */
+  BFD_RELOC_M68HC12_9_PCREL,
+
+/* Motorola 68HC12/XGATE reloc.
+This is a PCREL10 branch.  */
+  BFD_RELOC_M68HC12_10_PCREL,
+
+/* Motorola 68HC12/XGATE reloc.
+This is the 8 bit low part of an absolute address and immediately precedes
+a matching HI8XG part.  */
+  BFD_RELOC_M68HC12_LO8XG,
+
+/* Motorola 68HC12/XGATE reloc.
+This is the 8 bit high part of an absolute address and immediately follows
+a matching LO8XG part.  */
+  BFD_RELOC_M68HC12_HI8XG,
+
+/* NS CR16C Relocations.  */
+  BFD_RELOC_16C_NUM08,
+  BFD_RELOC_16C_NUM08_C,
+  BFD_RELOC_16C_NUM16,
+  BFD_RELOC_16C_NUM16_C,
+  BFD_RELOC_16C_NUM32,
+  BFD_RELOC_16C_NUM32_C,
+  BFD_RELOC_16C_DISP04,
+  BFD_RELOC_16C_DISP04_C,
+  BFD_RELOC_16C_DISP08,
+  BFD_RELOC_16C_DISP08_C,
+  BFD_RELOC_16C_DISP16,
+  BFD_RELOC_16C_DISP16_C,
+  BFD_RELOC_16C_DISP24,
+  BFD_RELOC_16C_DISP24_C,
+  BFD_RELOC_16C_DISP24a,
+  BFD_RELOC_16C_DISP24a_C,
+  BFD_RELOC_16C_REG04,
+  BFD_RELOC_16C_REG04_C,
+  BFD_RELOC_16C_REG04a,
+  BFD_RELOC_16C_REG04a_C,
+  BFD_RELOC_16C_REG14,
+  BFD_RELOC_16C_REG14_C,
+  BFD_RELOC_16C_REG16,
+  BFD_RELOC_16C_REG16_C,
+  BFD_RELOC_16C_REG20,
+  BFD_RELOC_16C_REG20_C,
+  BFD_RELOC_16C_ABS20,
+  BFD_RELOC_16C_ABS20_C,
+  BFD_RELOC_16C_ABS24,
+  BFD_RELOC_16C_ABS24_C,
+  BFD_RELOC_16C_IMM04,
+  BFD_RELOC_16C_IMM04_C,
+  BFD_RELOC_16C_IMM16,
+  BFD_RELOC_16C_IMM16_C,
+  BFD_RELOC_16C_IMM20,
+  BFD_RELOC_16C_IMM20_C,
+  BFD_RELOC_16C_IMM24,
+  BFD_RELOC_16C_IMM24_C,
+  BFD_RELOC_16C_IMM32,
+  BFD_RELOC_16C_IMM32_C,
+
+/* NS CR16 Relocations.  */
+  BFD_RELOC_CR16_NUM8,
+  BFD_RELOC_CR16_NUM16,
+  BFD_RELOC_CR16_NUM32,
+  BFD_RELOC_CR16_NUM32a,
+  BFD_RELOC_CR16_REGREL0,
+  BFD_RELOC_CR16_REGREL4,
+  BFD_RELOC_CR16_REGREL4a,
+  BFD_RELOC_CR16_REGREL14,
+  BFD_RELOC_CR16_REGREL14a,
+  BFD_RELOC_CR16_REGREL16,
+  BFD_RELOC_CR16_REGREL20,
+  BFD_RELOC_CR16_REGREL20a,
+  BFD_RELOC_CR16_ABS20,
+  BFD_RELOC_CR16_ABS24,
+  BFD_RELOC_CR16_IMM4,
+  BFD_RELOC_CR16_IMM8,
+  BFD_RELOC_CR16_IMM16,
+  BFD_RELOC_CR16_IMM20,
+  BFD_RELOC_CR16_IMM24,
+  BFD_RELOC_CR16_IMM32,
+  BFD_RELOC_CR16_IMM32a,
+  BFD_RELOC_CR16_DISP4,
+  BFD_RELOC_CR16_DISP8,
+  BFD_RELOC_CR16_DISP16,
+  BFD_RELOC_CR16_DISP20,
+  BFD_RELOC_CR16_DISP24,
+  BFD_RELOC_CR16_DISP24a,
+  BFD_RELOC_CR16_SWITCH8,
+  BFD_RELOC_CR16_SWITCH16,
+  BFD_RELOC_CR16_SWITCH32,
+  BFD_RELOC_CR16_GOT_REGREL20,
+  BFD_RELOC_CR16_GOTC_REGREL20,
+  BFD_RELOC_CR16_GLOB_DAT,
+
+/* NS CRX Relocations.  */
+  BFD_RELOC_CRX_REL4,
+  BFD_RELOC_CRX_REL8,
+  BFD_RELOC_CRX_REL8_CMP,
+  BFD_RELOC_CRX_REL16,
+  BFD_RELOC_CRX_REL24,
+  BFD_RELOC_CRX_REL32,
+  BFD_RELOC_CRX_REGREL12,
+  BFD_RELOC_CRX_REGREL22,
+  BFD_RELOC_CRX_REGREL28,
+  BFD_RELOC_CRX_REGREL32,
+  BFD_RELOC_CRX_ABS16,
+  BFD_RELOC_CRX_ABS32,
+  BFD_RELOC_CRX_NUM8,
+  BFD_RELOC_CRX_NUM16,
+  BFD_RELOC_CRX_NUM32,
+  BFD_RELOC_CRX_IMM16,
+  BFD_RELOC_CRX_IMM32,
+  BFD_RELOC_CRX_SWITCH8,
+  BFD_RELOC_CRX_SWITCH16,
+  BFD_RELOC_CRX_SWITCH32,
+
+/* These relocs are only used within the CRIS assembler.  They are not
+(at present) written to any object files.  */
+  BFD_RELOC_CRIS_BDISP8,
+  BFD_RELOC_CRIS_UNSIGNED_5,
+  BFD_RELOC_CRIS_SIGNED_6,
+  BFD_RELOC_CRIS_UNSIGNED_6,
+  BFD_RELOC_CRIS_SIGNED_8,
+  BFD_RELOC_CRIS_UNSIGNED_8,
+  BFD_RELOC_CRIS_SIGNED_16,
+  BFD_RELOC_CRIS_UNSIGNED_16,
+  BFD_RELOC_CRIS_LAPCQ_OFFSET,
+  BFD_RELOC_CRIS_UNSIGNED_4,
+
+/* Relocs used in ELF shared libraries for CRIS.  */
+  BFD_RELOC_CRIS_COPY,
+  BFD_RELOC_CRIS_GLOB_DAT,
+  BFD_RELOC_CRIS_JUMP_SLOT,
+  BFD_RELOC_CRIS_RELATIVE,
+
+/* 32-bit offset to symbol-entry within GOT.  */
+  BFD_RELOC_CRIS_32_GOT,
+
+/* 16-bit offset to symbol-entry within GOT.  */
+  BFD_RELOC_CRIS_16_GOT,
+
+/* 32-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_CRIS_32_GOTPLT,
+
+/* 16-bit offset to symbol-entry within GOT, with PLT handling.  */
+  BFD_RELOC_CRIS_16_GOTPLT,
+
+/* 32-bit offset to symbol, relative to GOT.  */
+  BFD_RELOC_CRIS_32_GOTREL,
+
+/* 32-bit offset to symbol with PLT entry, relative to GOT.  */
+  BFD_RELOC_CRIS_32_PLT_GOTREL,
+
+/* 32-bit offset to symbol with PLT entry, relative to this relocation.  */
+  BFD_RELOC_CRIS_32_PLT_PCREL,
+
+/* Relocs used in TLS code for CRIS.  */
+  BFD_RELOC_CRIS_32_GOT_GD,
+  BFD_RELOC_CRIS_16_GOT_GD,
+  BFD_RELOC_CRIS_32_GD,
+  BFD_RELOC_CRIS_DTP,
+  BFD_RELOC_CRIS_32_DTPREL,
+  BFD_RELOC_CRIS_16_DTPREL,
+  BFD_RELOC_CRIS_32_GOT_TPREL,
+  BFD_RELOC_CRIS_16_GOT_TPREL,
+  BFD_RELOC_CRIS_32_TPREL,
+  BFD_RELOC_CRIS_16_TPREL,
+  BFD_RELOC_CRIS_DTPMOD,
+  BFD_RELOC_CRIS_32_IE,
+
+/* Intel i860 Relocations.  */
+  BFD_RELOC_860_COPY,
+  BFD_RELOC_860_GLOB_DAT,
+  BFD_RELOC_860_JUMP_SLOT,
+  BFD_RELOC_860_RELATIVE,
+  BFD_RELOC_860_PC26,
+  BFD_RELOC_860_PLT26,
+  BFD_RELOC_860_PC16,
+  BFD_RELOC_860_LOW0,
+  BFD_RELOC_860_SPLIT0,
+  BFD_RELOC_860_LOW1,
+  BFD_RELOC_860_SPLIT1,
+  BFD_RELOC_860_LOW2,
+  BFD_RELOC_860_SPLIT2,
+  BFD_RELOC_860_LOW3,
+  BFD_RELOC_860_LOGOT0,
+  BFD_RELOC_860_SPGOT0,
+  BFD_RELOC_860_LOGOT1,
+  BFD_RELOC_860_SPGOT1,
+  BFD_RELOC_860_LOGOTOFF0,
+  BFD_RELOC_860_SPGOTOFF0,
+  BFD_RELOC_860_LOGOTOFF1,
+  BFD_RELOC_860_SPGOTOFF1,
+  BFD_RELOC_860_LOGOTOFF2,
+  BFD_RELOC_860_LOGOTOFF3,
+  BFD_RELOC_860_LOPC,
+  BFD_RELOC_860_HIGHADJ,
+  BFD_RELOC_860_HAGOT,
+  BFD_RELOC_860_HAGOTOFF,
+  BFD_RELOC_860_HAPC,
+  BFD_RELOC_860_HIGH,
+  BFD_RELOC_860_HIGOT,
+  BFD_RELOC_860_HIGOTOFF,
+
+/* OpenRISC 1000 Relocations.  */
+  BFD_RELOC_OR1K_REL_26,
+  BFD_RELOC_OR1K_GOTPC_HI16,
+  BFD_RELOC_OR1K_GOTPC_LO16,
+  BFD_RELOC_OR1K_GOT16,
+  BFD_RELOC_OR1K_PLT26,
+  BFD_RELOC_OR1K_GOTOFF_HI16,
+  BFD_RELOC_OR1K_GOTOFF_LO16,
+  BFD_RELOC_OR1K_COPY,
+  BFD_RELOC_OR1K_GLOB_DAT,
+  BFD_RELOC_OR1K_JMP_SLOT,
+  BFD_RELOC_OR1K_RELATIVE,
+  BFD_RELOC_OR1K_TLS_GD_HI16,
+  BFD_RELOC_OR1K_TLS_GD_LO16,
+  BFD_RELOC_OR1K_TLS_LDM_HI16,
+  BFD_RELOC_OR1K_TLS_LDM_LO16,
+  BFD_RELOC_OR1K_TLS_LDO_HI16,
+  BFD_RELOC_OR1K_TLS_LDO_LO16,
+  BFD_RELOC_OR1K_TLS_IE_HI16,
+  BFD_RELOC_OR1K_TLS_IE_LO16,
+  BFD_RELOC_OR1K_TLS_LE_HI16,
+  BFD_RELOC_OR1K_TLS_LE_LO16,
+  BFD_RELOC_OR1K_TLS_TPOFF,
+  BFD_RELOC_OR1K_TLS_DTPOFF,
+  BFD_RELOC_OR1K_TLS_DTPMOD,
+
+/* H8 elf Relocations.  */
+  BFD_RELOC_H8_DIR16A8,
+  BFD_RELOC_H8_DIR16R8,
+  BFD_RELOC_H8_DIR24A8,
+  BFD_RELOC_H8_DIR24R8,
+  BFD_RELOC_H8_DIR32A16,
+  BFD_RELOC_H8_DISP32A16,
+
+/* Sony Xstormy16 Relocations.  */
+  BFD_RELOC_XSTORMY16_REL_12,
+  BFD_RELOC_XSTORMY16_12,
+  BFD_RELOC_XSTORMY16_24,
+  BFD_RELOC_XSTORMY16_FPTR16,
+
+/* Self-describing complex relocations.  */
+  BFD_RELOC_RELC,
+
+
+/* Infineon Relocations.  */
+  BFD_RELOC_XC16X_PAG,
+  BFD_RELOC_XC16X_POF,
+  BFD_RELOC_XC16X_SEG,
+  BFD_RELOC_XC16X_SOF,
+
+/* Relocations used by VAX ELF.  */
+  BFD_RELOC_VAX_GLOB_DAT,
+  BFD_RELOC_VAX_JMP_SLOT,
+  BFD_RELOC_VAX_RELATIVE,
+
+/* Morpho MT - 16 bit immediate relocation.  */
+  BFD_RELOC_MT_PC16,
+
+/* Morpho MT - Hi 16 bits of an address.  */
+  BFD_RELOC_MT_HI16,
+
+/* Morpho MT - Low 16 bits of an address.  */
+  BFD_RELOC_MT_LO16,
+
+/* Morpho MT - Used to tell the linker which vtable entries are used.  */
+  BFD_RELOC_MT_GNU_VTINHERIT,
+
+/* Morpho MT - Used to tell the linker which vtable entries are used.  */
+  BFD_RELOC_MT_GNU_VTENTRY,
+
+/* Morpho MT - 8 bit immediate relocation.  */
+  BFD_RELOC_MT_PCINSN8,
+
+/* msp430 specific relocation codes  */
+  BFD_RELOC_MSP430_10_PCREL,
+  BFD_RELOC_MSP430_16_PCREL,
+  BFD_RELOC_MSP430_16,
+  BFD_RELOC_MSP430_16_PCREL_BYTE,
+  BFD_RELOC_MSP430_16_BYTE,
+  BFD_RELOC_MSP430_2X_PCREL,
+  BFD_RELOC_MSP430_RL_PCREL,
+  BFD_RELOC_MSP430_ABS8,
+  BFD_RELOC_MSP430X_PCR20_EXT_SRC,
+  BFD_RELOC_MSP430X_PCR20_EXT_DST,
+  BFD_RELOC_MSP430X_PCR20_EXT_ODST,
+  BFD_RELOC_MSP430X_ABS20_EXT_SRC,
+  BFD_RELOC_MSP430X_ABS20_EXT_DST,
+  BFD_RELOC_MSP430X_ABS20_EXT_ODST,
+  BFD_RELOC_MSP430X_ABS20_ADR_SRC,
+  BFD_RELOC_MSP430X_ABS20_ADR_DST,
+  BFD_RELOC_MSP430X_PCR16,
+  BFD_RELOC_MSP430X_PCR20_CALL,
+  BFD_RELOC_MSP430X_ABS16,
+  BFD_RELOC_MSP430_ABS_HI16,
+  BFD_RELOC_MSP430_PREL31,
+  BFD_RELOC_MSP430_SYM_DIFF,
+
+/* Relocations used by the Altera Nios II core.  */
+  BFD_RELOC_NIOS2_S16,
+  BFD_RELOC_NIOS2_U16,
+  BFD_RELOC_NIOS2_CALL26,
+  BFD_RELOC_NIOS2_IMM5,
+  BFD_RELOC_NIOS2_CACHE_OPX,
+  BFD_RELOC_NIOS2_IMM6,
+  BFD_RELOC_NIOS2_IMM8,
+  BFD_RELOC_NIOS2_HI16,
+  BFD_RELOC_NIOS2_LO16,
+  BFD_RELOC_NIOS2_HIADJ16,
+  BFD_RELOC_NIOS2_GPREL,
+  BFD_RELOC_NIOS2_UJMP,
+  BFD_RELOC_NIOS2_CJMP,
+  BFD_RELOC_NIOS2_CALLR,
+  BFD_RELOC_NIOS2_ALIGN,
+  BFD_RELOC_NIOS2_GOT16,
+  BFD_RELOC_NIOS2_CALL16,
+  BFD_RELOC_NIOS2_GOTOFF_LO,
+  BFD_RELOC_NIOS2_GOTOFF_HA,
+  BFD_RELOC_NIOS2_PCREL_LO,
+  BFD_RELOC_NIOS2_PCREL_HA,
+  BFD_RELOC_NIOS2_TLS_GD16,
+  BFD_RELOC_NIOS2_TLS_LDM16,
+  BFD_RELOC_NIOS2_TLS_LDO16,
+  BFD_RELOC_NIOS2_TLS_IE16,
+  BFD_RELOC_NIOS2_TLS_LE16,
+  BFD_RELOC_NIOS2_TLS_DTPMOD,
+  BFD_RELOC_NIOS2_TLS_DTPREL,
+  BFD_RELOC_NIOS2_TLS_TPREL,
+  BFD_RELOC_NIOS2_COPY,
+  BFD_RELOC_NIOS2_GLOB_DAT,
+  BFD_RELOC_NIOS2_JUMP_SLOT,
+  BFD_RELOC_NIOS2_RELATIVE,
+  BFD_RELOC_NIOS2_GOTOFF,
+  BFD_RELOC_NIOS2_CALL26_NOAT,
+  BFD_RELOC_NIOS2_GOT_LO,
+  BFD_RELOC_NIOS2_GOT_HA,
+  BFD_RELOC_NIOS2_CALL_LO,
+  BFD_RELOC_NIOS2_CALL_HA,
+  BFD_RELOC_NIOS2_R2_S12,
+  BFD_RELOC_NIOS2_R2_I10_1_PCREL,
+  BFD_RELOC_NIOS2_R2_T1I7_1_PCREL,
+  BFD_RELOC_NIOS2_R2_T1I7_2,
+  BFD_RELOC_NIOS2_R2_T2I4,
+  BFD_RELOC_NIOS2_R2_T2I4_1,
+  BFD_RELOC_NIOS2_R2_T2I4_2,
+  BFD_RELOC_NIOS2_R2_X1I7_2,
+  BFD_RELOC_NIOS2_R2_X2L5,
+  BFD_RELOC_NIOS2_R2_F1I5_2,
+  BFD_RELOC_NIOS2_R2_L5I4X1,
+  BFD_RELOC_NIOS2_R2_T1X1I6,
+  BFD_RELOC_NIOS2_R2_T1X1I6_2,
+
+/* IQ2000 Relocations.  */
+  BFD_RELOC_IQ2000_OFFSET_16,
+  BFD_RELOC_IQ2000_OFFSET_21,
+  BFD_RELOC_IQ2000_UHI16,
+
+/* Special Xtensa relocation used only by PLT entries in ELF shared
+objects to indicate that the runtime linker should set the value
+to one of its own internal functions or data structures.  */
+  BFD_RELOC_XTENSA_RTLD,
+
+/* Xtensa relocations for ELF shared objects.  */
+  BFD_RELOC_XTENSA_GLOB_DAT,
+  BFD_RELOC_XTENSA_JMP_SLOT,
+  BFD_RELOC_XTENSA_RELATIVE,
+
+/* Xtensa relocation used in ELF object files for symbols that may require
+PLT entries.  Otherwise, this is just a generic 32-bit relocation.  */
+  BFD_RELOC_XTENSA_PLT,
+
+/* Xtensa relocations to mark the difference of two local symbols.
+These are only needed to support linker relaxation and can be ignored
+when not relaxing.  The field is set to the value of the difference
+assuming no relaxation.  The relocation encodes the position of the
+first symbol so the linker can determine whether to adjust the field
+value.  */
+  BFD_RELOC_XTENSA_DIFF8,
+  BFD_RELOC_XTENSA_DIFF16,
+  BFD_RELOC_XTENSA_DIFF32,
+
+/* Generic Xtensa relocations for instruction operands.  Only the slot
+number is encoded in the relocation.  The relocation applies to the
+last PC-relative immediate operand, or if there are no PC-relative
+immediates, to the last immediate operand.  */
+  BFD_RELOC_XTENSA_SLOT0_OP,
+  BFD_RELOC_XTENSA_SLOT1_OP,
+  BFD_RELOC_XTENSA_SLOT2_OP,
+  BFD_RELOC_XTENSA_SLOT3_OP,
+  BFD_RELOC_XTENSA_SLOT4_OP,
+  BFD_RELOC_XTENSA_SLOT5_OP,
+  BFD_RELOC_XTENSA_SLOT6_OP,
+  BFD_RELOC_XTENSA_SLOT7_OP,
+  BFD_RELOC_XTENSA_SLOT8_OP,
+  BFD_RELOC_XTENSA_SLOT9_OP,
+  BFD_RELOC_XTENSA_SLOT10_OP,
+  BFD_RELOC_XTENSA_SLOT11_OP,
+  BFD_RELOC_XTENSA_SLOT12_OP,
+  BFD_RELOC_XTENSA_SLOT13_OP,
+  BFD_RELOC_XTENSA_SLOT14_OP,
+
+/* Alternate Xtensa relocations.  Only the slot is encoded in the
+relocation.  The meaning of these relocations is opcode-specific.  */
+  BFD_RELOC_XTENSA_SLOT0_ALT,
+  BFD_RELOC_XTENSA_SLOT1_ALT,
+  BFD_RELOC_XTENSA_SLOT2_ALT,
+  BFD_RELOC_XTENSA_SLOT3_ALT,
+  BFD_RELOC_XTENSA_SLOT4_ALT,
+  BFD_RELOC_XTENSA_SLOT5_ALT,
+  BFD_RELOC_XTENSA_SLOT6_ALT,
+  BFD_RELOC_XTENSA_SLOT7_ALT,
+  BFD_RELOC_XTENSA_SLOT8_ALT,
+  BFD_RELOC_XTENSA_SLOT9_ALT,
+  BFD_RELOC_XTENSA_SLOT10_ALT,
+  BFD_RELOC_XTENSA_SLOT11_ALT,
+  BFD_RELOC_XTENSA_SLOT12_ALT,
+  BFD_RELOC_XTENSA_SLOT13_ALT,
+  BFD_RELOC_XTENSA_SLOT14_ALT,
+
+/* Xtensa relocations for backward compatibility.  These have all been
+replaced by BFD_RELOC_XTENSA_SLOT0_OP.  */
+  BFD_RELOC_XTENSA_OP0,
+  BFD_RELOC_XTENSA_OP1,
+  BFD_RELOC_XTENSA_OP2,
+
+/* Xtensa relocation to mark that the assembler expanded the
+instructions from an original target.  The expansion size is
+encoded in the reloc size.  */
+  BFD_RELOC_XTENSA_ASM_EXPAND,
+
+/* Xtensa relocation to mark that the linker should simplify
+assembler-expanded instructions.  This is commonly used
+internally by the linker after analysis of a
+BFD_RELOC_XTENSA_ASM_EXPAND.  */
+  BFD_RELOC_XTENSA_ASM_SIMPLIFY,
+
+/* Xtensa TLS relocations.  */
+  BFD_RELOC_XTENSA_TLSDESC_FN,
+  BFD_RELOC_XTENSA_TLSDESC_ARG,
+  BFD_RELOC_XTENSA_TLS_DTPOFF,
+  BFD_RELOC_XTENSA_TLS_TPOFF,
+  BFD_RELOC_XTENSA_TLS_FUNC,
+  BFD_RELOC_XTENSA_TLS_ARG,
+  BFD_RELOC_XTENSA_TLS_CALL,
+
+/* 8 bit signed offset in (ix+d) or (iy+d).  */
+  BFD_RELOC_Z80_DISP8,
+
+/* DJNZ offset.  */
+  BFD_RELOC_Z8K_DISP7,
+
+/* CALR offset.  */
+  BFD_RELOC_Z8K_CALLR,
+
+/* 4 bit value.  */
+  BFD_RELOC_Z8K_IMM4L,
+
+/* Lattice Mico32 relocations.  */
+  BFD_RELOC_LM32_CALL,
+  BFD_RELOC_LM32_BRANCH,
+  BFD_RELOC_LM32_16_GOT,
+  BFD_RELOC_LM32_GOTOFF_HI16,
+  BFD_RELOC_LM32_GOTOFF_LO16,
+  BFD_RELOC_LM32_COPY,
+  BFD_RELOC_LM32_GLOB_DAT,
+  BFD_RELOC_LM32_JMP_SLOT,
+  BFD_RELOC_LM32_RELATIVE,
+
+/* Difference between two section addreses.  Must be followed by a
+BFD_RELOC_MACH_O_PAIR.  */
+  BFD_RELOC_MACH_O_SECTDIFF,
+
+/* Like BFD_RELOC_MACH_O_SECTDIFF but with a local symbol.  */
+  BFD_RELOC_MACH_O_LOCAL_SECTDIFF,
+
+/* Pair of relocation.  Contains the first symbol.  */
+  BFD_RELOC_MACH_O_PAIR,
+
+/* Symbol will be substracted.  Must be followed by a BFD_RELOC_32.  */
+  BFD_RELOC_MACH_O_SUBTRACTOR32,
+
+/* Symbol will be substracted.  Must be followed by a BFD_RELOC_64.  */
+  BFD_RELOC_MACH_O_SUBTRACTOR64,
+
+/* PCREL relocations.  They are marked as branch to create PLT entry if
+required.  */
+  BFD_RELOC_MACH_O_X86_64_BRANCH32,
+  BFD_RELOC_MACH_O_X86_64_BRANCH8,
+
+/* Used when referencing a GOT entry.  */
+  BFD_RELOC_MACH_O_X86_64_GOT,
+
+/* Used when loading a GOT entry with movq.  It is specially marked so that
+the linker could optimize the movq to a leaq if possible.  */
+  BFD_RELOC_MACH_O_X86_64_GOT_LOAD,
+
+/* Same as BFD_RELOC_32_PCREL but with an implicit -1 addend.  */
+  BFD_RELOC_MACH_O_X86_64_PCREL32_1,
+
+/* Same as BFD_RELOC_32_PCREL but with an implicit -2 addend.  */
+  BFD_RELOC_MACH_O_X86_64_PCREL32_2,
+
+/* Same as BFD_RELOC_32_PCREL but with an implicit -4 addend.  */
+  BFD_RELOC_MACH_O_X86_64_PCREL32_4,
+
+/* Addend for PAGE or PAGEOFF.  */
+  BFD_RELOC_MACH_O_ARM64_ADDEND,
+
+/* Relative offset to page of GOT slot.  */
+  BFD_RELOC_MACH_O_ARM64_GOT_LOAD_PAGE21,
+
+/* Relative offset within page of GOT slot.  */
+  BFD_RELOC_MACH_O_ARM64_GOT_LOAD_PAGEOFF12,
+
+/* Address of a GOT entry.  */
+  BFD_RELOC_MACH_O_ARM64_POINTER_TO_GOT,
+
+/* This is a 32 bit reloc for the microblaze that stores the
+low 16 bits of a value  */
+  BFD_RELOC_MICROBLAZE_32_LO,
+
+/* This is a 32 bit pc-relative reloc for the microblaze that
+stores the low 16 bits of a value  */
+  BFD_RELOC_MICROBLAZE_32_LO_PCREL,
+
+/* This is a 32 bit reloc for the microblaze that stores a
+value relative to the read-only small data area anchor  */
+  BFD_RELOC_MICROBLAZE_32_ROSDA,
+
+/* This is a 32 bit reloc for the microblaze that stores a
+value relative to the read-write small data area anchor  */
+  BFD_RELOC_MICROBLAZE_32_RWSDA,
+
+/* This is a 32 bit reloc for the microblaze to handle
+expressions of the form "Symbol Op Symbol"  */
+  BFD_RELOC_MICROBLAZE_32_SYM_OP_SYM,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  No relocation is
+done here - only used for relaxing  */
+  BFD_RELOC_MICROBLAZE_64_NONE,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  The relocation is
+PC-relative GOT offset  */
+  BFD_RELOC_MICROBLAZE_64_GOTPC,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  The relocation is
+GOT offset  */
+  BFD_RELOC_MICROBLAZE_64_GOT,
+
+/* This is a 64 bit reloc that stores the 32 bit pc relative
+value in two words (with an imm instruction).  The relocation is
+PC-relative offset into PLT  */
+  BFD_RELOC_MICROBLAZE_64_PLT,
+
+/* This is a 64 bit reloc that stores the 32 bit GOT relative
+value in two words (with an imm instruction).  The relocation is
+relative offset from _GLOBAL_OFFSET_TABLE_  */
+  BFD_RELOC_MICROBLAZE_64_GOTOFF,
+
+/* This is a 32 bit reloc that stores the 32 bit GOT relative
+value in a word.  The relocation is relative offset from  */
+  BFD_RELOC_MICROBLAZE_32_GOTOFF,
+
+/* This is used to tell the dynamic linker to copy the value out of
+the dynamic object into the runtime process image.  */
+  BFD_RELOC_MICROBLAZE_COPY,
+
+/* Unused Reloc  */
+  BFD_RELOC_MICROBLAZE_64_TLS,
+
+/* This is a 64 bit reloc that stores the 32 bit GOT relative value
+of the GOT TLS GD info entry in two words (with an imm instruction). The
+relocation is GOT offset.  */
+  BFD_RELOC_MICROBLAZE_64_TLSGD,
+
+/* This is a 64 bit reloc that stores the 32 bit GOT relative value
+of the GOT TLS LD info entry in two words (with an imm instruction). The
+relocation is GOT offset.  */
+  BFD_RELOC_MICROBLAZE_64_TLSLD,
+
+/* This is a 32 bit reloc that stores the Module ID to GOT(n).  */
+  BFD_RELOC_MICROBLAZE_32_TLSDTPMOD,
+
+/* This is a 32 bit reloc that stores TLS offset to GOT(n+1).  */
+  BFD_RELOC_MICROBLAZE_32_TLSDTPREL,
+
+/* This is a 32 bit reloc for storing TLS offset to two words (uses imm
+instruction)  */
+  BFD_RELOC_MICROBLAZE_64_TLSDTPREL,
+
+/* This is a 64 bit reloc that stores 32-bit thread pointer relative offset
+to two words (uses imm instruction).  */
+  BFD_RELOC_MICROBLAZE_64_TLSGOTTPREL,
+
+/* This is a 64 bit reloc that stores 32-bit thread pointer relative offset
+to two words (uses imm instruction).  */
+  BFD_RELOC_MICROBLAZE_64_TLSTPREL,
+
+/* AArch64 pseudo relocation code to mark the start of the AArch64
+relocation enumerators.  N.B. the order of the enumerators is
+important as several tables in the AArch64 bfd backend are indexed
+by these enumerators; make sure they are all synced.  */
+  BFD_RELOC_AARCH64_RELOC_START,
+
+/* Deprecated AArch64 null relocation code.  */
+  BFD_RELOC_AARCH64_NULL,
+
+/* AArch64 null relocation code.  */
+  BFD_RELOC_AARCH64_NONE,
+
+/* Basic absolute relocations of N bits.  These are equivalent to
+BFD_RELOC_N and they were added to assist the indexing of the howto
+table.  */
+  BFD_RELOC_AARCH64_64,
+  BFD_RELOC_AARCH64_32,
+  BFD_RELOC_AARCH64_16,
+
+/* PC-relative relocations.  These are equivalent to BFD_RELOC_N_PCREL
+and they were added to assist the indexing of the howto table.  */
+  BFD_RELOC_AARCH64_64_PCREL,
+  BFD_RELOC_AARCH64_32_PCREL,
+  BFD_RELOC_AARCH64_16_PCREL,
+
+/* AArch64 MOV[NZK] instruction with most significant bits 0 to 15
+of an unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G0,
+
+/* AArch64 MOV[NZK] instruction with less significant bits 0 to 15 of
+an address/value.  No overflow checking.  */
+  BFD_RELOC_AARCH64_MOVW_G0_NC,
+
+/* AArch64 MOV[NZK] instruction with most significant bits 16 to 31
+of an unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G1,
+
+/* AArch64 MOV[NZK] instruction with less significant bits 16 to 31
+of an address/value.  No overflow checking.  */
+  BFD_RELOC_AARCH64_MOVW_G1_NC,
+
+/* AArch64 MOV[NZK] instruction with most significant bits 32 to 47
+of an unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G2,
+
+/* AArch64 MOV[NZK] instruction with less significant bits 32 to 47
+of an address/value.  No overflow checking.  */
+  BFD_RELOC_AARCH64_MOVW_G2_NC,
+
+/* AArch64 MOV[NZK] instruction with most signficant bits 48 to 64
+of a signed or unsigned address/value.  */
+  BFD_RELOC_AARCH64_MOVW_G3,
+
+/* AArch64 MOV[NZ] instruction with most significant bits 0 to 15
+of a signed value.  Changes instruction to MOVZ or MOVN depending on the
+value's sign.  */
+  BFD_RELOC_AARCH64_MOVW_G0_S,
+
+/* AArch64 MOV[NZ] instruction with most significant bits 16 to 31
+of a signed value.  Changes instruction to MOVZ or MOVN depending on the
+value's sign.  */
+  BFD_RELOC_AARCH64_MOVW_G1_S,
+
+/* AArch64 MOV[NZ] instruction with most significant bits 32 to 47
+of a signed value.  Changes instruction to MOVZ or MOVN depending on the
+value's sign.  */
+  BFD_RELOC_AARCH64_MOVW_G2_S,
+
+/* AArch64 Load Literal instruction, holding a 19 bit pc-relative word
+offset.  The lowest two bits must be zero and are not stored in the
+instruction, giving a 21 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_LD_LO19_PCREL,
+
+/* AArch64 ADR instruction, holding a simple 21 bit pc-relative byte offset.  */
+  BFD_RELOC_AARCH64_ADR_LO21_PCREL,
+
+/* AArch64 ADRP instruction, with bits 12 to 32 of a pc-relative page
+offset, giving a 4KB aligned page base address.  */
+  BFD_RELOC_AARCH64_ADR_HI21_PCREL,
+
+/* AArch64 ADRP instruction, with bits 12 to 32 of a pc-relative page
+offset, giving a 4KB aligned page base address, but with no overflow
+checking.  */
+  BFD_RELOC_AARCH64_ADR_HI21_NC_PCREL,
+
+/* AArch64 ADD immediate instruction, holding bits 0 to 11 of the address.
+Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_ADD_LO12,
+
+/* AArch64 8-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST8_LO12,
+
+/* AArch64 14 bit pc-relative test bit and branch.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 16 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_TSTBR14,
+
+/* AArch64 19 bit pc-relative conditional branch and compare & branch.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 21 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_BRANCH19,
+
+/* AArch64 26 bit pc-relative unconditional branch.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 28 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_JUMP26,
+
+/* AArch64 26 bit pc-relative unconditional branch and link.
+The lowest two bits must be zero and are not stored in the instruction,
+giving a 28 bit signed byte offset.  */
+  BFD_RELOC_AARCH64_CALL26,
+
+/* AArch64 16-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST16_LO12,
+
+/* AArch64 32-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST32_LO12,
+
+/* AArch64 64-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST64_LO12,
+
+/* AArch64 128-bit load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST128_LO12,
+
+/* AArch64 Load Literal instruction, holding a 19 bit PC relative word
+offset of the global offset table entry for a symbol.  The lowest two
+bits must be zero and are not stored in the instruction, giving a 21
+bit signed byte offset.  This relocation type requires signed overflow
+checking.  */
+  BFD_RELOC_AARCH64_GOT_LD_PREL19,
+
+/* Get to the page base of the global offset table entry for a symbol as
+part of an ADRP instruction using a 21 bit PC relative value.Used in
+conjunction with BFD_RELOC_AARCH64_LD64_GOT_LO12_NC.  */
+  BFD_RELOC_AARCH64_ADR_GOT_PAGE,
+
+/* Unsigned 12 bit byte offset for 64 bit load/store from the page of
+the GOT entry for this symbol.  Used in conjunction with
+BFD_RELOC_AARCH64_ADR_GOTPAGE.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_LD64_GOT_LO12_NC,
+
+/* Unsigned 12 bit byte offset for 32 bit load/store from the page of
+the GOT entry for this symbol.  Used in conjunction with
+BFD_RELOC_AARCH64_ADR_GOTPAGE.  Valid in ILP32 ABI only.  */
+  BFD_RELOC_AARCH64_LD32_GOT_LO12_NC,
+
+/* Unsigned 16 bit byte offset for 64 bit load/store from the GOT entry
+for this symbol.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_MOVW_GOTOFF_G0_NC,
+
+/* Unsigned 16 bit byte higher offset for 64 bit load/store from the GOT entry
+for this symbol.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_MOVW_GOTOFF_G1,
+
+/* Unsigned 15 bit byte offset for 64 bit load/store from the page of
+the GOT entry for this symbol.  Valid in LP64 ABI only.  */
+  BFD_RELOC_AARCH64_LD64_GOTOFF_LO15,
+
+/* Scaled 14 bit byte offset to the page base of the global offset table.  */
+  BFD_RELOC_AARCH64_LD32_GOTPAGE_LO14,
+
+/* Scaled 15 bit byte offset to the page base of the global offset table.  */
+  BFD_RELOC_AARCH64_LD64_GOTPAGE_LO15,
+
+/* Get to the page base of the global offset table entry for a symbols
+tls_index structure as part of an adrp instruction using a 21 bit PC
+relative value.  Used in conjunction with
+BFD_RELOC_AARCH64_TLSGD_ADD_LO12_NC.  */
+  BFD_RELOC_AARCH64_TLSGD_ADR_PAGE21,
+
+/* AArch64 TLS General Dynamic  */
+  BFD_RELOC_AARCH64_TLSGD_ADR_PREL21,
+
+/* Unsigned 12 bit byte offset to global offset table entry for a symbols
+tls_index structure.  Used in conjunction with
+BFD_RELOC_AARCH64_TLSGD_ADR_PAGE21.  */
+  BFD_RELOC_AARCH64_TLSGD_ADD_LO12_NC,
+
+/* AArch64 TLS General Dynamic relocation.  */
+  BFD_RELOC_AARCH64_TLSGD_MOVW_G0_NC,
+
+/* AArch64 TLS General Dynamic relocation.  */
+  BFD_RELOC_AARCH64_TLSGD_MOVW_G1,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_LD32_GOTTPREL_LO12_NC,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_LD_GOTTPREL_PREL19,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC,
+
+/* AArch64 TLS INITIAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSIE_MOVW_GOTTPREL_G1,
+
+/* bit[23:12] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_HI12,
+
+/* Unsigned 12 bit byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_LO12,
+
+/* No overflow check version of BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_LO12.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_DTPREL_LO12_NC,
+
+/* Unsigned 12 bit byte offset to global offset table entry for a symbols
+tls_index structure.  Used in conjunction with
+BFD_RELOC_AARCH64_TLSLD_ADR_PAGE21.  */
+  BFD_RELOC_AARCH64_TLSLD_ADD_LO12_NC,
+
+/* GOT entry page address for AArch64 TLS Local Dynamic, used with ADRP
+instruction.  */
+  BFD_RELOC_AARCH64_TLSLD_ADR_PAGE21,
+
+/* GOT entry address for AArch64 TLS Local Dynamic, used with ADR instruction.  */
+  BFD_RELOC_AARCH64_TLSLD_ADR_PREL21,
+
+/* bit[11:1] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST16_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST16_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC,
+
+/* bit[11:2] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST32_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST32_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC,
+
+/* bit[11:3] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST64_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST64_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC,
+
+/* bit[11:0] of byte offset to module TLS base address, encoded in ldst
+instructions.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST8_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST8_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC,
+
+/* bit[15:0] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G0,
+
+/* No overflow check version of BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G0  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G0_NC,
+
+/* bit[31:16] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G1,
+
+/* No overflow check version of BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G1  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G1_NC,
+
+/* bit[47:32] of byte offset to module TLS base address.  */
+  BFD_RELOC_AARCH64_TLSLD_MOVW_DTPREL_G2,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G2,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G1,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G1_NC,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G0,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_MOVW_TPREL_G0_NC,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_ADD_TPREL_HI12,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_ADD_TPREL_LO12,
+
+/* AArch64 TLS LOCAL EXEC relocation.  */
+  BFD_RELOC_AARCH64_TLSLE_ADD_TPREL_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD_PREL19,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADR_PREL21,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADR_PAGE21,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD64_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD32_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADD_LO12_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_OFF_G1,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_OFF_G0_NC,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_LDR,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_ADD,
+
+/* AArch64 TLS DESC relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC_CALL,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_COPY,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_GLOB_DAT,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_JUMP_SLOT,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_RELATIVE,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLS_DTPMOD,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLS_DTPREL,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLS_TPREL,
+
+/* AArch64 TLS relocation.  */
+  BFD_RELOC_AARCH64_TLSDESC,
+
+/* AArch64 support for STT_GNU_IFUNC.  */
+  BFD_RELOC_AARCH64_IRELATIVE,
+
+/* AArch64 pseudo relocation code to mark the end of the AArch64
+relocation enumerators that have direct mapping to ELF reloc codes.
+There are a few more enumerators after this one; those are mainly
+used by the AArch64 assembler for the internal fixup or to select
+one of the above enumerators.  */
+  BFD_RELOC_AARCH64_RELOC_END,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_GAS_INTERNAL_FIXUP,
+
+/* AArch64 unspecified load/store instruction, holding bits 0 to 11 of the
+address.  Used in conjunction with BFD_RELOC_AARCH64_ADR_HI21_PCREL.  */
+  BFD_RELOC_AARCH64_LDST_LO12,
+
+/* AArch64 pseudo relocation code for TLS local dynamic mode.  It's to be
+used internally by the AArch64 assembler and not (currently) written to
+any object files.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST_DTPREL_LO12,
+
+/* Similar as BFD_RELOC_AARCH64_TLSLD_LDST_DTPREL_LO12, but no overflow check.  */
+  BFD_RELOC_AARCH64_TLSLD_LDST_DTPREL_LO12_NC,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_LD_GOT_LO12_NC,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_TLSIE_LD_GOTTPREL_LO12_NC,
+
+/* AArch64 pseudo relocation code to be used internally by the AArch64
+assembler and not (currently) written to any object files.  */
+  BFD_RELOC_AARCH64_TLSDESC_LD_LO12_NC,
+
+/* Tilera TILEPro Relocations.  */
+  BFD_RELOC_TILEPRO_COPY,
+  BFD_RELOC_TILEPRO_GLOB_DAT,
+  BFD_RELOC_TILEPRO_JMP_SLOT,
+  BFD_RELOC_TILEPRO_RELATIVE,
+  BFD_RELOC_TILEPRO_BROFF_X1,
+  BFD_RELOC_TILEPRO_JOFFLONG_X1,
+  BFD_RELOC_TILEPRO_JOFFLONG_X1_PLT,
+  BFD_RELOC_TILEPRO_IMM8_X0,
+  BFD_RELOC_TILEPRO_IMM8_Y0,
+  BFD_RELOC_TILEPRO_IMM8_X1,
+  BFD_RELOC_TILEPRO_IMM8_Y1,
+  BFD_RELOC_TILEPRO_DEST_IMM8_X1,
+  BFD_RELOC_TILEPRO_MT_IMM15_X1,
+  BFD_RELOC_TILEPRO_MF_IMM15_X1,
+  BFD_RELOC_TILEPRO_IMM16_X0,
+  BFD_RELOC_TILEPRO_IMM16_X1,
+  BFD_RELOC_TILEPRO_IMM16_X0_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_HA,
+  BFD_RELOC_TILEPRO_IMM16_X0_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_LO_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_LO_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_HI_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_HI_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_HA_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X1_HA_PCREL,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_GOT_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_GOT_HA,
+  BFD_RELOC_TILEPRO_MMSTART_X0,
+  BFD_RELOC_TILEPRO_MMEND_X0,
+  BFD_RELOC_TILEPRO_MMSTART_X1,
+  BFD_RELOC_TILEPRO_MMEND_X1,
+  BFD_RELOC_TILEPRO_SHAMT_X0,
+  BFD_RELOC_TILEPRO_SHAMT_X1,
+  BFD_RELOC_TILEPRO_SHAMT_Y0,
+  BFD_RELOC_TILEPRO_SHAMT_Y1,
+  BFD_RELOC_TILEPRO_TLS_GD_CALL,
+  BFD_RELOC_TILEPRO_IMM8_X0_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_IMM8_X1_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_IMM8_Y0_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_IMM8_Y1_TLS_GD_ADD,
+  BFD_RELOC_TILEPRO_TLS_IE_LOAD,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_GD_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_GD_HA,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_IE_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_IE_HA,
+  BFD_RELOC_TILEPRO_TLS_DTPMOD32,
+  BFD_RELOC_TILEPRO_TLS_DTPOFF32,
+  BFD_RELOC_TILEPRO_TLS_TPOFF32,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE_LO,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE_HI,
+  BFD_RELOC_TILEPRO_IMM16_X0_TLS_LE_HA,
+  BFD_RELOC_TILEPRO_IMM16_X1_TLS_LE_HA,
+
+/* Tilera TILE-Gx Relocations.  */
+  BFD_RELOC_TILEGX_HW0,
+  BFD_RELOC_TILEGX_HW1,
+  BFD_RELOC_TILEGX_HW2,
+  BFD_RELOC_TILEGX_HW3,
+  BFD_RELOC_TILEGX_HW0_LAST,
+  BFD_RELOC_TILEGX_HW1_LAST,
+  BFD_RELOC_TILEGX_HW2_LAST,
+  BFD_RELOC_TILEGX_COPY,
+  BFD_RELOC_TILEGX_GLOB_DAT,
+  BFD_RELOC_TILEGX_JMP_SLOT,
+  BFD_RELOC_TILEGX_RELATIVE,
+  BFD_RELOC_TILEGX_BROFF_X1,
+  BFD_RELOC_TILEGX_JUMPOFF_X1,
+  BFD_RELOC_TILEGX_JUMPOFF_X1_PLT,
+  BFD_RELOC_TILEGX_IMM8_X0,
+  BFD_RELOC_TILEGX_IMM8_Y0,
+  BFD_RELOC_TILEGX_IMM8_X1,
+  BFD_RELOC_TILEGX_IMM8_Y1,
+  BFD_RELOC_TILEGX_DEST_IMM8_X1,
+  BFD_RELOC_TILEGX_MT_IMM14_X1,
+  BFD_RELOC_TILEGX_MF_IMM14_X1,
+  BFD_RELOC_TILEGX_MMSTART_X0,
+  BFD_RELOC_TILEGX_MMEND_X0,
+  BFD_RELOC_TILEGX_SHAMT_X0,
+  BFD_RELOC_TILEGX_SHAMT_X1,
+  BFD_RELOC_TILEGX_SHAMT_Y0,
+  BFD_RELOC_TILEGX_SHAMT_Y1,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2,
+  BFD_RELOC_TILEGX_IMM16_X0_HW3,
+  BFD_RELOC_TILEGX_IMM16_X1_HW3,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_LAST,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_LAST,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW3_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW3_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_LAST_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_GOT,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_GOT,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_GOT,
+  BFD_RELOC_TILEGX_IMM16_X0_HW3_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW3_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_TLS_LE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_TLS_GD,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW2_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X1_HW2_LAST_PLT_PCREL,
+  BFD_RELOC_TILEGX_IMM16_X0_HW0_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW0_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X0_HW1_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_IMM16_X1_HW1_LAST_TLS_IE,
+  BFD_RELOC_TILEGX_TLS_DTPMOD64,
+  BFD_RELOC_TILEGX_TLS_DTPOFF64,
+  BFD_RELOC_TILEGX_TLS_TPOFF64,
+  BFD_RELOC_TILEGX_TLS_DTPMOD32,
+  BFD_RELOC_TILEGX_TLS_DTPOFF32,
+  BFD_RELOC_TILEGX_TLS_TPOFF32,
+  BFD_RELOC_TILEGX_TLS_GD_CALL,
+  BFD_RELOC_TILEGX_IMM8_X0_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_IMM8_X1_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y0_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y1_TLS_GD_ADD,
+  BFD_RELOC_TILEGX_TLS_IE_LOAD,
+  BFD_RELOC_TILEGX_IMM8_X0_TLS_ADD,
+  BFD_RELOC_TILEGX_IMM8_X1_TLS_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y0_TLS_ADD,
+  BFD_RELOC_TILEGX_IMM8_Y1_TLS_ADD,
+
+/* Adapteva EPIPHANY - 8 bit signed pc-relative displacement  */
+  BFD_RELOC_EPIPHANY_SIMM8,
+
+/* Adapteva EPIPHANY - 24 bit signed pc-relative displacement  */
+  BFD_RELOC_EPIPHANY_SIMM24,
+
+/* Adapteva EPIPHANY - 16 most-significant bits of absolute address  */
+  BFD_RELOC_EPIPHANY_HIGH,
+
+/* Adapteva EPIPHANY - 16 least-significant bits of absolute address  */
+  BFD_RELOC_EPIPHANY_LOW,
+
+/* Adapteva EPIPHANY - 11 bit signed number - add/sub immediate  */
+  BFD_RELOC_EPIPHANY_SIMM11,
+
+/* Adapteva EPIPHANY - 11 bit sign-magnitude number (ld/st displacement)  */
+  BFD_RELOC_EPIPHANY_IMM11,
+
+/* Adapteva EPIPHANY - 8 bit immediate for 16 bit mov instruction.  */
+  BFD_RELOC_EPIPHANY_IMM8,
+
+/* Visium Relocations.  */
+  BFD_RELOC_VISIUM_HI16,
+  BFD_RELOC_VISIUM_LO16,
+  BFD_RELOC_VISIUM_IM16,
+  BFD_RELOC_VISIUM_REL16,
+  BFD_RELOC_VISIUM_HI16_PCREL,
+  BFD_RELOC_VISIUM_LO16_PCREL,
+  BFD_RELOC_VISIUM_IM16_PCREL,
+  BFD_RELOC_UNUSED };
+
+typedef enum bfd_reloc_code_real bfd_reloc_code_real_type;
+reloc_howto_type *bfd_reloc_type_lookup
+   (bfd *abfd, bfd_reloc_code_real_type code);
+reloc_howto_type *bfd_reloc_name_lookup
+   (bfd *abfd, const char *reloc_name);
+
+const char *bfd_get_reloc_code_name (bfd_reloc_code_real_type code);
+
+/* Extracted from syms.c.  */
+
+typedef struct bfd_symbol
+{
+  /* A pointer to the BFD which owns the symbol. This information
+     is necessary so that a back end can work out what additional
+     information (invisible to the application writer) is carried
+     with the symbol.
+
+     This field is *almost* redundant, since you can use section->owner
+     instead, except that some symbols point to the global sections
+     bfd_{abs,com,und}_section.  This could be fixed by making
+     these globals be per-bfd (or per-target-flavor).  FIXME.  */
+  struct bfd *the_bfd; /* Use bfd_asymbol_bfd(sym) to access this field.  */
+
+  /* The text of the symbol. The name is left alone, and not copied; the
+     application may not alter it.  */
+  const char *name;
+
+  /* The value of the symbol.  This really should be a union of a
+     numeric value with a pointer, since some flags indicate that
+     a pointer to another symbol is stored here.  */
+  symvalue value;
+
+  /* Attributes of a symbol.  */
+#define BSF_NO_FLAGS           0x00
+
+  /* The symbol has local scope; <<static>> in <<C>>. The value
+     is the offset into the section of the data.  */
+#define BSF_LOCAL              (1 << 0)
+
+  /* The symbol has global scope; initialized data in <<C>>. The
+     value is the offset into the section of the data.  */
+#define BSF_GLOBAL             (1 << 1)
+
+  /* The symbol has global scope and is exported. The value is
+     the offset into the section of the data.  */
+#define BSF_EXPORT     BSF_GLOBAL /* No real difference.  */
+
+  /* A normal C symbol would be one of:
+     <<BSF_LOCAL>>, <<BSF_UNDEFINED>> or <<BSF_GLOBAL>>.  */
+
+  /* The symbol is a debugging record. The value has an arbitrary
+     meaning, unless BSF_DEBUGGING_RELOC is also set.  */
+#define BSF_DEBUGGING          (1 << 2)
+
+  /* The symbol denotes a function entry point.  Used in ELF,
+     perhaps others someday.  */
+#define BSF_FUNCTION           (1 << 3)
+
+  /* Used by the linker.  */
+#define BSF_KEEP               (1 << 5)
+
+  /* An ELF common symbol.  */
+#define BSF_ELF_COMMON         (1 << 6)
+
+  /* A weak global symbol, overridable without warnings by
+     a regular global symbol of the same name.  */
+#define BSF_WEAK               (1 << 7)
+
+  /* This symbol was created to point to a section, e.g. ELF's
+     STT_SECTION symbols.  */
+#define BSF_SECTION_SYM        (1 << 8)
+
+  /* The symbol used to be a common symbol, but now it is
+     allocated.  */
+#define BSF_OLD_COMMON         (1 << 9)
+
+  /* In some files the type of a symbol sometimes alters its
+     location in an output file - ie in coff a <<ISFCN>> symbol
+     which is also <<C_EXT>> symbol appears where it was
+     declared and not at the end of a section.  This bit is set
+     by the target BFD part to convey this information.  */
+#define BSF_NOT_AT_END         (1 << 10)
+
+  /* Signal that the symbol is the label of constructor section.  */
+#define BSF_CONSTRUCTOR        (1 << 11)
+
+  /* Signal that the symbol is a warning symbol.  The name is a
+     warning.  The name of the next symbol is the one to warn about;
+     if a reference is made to a symbol with the same name as the next
+     symbol, a warning is issued by the linker.  */
+#define BSF_WARNING            (1 << 12)
+
+  /* Signal that the symbol is indirect.  This symbol is an indirect
+     pointer to the symbol with the same name as the next symbol.  */
+#define BSF_INDIRECT           (1 << 13)
+
+  /* BSF_FILE marks symbols that contain a file name.  This is used
+     for ELF STT_FILE symbols.  */
+#define BSF_FILE               (1 << 14)
+
+  /* Symbol is from dynamic linking information.  */
+#define BSF_DYNAMIC            (1 << 15)
+
+  /* The symbol denotes a data object.  Used in ELF, and perhaps
+     others someday.  */
+#define BSF_OBJECT             (1 << 16)
+
+  /* This symbol is a debugging symbol.  The value is the offset
+     into the section of the data.  BSF_DEBUGGING should be set
+     as well.  */
+#define BSF_DEBUGGING_RELOC    (1 << 17)
+
+  /* This symbol is thread local.  Used in ELF.  */
+#define BSF_THREAD_LOCAL       (1 << 18)
+
+  /* This symbol represents a complex relocation expression,
+     with the expression tree serialized in the symbol name.  */
+#define BSF_RELC               (1 << 19)
+
+  /* This symbol represents a signed complex relocation expression,
+     with the expression tree serialized in the symbol name.  */
+#define BSF_SRELC              (1 << 20)
+
+  /* This symbol was created by bfd_get_synthetic_symtab.  */
+#define BSF_SYNTHETIC          (1 << 21)
+
+  /* This symbol is an indirect code object.  Unrelated to BSF_INDIRECT.
+     The dynamic linker will compute the value of this symbol by
+     calling the function that it points to.  BSF_FUNCTION must
+     also be also set.  */
+#define BSF_GNU_INDIRECT_FUNCTION (1 << 22)
+  /* This symbol is a globally unique data object.  The dynamic linker
+     will make sure that in the entire process there is just one symbol
+     with this name and type in use.  BSF_OBJECT must also be set.  */
+#define BSF_GNU_UNIQUE         (1 << 23)
+
+  flagword flags;
+
+  /* A pointer to the section to which this symbol is
+     relative.  This will always be non NULL, there are special
+     sections for undefined and absolute symbols.  */
+  struct bfd_section *section;
+
+  /* Back end special data.  */
+  union
+    {
+      void *p;
+      bfd_vma i;
+    }
+  udata;
+}
+asymbol;
+
+#define bfd_get_symtab_upper_bound(abfd) \
+     BFD_SEND (abfd, _bfd_get_symtab_upper_bound, (abfd))
+
+bfd_boolean bfd_is_local_label (bfd *abfd, asymbol *sym);
+
+bfd_boolean bfd_is_local_label_name (bfd *abfd, const char *name);
+
+#define bfd_is_local_label_name(abfd, name) \
+  BFD_SEND (abfd, _bfd_is_local_label_name, (abfd, name))
+
+bfd_boolean bfd_is_target_special_symbol (bfd *abfd, asymbol *sym);
+
+#define bfd_is_target_special_symbol(abfd, sym) \
+  BFD_SEND (abfd, _bfd_is_target_special_symbol, (abfd, sym))
+
+#define bfd_canonicalize_symtab(abfd, location) \
+  BFD_SEND (abfd, _bfd_canonicalize_symtab, (abfd, location))
+
+bfd_boolean bfd_set_symtab
+   (bfd *abfd, asymbol **location, unsigned int count);
+
+void bfd_print_symbol_vandf (bfd *abfd, void *file, asymbol *symbol);
+
+#define bfd_make_empty_symbol(abfd) \
+  BFD_SEND (abfd, _bfd_make_empty_symbol, (abfd))
+
+asymbol *_bfd_generic_make_empty_symbol (bfd *);
+
+#define bfd_make_debug_symbol(abfd,ptr,size) \
+  BFD_SEND (abfd, _bfd_make_debug_symbol, (abfd, ptr, size))
+
+int bfd_decode_symclass (asymbol *symbol);
+
+bfd_boolean bfd_is_undefined_symclass (int symclass);
+
+void bfd_symbol_info (asymbol *symbol, symbol_info *ret);
+
+bfd_boolean bfd_copy_private_symbol_data
+   (bfd *ibfd, asymbol *isym, bfd *obfd, asymbol *osym);
+
+#define bfd_copy_private_symbol_data(ibfd, isymbol, obfd, osymbol) \
+  BFD_SEND (obfd, _bfd_copy_private_symbol_data, \
+            (ibfd, isymbol, obfd, osymbol))
+
+/* Extracted from bfd.c.  */
+
+enum bfd_direction
+  {
+    no_direction = 0,
+    read_direction = 1,
+    write_direction = 2,
+    both_direction = 3
+  };
+
+enum bfd_plugin_format
+  {
+    bfd_plugin_unknown = 0,
+    bfd_plugin_yes = 1,
+    bfd_plugin_no = 2
+  };
+
+struct bfd_build_id
+  {
+    bfd_size_type size;
+    bfd_byte data[1];
+  };
+
+struct bfd
+{
+  /* The filename the application opened the BFD with.  */
+  const char *filename;
+
+  /* A pointer to the target jump table.  */
+  const struct bfd_target *xvec;
+
+  /* The IOSTREAM, and corresponding IO vector that provide access
+     to the file backing the BFD.  */
+  void *iostream;
+  const struct bfd_iovec *iovec;
+
+  /* The caching routines use these to maintain a
+     least-recently-used list of BFDs.  */
+  struct bfd *lru_prev, *lru_next;
+
+  /* When a file is closed by the caching routines, BFD retains
+     state information on the file here...  */
+  ufile_ptr where;
+
+  /* File modified time, if mtime_set is TRUE.  */
+  long mtime;
+
+  /* A unique identifier of the BFD  */
+  unsigned int id;
+
+  /* The format which belongs to the BFD. (object, core, etc.)  */
+  ENUM_BITFIELD (bfd_format) format : 3;
+
+  /* The direction with which the BFD was opened.  */
+  ENUM_BITFIELD (bfd_direction) direction : 2;
+
+  /* Format_specific flags.  */
+  flagword flags : 20;
+
+  /* Values that may appear in the flags field of a BFD.  These also
+     appear in the object_flags field of the bfd_target structure, where
+     they indicate the set of flags used by that backend (not all flags
+     are meaningful for all object file formats) (FIXME: at the moment,
+     the object_flags values have mostly just been copied from backend
+     to another, and are not necessarily correct).  */
+
+#define BFD_NO_FLAGS   0x00
+
+  /* BFD contains relocation entries.  */
+#define HAS_RELOC      0x01
+
+  /* BFD is directly executable.  */
+#define EXEC_P         0x02
+
+  /* BFD has line number information (basically used for F_LNNO in a
+     COFF header).  */
+#define HAS_LINENO     0x04
+
+  /* BFD has debugging information.  */
+#define HAS_DEBUG      0x08
+
+  /* BFD has symbols.  */
+#define HAS_SYMS       0x10
+
+  /* BFD has local symbols (basically used for F_LSYMS in a COFF
+     header).  */
+#define HAS_LOCALS     0x20
+
+  /* BFD is a dynamic object.  */
+#define DYNAMIC        0x40
+
+  /* Text section is write protected (if D_PAGED is not set, this is
+     like an a.out NMAGIC file) (the linker sets this by default, but
+     clears it for -r or -N).  */
+#define WP_TEXT        0x80
+
+  /* BFD is dynamically paged (this is like an a.out ZMAGIC file) (the
+     linker sets this by default, but clears it for -r or -n or -N).  */
+#define D_PAGED        0x100
+
+  /* BFD is relaxable (this means that bfd_relax_section may be able to
+     do something) (sometimes bfd_relax_section can do something even if
+     this is not set).  */
+#define BFD_IS_RELAXABLE 0x200
+
+  /* This may be set before writing out a BFD to request using a
+     traditional format.  For example, this is used to request that when
+     writing out an a.out object the symbols not be hashed to eliminate
+     duplicates.  */
+#define BFD_TRADITIONAL_FORMAT 0x400
+
+  /* This flag indicates that the BFD contents are actually cached
+     in memory.  If this is set, iostream points to a bfd_in_memory
+     struct.  */
+#define BFD_IN_MEMORY 0x800
+
+  /* This BFD has been created by the linker and doesn't correspond
+     to any input file.  */
+#define BFD_LINKER_CREATED 0x1000
+
+  /* This may be set before writing out a BFD to request that it
+     be written using values for UIDs, GIDs, timestamps, etc. that
+     will be consistent from run to run.  */
+#define BFD_DETERMINISTIC_OUTPUT 0x2000
+
+  /* Compress sections in this BFD.  */
+#define BFD_COMPRESS 0x4000
+
+  /* Decompress sections in this BFD.  */
+#define BFD_DECOMPRESS 0x8000
+
+  /* BFD is a dummy, for plugins.  */
+#define BFD_PLUGIN 0x10000
+
+  /* Compress sections in this BFD with SHF_COMPRESSED from gABI.  */
+#define BFD_COMPRESS_GABI 0x20000
+
+  /* Convert ELF common symbol type to STT_COMMON or STT_OBJECT in this
+     BFD.  */
+#define BFD_CONVERT_ELF_COMMON 0x40000
+
+  /* Use the ELF STT_COMMON type in this BFD.  */
+#define BFD_USE_ELF_STT_COMMON 0x80000
+
+  /* Flags bits to be saved in bfd_preserve_save.  */
+#define BFD_FLAGS_SAVED \
+  (BFD_IN_MEMORY | BFD_COMPRESS | BFD_DECOMPRESS | BFD_PLUGIN \
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON)
+
+  /* Flags bits which are for BFD use only.  */
+#define BFD_FLAGS_FOR_BFD_USE_MASK \
+  (BFD_IN_MEMORY | BFD_COMPRESS | BFD_DECOMPRESS | BFD_LINKER_CREATED \
+   | BFD_PLUGIN | BFD_TRADITIONAL_FORMAT | BFD_DETERMINISTIC_OUTPUT \
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON)
+
+  /* Is the file descriptor being cached?  That is, can it be closed as
+     needed, and re-opened when accessed later?  */
+  unsigned int cacheable : 1;
+
+  /* Marks whether there was a default target specified when the
+     BFD was opened. This is used to select which matching algorithm
+     to use to choose the back end.  */
+  unsigned int target_defaulted : 1;
+
+  /* ... and here: (``once'' means at least once).  */
+  unsigned int opened_once : 1;
+
+  /* Set if we have a locally maintained mtime value, rather than
+     getting it from the file each time.  */
+  unsigned int mtime_set : 1;
+
+  /* Flag set if symbols from this BFD should not be exported.  */
+  unsigned int no_export : 1;
+
+  /* Remember when output has begun, to stop strange things
+     from happening.  */
+  unsigned int output_has_begun : 1;
+
+  /* Have archive map.  */
+  unsigned int has_armap : 1;
+
+  /* Set if this is a thin archive.  */
+  unsigned int is_thin_archive : 1;
+
+  /* Set if only required symbols should be added in the link hash table for
+     this object.  Used by VMS linkers.  */
+  unsigned int selective_search : 1;
+
+  /* Set if this is the linker output BFD.  */
+  unsigned int is_linker_output : 1;
+
+  /* Set if this is the linker input BFD.  */
+  unsigned int is_linker_input : 1;
+
+  /* If this is an input for a compiler plug-in library.  */
+  ENUM_BITFIELD (bfd_plugin_format) plugin_format : 2;
+
+  /* Set if this is a plugin output file.  */
+  unsigned int lto_output : 1;
+
+  /* Set to dummy BFD created when claimed by a compiler plug-in
+     library.  */
+  bfd *plugin_dummy_bfd;
+
+  /* Currently my_archive is tested before adding origin to
+     anything. I believe that this can become always an add of
+     origin, with origin set to 0 for non archive files.  */
+  ufile_ptr origin;
+
+  /* The origin in the archive of the proxy entry.  This will
+     normally be the same as origin, except for thin archives,
+     when it will contain the current offset of the proxy in the
+     thin archive rather than the offset of the bfd in its actual
+     container.  */
+  ufile_ptr proxy_origin;
+
+  /* A hash table for section names.  */
+  struct bfd_hash_table section_htab;
+
+  /* Pointer to linked list of sections.  */
+  struct bfd_section *sections;
+
+  /* The last section on the section list.  */
+  struct bfd_section *section_last;
+
+  /* The number of sections.  */
+  unsigned int section_count;
+
+  /* A field used by _bfd_generic_link_add_archive_symbols.  This will
+     be used only for archive elements.  */
+  int archive_pass;
+
+  /* Stuff only useful for object files:
+     The start address.  */
+  bfd_vma start_address;
+
+  /* Symbol table for output BFD (with symcount entries).
+     Also used by the linker to cache input BFD symbols.  */
+  struct bfd_symbol  **outsymbols;
+
+  /* Used for input and output.  */
+  unsigned int symcount;
+
+  /* Used for slurped dynamic symbol tables.  */
+  unsigned int dynsymcount;
+
+  /* Pointer to structure which contains architecture information.  */
+  const struct bfd_arch_info *arch_info;
+
+  /* Stuff only useful for archives.  */
+  void *arelt_data;
+  struct bfd *my_archive;      /* The containing archive BFD.  */
+  struct bfd *archive_next;    /* The next BFD in the archive.  */
+  struct bfd *archive_head;    /* The first BFD in the archive.  */
+  struct bfd *nested_archives; /* List of nested archive in a flattened
+                                  thin archive.  */
+
+  union {
+    /* For input BFDs, a chain of BFDs involved in a link.  */
+    struct bfd *next;
+    /* For output BFD, the linker hash table.  */
+    struct bfd_link_hash_table *hash;
+  } link;
+
+  /* Used by the back end to hold private data.  */
+  union
+    {
+      struct aout_data_struct *aout_data;
+      struct artdata *aout_ar_data;
+      struct _oasys_data *oasys_obj_data;
+      struct _oasys_ar_data *oasys_ar_data;
+      struct coff_tdata *coff_obj_data;
+      struct pe_tdata *pe_obj_data;
+      struct xcoff_tdata *xcoff_obj_data;
+      struct ecoff_tdata *ecoff_obj_data;
+      struct ieee_data_struct *ieee_data;
+      struct ieee_ar_data_struct *ieee_ar_data;
+      struct srec_data_struct *srec_data;
+      struct verilog_data_struct *verilog_data;
+      struct ihex_data_struct *ihex_data;
+      struct tekhex_data_struct *tekhex_data;
+      struct elf_obj_tdata *elf_obj_data;
+      struct nlm_obj_tdata *nlm_obj_data;
+      struct bout_data_struct *bout_data;
+      struct mmo_data_struct *mmo_data;
+      struct sun_core_struct *sun_core_data;
+      struct sco5_core_struct *sco5_core_data;
+      struct trad_core_struct *trad_core_data;
+      struct som_data_struct *som_data;
+      struct hpux_core_struct *hpux_core_data;
+      struct hppabsd_core_struct *hppabsd_core_data;
+      struct sgi_core_struct *sgi_core_data;
+      struct lynx_core_struct *lynx_core_data;
+      struct osf_core_struct *osf_core_data;
+      struct cisco_core_struct *cisco_core_data;
+      struct versados_data_struct *versados_data;
+      struct netbsd_core_struct *netbsd_core_data;
+      struct mach_o_data_struct *mach_o_data;
+      struct mach_o_fat_data_struct *mach_o_fat_data;
+      struct plugin_data_struct *plugin_data;
+      struct bfd_pef_data_struct *pef_data;
+      struct bfd_pef_xlib_data_struct *pef_xlib_data;
+      struct bfd_sym_data_struct *sym_data;
+      void *any;
+    }
+  tdata;
+
+  /* Used by the application to hold private data.  */
+  void *usrdata;
+
+  /* Where all the allocated stuff under this BFD goes.  This is a
+     struct objalloc *, but we use void * to avoid requiring the inclusion
+     of objalloc.h.  */
+  void *memory;
+
+  /* For input BFDs, the build ID, if the object has one. */
+  const struct bfd_build_id *build_id;
+};
+
+/* See note beside bfd_set_section_userdata.  */
+static inline bfd_boolean
+bfd_set_cacheable (bfd * abfd, bfd_boolean val)
+{
+  abfd->cacheable = val;
+  return TRUE;
+}
+
+
+typedef enum bfd_error
+{
+  bfd_error_no_error = 0,
+  bfd_error_system_call,
+  bfd_error_invalid_target,
+  bfd_error_wrong_format,
+  bfd_error_wrong_object_format,
+  bfd_error_invalid_operation,
+  bfd_error_no_memory,
+  bfd_error_no_symbols,
+  bfd_error_no_armap,
+  bfd_error_no_more_archived_files,
+  bfd_error_malformed_archive,
+  bfd_error_missing_dso,
+  bfd_error_file_not_recognized,
+  bfd_error_file_ambiguously_recognized,
+  bfd_error_no_contents,
+  bfd_error_nonrepresentable_section,
+  bfd_error_no_debug_section,
+  bfd_error_bad_value,
+  bfd_error_file_truncated,
+  bfd_error_file_too_big,
+  bfd_error_on_input,
+  bfd_error_invalid_error_code
+}
+bfd_error_type;
+
+bfd_error_type bfd_get_error (void);
+
+void bfd_set_error (bfd_error_type error_tag, ...);
+
+const char *bfd_errmsg (bfd_error_type error_tag);
+
+void bfd_perror (const char *message);
+
+
+typedef void (*bfd_error_handler_type) (const char *, va_list);
+
+bfd_error_handler_type bfd_set_error_handler (bfd_error_handler_type);
+
+void bfd_set_error_program_name (const char *);
+
+
+typedef void (*bfd_assert_handler_type) (const char *bfd_formatmsg,
+                                         const char *bfd_version,
+                                         const char *bfd_file,
+                                         int bfd_line);
+
+bfd_assert_handler_type bfd_set_assert_handler (bfd_assert_handler_type);
+
+long bfd_get_reloc_upper_bound (bfd *abfd, asection *sect);
+
+long bfd_canonicalize_reloc
+   (bfd *abfd, asection *sec, arelent **loc, asymbol **syms);
+
+void bfd_set_reloc
+   (bfd *abfd, asection *sec, arelent **rel, unsigned int count);
+
+bfd_boolean bfd_set_file_flags (bfd *abfd, flagword flags);
+
+int bfd_get_arch_size (bfd *abfd);
+
+int bfd_get_sign_extend_vma (bfd *abfd);
+
+bfd_boolean bfd_set_start_address (bfd *abfd, bfd_vma vma);
+
+unsigned int bfd_get_gp_size (bfd *abfd);
+
+void bfd_set_gp_size (bfd *abfd, unsigned int i);
+
+bfd_vma bfd_scan_vma (const char *string, const char **end, int base);
+
+bfd_boolean bfd_copy_private_header_data (bfd *ibfd, bfd *obfd);
+
+#define bfd_copy_private_header_data(ibfd, obfd) \
+     BFD_SEND (obfd, _bfd_copy_private_header_data, \
+               (ibfd, obfd))
+bfd_boolean bfd_copy_private_bfd_data (bfd *ibfd, bfd *obfd);
+
+#define bfd_copy_private_bfd_data(ibfd, obfd) \
+     BFD_SEND (obfd, _bfd_copy_private_bfd_data, \
+               (ibfd, obfd))
+bfd_boolean bfd_set_private_flags (bfd *abfd, flagword flags);
+
+#define bfd_set_private_flags(abfd, flags) \
+     BFD_SEND (abfd, _bfd_set_private_flags, (abfd, flags))
+#define bfd_sizeof_headers(abfd, info) \
+       BFD_SEND (abfd, _bfd_sizeof_headers, (abfd, info))
+
+#define bfd_find_nearest_line(abfd, sec, syms, off, file, func, line) \
+       BFD_SEND (abfd, _bfd_find_nearest_line, \
+                 (abfd, syms, sec, off, file, func, line, NULL))
+
+#define bfd_find_nearest_line_discriminator(abfd, sec, syms, off, file, func, \
+                                            line, disc) \
+       BFD_SEND (abfd, _bfd_find_nearest_line, \
+                 (abfd, syms, sec, off, file, func, line, disc))
+
+#define bfd_find_line(abfd, syms, sym, file, line) \
+       BFD_SEND (abfd, _bfd_find_line, \
+                 (abfd, syms, sym, file, line))
+
+#define bfd_find_inliner_info(abfd, file, func, line) \
+       BFD_SEND (abfd, _bfd_find_inliner_info, \
+                 (abfd, file, func, line))
+
+#define bfd_debug_info_start(abfd) \
+       BFD_SEND (abfd, _bfd_debug_info_start, (abfd))
+
+#define bfd_debug_info_end(abfd) \
+       BFD_SEND (abfd, _bfd_debug_info_end, (abfd))
+
+#define bfd_debug_info_accumulate(abfd, section) \
+       BFD_SEND (abfd, _bfd_debug_info_accumulate, (abfd, section))
+
+#define bfd_stat_arch_elt(abfd, stat) \
+       BFD_SEND (abfd, _bfd_stat_arch_elt,(abfd, stat))
+
+#define bfd_update_armap_timestamp(abfd) \
+       BFD_SEND (abfd, _bfd_update_armap_timestamp, (abfd))
+
+#define bfd_set_arch_mach(abfd, arch, mach)\
+       BFD_SEND ( abfd, _bfd_set_arch_mach, (abfd, arch, mach))
+
+#define bfd_relax_section(abfd, section, link_info, again) \
+       BFD_SEND (abfd, _bfd_relax_section, (abfd, section, link_info, again))
+
+#define bfd_gc_sections(abfd, link_info) \
+       BFD_SEND (abfd, _bfd_gc_sections, (abfd, link_info))
+
+#define bfd_lookup_section_flags(link_info, flag_info, section) \
+       BFD_SEND (abfd, _bfd_lookup_section_flags, (link_info, flag_info, section))
+
+#define bfd_merge_sections(abfd, link_info) \
+       BFD_SEND (abfd, _bfd_merge_sections, (abfd, link_info))
+
+#define bfd_is_group_section(abfd, sec) \
+       BFD_SEND (abfd, _bfd_is_group_section, (abfd, sec))
+
+#define bfd_discard_group(abfd, sec) \
+       BFD_SEND (abfd, _bfd_discard_group, (abfd, sec))
+
+#define bfd_link_hash_table_create(abfd) \
+       BFD_SEND (abfd, _bfd_link_hash_table_create, (abfd))
+
+#define bfd_link_add_symbols(abfd, info) \
+       BFD_SEND (abfd, _bfd_link_add_symbols, (abfd, info))
+
+#define bfd_link_just_syms(abfd, sec, info) \
+       BFD_SEND (abfd, _bfd_link_just_syms, (sec, info))
+
+#define bfd_final_link(abfd, info) \
+       BFD_SEND (abfd, _bfd_final_link, (abfd, info))
+
+#define bfd_free_cached_info(abfd) \
+       BFD_SEND (abfd, _bfd_free_cached_info, (abfd))
+
+#define bfd_get_dynamic_symtab_upper_bound(abfd) \
+       BFD_SEND (abfd, _bfd_get_dynamic_symtab_upper_bound, (abfd))
+
+#define bfd_print_private_bfd_data(abfd, file)\
+       BFD_SEND (abfd, _bfd_print_private_bfd_data, (abfd, file))
+
+#define bfd_canonicalize_dynamic_symtab(abfd, asymbols) \
+       BFD_SEND (abfd, _bfd_canonicalize_dynamic_symtab, (abfd, asymbols))
+
+#define bfd_get_synthetic_symtab(abfd, count, syms, dyncount, dynsyms, ret) \
+       BFD_SEND (abfd, _bfd_get_synthetic_symtab, (abfd, count, syms, \
+                                                   dyncount, dynsyms, ret))
+
+#define bfd_get_dynamic_reloc_upper_bound(abfd) \
+       BFD_SEND (abfd, _bfd_get_dynamic_reloc_upper_bound, (abfd))
+
+#define bfd_canonicalize_dynamic_reloc(abfd, arels, asyms) \
+       BFD_SEND (abfd, _bfd_canonicalize_dynamic_reloc, (abfd, arels, asyms))
+
+extern bfd_byte *bfd_get_relocated_section_contents
+  (bfd *, struct bfd_link_info *, struct bfd_link_order *, bfd_byte *,
+   bfd_boolean, asymbol **);
+
+bfd_boolean bfd_alt_mach_code (bfd *abfd, int alternative);
+
+bfd_vma bfd_emul_get_maxpagesize (const char *);
+
+void bfd_emul_set_maxpagesize (const char *, bfd_vma);
+
+bfd_vma bfd_emul_get_commonpagesize (const char *);
+
+void bfd_emul_set_commonpagesize (const char *, bfd_vma);
+
+char *bfd_demangle (bfd *, const char *, int);
+
+void bfd_update_compression_header
+   (bfd *abfd, bfd_byte *contents, asection *sec);
+
+bfd_boolean bfd_check_compression_header
+   (bfd *abfd, bfd_byte *contents, asection *sec,
+    bfd_size_type *uncompressed_size);
+
+int bfd_get_compression_header_size (bfd *abfd, asection *sec);
+
+bfd_size_type bfd_convert_section_size
+   (bfd *ibfd, asection *isec, bfd *obfd, bfd_size_type size);
+
+bfd_boolean bfd_convert_section_contents
+   (bfd *ibfd, asection *isec, bfd *obfd,
+    bfd_byte **ptr, bfd_size_type *ptr_size);
+
+/* Extracted from archive.c.  */
+symindex bfd_get_next_mapent
+   (bfd *abfd, symindex previous, carsym **sym);
+
+bfd_boolean bfd_set_archive_head (bfd *output, bfd *new_head);
+
+bfd *bfd_openr_next_archived_file (bfd *archive, bfd *previous);
+
+/* Extracted from corefile.c.  */
+const char *bfd_core_file_failing_command (bfd *abfd);
+
+int bfd_core_file_failing_signal (bfd *abfd);
+
+int bfd_core_file_pid (bfd *abfd);
+
+bfd_boolean core_file_matches_executable_p
+   (bfd *core_bfd, bfd *exec_bfd);
+
+bfd_boolean generic_core_file_matches_executable_p
+   (bfd *core_bfd, bfd *exec_bfd);
+
+/* Extracted from targets.c.  */
+#define BFD_SEND(bfd, message, arglist) \
+  ((*((bfd)->xvec->message)) arglist)
+
+#ifdef DEBUG_BFD_SEND
+#undef BFD_SEND
+#define BFD_SEND(bfd, message, arglist) \
+  (((bfd) && (bfd)->xvec && (bfd)->xvec->message) ? \
+    ((*((bfd)->xvec->message)) arglist) : \
+    (bfd_assert (__FILE__,__LINE__), NULL))
+#endif
+#define BFD_SEND_FMT(bfd, message, arglist) \
+  (((bfd)->xvec->message[(int) ((bfd)->format)]) arglist)
+
+#ifdef DEBUG_BFD_SEND
+#undef BFD_SEND_FMT
+#define BFD_SEND_FMT(bfd, message, arglist) \
+  (((bfd) && (bfd)->xvec && (bfd)->xvec->message) ? \
+   (((bfd)->xvec->message[(int) ((bfd)->format)]) arglist) : \
+   (bfd_assert (__FILE__,__LINE__), NULL))
+#endif
+
+enum bfd_flavour
+{
+  /* N.B. Update bfd_flavour_name if you change this.  */
+  bfd_target_unknown_flavour,
+  bfd_target_aout_flavour,
+  bfd_target_coff_flavour,
+  bfd_target_ecoff_flavour,
+  bfd_target_xcoff_flavour,
+  bfd_target_elf_flavour,
+  bfd_target_ieee_flavour,
+  bfd_target_nlm_flavour,
+  bfd_target_oasys_flavour,
+  bfd_target_tekhex_flavour,
+  bfd_target_srec_flavour,
+  bfd_target_verilog_flavour,
+  bfd_target_ihex_flavour,
+  bfd_target_som_flavour,
+  bfd_target_os9k_flavour,
+  bfd_target_versados_flavour,
+  bfd_target_msdos_flavour,
+  bfd_target_ovax_flavour,
+  bfd_target_evax_flavour,
+  bfd_target_mmo_flavour,
+  bfd_target_mach_o_flavour,
+  bfd_target_pef_flavour,
+  bfd_target_pef_xlib_flavour,
+  bfd_target_sym_flavour
+};
+
+enum bfd_endian { BFD_ENDIAN_BIG, BFD_ENDIAN_LITTLE, BFD_ENDIAN_UNKNOWN };
+
+/* Forward declaration.  */
+typedef struct bfd_link_info _bfd_link_info;
+
+/* Forward declaration.  */
+typedef struct flag_info flag_info;
+
+typedef struct bfd_target
+{
+  /* Identifies the kind of target, e.g., SunOS4, Ultrix, etc.  */
+  char *name;
+
+ /* The "flavour" of a back end is a general indication about
+    the contents of a file.  */
+  enum bfd_flavour flavour;
+
+  /* The order of bytes within the data area of a file.  */
+  enum bfd_endian byteorder;
+
+ /* The order of bytes within the header parts of a file.  */
+  enum bfd_endian header_byteorder;
+
+  /* A mask of all the flags which an executable may have set -
+     from the set <<BFD_NO_FLAGS>>, <<HAS_RELOC>>, ...<<D_PAGED>>.  */
+  flagword object_flags;
+
+ /* A mask of all the flags which a section may have set - from
+    the set <<SEC_NO_FLAGS>>, <<SEC_ALLOC>>, ...<<SET_NEVER_LOAD>>.  */
+  flagword section_flags;
+
+ /* The character normally found at the front of a symbol.
+    (if any), perhaps `_'.  */
+  char symbol_leading_char;
+
+ /* The pad character for file names within an archive header.  */
+  char ar_pad_char;
+
+  /* The maximum number of characters in an archive header.  */
+  unsigned char ar_max_namelen;
+
+  /* How well this target matches, used to select between various
+     possible targets when more than one target matches.  */
+  unsigned char match_priority;
+
+  /* Entries for byte swapping for data. These are different from the
+     other entry points, since they don't take a BFD as the first argument.
+     Certain other handlers could do the same.  */
+  bfd_uint64_t   (*bfd_getx64) (const void *);
+  bfd_int64_t    (*bfd_getx_signed_64) (const void *);
+  void           (*bfd_putx64) (bfd_uint64_t, void *);
+  bfd_vma        (*bfd_getx32) (const void *);
+  bfd_signed_vma (*bfd_getx_signed_32) (const void *);
+  void           (*bfd_putx32) (bfd_vma, void *);
+  bfd_vma        (*bfd_getx16) (const void *);
+  bfd_signed_vma (*bfd_getx_signed_16) (const void *);
+  void           (*bfd_putx16) (bfd_vma, void *);
+
+  /* Byte swapping for the headers.  */
+  bfd_uint64_t   (*bfd_h_getx64) (const void *);
+  bfd_int64_t    (*bfd_h_getx_signed_64) (const void *);
+  void           (*bfd_h_putx64) (bfd_uint64_t, void *);
+  bfd_vma        (*bfd_h_getx32) (const void *);
+  bfd_signed_vma (*bfd_h_getx_signed_32) (const void *);
+  void           (*bfd_h_putx32) (bfd_vma, void *);
+  bfd_vma        (*bfd_h_getx16) (const void *);
+  bfd_signed_vma (*bfd_h_getx_signed_16) (const void *);
+  void           (*bfd_h_putx16) (bfd_vma, void *);
+
+  /* Format dependent routines: these are vectors of entry points
+     within the target vector structure, one for each format to check.  */
+
+  /* Check the format of a file being read.  Return a <<bfd_target *>> or zero.  */
+  const struct bfd_target *(*_bfd_check_format[bfd_type_end]) (bfd *);
+
+  /* Set the format of a file being written.  */
+  bfd_boolean (*_bfd_set_format[bfd_type_end]) (bfd *);
+
+  /* Write cached information into a file being written, at <<bfd_close>>.  */
+  bfd_boolean (*_bfd_write_contents[bfd_type_end]) (bfd *);
+
+
+  /* Generic entry points.  */
+#define BFD_JUMP_TABLE_GENERIC(NAME) \
+  NAME##_close_and_cleanup, \
+  NAME##_bfd_free_cached_info, \
+  NAME##_new_section_hook, \
+  NAME##_get_section_contents, \
+  NAME##_get_section_contents_in_window
+
+  /* Called when the BFD is being closed to do any necessary cleanup.  */
+  bfd_boolean (*_close_and_cleanup) (bfd *);
+  /* Ask the BFD to free all cached information.  */
+  bfd_boolean (*_bfd_free_cached_info) (bfd *);
+  /* Called when a new section is created.  */
+  bfd_boolean (*_new_section_hook) (bfd *, sec_ptr);
+  /* Read the contents of a section.  */
+  bfd_boolean (*_bfd_get_section_contents)
+    (bfd *, sec_ptr, void *, file_ptr, bfd_size_type);
+  bfd_boolean (*_bfd_get_section_contents_in_window)
+    (bfd *, sec_ptr, bfd_window *, file_ptr, bfd_size_type);
+
+  /* Entry points to copy private data.  */
+#define BFD_JUMP_TABLE_COPY(NAME) \
+  NAME##_bfd_copy_private_bfd_data, \
+  NAME##_bfd_merge_private_bfd_data, \
+  _bfd_generic_init_private_section_data, \
+  NAME##_bfd_copy_private_section_data, \
+  NAME##_bfd_copy_private_symbol_data, \
+  NAME##_bfd_copy_private_header_data, \
+  NAME##_bfd_set_private_flags, \
+  NAME##_bfd_print_private_bfd_data
+
+  /* Called to copy BFD general private data from one object file
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_bfd_data) (bfd *, bfd *);
+  /* Called to merge BFD general private data from one object file
+     to a common output file when linking.  */
+  bfd_boolean (*_bfd_merge_private_bfd_data) (bfd *, struct bfd_link_info *);
+  /* Called to initialize BFD private section data from one object file
+     to another.  */
+#define bfd_init_private_section_data(ibfd, isec, obfd, osec, link_info) \
+  BFD_SEND (obfd, _bfd_init_private_section_data, (ibfd, isec, obfd, osec, link_info))
+  bfd_boolean (*_bfd_init_private_section_data)
+    (bfd *, sec_ptr, bfd *, sec_ptr, struct bfd_link_info *);
+  /* Called to copy BFD private section data from one object file
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_section_data)
+    (bfd *, sec_ptr, bfd *, sec_ptr);
+  /* Called to copy BFD private symbol data from one symbol
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_symbol_data)
+    (bfd *, asymbol *, bfd *, asymbol *);
+  /* Called to copy BFD private header data from one object file
+     to another.  */
+  bfd_boolean (*_bfd_copy_private_header_data)
+    (bfd *, bfd *);
+  /* Called to set private backend flags.  */
+  bfd_boolean (*_bfd_set_private_flags) (bfd *, flagword);
+
+  /* Called to print private BFD data.  */
+  bfd_boolean (*_bfd_print_private_bfd_data) (bfd *, void *);
+
+  /* Core file entry points.  */
+#define BFD_JUMP_TABLE_CORE(NAME) \
+  NAME##_core_file_failing_command, \
+  NAME##_core_file_failing_signal, \
+  NAME##_core_file_matches_executable_p, \
+  NAME##_core_file_pid
+
+  char *      (*_core_file_failing_command) (bfd *);
+  int         (*_core_file_failing_signal) (bfd *);
+  bfd_boolean (*_core_file_matches_executable_p) (bfd *, bfd *);
+  int         (*_core_file_pid) (bfd *);
+
+  /* Archive entry points.  */
+#define BFD_JUMP_TABLE_ARCHIVE(NAME) \
+  NAME##_slurp_armap, \
+  NAME##_slurp_extended_name_table, \
+  NAME##_construct_extended_name_table, \
+  NAME##_truncate_arname, \
+  NAME##_write_armap, \
+  NAME##_read_ar_hdr, \
+  NAME##_write_ar_hdr, \
+  NAME##_openr_next_archived_file, \
+  NAME##_get_elt_at_index, \
+  NAME##_generic_stat_arch_elt, \
+  NAME##_update_armap_timestamp
+
+  bfd_boolean (*_bfd_slurp_armap) (bfd *);
+  bfd_boolean (*_bfd_slurp_extended_name_table) (bfd *);
+  bfd_boolean (*_bfd_construct_extended_name_table)
+    (bfd *, char **, bfd_size_type *, const char **);
+  void        (*_bfd_truncate_arname) (bfd *, const char *, char *);
+  bfd_boolean (*write_armap)
+    (bfd *, unsigned int, struct orl *, unsigned int, int);
+  void *      (*_bfd_read_ar_hdr_fn) (bfd *);
+  bfd_boolean (*_bfd_write_ar_hdr_fn) (bfd *, bfd *);
+  bfd *       (*openr_next_archived_file) (bfd *, bfd *);
+#define bfd_get_elt_at_index(b,i) BFD_SEND (b, _bfd_get_elt_at_index, (b,i))
+  bfd *       (*_bfd_get_elt_at_index) (bfd *, symindex);
+  int         (*_bfd_stat_arch_elt) (bfd *, struct stat *);
+  bfd_boolean (*_bfd_update_armap_timestamp) (bfd *);
+
+  /* Entry points used for symbols.  */
+#define BFD_JUMP_TABLE_SYMBOLS(NAME) \
+  NAME##_get_symtab_upper_bound, \
+  NAME##_canonicalize_symtab, \
+  NAME##_make_empty_symbol, \
+  NAME##_print_symbol, \
+  NAME##_get_symbol_info, \
+  NAME##_get_symbol_version_string, \
+  NAME##_bfd_is_local_label_name, \
+  NAME##_bfd_is_target_special_symbol, \
+  NAME##_get_lineno, \
+  NAME##_find_nearest_line, \
+  NAME##_find_line, \
+  NAME##_find_inliner_info, \
+  NAME##_bfd_make_debug_symbol, \
+  NAME##_read_minisymbols, \
+  NAME##_minisymbol_to_symbol
+
+  long        (*_bfd_get_symtab_upper_bound) (bfd *);
+  long        (*_bfd_canonicalize_symtab)
+    (bfd *, struct bfd_symbol **);
+  struct bfd_symbol *
+              (*_bfd_make_empty_symbol) (bfd *);
+  void        (*_bfd_print_symbol)
+    (bfd *, void *, struct bfd_symbol *, bfd_print_symbol_type);
+#define bfd_print_symbol(b,p,s,e) BFD_SEND (b, _bfd_print_symbol, (b,p,s,e))
+  void        (*_bfd_get_symbol_info)
+    (bfd *, struct bfd_symbol *, symbol_info *);
+#define bfd_get_symbol_info(b,p,e) BFD_SEND (b, _bfd_get_symbol_info, (b,p,e))
+  const char *(*_bfd_get_symbol_version_string)
+    (bfd *, struct bfd_symbol *, bfd_boolean *);
+#define bfd_get_symbol_version_string(b,s,h) BFD_SEND (b, _bfd_get_symbol_version_string, (b,s,h))
+  bfd_boolean (*_bfd_is_local_label_name) (bfd *, const char *);
+  bfd_boolean (*_bfd_is_target_special_symbol) (bfd *, asymbol *);
+  alent *     (*_get_lineno) (bfd *, struct bfd_symbol *);
+  bfd_boolean (*_bfd_find_nearest_line)
+    (bfd *, struct bfd_symbol **, struct bfd_section *, bfd_vma,
+     const char **, const char **, unsigned int *, unsigned int *);
+  bfd_boolean (*_bfd_find_line)
+    (bfd *, struct bfd_symbol **, struct bfd_symbol *,
+     const char **, unsigned int *);
+  bfd_boolean (*_bfd_find_inliner_info)
+    (bfd *, const char **, const char **, unsigned int *);
+ /* Back-door to allow format-aware applications to create debug symbols
+    while using BFD for everything else.  Currently used by the assembler
+    when creating COFF files.  */
+  asymbol *   (*_bfd_make_debug_symbol)
+    (bfd *, void *, unsigned long size);
+#define bfd_read_minisymbols(b, d, m, s) \
+  BFD_SEND (b, _read_minisymbols, (b, d, m, s))
+  long        (*_read_minisymbols)
+    (bfd *, bfd_boolean, void **, unsigned int *);
+#define bfd_minisymbol_to_symbol(b, d, m, f) \
+  BFD_SEND (b, _minisymbol_to_symbol, (b, d, m, f))
+  asymbol *   (*_minisymbol_to_symbol)
+    (bfd *, bfd_boolean, const void *, asymbol *);
+
+  /* Routines for relocs.  */
+#define BFD_JUMP_TABLE_RELOCS(NAME) \
+  NAME##_get_reloc_upper_bound, \
+  NAME##_canonicalize_reloc, \
+  NAME##_bfd_reloc_type_lookup, \
+  NAME##_bfd_reloc_name_lookup
+
+  long        (*_get_reloc_upper_bound) (bfd *, sec_ptr);
+  long        (*_bfd_canonicalize_reloc)
+    (bfd *, sec_ptr, arelent **, struct bfd_symbol **);
+  /* See documentation on reloc types.  */
+  reloc_howto_type *
+              (*reloc_type_lookup) (bfd *, bfd_reloc_code_real_type);
+  reloc_howto_type *
+              (*reloc_name_lookup) (bfd *, const char *);
+
+
+  /* Routines used when writing an object file.  */
+#define BFD_JUMP_TABLE_WRITE(NAME) \
+  NAME##_set_arch_mach, \
+  NAME##_set_section_contents
+
+  bfd_boolean (*_bfd_set_arch_mach)
+    (bfd *, enum bfd_architecture, unsigned long);
+  bfd_boolean (*_bfd_set_section_contents)
+    (bfd *, sec_ptr, const void *, file_ptr, bfd_size_type);
+
+  /* Routines used by the linker.  */
+#define BFD_JUMP_TABLE_LINK(NAME) \
+  NAME##_sizeof_headers, \
+  NAME##_bfd_get_relocated_section_contents, \
+  NAME##_bfd_relax_section, \
+  NAME##_bfd_link_hash_table_create, \
+  NAME##_bfd_link_add_symbols, \
+  NAME##_bfd_link_just_syms, \
+  NAME##_bfd_copy_link_hash_symbol_type, \
+  NAME##_bfd_final_link, \
+  NAME##_bfd_link_split_section, \
+  NAME##_bfd_link_check_relocs, \
+  NAME##_bfd_gc_sections, \
+  NAME##_bfd_lookup_section_flags, \
+  NAME##_bfd_merge_sections, \
+  NAME##_bfd_is_group_section, \
+  NAME##_bfd_discard_group, \
+  NAME##_section_already_linked, \
+  NAME##_bfd_define_common_symbol
+
+  int         (*_bfd_sizeof_headers) (bfd *, struct bfd_link_info *);
+  bfd_byte *  (*_bfd_get_relocated_section_contents)
+    (bfd *, struct bfd_link_info *, struct bfd_link_order *,
+     bfd_byte *, bfd_boolean, struct bfd_symbol **);
+
+  bfd_boolean (*_bfd_relax_section)
+    (bfd *, struct bfd_section *, struct bfd_link_info *, bfd_boolean *);
+
+  /* Create a hash table for the linker.  Different backends store
+     different information in this table.  */
+  struct bfd_link_hash_table *
+              (*_bfd_link_hash_table_create) (bfd *);
+
+  /* Add symbols from this object file into the hash table.  */
+  bfd_boolean (*_bfd_link_add_symbols) (bfd *, struct bfd_link_info *);
+
+  /* Indicate that we are only retrieving symbol values from this section.  */
+  void        (*_bfd_link_just_syms) (asection *, struct bfd_link_info *);
+
+  /* Copy the symbol type and other attributes for a linker script
+     assignment of one symbol to another.  */
+#define bfd_copy_link_hash_symbol_type(b, t, f) \
+  BFD_SEND (b, _bfd_copy_link_hash_symbol_type, (b, t, f))
+  void (*_bfd_copy_link_hash_symbol_type)
+    (bfd *, struct bfd_link_hash_entry *, struct bfd_link_hash_entry *);
+
+  /* Do a link based on the link_order structures attached to each
+     section of the BFD.  */
+  bfd_boolean (*_bfd_final_link) (bfd *, struct bfd_link_info *);
+
+  /* Should this section be split up into smaller pieces during linking.  */
+  bfd_boolean (*_bfd_link_split_section) (bfd *, struct bfd_section *);
+
+  /* Check the relocations in the bfd for validity.  */
+  bfd_boolean (* _bfd_link_check_relocs)(bfd *, struct bfd_link_info *);
+
+  /* Remove sections that are not referenced from the output.  */
+  bfd_boolean (*_bfd_gc_sections) (bfd *, struct bfd_link_info *);
+
+  /* Sets the bitmask of allowed and disallowed section flags.  */
+  bfd_boolean (*_bfd_lookup_section_flags) (struct bfd_link_info *,
+                                            struct flag_info *,
+                                            asection *);
+
+  /* Attempt to merge SEC_MERGE sections.  */
+  bfd_boolean (*_bfd_merge_sections) (bfd *, struct bfd_link_info *);
+
+  /* Is this section a member of a group?  */
+  bfd_boolean (*_bfd_is_group_section) (bfd *, const struct bfd_section *);
+
+  /* Discard members of a group.  */
+  bfd_boolean (*_bfd_discard_group) (bfd *, struct bfd_section *);
+
+  /* Check if SEC has been already linked during a reloceatable or
+     final link.  */
+  bfd_boolean (*_section_already_linked) (bfd *, asection *,
+                                          struct bfd_link_info *);
+
+  /* Define a common symbol.  */
+  bfd_boolean (*_bfd_define_common_symbol) (bfd *, struct bfd_link_info *,
+                                            struct bfd_link_hash_entry *);
+
+  /* Routines to handle dynamic symbols and relocs.  */
+#define BFD_JUMP_TABLE_DYNAMIC(NAME) \
+  NAME##_get_dynamic_symtab_upper_bound, \
+  NAME##_canonicalize_dynamic_symtab, \
+  NAME##_get_synthetic_symtab, \
+  NAME##_get_dynamic_reloc_upper_bound, \
+  NAME##_canonicalize_dynamic_reloc
+
+  /* Get the amount of memory required to hold the dynamic symbols.  */
+  long        (*_bfd_get_dynamic_symtab_upper_bound) (bfd *);
+  /* Read in the dynamic symbols.  */
+  long        (*_bfd_canonicalize_dynamic_symtab)
+    (bfd *, struct bfd_symbol **);
+  /* Create synthetized symbols.  */
+  long        (*_bfd_get_synthetic_symtab)
+    (bfd *, long, struct bfd_symbol **, long, struct bfd_symbol **,
+     struct bfd_symbol **);
+  /* Get the amount of memory required to hold the dynamic relocs.  */
+  long        (*_bfd_get_dynamic_reloc_upper_bound) (bfd *);
+  /* Read in the dynamic relocs.  */
+  long        (*_bfd_canonicalize_dynamic_reloc)
+    (bfd *, arelent **, struct bfd_symbol **);
+
+  /* Opposite endian version of this target.  */
+  const struct bfd_target * alternative_target;
+
+  /* Data for use by back-end routines, which isn't
+     generic enough to belong in this structure.  */
+  const void *backend_data;
+
+} bfd_target;
+
+bfd_boolean bfd_set_default_target (const char *name);
+
+const bfd_target *bfd_find_target (const char *target_name, bfd *abfd);
+
+const bfd_target *bfd_get_target_info (const char *target_name,
+    bfd *abfd,
+    bfd_boolean *is_bigendian,
+    int *underscoring,
+    const char **def_target_arch);
+const char ** bfd_target_list (void);
+
+const bfd_target *bfd_iterate_over_targets
+   (int (*func) (const bfd_target *, void *),
+    void *data);
+
+const char *bfd_flavour_name (enum bfd_flavour flavour);
+
+/* Extracted from format.c.  */
+bfd_boolean bfd_check_format (bfd *abfd, bfd_format format);
+
+bfd_boolean bfd_check_format_matches
+   (bfd *abfd, bfd_format format, char ***matching);
+
+bfd_boolean bfd_set_format (bfd *abfd, bfd_format format);
+
+const char *bfd_format_string (bfd_format format);
+
+/* Extracted from linker.c.  */
+bfd_boolean bfd_link_split_section (bfd *abfd, asection *sec);
+
+#define bfd_link_split_section(abfd, sec) \
+       BFD_SEND (abfd, _bfd_link_split_section, (abfd, sec))
+
+bfd_boolean bfd_section_already_linked (bfd *abfd,
+    asection *sec,
+    struct bfd_link_info *info);
+
+#define bfd_section_already_linked(abfd, sec, info) \
+       BFD_SEND (abfd, _section_already_linked, (abfd, sec, info))
+
+bfd_boolean bfd_generic_define_common_symbol
+   (bfd *output_bfd, struct bfd_link_info *info,
+    struct bfd_link_hash_entry *h);
+
+#define bfd_define_common_symbol(output_bfd, info, h) \
+       BFD_SEND (output_bfd, _bfd_define_common_symbol, (output_bfd, info, h))
+
+struct bfd_elf_version_tree * bfd_find_version_for_sym
+   (struct bfd_elf_version_tree *verdefs,
+    const char *sym_name, bfd_boolean *hide);
+
+bfd_boolean bfd_hide_sym_by_version
+   (struct bfd_elf_version_tree *verdefs, const char *sym_name);
+
+bfd_boolean bfd_link_check_relocs
+   (bfd *abfd, struct bfd_link_info *info);
+
+bfd_boolean _bfd_generic_link_check_relocs
+   (bfd *abfd, struct bfd_link_info *info);
+
+bfd_boolean bfd_merge_private_bfd_data
+   (bfd *ibfd, struct bfd_link_info *info);
+
+#define bfd_merge_private_bfd_data(ibfd, info) \
+     BFD_SEND ((info)->output_bfd, _bfd_merge_private_bfd_data, \
+               (ibfd, info))
+/* Extracted from simple.c.  */
+bfd_byte *bfd_simple_get_relocated_section_contents
+   (bfd *abfd, asection *sec, bfd_byte *outbuf, asymbol **symbol_table);
+
+/* Extracted from compress.c.  */
+bfd_boolean bfd_get_full_section_contents
+   (bfd *abfd, asection *section, bfd_byte **ptr);
+
+void bfd_cache_section_contents
+   (asection *sec, void *contents);
+
+bfd_boolean bfd_is_section_compressed_with_header
+   (bfd *abfd, asection *section,
+    int *compression_header_size_p,
+    bfd_size_type *uncompressed_size_p);
+
+bfd_boolean bfd_is_section_compressed
+   (bfd *abfd, asection *section);
+
+bfd_boolean bfd_init_section_decompress_status
+   (bfd *abfd, asection *section);
+
+bfd_boolean bfd_init_section_compress_status
+   (bfd *abfd, asection *section);
+
+bfd_boolean bfd_compress_section
+   (bfd *abfd, asection *section, bfd_byte *uncompressed_buffer);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/bfd_stdint.h b/utils/gapy/gen-debug-info-src/ext/bfd/bfd_stdint.h
new file mode 100644
index 000000000..1aedfff86
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/bfd_stdint.h
@@ -0,0 +1,47 @@
+/* generated for  gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0 */
+
+#ifndef GCC_GENERATED_STDINT_H
+#define GCC_GENERATED_STDINT_H 1
+
+#include <sys/types.h>
+#include <stdint.h>
+/* glibc uses these symbols as guards to prevent redefinitions.  */
+#ifdef __int8_t_defined
+#define _INT8_T
+#define _INT16_T
+#define _INT32_T
+#endif
+#ifdef __uint32_t_defined
+#define _UINT32_T
+#endif
+
+
+/* Some systems have guard macros to prevent redefinitions, define them.  */
+#ifndef _INT8_T
+#define _INT8_T
+#endif
+#ifndef _INT16_T
+#define _INT16_T
+#endif
+#ifndef _INT32_T
+#define _INT32_T
+#endif
+#ifndef _UINT8_T
+#define _UINT8_T
+#endif
+#ifndef _UINT16_T
+#define _UINT16_T
+#endif
+#ifndef _UINT32_T
+#define _UINT32_T
+#endif
+
+/* system headers have good uint64_t and int64_t */
+#ifndef _INT64_T
+#define _INT64_T
+#endif
+#ifndef _UINT64_T
+#define _UINT64_T
+#endif
+
+#endif /* GCC_GENERATED_STDINT_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/bfdver.h b/utils/gapy/gen-debug-info-src/ext/bfd/bfdver.h
new file mode 100644
index 000000000..68ad80862
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/bfdver.h
@@ -0,0 +1,4 @@
+#define BFD_VERSION_DATE 20170505
+#define BFD_VERSION 228000000
+#define BFD_VERSION_STRING  "(GNU Binutils) " "2.28.0.20170505"
+#define REPORT_BUGS_TO "<http://www.sourceware.org/bugzilla/>"
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/config.h b/utils/gapy/gen-debug-info-src/ext/bfd/config.h
new file mode 100644
index 000000000..fe24ed01e
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/config.h
@@ -0,0 +1,395 @@
+/* config.h.  Generated from config.in by configure.  */
+/* config.in.  Generated from configure.ac by autoheader.  */
+
+/* Check that config.h is #included before system headers
+    (this works only for glibc, but that should be enough).  */
+#if defined(__GLIBC__) && !defined(__FreeBSD_kernel__) && !defined(__CONFIG_H__)
+#  error config.h must be #included before system headers
+#endif
+#define __CONFIG_H__ 1
+
+/* Name of host specific core header file to include in elf.c. */
+/* #undef CORE_HEADER */
+
+/* Define to 1 if translation of program messages to the user's native
+   language is requested. */
+#define ENABLE_NLS 1
+
+/* Define to 1 if you have the <alloca.h> header file. */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the declaration of `asprintf', and to 0 if you
+   don't. */
+#define HAVE_DECL_ASPRINTF 1
+
+/* Define to 1 if you have the declaration of `basename', and to 0 if you
+   don't. */
+#define HAVE_DECL_BASENAME 1
+
+/* Define to 1 if you have the declaration of `ffs', and to 0 if you don't. */
+#define HAVE_DECL_FFS 1
+
+/* Define to 1 if you have the declaration of `free', and to 0 if you don't.
+   */
+#define HAVE_DECL_FREE 1
+
+/* Define to 1 if you have the declaration of `fseeko', and to 0 if you don't.
+   */
+#define HAVE_DECL_FSEEKO 1
+
+/* Define to 1 if you have the declaration of `fseeko64', and to 0 if you
+   don't. */
+#define HAVE_DECL_FSEEKO64 1
+
+/* Define to 1 if you have the declaration of `ftello', and to 0 if you don't.
+   */
+#define HAVE_DECL_FTELLO 1
+
+/* Define to 1 if you have the declaration of `ftello64', and to 0 if you
+   don't. */
+#define HAVE_DECL_FTELLO64 1
+
+/* Define to 1 if you have the declaration of `getenv', and to 0 if you don't.
+   */
+#define HAVE_DECL_GETENV 1
+
+/* Define to 1 if you have the declaration of `malloc', and to 0 if you don't.
+   */
+#define HAVE_DECL_MALLOC 1
+
+/* Define to 1 if you have the declaration of `realloc', and to 0 if you
+   don't. */
+#define HAVE_DECL_REALLOC 1
+
+/* Define to 1 if you have the declaration of `snprintf', and to 0 if you
+   don't. */
+#define HAVE_DECL_SNPRINTF 1
+
+/* Define to 1 if you have the declaration of `stpcpy', and to 0 if you don't.
+   */
+#define HAVE_DECL_STPCPY 1
+
+/* Define to 1 if you have the declaration of `strnlen', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRNLEN 1
+
+/* Define to 1 if you have the declaration of `strstr', and to 0 if you don't.
+   */
+#define HAVE_DECL_STRSTR 1
+
+/* Define to 1 if you have the declaration of `vasprintf', and to 0 if you
+   don't. */
+#define HAVE_DECL_VASPRINTF 1
+
+/* Define to 1 if you have the declaration of `vsnprintf', and to 0 if you
+   don't. */
+#define HAVE_DECL_VSNPRINTF 1
+
+/* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
+   */
+#define HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `fcntl' function. */
+#define HAVE_FCNTL 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `fdopen' function. */
+#define HAVE_FDOPEN 1
+
+/* Define to 1 if you have the `fileno' function. */
+#define HAVE_FILENO 1
+
+/* Define to 1 if you have the `fopen64' function. */
+#define HAVE_FOPEN64 1
+
+/* Define to 1 if you have the `fseeko' function. */
+#define HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `fseeko64' function. */
+#define HAVE_FSEEKO64 1
+
+/* Define to 1 if you have the `ftello' function. */
+#define HAVE_FTELLO 1
+
+/* Define to 1 if you have the `ftello64' function. */
+#define HAVE_FTELLO64 1
+
+/* Define to 1 if you have the `getgid' function. */
+#define HAVE_GETGID 1
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getrlimit' function. */
+#define HAVE_GETRLIMIT 1
+
+/* Define to 1 if you have the `getuid' function. */
+#define HAVE_GETUID 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define if <sys/procfs.h> has lwpstatus_t. */
+/* #undef HAVE_LWPSTATUS_T */
+
+/* Define if <sys/procfs.h> has lwpstatus_t.pr_context. */
+/* #undef HAVE_LWPSTATUS_T_PR_CONTEXT */
+
+/* Define if <sys/procfs.h> has lwpstatus_t.pr_fpreg. */
+/* #undef HAVE_LWPSTATUS_T_PR_FPREG */
+
+/* Define if <sys/procfs.h> has lwpstatus_t.pr_reg. */
+/* #undef HAVE_LWPSTATUS_T_PR_REG */
+
+/* Define if <sys/procfs.h> has lwpxstatus_t. */
+/* #undef HAVE_LWPXSTATUS_T */
+
+/* Define to 1 if you have the `madvise' function. */
+#define HAVE_MADVISE 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have a working `mmap' system call. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if you have the `mprotect' function. */
+#define HAVE_MPROTECT 1
+
+/* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
+/* #undef HAVE_NDIR_H */
+
+/* Define if <sys/procfs.h> has prpsinfo32_t. */
+/* #undef HAVE_PRPSINFO32_T */
+
+/* Define if <sys/procfs.h> has prpsinfo32_t.pr_pid. */
+/* #undef HAVE_PRPSINFO32_T_PR_PID */
+
+/* Define if <sys/procfs.h> has prpsinfo_t. */
+/* #undef HAVE_PRPSINFO_T */
+
+/* Define if <sys/procfs.h> has prpsinfo_t.pr_pid. */
+/* #undef HAVE_PRPSINFO_T_PR_PID */
+
+/* Define if <sys/procfs.h> has prstatus32_t. */
+/* #undef HAVE_PRSTATUS32_T */
+
+/* Define if <sys/procfs.h> has prstatus32_t.pr_who. */
+/* #undef HAVE_PRSTATUS32_T_PR_WHO */
+
+/* Define if <sys/procfs.h> has prstatus_t. */
+/* #undef HAVE_PRSTATUS_T */
+
+/* Define if <sys/procfs.h> has prstatus_t.pr_who. */
+/* #undef HAVE_PRSTATUS_T_PR_WHO */
+
+/* Define if <sys/procfs.h> has psinfo32_t. */
+/* #undef HAVE_PSINFO32_T */
+
+/* Define if <sys/procfs.h> has psinfo32_t.pr_pid. */
+/* #undef HAVE_PSINFO32_T_PR_PID */
+
+/* Define if <sys/procfs.h> has psinfo_t. */
+/* #undef HAVE_PSINFO_T */
+
+/* Define if <sys/procfs.h> has psinfo_t.pr_pid. */
+/* #undef HAVE_PSINFO_T_PR_PID */
+
+/* Define if <sys/procfs.h> has pstatus32_t. */
+/* #undef HAVE_PSTATUS32_T */
+
+/* Define if <sys/procfs.h> has pstatus_t. */
+/* #undef HAVE_PSTATUS_T */
+
+/* Define if <sys/procfs.h> has pxstatus_t. */
+/* #undef HAVE_PXSTATUS_T */
+
+/* Define to 1 if you have the `setitimer' function. */
+#define HAVE_SETITIMER 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtoull' function. */
+#define HAVE_STRTOULL 1
+
+/* Define if struct core_dumpx has member c_impl */
+/* #undef HAVE_ST_C_IMPL */
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to 1 if you have the <sys/dir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_DIR_H */
+
+/* Define to 1 if you have the <sys/file.h> header file. */
+#define HAVE_SYS_FILE_H 1
+
+/* Define to 1 if you have the <sys/ndir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_NDIR_H */
+
+/* Define to 1 if you have the <sys/procfs.h> header file. */
+/* #undef HAVE_SYS_PROCFS_H */
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <time.h> header file. */
+#define HAVE_TIME_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the <wchar.h> header file. */
+#define HAVE_WCHAR_H 1
+
+/* Define to 1 if you have the <wctype.h> header file. */
+#define HAVE_WCTYPE_H 1
+
+/* Define if <sys/procfs.h> has win32_pstatus_t. */
+/* #undef HAVE_WIN32_PSTATUS_T */
+
+/* Define to 1 if you have the <windows.h> header file. */
+/* #undef HAVE_WINDOWS_H */
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "bfd"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "bfd"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "bfd 2.28.0"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "bfd"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.28.0"
+
+/* The size of `char', as computed by sizeof. */
+/* #undef SIZEOF_CHAR */
+
+/* The size of `int', as computed by sizeof. */
+/* #undef SIZEOF_INT */
+
+/* The size of `long', as computed by sizeof. */
+#define SIZEOF_LONG 8
+
+/* The size of `long long', as computed by sizeof. */
+#define SIZEOF_LONG_LONG 8
+
+/* The size of `off_t', as computed by sizeof. */
+#define SIZEOF_OFF_T 8
+
+/* The size of `short', as computed by sizeof. */
+/* #undef SIZEOF_SHORT */
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define if you can safely include both <string.h> and <strings.h>. */
+#define STRING_WITH_STRINGS 1
+
+/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+#define TIME_WITH_SYS_TIME 1
+
+/* Name of host specific header file to include in trad-core.c. */
+/* #undef TRAD_HEADER */
+
+/* Define if 64-bit archives should always be used. */
+/* #undef USE_64_BIT_ARCHIVE */
+
+/* Use b modifier when opening binary files? */
+/* #undef USE_BINARY_FOPEN */
+
+/* Define if we should use leading underscore on 64 bit mingw targets */
+/* #undef USE_MINGW64_LEADING_UNDERSCORES */
+
+/* Use mmap if it's available? */
+/* #undef USE_MMAP */
+
+/* Define if we should default to creating read-only plt entries */
+#define USE_SECUREPLT 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Version number of package */
+#define VERSION "2.28.0"
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/elf32-riscv.c b/utils/gapy/gen-debug-info-src/ext/bfd/elf32-riscv.c
new file mode 100644
index 000000000..58b805217
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/elf32-riscv.c
@@ -0,0 +1,4322 @@
+#line 1 "/home/haugoug/src/gap_sdk/riscv-gnu-toolchain/riscv-binutils-gdb/bfd/elfnn-riscv.c"
+/* RISC-V-specific support for 32-bit ELF.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+
+   Contributed by Andrew Waterman (andrew@sifive.com).
+   Based on TILE-Gx and MIPS targets.
+
+   PULP family support contributed by Eric Flamand (eflamand@iis.ee.ethz.ch) at ETH-Zurich
+   and Greenwaves Technologies (eric.flamand@greenwaves-technologies.com)
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING3. If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+/* This file handles RISC-V ELF targets.  */
+
+#include "sysdep.h"
+#include "bfd.h"
+#include "libbfd.h"
+#include "bfdlink.h"
+#include "genlink.h"
+#include "elf-bfd.h"
+#include "elfxx-riscv.h"
+#include "elf/riscv.h"
+#include "opcode/riscv.h"
+
+#define ARCH_SIZE 32
+
+#define MINUS_ONE ((bfd_vma)0 - 1)
+
+#define RISCV_ELF_LOG_WORD_BYTES (ARCH_SIZE == 32 ? 2 : 3)
+
+#define RISCV_ELF_WORD_BYTES (1 << RISCV_ELF_LOG_WORD_BYTES)
+
+/* The name of the dynamic interpreter.  This is put in the .interp
+   section.  */
+
+#define ELF64_DYNAMIC_INTERPRETER "/lib/ld.so.1"
+#define ELF32_DYNAMIC_INTERPRETER "/lib32/ld.so.1"
+
+#define ELF_ARCH			bfd_arch_riscv
+#define ELF_TARGET_ID			RISCV_ELF_DATA
+#define ELF_MACHINE_CODE		EM_RISCV
+#define ELF_MAXPAGESIZE			0x1000
+#define ELF_COMMONPAGESIZE		0x1000
+
+/* Linker argument -mComp
+   Set linker to component mode, in this case when export section is generated we use an offset relative to the section head
+   for exported symbol. If not, resident mode, we use the absolute address */
+
+#if 32==32
+bfd_boolean ComponentMode = FALSE;
+
+/*  Linker argumenr -mDIE, to dump import and export sections */
+unsigned int DumpImportExportSections = 0;
+#endif
+
+/* The RISC-V linker needs to keep track of the number of relocs that it
+   decides to copy as dynamic relocs in check_relocs for each symbol.
+   This is so that it can later discard them if they are found to be
+   unnecessary.  We store the information in a field extending the
+   regular ELF linker hash table.  */
+
+struct riscv_elf_dyn_relocs
+{
+  struct riscv_elf_dyn_relocs *next;
+
+  /* The input section of the reloc.  */
+  asection *sec;
+
+  /* Total number of relocs copied for the input section.  */
+  bfd_size_type count;
+
+  /* Number of pc-relative relocs copied for the input section.  */
+  bfd_size_type pc_count;
+};
+
+/* RISC-V ELF linker hash entry.  */
+
+struct riscv_elf_link_hash_entry
+{
+  struct elf_link_hash_entry elf;
+
+  /* Track dynamic relocs copied for this symbol.  */
+  struct riscv_elf_dyn_relocs *dyn_relocs;
+
+#define GOT_UNKNOWN     0
+#define GOT_NORMAL      1
+#define GOT_TLS_GD      2
+#define GOT_TLS_IE      4
+#define GOT_TLS_LE      8
+  char tls_type;
+};
+
+#define riscv_elf_hash_entry(ent) \
+  ((struct riscv_elf_link_hash_entry *)(ent))
+
+struct _bfd_riscv_elf_obj_tdata
+{
+  struct elf_obj_tdata root;
+
+  /* tls_type for each local got entry.  */
+  char *local_got_tls_type;
+};
+
+#define _bfd_riscv_elf_tdata(abfd) \
+  ((struct _bfd_riscv_elf_obj_tdata *) (abfd)->tdata.any)
+
+#define _bfd_riscv_elf_local_got_tls_type(abfd) \
+  (_bfd_riscv_elf_tdata (abfd)->local_got_tls_type)
+
+#define _bfd_riscv_elf_tls_type(abfd, h, symndx)		\
+  (*((h) != NULL ? &riscv_elf_hash_entry (h)->tls_type		\
+     : &_bfd_riscv_elf_local_got_tls_type (abfd) [symndx]))
+
+#define is_riscv_elf(bfd)				\
+  (bfd_get_flavour (bfd) == bfd_target_elf_flavour	\
+   && elf_tdata (bfd) != NULL				\
+   && elf_object_id (bfd) == RISCV_ELF_DATA)
+
+#include "elf/common.h"
+#include "elf/internal.h"
+
+struct riscv_elf_link_hash_table
+{
+  struct elf_link_hash_table elf;
+
+  /* Short-cuts to get to dynamic linker sections.  */
+  asection *sdyntdata;
+
+  /* Small local sym to section mapping cache.  */
+  struct sym_cache sym_cache;
+};
+
+
+/* Get the RISC-V ELF linker hash table from a link_info structure.  */
+#define riscv_elf_hash_table(p) \
+  (elf_hash_table_id ((struct elf_link_hash_table *) ((p)->hash)) \
+  == RISCV_ELF_DATA ? ((struct riscv_elf_link_hash_table *) ((p)->hash)) : NULL)
+
+static void
+riscv_info_to_howto_rela (bfd *abfd ATTRIBUTE_UNUSED,
+			  arelent *cache_ptr,
+			  Elf_Internal_Rela *dst)
+{
+  cache_ptr->howto = riscv_elf_rtype_to_howto (ELF32_R_TYPE (dst->r_info));
+}
+
+bfd_boolean _bfd_riscv_elf_final_link (bfd *, struct bfd_link_info *);
+
+static void
+riscv_elf_append_rela (bfd *abfd, asection *s, Elf_Internal_Rela *rel)
+{
+  const struct elf_backend_data *bed;
+  bfd_byte *loc;
+
+  bed = get_elf_backend_data (abfd);
+  loc = s->contents + (s->reloc_count++ * bed->s->sizeof_rela);
+  bed->s->swap_reloca_out (abfd, rel, loc);
+}
+
+/* PLT/GOT stuff.  */
+
+#define PLT_HEADER_INSNS 8
+#define PLT_ENTRY_INSNS 4
+#define PLT_HEADER_SIZE (PLT_HEADER_INSNS * 4)
+#define PLT_ENTRY_SIZE (PLT_ENTRY_INSNS * 4)
+
+#define GOT_ENTRY_SIZE RISCV_ELF_WORD_BYTES
+
+#define GOTPLT_HEADER_SIZE (2 * GOT_ENTRY_SIZE)
+
+#define sec_addr(sec) ((sec)->output_section->vma + (sec)->output_offset)
+
+static bfd_vma
+riscv_elf_got_plt_val (bfd_vma plt_index, struct bfd_link_info *info)
+{
+  return sec_addr (riscv_elf_hash_table (info)->elf.sgotplt)
+	 + GOTPLT_HEADER_SIZE + (plt_index * GOT_ENTRY_SIZE);
+}
+
+#if ARCH_SIZE == 32
+# define MATCH_LREG MATCH_LW
+#else
+# define MATCH_LREG MATCH_LD
+#endif
+
+/* Generate a PLT header.  */
+
+static void
+riscv_make_plt_header (bfd_vma gotplt_addr, bfd_vma addr, uint32_t *entry)
+{
+  bfd_vma gotplt_offset_high = RISCV_PCREL_HIGH_PART (gotplt_addr, addr);
+  bfd_vma gotplt_offset_low = RISCV_PCREL_LOW_PART (gotplt_addr, addr);
+
+  /* auipc  t2, %hi(.got.plt)
+     sub    t1, t1, t3               # shifted .got.plt offset + hdr size + 12
+     l[w|d] t3, %lo(.got.plt)(t2)    # _dl_runtime_resolve
+     addi   t1, t1, -(hdr size + 12) # shifted .got.plt offset
+     addi   t0, t2, %lo(.got.plt)    # &.got.plt
+     srli   t1, t1, log2(16/PTRSIZE) # .got.plt offset
+     l[w|d] t0, PTRSIZE(t0)          # link map
+     jr     t3 */
+
+  entry[0] = RISCV_UTYPE (AUIPC, X_T2, gotplt_offset_high);
+  entry[1] = RISCV_RTYPE (SUB, X_T1, X_T1, X_T3);
+  entry[2] = RISCV_ITYPE (LREG, X_T3, X_T2, gotplt_offset_low);
+  entry[3] = RISCV_ITYPE (ADDI, X_T1, X_T1, -(PLT_HEADER_SIZE + 12));
+  entry[4] = RISCV_ITYPE (ADDI, X_T0, X_T2, gotplt_offset_low);
+  entry[5] = RISCV_ITYPE (SRLI, X_T1, X_T1, 4 - RISCV_ELF_LOG_WORD_BYTES);
+  entry[6] = RISCV_ITYPE (LREG, X_T0, X_T0, RISCV_ELF_WORD_BYTES);
+  entry[7] = RISCV_ITYPE (JALR, 0, X_T3, 0);
+}
+
+/* Generate a PLT entry.  */
+
+static void
+riscv_make_plt_entry (bfd_vma got, bfd_vma addr, uint32_t *entry)
+{
+  /* auipc  t3, %hi(.got.plt entry)
+     l[w|d] t3, %lo(.got.plt entry)(t3)
+     jalr   t1, t3
+     nop */
+
+  entry[0] = RISCV_UTYPE (AUIPC, X_T3, RISCV_PCREL_HIGH_PART (got, addr));
+  entry[1] = RISCV_ITYPE (LREG,  X_T3, X_T3, RISCV_PCREL_LOW_PART (got, addr));
+  entry[2] = RISCV_ITYPE (JALR, X_T1, X_T3, 0);
+  entry[3] = RISCV_NOP;
+}
+
+/* Create an entry in an RISC-V ELF linker hash table.  */
+
+static struct bfd_hash_entry *
+link_hash_newfunc (struct bfd_hash_entry *entry,
+		   struct bfd_hash_table *table, const char *string)
+{
+  /* Allocate the structure if it has not already been allocated by a
+     subclass.  */
+  if (entry == NULL)
+    {
+      entry =
+	bfd_hash_allocate (table,
+			   sizeof (struct riscv_elf_link_hash_entry));
+      if (entry == NULL)
+	return entry;
+    }
+
+  /* Call the allocation method of the superclass.  */
+  entry = _bfd_elf_link_hash_newfunc (entry, table, string);
+  if (entry != NULL)
+    {
+      struct riscv_elf_link_hash_entry *eh;
+
+      eh = (struct riscv_elf_link_hash_entry *) entry;
+      eh->dyn_relocs = NULL;
+      eh->tls_type = GOT_UNKNOWN;
+    }
+
+  return entry;
+}
+
+/* Create a RISC-V ELF linker hash table.  */
+
+static struct bfd_link_hash_table *
+riscv_elf_link_hash_table_create (bfd *abfd)
+{
+  struct riscv_elf_link_hash_table *ret;
+  bfd_size_type amt = sizeof (struct riscv_elf_link_hash_table);
+
+  ret = (struct riscv_elf_link_hash_table *) bfd_zmalloc (amt);
+  if (ret == NULL)
+    return NULL;
+
+  if (!_bfd_elf_link_hash_table_init (&ret->elf, abfd, link_hash_newfunc,
+				      sizeof (struct riscv_elf_link_hash_entry),
+				      RISCV_ELF_DATA))
+    {
+      free (ret);
+      return NULL;
+    }
+
+  return &ret->elf.root;
+}
+
+/* Create the .got section.  */
+
+static bfd_boolean
+riscv_elf_create_got_section (bfd *abfd, struct bfd_link_info *info)
+{
+  flagword flags;
+  asection *s, *s_got;
+  struct elf_link_hash_entry *h;
+  const struct elf_backend_data *bed = get_elf_backend_data (abfd);
+  struct elf_link_hash_table *htab = elf_hash_table (info);
+
+  /* This function may be called more than once.  */
+  if (htab->sgot != NULL)
+    return TRUE;
+
+  flags = bed->dynamic_sec_flags;
+
+  s = bfd_make_section_anyway_with_flags (abfd,
+					  (bed->rela_plts_and_copies_p
+					   ? ".rela.got" : ".rel.got"),
+					  (bed->dynamic_sec_flags
+					   | SEC_READONLY));
+  if (s == NULL
+      || ! bfd_set_section_alignment (abfd, s, bed->s->log_file_align))
+    return FALSE;
+  htab->srelgot = s;
+
+  s = s_got = bfd_make_section_anyway_with_flags (abfd, ".got", flags);
+  if (s == NULL
+      || !bfd_set_section_alignment (abfd, s, bed->s->log_file_align))
+    return FALSE;
+  htab->sgot = s;
+
+  /* The first bit of the global offset table is the header.  */
+  s->size += bed->got_header_size;
+
+  if (bed->want_got_plt)
+    {
+      s = bfd_make_section_anyway_with_flags (abfd, ".got.plt", flags);
+      if (s == NULL
+	  || !bfd_set_section_alignment (abfd, s,
+					 bed->s->log_file_align))
+	return FALSE;
+      htab->sgotplt = s;
+
+      /* Reserve room for the header.  */
+      s->size += GOTPLT_HEADER_SIZE;
+    }
+
+  if (bed->want_got_sym)
+    {
+      /* Define the symbol _GLOBAL_OFFSET_TABLE_ at the start of the .got
+	 section.  We don't do this in the linker script because we don't want
+	 to define the symbol if we are not creating a global offset
+	 table.  */
+      h = _bfd_elf_define_linkage_sym (abfd, info, s_got,
+				       "_GLOBAL_OFFSET_TABLE_");
+      elf_hash_table (info)->hgot = h;
+      if (h == NULL)
+	return FALSE;
+    }
+
+  return TRUE;
+}
+
+/* Create .plt, .rela.plt, .got, .got.plt, .rela.got, .dynbss, and
+   .rela.bss sections in DYNOBJ, and set up shortcuts to them in our
+   hash table.  */
+
+static bfd_boolean
+riscv_elf_create_dynamic_sections (bfd *dynobj,
+				   struct bfd_link_info *info)
+{
+  struct riscv_elf_link_hash_table *htab;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+
+  if (!riscv_elf_create_got_section (dynobj, info))
+    return FALSE;
+
+  if (!_bfd_elf_create_dynamic_sections (dynobj, info))
+    return FALSE;
+
+  if (!bfd_link_pic (info))
+    {
+      htab->sdyntdata =
+	bfd_make_section_anyway_with_flags (dynobj, ".tdata.dyn",
+					    SEC_ALLOC | SEC_THREAD_LOCAL);
+    }
+
+  if (!htab->elf.splt || !htab->elf.srelplt || !htab->elf.sdynbss
+      || (!bfd_link_pic (info) && (!htab->elf.srelbss || !htab->sdyntdata)))
+    abort ();
+
+  return TRUE;
+}
+
+/* Copy the extra info we tack onto an elf_link_hash_entry.  */
+
+static void
+riscv_elf_copy_indirect_symbol (struct bfd_link_info *info,
+				struct elf_link_hash_entry *dir,
+				struct elf_link_hash_entry *ind)
+{
+  struct riscv_elf_link_hash_entry *edir, *eind;
+
+  edir = (struct riscv_elf_link_hash_entry *) dir;
+  eind = (struct riscv_elf_link_hash_entry *) ind;
+
+  if (eind->dyn_relocs != NULL)
+    {
+      if (edir->dyn_relocs != NULL)
+	{
+	  struct riscv_elf_dyn_relocs **pp;
+	  struct riscv_elf_dyn_relocs *p;
+
+	  /* Add reloc counts against the indirect sym to the direct sym
+	     list.  Merge any entries against the same section.  */
+	  for (pp = &eind->dyn_relocs; (p = *pp) != NULL; )
+	    {
+	      struct riscv_elf_dyn_relocs *q;
+
+	      for (q = edir->dyn_relocs; q != NULL; q = q->next)
+		if (q->sec == p->sec)
+		  {
+		    q->pc_count += p->pc_count;
+		    q->count += p->count;
+		    *pp = p->next;
+		    break;
+		  }
+	      if (q == NULL)
+		pp = &p->next;
+	    }
+	  *pp = edir->dyn_relocs;
+	}
+
+      edir->dyn_relocs = eind->dyn_relocs;
+      eind->dyn_relocs = NULL;
+    }
+
+  if (ind->root.type == bfd_link_hash_indirect
+      && dir->got.refcount <= 0)
+    {
+      edir->tls_type = eind->tls_type;
+      eind->tls_type = GOT_UNKNOWN;
+    }
+  _bfd_elf_link_hash_copy_indirect (info, dir, ind);
+}
+
+static bfd_boolean
+riscv_elf_record_tls_type (bfd *abfd, struct elf_link_hash_entry *h,
+			   unsigned long symndx, char tls_type)
+{
+  char *new_tls_type = &_bfd_riscv_elf_tls_type (abfd, h, symndx);
+
+  *new_tls_type |= tls_type;
+  if ((*new_tls_type & GOT_NORMAL) && (*new_tls_type & ~GOT_NORMAL))
+    {
+      (*_bfd_error_handler)
+	(_("%B: `%s' accessed both as normal and thread local symbol"),
+	 abfd, h ? h->root.root.string : "<local>");
+      return FALSE;
+    }
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_elf_record_got_reference (bfd *abfd, struct bfd_link_info *info,
+				struct elf_link_hash_entry *h, long symndx)
+{
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  Elf_Internal_Shdr *symtab_hdr = &elf_tdata (abfd)->symtab_hdr;
+
+  if (htab->elf.sgot == NULL)
+    {
+      if (!riscv_elf_create_got_section (htab->elf.dynobj, info))
+	return FALSE;
+    }
+
+  if (h != NULL)
+    {
+      h->got.refcount += 1;
+      return TRUE;
+    }
+
+  /* This is a global offset table entry for a local symbol.  */
+  if (elf_local_got_refcounts (abfd) == NULL)
+    {
+      bfd_size_type size = symtab_hdr->sh_info * (sizeof (bfd_vma) + 1);
+      if (!(elf_local_got_refcounts (abfd) = bfd_zalloc (abfd, size)))
+	return FALSE;
+      _bfd_riscv_elf_local_got_tls_type (abfd)
+	= (char *) (elf_local_got_refcounts (abfd) + symtab_hdr->sh_info);
+    }
+  elf_local_got_refcounts (abfd) [symndx] += 1;
+
+  return TRUE;
+}
+
+static bfd_boolean
+bad_static_reloc (bfd *abfd, unsigned r_type, struct elf_link_hash_entry *h)
+{
+  (*_bfd_error_handler)
+    (_("%B: relocation %s against `%s' can not be used when making a shared "
+       "object; recompile with -fPIC"),
+      abfd, riscv_elf_rtype_to_howto (r_type)->name,
+      h != NULL ? h->root.root.string : "a local symbol");
+  bfd_set_error (bfd_error_bad_value);
+  return FALSE;
+}
+
+/* Pulp add on for proprietary dynmaic relocation */
+typedef struct PulpImportRef {
+        Elf_Internal_Rela       Rel;
+        struct PulpImportRef    *Next;
+} PulpImportRef;
+
+typedef struct PulpImportEntry {
+        char                    *Name;
+	int			RelocCount;
+        PulpImportRef           *Ref;
+        struct PulpImportEntry  *Next;
+} PulpImportEntry;
+
+typedef struct PulpExportEntry {
+        char                    *Name;
+	unsigned int		Address;
+        struct PulpExportEntry  *Next;
+} PulpExportEntry;
+
+#define HASH_IMPORT_E 1024
+
+static PulpImportEntry * ImportEntries[HASH_IMPORT_E];
+static PulpExportEntry * ExportEntries[HASH_IMPORT_E];
+
+static struct bfd_sym_chain ComponentEntry;
+static bfd_boolean ComponentEntryProvided;
+
+#if 32 == 32
+void PulpRegisterSymbolEntry(struct bfd_sym_chain EntrySymb, bfd_boolean EntryOnCmdLine)
+
+{
+	ComponentEntry = EntrySymb;
+	ComponentEntryProvided = EntryOnCmdLine;
+}
+#endif
+
+
+static unsigned long hash_sdbm(const char *str)
+
+{
+        unsigned long hash = 0;
+        int c;
+
+        while ((c = (*str++))) hash = c + (hash << 6) + (hash << 16) - hash;
+        return (hash % HASH_IMPORT_E);
+}
+
+static bfd_boolean ExportLookup(const char *Name)
+
+{
+	unsigned int Index = hash_sdbm(Name);
+	PulpExportEntry *PtEntry = ExportEntries[Index];
+
+	while (PtEntry && (strcmp(PtEntry->Name, Name) != 0)) PtEntry = PtEntry->Next;
+
+	return (PtEntry != NULL);
+}
+
+#if 32 == 32
+bfd_boolean InsertExportEntry(const char *Name)
+
+{
+	unsigned int Index = hash_sdbm(Name);
+	PulpExportEntry *PtEntry = ExportEntries[Index];
+	PulpExportEntry *PtPrevEntry = NULL;
+
+	while (PtEntry && (strcmp(PtEntry->Name, Name) != 0)) {
+		PtPrevEntry = PtEntry; PtEntry = PtEntry->Next;
+	}
+	if (PtEntry == NULL) {
+		PtEntry = (PulpExportEntry *) bfd_malloc (sizeof (PulpExportEntry));
+		if (PtEntry == NULL) return FALSE;
+		PtEntry->Name = (char *) bfd_malloc (sizeof (char) * (strlen(Name)+1));
+		if (PtEntry->Name == NULL) return FALSE;
+		strcpy(PtEntry->Name, Name);
+		PtEntry->Address = 0; PtEntry->Next = NULL;
+		if (PtPrevEntry) PtPrevEntry->Next = PtEntry; else ExportEntries[Index] = PtEntry;
+	}
+	return TRUE;
+}
+
+unsigned int ExportSectionSize(unsigned int *EntryCount)
+
+{
+	int i;
+	PulpExportEntry *PtEntry;
+	unsigned int Size = 4;	/* Room for Number of Exported Symbs */
+	unsigned int Entry = 0;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			Entry++;
+			Size += (strlen(PtEntry->Name)+1+1);	// Null terminated and one byte prefix to pass section index
+		}
+	}
+	/* Align on 4 bytes */
+	if (Size % 4) {
+		Size = ((Size>>2)+1)<<2;
+	}
+	/* Add Value section */
+	Size += Entry*4;
+	if (EntryCount) *EntryCount = Entry;
+	if (Entry == 0) Size = 0;
+	return Size;
+}
+
+#endif
+
+static bfd_boolean ReleaseExportEntry(void)
+
+{
+	int i;
+	PulpExportEntry *PtEntry, *NextEntry;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = NextEntry) {
+			NextEntry = PtEntry->Next;
+			free(PtEntry->Name); free(PtEntry);
+		}
+		ExportEntries[i] = NULL;
+	}
+	return TRUE;
+}
+
+static bfd_boolean InsertImportEntry(const char *Name, Elf_Internal_Rela *Rel, bfd_vma OutOffset, bfd_boolean Collect)
+
+{
+	unsigned int Index = hash_sdbm(Name);
+	PulpImportEntry *PtEntry = ImportEntries[Index];
+	PulpImportEntry *PtPrevEntry = NULL;
+	PulpImportRef *Ref, *PtRef;
+	static PulpImportEntry *LastEntry = NULL;
+	static Elf_Internal_Rela *LastRel = NULL;
+
+	while (PtEntry && (strcmp(PtEntry->Name, Name) != 0)) {
+		PtPrevEntry = PtEntry; PtEntry = PtEntry->Next;
+	}
+	if (PtEntry == NULL) {
+		PtEntry = (PulpImportEntry *) bfd_malloc (sizeof (PulpImportEntry));
+		if (PtEntry == NULL) return FALSE;
+		PtEntry->Name = (char *) bfd_malloc (sizeof (char) * (strlen(Name)+1));
+		if (PtEntry->Name == NULL) return FALSE;
+		strcpy(PtEntry->Name, Name);
+		PtEntry->Ref = NULL; PtEntry->Next = NULL;
+		PtEntry->RelocCount = 0;
+		if (PtPrevEntry) PtPrevEntry->Next = PtEntry; else ImportEntries[Index] = PtEntry;
+	}
+	if (Collect) {
+		LastEntry = PtEntry;
+		if ((ELF32_R_TYPE(Rel->r_info) == R_RISCV_LO12_I) && (LastEntry == PtEntry) &&
+		    (ELF32_R_TYPE(LastRel->r_info) == R_RISCV_HI20) && ((Rel->r_offset - LastRel->r_offset) == 4) ) {
+		} else PtEntry->RelocCount = PtEntry->RelocCount + 1;
+		LastRel = Rel;
+		return TRUE;
+	}
+
+	Ref = (PulpImportRef *) bfd_malloc (sizeof (PulpImportRef));
+	if (Ref == NULL) return FALSE;
+	Ref->Rel = *Rel; Ref->Next = NULL;
+	Ref->Rel.r_info = ELF32_R_TYPE(Rel->r_info);
+	Ref->Rel.r_offset = Rel->r_offset + OutOffset;
+/*
+	if (ComponentMode == 0) {
+		Ref->Rel.r_offset += Sec->vma;
+	}
+*/
+	PtRef = PtEntry->Ref;
+	while (PtRef && PtRef->Next != NULL) PtRef = PtRef->Next;
+	if (PtRef) PtRef->Next = Ref; else PtEntry->Ref = Ref;
+	return TRUE;
+}
+
+static bfd_boolean ReleaseImportEntry(void)
+
+{
+	int i;
+	PulpImportEntry *PtEntry, *NextEntry;
+	PulpImportRef *PtRef, *NextRef;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = NextEntry) {
+			NextEntry = PtEntry->Next;
+			for (PtRef = PtEntry->Ref; PtRef; PtRef = NextRef) {
+				NextRef = PtRef->Next;
+				free(PtRef);
+			}
+			free(PtEntry->Name); free(PtEntry);
+		}
+		ImportEntries[i] = NULL;
+	}
+	return TRUE;
+}
+
+/*
+
+N imported symbols
+
+Structure of .pulp.import.names Section:
+----------------------------------------
+We use only Type 0
+	Len(Name) = Length(Name)+1 				Null terminated string
+	Size:	Pad4((N+1)*4 + Sum(j:1..N){Len(Namej)})		If Type=0
+	     	(N+1)*4						If Type=1
+
+		Base						Bit0: 		Section Type: 0 with names, 1 uses pre resolved indexes
+								Bit1:31:	(Section size) / 4 Always 4 byte aligned
+
+		Base						Bit  0:11 	NumberOfImports
+								Bit 12:31 	(Size of Names Section) / 4.
+
+	Type=0 (Names)
+		Base+4						Name1_Index = Base+4*(N+1)
+		Base+8						Name2_Index = Base+4*(N+1)+Len(Name1)
+		...
+		Base+4*(i)					Namei_Index = Base+4*(N+1)+Sum(j:1..(i-1)){Len(Namej)}
+		...
+		Base+4*(N)					NameN_Index = Base+4*(N+1)+Sum(j:1..(N-1)){Len(Namej)}
+		Base+4*(N+1)					Name1
+		Base+4*(N+1)+Len(Name1)				Name2
+		...
+		Base+4*(N+1)+Sum(j:1..(i-1)){Len(Namej)}	Namei	
+		...
+		Base+4*(N+1)+Sum(j:1..(N-1)){Len(Namej)}	NameN	
+		Pad till next address aligned on 4 bytes
+	Type=1 (Pre resolved indexes)
+		Base+4						Name1_Index (points to corresponding name in .export)
+		Base+8						Name2_Index (points to corresponding name in .export)
+		...
+		Base+4*(i)					Namei_Index (points to corresponding name in .export)
+		...
+		Base+4*(N)					NameN_Index (points to corresponding name in .export)
+
+Structure of .pulp.import.reloc Section:
+----------------------------------------
+
+	Size for Namei:	4*(N_Reloc(i) + 1)
+	Total Size:	4 + Sum(j:1..N){4*(N_Reloc(j) + 1)}
+
+	Entry(i+1) = Entry(i) + 4*(N_Reloc(i) + 1)
+
+		Base			Bit  0:11 		NumberOfImports
+					Bit 12:31 		(Size of Relocs Section) / 4. Contains only words thus multiple of 4 bytes
+
+		Base+4			Name_Index(1)		In .pulp.import.names
+		Base+6			N_Reloc(1)		Number of reloc for this name
+		Base+8			Reloc			One reloc
+		Base+12			Reloc			One reloc
+		...
+		Base+4+4*N_Reloc(1)	Reloc			Size for relocs: N_Reloc(1)*4, Total for Name1: 4*(N_Reloc(1) + 1)
+		....
+
+	Reloc:
+		Reloc Type: 4 Bits  	=> Bit31 : Bit28
+			0	R_RISCV_JAL					Offset = @Name-pc
+					pc: 	jal (pc+Offset[20..1])		InsnBits[31:12] =>  I[20],I[10:1],I[11],I[19:12]
+			1	Pair of R_RISCV_HI20, R_RISCV_LO12_I
+					pc:	lui Reg,Hi20(Name)		InsBits[31:12] => @Name[31:12]
+					pc+4:	addi Reg, Reg, Lo12(Name)	InsBits[31:20] => @Name[11:0}
+			2	R_RISCV_HI20
+					pc:	lui Reg,Hi20(Name)		InsBits[31:12] => @Name[31:12]
+			3	R_RISCV_LO12_I
+					pc:	addi Reg, Reg, Lo12(Name)	InsBits[31:20] => @Name[11:0}
+			4	R_RISCV_LO12_S
+					pc:					InsnBits[31:25] => @Name[15:5], InsnBits[11:7] => @Name[4:0]
+
+		Reloc Offset: 28 Bits	=> Bit27 : Bit0				Offset from section base / 2.
+										On RiscV we can assume that offset is always a multiple of 2
+
+
+Structure of .pulp.export Section:
+----------------------------------
+
+	Total Size:	Pad4(4 + +Sum(j=1..N){Len(Namej)+1}) + 4*N. Is a multiple of 4
+
+	Base						Bit0		0: Resident, 1: Component
+							Bit15:Bit1 	N: Number of exported names
+							Bit31:Bit16 	Offset/4 to first Value in this section. /4 since all entities are words
+	Base+4						Section Name1	Section (byte) in which name is defined, Null terminated name
+	Base+4+Len(Name1)				Section Name2
+	...
+	Base+4+Sum(j=1..i-1){Len(Namej)+1}		Section Namei
+	...
+	Base+4+Sum(j=1..N-1){Len(Namej)+1}		Section NameN
+
+	Base+4+Sum(j=1..N){Len(Namej+1)}+Pad4		Value1	Link time Offset for Name1	Pad4: Alignment to 4
+	...
+	Base+4+Sum(j=1..N){Len(Namej+1)}+Pad4+4*N	ValueN	Link time Offset for NameN
+	
+
+*/
+
+#define	IMPORT_REL_JAL			0
+#define	IMPORT_REL_HI20_LO12_I		1
+#define	IMPORT_REL_HI20			2
+#define	IMPORT_REL_LO12_I		3
+#define	IMPORT_REL_LO12_S		4
+
+#define	IMPORT_SECN_NAME_SZ		4
+#define	IMPORT_SECN_TYPE_SZ		4
+#define	IMPORT_SECN_NAME_INDEX_SZ	4
+
+#define	IMPORT_SECR_IMPORT_CNT_SZ	4
+#define	IMPORT_SECR_NAME_INDEX_SZ	2
+#define	IMPORT_SECR_REL_CNT_SZ		2
+#define	IMPORT_SECR_REL_EXPR_SZ		4
+
+static char *RelImage(unsigned int Rel)
+
+{
+	switch (Rel) {
+		case IMPORT_REL_JAL: return "REL_JAL";
+		case IMPORT_REL_HI20_LO12_I : return "REL_HI20_LO12_I";
+		case IMPORT_REL_HI20: return "REL_HI20";
+		case IMPORT_REL_LO12_I: return "REL_LO12_I";
+		case IMPORT_REL_LO12_S: return "REL_LO12_S";
+		default: return "Unknown Rel";
+	}
+}
+static unsigned int PulpImportNameSize(const char *Name)
+
+{
+	unsigned int Size = strlen(Name)+1;
+	return Size;
+}
+
+#if 32 == 32
+void PulpImportSectionsSize(int Mode, unsigned int *SecName, unsigned int *SecReloc, unsigned int *N_Import, bfd_boolean Collect)
+
+{
+	int i;
+	PulpImportEntry *PtEntry;
+	PulpImportRef *PtRef;
+	unsigned int NameSize = IMPORT_SECN_TYPE_SZ;		/* Name Section Type */
+	unsigned int RefSize = IMPORT_SECR_IMPORT_CNT_SZ;	/* N Import */
+	unsigned int N_Imp = 0;
+	int Skip = 0;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			if (((PtEntry->RelocCount == 0)&&Collect) || ((PtEntry->Ref == NULL)&&(!Collect))) continue;
+			RefSize += IMPORT_SECR_NAME_INDEX_SZ;		/* Name Index */
+			RefSize += IMPORT_SECR_REL_CNT_SZ;		/* Number of Relocs for Name Index */
+			NameSize += IMPORT_SECN_NAME_INDEX_SZ;		/* Name Index */
+			N_Imp ++;
+			if (Mode == 0) NameSize += PulpImportNameSize(PtEntry->Name);
+			if (Collect) RefSize += PtEntry->RelocCount*IMPORT_SECR_REL_EXPR_SZ;
+			else {
+				for (PtRef = PtEntry->Ref; PtRef; PtRef = PtRef->Next) {
+					if (Skip) {
+						Skip = 0; continue;
+					}
+					if ((PtRef->Rel.r_info == R_RISCV_HI20) && PtRef->Next &&
+					    (PtRef->Next->Rel.r_info == R_RISCV_LO12_I) && ((PtRef->Next->Rel.r_offset - PtRef->Rel.r_offset)==4)) Skip = 1;
+					RefSize += IMPORT_SECR_REL_EXPR_SZ;	/* Reloc Expr */
+				}
+			}
+		}
+	}
+	/* Force Names section size to be 4 bytes aligned */
+	if (NameSize%4) NameSize = ((NameSize>>2)+1)<<2;
+	*SecName = NameSize;
+	*SecReloc = RefSize;
+	*N_Import = N_Imp;
+}
+#endif
+
+static bfd_boolean PulpExportCreateSection(unsigned int **Section, unsigned int *SizeSection, struct bfd_link_info *info)
+
+{
+	PulpExportEntry *PtEntry;
+	char *Base = NULL;
+	unsigned int *BaseI;
+	unsigned int *Entries;
+	unsigned int Entry = 0, BaseLinkedVal, Addr = 0;
+	unsigned int Size = ExportSectionSize(&Entry);
+	int i, j;
+
+	*SizeSection = Size;
+	if (Size == 0) {
+		*Section = NULL; return TRUE;
+	}
+	*Section = (unsigned int *) bfd_malloc (Size);
+	Entries = (unsigned int *) bfd_malloc (Entry*sizeof(unsigned int));
+
+	if (*Section == NULL || Entries == NULL) {
+	  	(*_bfd_error_handler) (_("Export Create Section, Can't allocate memory"));
+		return FALSE;
+	}
+
+	Entry = 0;
+	Base = (char *) (&(*Section)[1]);
+	Addr = 4;
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			int Len = (int) strlen(PtEntry->Name);
+  			struct bfd_link_hash_entry *h;
+  			h = bfd_link_hash_lookup (info->hash, PtEntry->Name, FALSE, FALSE, TRUE);
+			if (h == NULL) {
+	  			(*_bfd_error_handler) (_("Export Create Section, Can't find symbol: %s"), PtEntry->Name);
+				return FALSE;
+			}
+			if (ComponentMode)
+				// We don't want lma or vma added here just an offset relative to the beginning of the output section in which it is
+				Entries[Entry++] = h->u.def.value + h->u.def.section->output_offset;
+			else 
+				Entries[Entry++] = h->u.def.value + sec_addr (h->u.def.section);
+				// Entries[Entry++] = h->u.def.value + h->u.def.section->output_offset + h->u.def.section->lma;
+			Base[0] = 0; /* Here should come the section in which the symbol is defined */
+			for (j=0; j<Len; j++)  Base[j+1] = PtEntry->Name[j];
+			Base[j+1] = 0; /* Null termination */
+			Base += (Len+2); Addr += (Len+2);
+		}
+	}
+	{
+		unsigned long int Base1 = (unsigned long int) Base;
+		if (Base1 % 4) Base1 = ((Base1>>2)+1)<<2;
+		BaseI = (unsigned int *) Base1;
+		if (Addr % 4) Addr = ((Addr>>2)+1)<<2;
+	}
+	BaseLinkedVal = Addr;
+	Entry = 0;
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			BaseI[Entry] = Entries[Entry]; Entry++;
+		}
+	}
+	(*Section)[0] = (ComponentMode&0x01) | ((Entry<<1)&0x0FFFE) | ((BaseLinkedVal>>2) << 16);
+	free(Entries);
+	return TRUE;
+}
+
+static bfd_boolean PulpImportCreateNameAndRelocSections(int Mode,
+						 unsigned int **S_Name, unsigned int *S_NameSize,
+						 unsigned int **S_Reloc, unsigned int *S_RelocSize,
+						 unsigned int *NImport)
+
+{
+	PulpImportEntry *PtEntry;
+	PulpImportRef *PtRef;
+	unsigned int SecNameSize;
+	unsigned int SecRelocSize;
+	unsigned int *SecName;
+	unsigned int *SecReloc;
+	unsigned int N_Import;
+	int Skip = 0;
+	unsigned N_Imp = 0;
+	unsigned int HeadName, HeadRel;
+	unsigned int i;
+	static int Trace = 0;
+
+	PulpImportSectionsSize(Mode, &SecNameSize, &SecRelocSize, &N_Import, FALSE);
+
+	SecName = (unsigned int *) bfd_malloc (SecNameSize);
+	SecReloc = (unsigned int *) bfd_malloc (SecRelocSize);
+	*S_Name = SecName; *S_NameSize = SecNameSize;
+	*S_Reloc = SecReloc; *S_RelocSize = SecRelocSize;
+
+	if (SecName == NULL || SecReloc == NULL) return FALSE;
+	// SecName[0]  = (Mode&0x1) | ((SecNameSize)<<1);
+
+	for (i=0; i< (SecNameSize>>2); i++) SecName[i] = 0;
+	for (i=0; i< (SecRelocSize>>2); i++) SecReloc[i] = 0;
+
+	SecName[0] = (N_Import&0x0FFF) | ((SecNameSize>>2)<<12);
+	HeadName = (1 + N_Import)*4;
+
+	if (Trace) fprintf(stderr, "Names: Size: %d, Relocs: Size: %d, N Imports: %d, Head Strings: %X\n", SecNameSize, SecRelocSize, N_Import, HeadName*4);
+
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			unsigned int j;
+			unsigned int NameSize;
+			char *Pt;
+
+			if (PtEntry->Ref == NULL) continue;
+
+			if (Trace) fprintf(stderr, "At: %5X Creating Name  Entry: %5d, String: %5X (%s)\n", (1+N_Imp)*4, N_Imp, HeadName, PtEntry->Name);
+			SecName[1+N_Imp] = HeadName; N_Imp++;
+			if (Mode == 1) continue;
+			NameSize = PulpImportNameSize(PtEntry->Name);
+			Pt = ((char *) SecName) + HeadName;
+			for (j=0; j<strlen(PtEntry->Name); j++) Pt[j] = PtEntry->Name[j];
+			Pt[j] = 0;
+			if (Trace) fprintf(stderr, "At: %5X Creating Name String: %s\n", HeadName, PtEntry->Name);
+			HeadName += NameSize;
+		}
+	}
+	SecReloc[0] = (N_Import&0x0FFF) | ((SecRelocSize>>2)<<12); N_Imp = 0; HeadRel = 1;
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		unsigned int Base;
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			unsigned int RelCount = 0;
+			if (PtEntry->Ref == NULL) continue;
+			Base = HeadRel++;
+			for (PtRef = PtEntry->Ref; PtRef; PtRef = PtRef->Next) {
+				unsigned int Rel;
+				if (Skip) {
+					Skip = 0; continue;
+				}
+				if ((PtRef->Rel.r_info == R_RISCV_HI20) && PtRef->Next &&
+				    (PtRef->Next->Rel.r_info == R_RISCV_LO12_I) && ((PtRef->Next->Rel.r_offset - PtRef->Rel.r_offset)==4)) {
+					Skip = 1;
+					Rel = IMPORT_REL_HI20_LO12_I;
+				} else {
+					switch (PtRef->Rel.r_info) {
+						case R_RISCV_JAL: Rel = IMPORT_REL_JAL; break;
+						case R_RISCV_HI20: Rel = IMPORT_REL_HI20; break;
+						case R_RISCV_LO12_I: Rel = IMPORT_REL_LO12_I; break;
+						case R_RISCV_LO12_S: Rel = IMPORT_REL_LO12_S; break;
+						default: {
+								reloc_howto_type *howto = riscv_elf_rtype_to_howto (PtRef->Rel.r_info);
+								Rel = -1; /* Error */
+	  							(*_bfd_error_handler) (_("Unknown Relocation: %X (%s)"),
+											(int) PtRef->Rel.r_info,
+											howto?howto->name:"Unknown");
+								return FALSE;
+							}
+					}
+				}
+				if (Trace) fprintf(stderr, "At: %5X Adding   Rel for Entry: %5d => %8X [Rel:%8X, Offset: %8X]\n",
+						   HeadRel*4, N_Imp+1, (unsigned int) ((Rel<<28) | ((PtRef->Rel.r_offset>>1) & 0x0FFFFFFF)),
+						   (unsigned int) Rel, (unsigned int) PtRef->Rel.r_offset);
+				SecReloc[HeadRel++] = (Rel<<28) | ((PtRef->Rel.r_offset>>1) & 0x0FFFFFFF);
+				RelCount++;
+			}
+			SecReloc[Base] = ((4*(N_Imp+1))&0x0FFFF) | ((RelCount<<16)&0xFFFF0000);
+			if (Trace) fprintf(stderr, "At: %5X Creating Rel     Entry: %5d, Rel Count: %d\n", Base*4, 4*(N_Imp+1), RelCount); 
+			N_Imp++;
+		}
+	}
+	*NImport = N_Import;
+	return TRUE;
+}
+
+/* We adjust reloc offset to absolute address when ComponentMode=0, e.g resident mode. In this case we need to add the lma of the text section */
+
+static void AdjustRelocsImport(unsigned int *ImportRelocs, unsigned int BaseText)
+
+{
+	unsigned int Addr, N_Import;
+	unsigned int i, j;
+
+	if (ImportRelocs == NULL) return;
+
+	N_Import = (ImportRelocs[0] & 0x0FFF); // ??? >>1;
+	Addr = 4;
+	for (i=0; i<N_Import; i++) {
+		unsigned int RelCount = (ImportRelocs[Addr/4]>>16) & 0x0FFFF;
+
+		Addr += 4;
+		for (j=0; j<RelCount; j++) {
+			unsigned int Rel = ImportRelocs[Addr/4];
+			unsigned int Offset = (((Rel & 0x0FFFFFFF)<<1)+BaseText)>>1;
+			unsigned int Type = (Rel>>28);
+
+			Rel = (Type<<28) | (Offset & 0x0FFFFFFF);
+			ImportRelocs[Addr/4] = Rel;
+			Addr += 4;
+		}
+	}
+}
+
+static void DumpCEquiv(unsigned int *Section, unsigned int Size, unsigned int Elem, char *DeclName)
+
+{
+	unsigned int i;
+	unsigned DeclSize =  Size/Elem;
+	unsigned short *Half = (unsigned short *) Section;
+	unsigned char *Byte = (unsigned char *) Section;
+
+	switch (Elem) {
+		case 1:
+			fprintf(stderr, "unsigned char %s[%d] = {\n\t", DeclName, DeclSize);
+			for (i=0; i<DeclSize; i++) {
+				fprintf(stderr, "0X%X, ", Byte[i]);
+				if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+			}
+			fprintf(stderr, "\n};\n\n");
+			break;
+		case 2:
+			fprintf(stderr, "unsigned short int %s[%d] = {\n\t", DeclName, DeclSize);
+			for (i=0; i<DeclSize; i++) {
+				fprintf(stderr, "0X%X, ", Half[i]);
+				if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+			}
+			fprintf(stderr, "\n};\n\n");
+			break;
+		case 4:
+			fprintf(stderr, "unsigned int %s[%d] = {\n\t", DeclName, DeclSize);
+			for (i=0; i<DeclSize; i++) {
+				fprintf(stderr, "0X%X, ", Section[i]);
+				if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+			}
+			fprintf(stderr, "\n};\n\n");
+			break;
+		default: ;
+	}
+}
+
+static void DiassembleImports(unsigned int *ImportNames, unsigned SecNamesSize, unsigned int *ImportRelocs, unsigned int SecRelocsSize, unsigned int BaseText)
+
+{
+	static int RawDump = 0;
+	unsigned int Addr, N_Import;
+	unsigned int i, j;
+	char *Name;
+
+	if (ImportNames == NULL || ImportRelocs == NULL) {
+		return;
+	}
+	N_Import = ImportRelocs[0] & 0x0FFF;
+
+	fprintf(stderr, "Section: .pulp.import.names\n");
+	Addr = 0;
+	fprintf(stderr, "%8s  %17s %20s\n", "Offset", "Content", "Comment");
+	fprintf(stderr, "%8x: 0x%15X (NImport = %d, Section Size: 0x%X)\n",
+		Addr, ImportNames[Addr/4], ImportNames[Addr/4]&0x0FFF, (ImportNames[Addr/4]>>12)*4);
+	Addr += 4;
+	Name = (char *) &ImportNames[1];
+	for (i=0; i<N_Import; i++) {
+		fprintf(stderr, "%8x: 0x%15X (Import Symbol %5d, Name @ in this section)\n", Addr, ImportNames[Addr/4], i);
+		Addr += 4; Name += 4;
+	}
+	for (i=0; i<N_Import; i++) {
+		fprintf(stderr, "%8x: %17s (Import Symbol %5d)\n", Addr, Name, i);
+		Addr = Addr + strlen(Name) + 1; Name = Name + strlen(Name) + 1;
+	}
+
+	fprintf(stderr, "Section: .pulp.import.relocs, Mode=%s, BaseText=%X\n", ComponentMode?"Component":"Resident", BaseText);
+	Addr = 0;
+	fprintf(stderr, "%8s  %17s %20s\n", "Offset", "Content", "Comment");
+	fprintf(stderr, "%8x: 0x%15X (Number of Imported Symbols: %d, Section Size: 0x%X)\n",
+		Addr, ImportRelocs[Addr/4], ImportRelocs[Addr/4]&0x0FFF, ((ImportRelocs[Addr/4]>>12)&0x000FFFFF)<<2);
+	Addr += 4;
+	for (i=0; i<N_Import; i++) {
+		unsigned int Entry = (ImportRelocs[Addr/4] & 0x0FFFF);
+		unsigned int RelCount = (ImportRelocs[Addr/4]>>16) & 0x0FFFF;
+
+		Name = ((char *) ImportNames) + ImportNames[Entry>>2];
+		fprintf(stderr, "%8x: 0x%15X (Name @: 0x%6X, Reloc: %3d) %s\n", Addr, ImportRelocs[Addr/4], Entry, RelCount, Name);
+		Addr += 4;
+		for (j=0; j<RelCount; j++) {
+			unsigned int Rel = ImportRelocs[Addr/4];
+			unsigned int Offset = ((Rel & 0x0FFFFFFF)<<1);
+			unsigned int Type = (Rel>>28);
+
+			fprintf(stderr, "%8x: 0x%15X (Offset: 0x%6X, Reloc: %s)\n", Addr, ImportRelocs[Addr/4], Offset, RelImage(Type));
+			Addr += 4;
+		}
+	}
+	if (RawDump) {
+		unsigned int NameSize = SecNamesSize>>2, RelocSize = SecRelocsSize>>2;
+		if (SecNamesSize % 4) NameSize++;
+		if (SecRelocsSize % 4) RelocSize++;
+		fprintf(stderr, "unsigned int CompNames[%d] = {\n\t", NameSize);
+		for (i=0; i<NameSize; i++) {
+			fprintf(stderr, "0X%X, ", ImportNames[i]);
+			if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+		}
+		fprintf(stderr, "\n};\n");
+		fprintf(stderr, "unsigned int CompRelocs[%d] = {\n\t", RelocSize);
+		for (i=0; i<RelocSize; i++) {
+			fprintf(stderr, "0X%X, ", ImportRelocs[i]);
+			if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+		}
+		fprintf(stderr, "\n};\n");
+	}
+}
+
+static void DiassembleExports(unsigned int *Section, unsigned int SectionSize)
+
+{
+	static int RawDump = 0;
+	unsigned int Entry;
+	char *Base;
+	unsigned int *BaseI;
+	unsigned long int Addr;
+	unsigned int i;
+
+	if (Section == NULL) return;
+
+	fprintf(stderr, "Section: .pulp.export\n");
+	Entry = (Section[0]&0x0FFFF)>>1;
+	Base = (char *) (&Section[1]);
+
+	Addr = 0;
+	fprintf(stderr, "%8s  %17s %20s\n", "Offset", "Content", "Comment");
+	fprintf(stderr, "%8x: 0x%15X (Type: %s, Number of Exported Symbols: %d, Base Linker Values: 0x%X, Section Size: 0x%X)\n",
+		(unsigned int) Addr, Section[Addr], (Section[Addr]&0x01)?"Component":"Resident",
+		(Section[Addr]&0x0FFFF)>>1, ((Section[Addr]>>16)<<2)&0x0FFFF,
+		((Section[Addr]&0x0FFFF)>>1)*4 + (((Section[Addr]>>16)<<2)&0x0FFFF));
+	Addr += 4;
+	for (i=0; i<Entry; i++) {
+		unsigned int Off = strlen(Base+1) + 2;
+		fprintf(stderr, "%8x: %17s Section: %2d (Exported Symbol %5d)\n", (unsigned int) Addr, Base+1, Base[0], i);
+		Base = Base + Off; Addr = Addr + Off;
+	}
+	while ((unsigned long int) Base % 4) {
+		Base++; Addr++;
+	}
+	// BaseI = (unsigned int *) Base;
+	Addr = (Section[0]>>16)<<2;
+	BaseI = Section + (Section[0]>>16);
+	for (i=0; i<Entry; i++) {
+		fprintf(stderr, "%8x: 0x%15X (Exported Symbol %5d, Offset in Section)\n", (unsigned int) Addr, BaseI[i], i);
+		Addr+=4;
+	}
+	if (RawDump) {
+		unsigned int Size = SectionSize>>2;
+		if (SectionSize % 4) Size++;
+		fprintf(stderr, "unsigned int ExportSymb[%d] = {\n\t", Size);
+		for (i=0; i<Size; i++) {
+			fprintf(stderr, "0X%X, ", Section[i]);
+			if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+		}
+		fprintf(stderr, "\n};\n");
+	}
+}
+
+
+/* Look through the relocs for a section during the first phase, and
+   allocate space in the global offset table or procedure linkage
+   table.  */
+
+static bfd_boolean
+riscv_elf_check_relocs (bfd *abfd, struct bfd_link_info *info,
+			asection *sec, const Elf_Internal_Rela *relocs)
+{
+  struct riscv_elf_link_hash_table *htab;
+  Elf_Internal_Shdr *symtab_hdr;
+  struct elf_link_hash_entry **sym_hashes;
+  Elf_Internal_Rela *rel;
+  asection *sreloc = NULL;
+
+  if (bfd_link_relocatable (info))
+    return TRUE;
+
+  htab = riscv_elf_hash_table (info);
+  symtab_hdr = &elf_tdata (abfd)->symtab_hdr;
+  sym_hashes = elf_sym_hashes (abfd);
+
+  if (htab->elf.dynobj == NULL)
+    htab->elf.dynobj = abfd;
+
+  for (rel = relocs; rel < relocs + sec->reloc_count; rel++)
+    {
+      unsigned int r_type;
+      unsigned long r_symndx;
+      struct elf_link_hash_entry *h;
+
+      r_symndx = ELF32_R_SYM (rel->r_info);
+      r_type = ELF32_R_TYPE (rel->r_info);
+
+      if (r_symndx >= NUM_SHDR_ENTRIES (symtab_hdr))
+	{
+	  (*_bfd_error_handler) (_("%B: bad symbol index: %d"),
+				 abfd, r_symndx);
+	  return FALSE;
+	}
+
+      if (r_symndx < symtab_hdr->sh_info)
+	h = NULL;
+      else
+	{
+	  h = sym_hashes[r_symndx - symtab_hdr->sh_info];
+	  while (h->root.type == bfd_link_hash_indirect
+		 || h->root.type == bfd_link_hash_warning)
+	    h = (struct elf_link_hash_entry *) h->root.u.i.link;
+
+	  /* PR15323, ref flags aren't set for references in the same
+	     object.  */
+	  h->root.non_ir_ref = 1;
+	}
+
+      if (h && h->root.type == bfd_link_hash_defweak) {
+                static int Trace = 0;
+                asection *sec1;
+                reloc_howto_type *howto = riscv_elf_rtype_to_howto (ELF32_R_TYPE (rel->r_info));
+
+                sec1 = h->root.u.def.section;
+
+                if (sec1 != NULL && (strcmp(sec1->name, ".pulp.import")==0)) {
+                        if (Trace) printf("Pre Importing %15s in reloc: %4d -> %4d:%22s, at offset: (%8X + %8X) => %X\n",
+                                          h->root.root.string, (int) rel->r_info,
+                                          (int) ELF32_R_TYPE(rel->r_info), howto->name, (int) rel->r_offset, (int) sec1->output_offset,
+                                          (int) sec1->output_offset+(int)rel->r_offset);
+                        sec1->flags |= SEC_KEEP;
+                        InsertImportEntry(h->root.root.string, rel, sec1->output_offset, TRUE);
+                }
+
+      }
+      switch (r_type)
+	{
+	case R_RISCV_TLS_GD_HI20:
+	  if (!riscv_elf_record_got_reference (abfd, info, h, r_symndx)
+	      || !riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_TLS_GD))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_TLS_GOT_HI20:
+	  if (bfd_link_pic (info))
+	    info->flags |= DF_STATIC_TLS;
+	  if (!riscv_elf_record_got_reference (abfd, info, h, r_symndx)
+	      || !riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_TLS_IE))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_GOT_HI20:
+	  if (!riscv_elf_record_got_reference (abfd, info, h, r_symndx)
+	      || !riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_NORMAL))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_CALL_PLT:
+	  /* This symbol requires a procedure linkage table entry.  We
+	     actually build the entry in adjust_dynamic_symbol,
+	     because this might be a case of linking PIC code without
+	     linking in any dynamic objects, in which case we don't
+	     need to generate a procedure linkage table after all.  */
+
+	  if (h != NULL)
+	    {
+	      h->needs_plt = 1;
+	      h->plt.refcount += 1;
+	    }
+	  break;
+
+	case R_RISCV_CALL:
+	case R_RISCV_JAL:
+	case R_RISCV_BRANCH:
+	case R_RISCV_RVC_BRANCH:
+	case R_RISCV_RVC_JUMP:
+	case R_RISCV_PCREL_HI20:
+	  /* In shared libraries, these relocs are known to bind locally.  */
+	  if (bfd_link_pic (info))
+	    break;
+	  goto static_reloc;
+
+	case R_RISCV_TPREL_HI20:
+	  if (!bfd_link_executable (info))
+	    return bad_static_reloc (abfd, r_type, h);
+	  if (h != NULL)
+	    riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_TLS_LE);
+	  goto static_reloc;
+
+	case R_RISCV_HI20:
+	  if (bfd_link_pic (info))
+	    return bad_static_reloc (abfd, r_type, h);
+	  /* Fall through.  */
+
+	case R_RISCV_COPY:
+	case R_RISCV_JUMP_SLOT:
+	case R_RISCV_RELATIVE:
+	case R_RISCV_64:
+	case R_RISCV_32:
+	  /* Fall through.  */
+
+	static_reloc:
+	  /* This reloc might not bind locally.  */
+	  if (h != NULL)
+	    h->non_got_ref = 1;
+
+	  if (h != NULL && !bfd_link_pic (info))
+	    {
+	      /* We may need a .plt entry if the function this reloc
+		 refers to is in a shared lib.  */
+	      h->plt.refcount += 1;
+	    }
+
+	  /* If we are creating a shared library, and this is a reloc
+	     against a global symbol, or a non PC relative reloc
+	     against a local symbol, then we need to copy the reloc
+	     into the shared library.  However, if we are linking with
+	     -Bsymbolic, we do not need to copy a reloc against a
+	     global symbol which is defined in an object we are
+	     including in the link (i.e., DEF_REGULAR is set).  At
+	     this point we have not seen all the input files, so it is
+	     possible that DEF_REGULAR is not set now but will be set
+	     later (it is never cleared).  In case of a weak definition,
+	     DEF_REGULAR may be cleared later by a strong definition in
+	     a shared library.  We account for that possibility below by
+	     storing information in the relocs_copied field of the hash
+	     table entry.  A similar situation occurs when creating
+	     shared libraries and symbol visibility changes render the
+	     symbol local.
+
+	     If on the other hand, we are creating an executable, we
+	     may need to keep relocations for symbols satisfied by a
+	     dynamic library if we manage to avoid copy relocs for the
+	     symbol.  */
+	  if ((bfd_link_pic (info)
+	       && (sec->flags & SEC_ALLOC) != 0
+	       && (! riscv_elf_rtype_to_howto (r_type)->pc_relative
+		   || (h != NULL
+		       && (! info->symbolic
+			   || h->root.type == bfd_link_hash_defweak
+			   || !h->def_regular))))
+	      || (!bfd_link_pic (info)
+		  && (sec->flags & SEC_ALLOC) != 0
+		  && h != NULL
+		  && (h->root.type == bfd_link_hash_defweak
+		      || !h->def_regular)))
+	    {
+	      struct riscv_elf_dyn_relocs *p;
+	      struct riscv_elf_dyn_relocs **head;
+
+	      /* When creating a shared object, we must copy these
+		 relocs into the output file.  We create a reloc
+		 section in dynobj and make room for the reloc.  */
+	      if (sreloc == NULL)
+		{
+		  sreloc = _bfd_elf_make_dynamic_reloc_section
+		    (sec, htab->elf.dynobj, RISCV_ELF_LOG_WORD_BYTES,
+		    abfd, /*rela?*/ TRUE);
+
+		  if (sreloc == NULL)
+		    return FALSE;
+		}
+
+	      /* If this is a global symbol, we count the number of
+		 relocations we need for this symbol.  */
+	      if (h != NULL)
+		head = &((struct riscv_elf_link_hash_entry *) h)->dyn_relocs;
+	      else
+		{
+		  /* Track dynamic relocs needed for local syms too.
+		     We really need local syms available to do this
+		     easily.  Oh well.  */
+
+		  asection *s;
+		  void *vpp;
+		  Elf_Internal_Sym *isym;
+
+		  isym = bfd_sym_from_r_symndx (&htab->sym_cache,
+						abfd, r_symndx);
+		  if (isym == NULL)
+		    return FALSE;
+
+		  s = bfd_section_from_elf_index (abfd, isym->st_shndx);
+		  if (s == NULL)
+		    s = sec;
+
+		  vpp = &elf_section_data (s)->local_dynrel;
+		  head = (struct riscv_elf_dyn_relocs **) vpp;
+		}
+
+	      p = *head;
+	      if (p == NULL || p->sec != sec)
+		{
+		  bfd_size_type amt = sizeof *p;
+		  p = ((struct riscv_elf_dyn_relocs *)
+		       bfd_alloc (htab->elf.dynobj, amt));
+		  if (p == NULL)
+		    return FALSE;
+		  p->next = *head;
+		  *head = p;
+		  p->sec = sec;
+		  p->count = 0;
+		  p->pc_count = 0;
+		}
+
+	      p->count += 1;
+	      p->pc_count += riscv_elf_rtype_to_howto (r_type)->pc_relative;
+	    }
+
+	  break;
+
+	case R_RISCV_GNU_VTINHERIT:
+	  if (!bfd_elf_gc_record_vtinherit (abfd, sec, h, rel->r_offset))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_GNU_VTENTRY:
+	  if (!bfd_elf_gc_record_vtentry (abfd, sec, h, rel->r_addend))
+	    return FALSE;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return TRUE;
+}
+
+static asection *
+riscv_elf_gc_mark_hook (asection *sec,
+			struct bfd_link_info *info,
+			Elf_Internal_Rela *rel,
+			struct elf_link_hash_entry *h,
+			Elf_Internal_Sym *sym)
+{
+  if (h != NULL)
+    switch (ELF32_R_TYPE (rel->r_info))
+      {
+      case R_RISCV_GNU_VTINHERIT:
+      case R_RISCV_GNU_VTENTRY:
+	return NULL;
+      }
+
+  return _bfd_elf_gc_mark_hook (sec, info, rel, h, sym);
+}
+
+/* Update the got entry reference counts for the section being removed.  */
+
+static bfd_boolean
+riscv_elf_gc_sweep_hook (bfd *abfd,
+			 struct bfd_link_info *info,
+			 asection *sec,
+			 const Elf_Internal_Rela *relocs)
+{
+  const Elf_Internal_Rela *rel, *relend;
+  Elf_Internal_Shdr *symtab_hdr = &elf_symtab_hdr (abfd);
+  struct elf_link_hash_entry **sym_hashes = elf_sym_hashes (abfd);
+  bfd_signed_vma *local_got_refcounts = elf_local_got_refcounts (abfd);
+
+  if (bfd_link_relocatable (info))
+    return TRUE;
+
+  elf_section_data (sec)->local_dynrel = NULL;
+
+  for (rel = relocs, relend = relocs + sec->reloc_count; rel < relend; rel++)
+    {
+      unsigned long r_symndx;
+      struct elf_link_hash_entry *h = NULL;
+
+      r_symndx = ELF32_R_SYM (rel->r_info);
+      if (r_symndx >= symtab_hdr->sh_info)
+	{
+	  struct riscv_elf_link_hash_entry *eh;
+	  struct riscv_elf_dyn_relocs **pp;
+	  struct riscv_elf_dyn_relocs *p;
+
+	  h = sym_hashes[r_symndx - symtab_hdr->sh_info];
+	  while (h->root.type == bfd_link_hash_indirect
+		 || h->root.type == bfd_link_hash_warning)
+	    h = (struct elf_link_hash_entry *) h->root.u.i.link;
+	  eh = (struct riscv_elf_link_hash_entry *) h;
+	  for (pp = &eh->dyn_relocs; (p = *pp) != NULL; pp = &p->next)
+	    if (p->sec == sec)
+	      {
+		/* Everything must go for SEC.  */
+		*pp = p->next;
+		break;
+	      }
+	}
+
+      switch (ELF32_R_TYPE (rel->r_info))
+	{
+	case R_RISCV_GOT_HI20:
+	case R_RISCV_TLS_GOT_HI20:
+	case R_RISCV_TLS_GD_HI20:
+	  if (h != NULL)
+	    {
+	      if (h->got.refcount > 0)
+		h->got.refcount--;
+	    }
+	  else
+	    {
+	      if (local_got_refcounts &&
+		  local_got_refcounts[r_symndx] > 0)
+		local_got_refcounts[r_symndx]--;
+	    }
+	  break;
+
+	case R_RISCV_HI20:
+	case R_RISCV_PCREL_HI20:
+	case R_RISCV_COPY:
+	case R_RISCV_JUMP_SLOT:
+	case R_RISCV_RELATIVE:
+	case R_RISCV_64:
+	case R_RISCV_32:
+	case R_RISCV_BRANCH:
+	case R_RISCV_CALL:
+	case R_RISCV_JAL:
+	case R_RISCV_RVC_BRANCH:
+	case R_RISCV_RVC_JUMP:
+	  if (bfd_link_pic (info))
+	    break;
+	  /* Fall through.  */
+
+	case R_RISCV_CALL_PLT:
+	  if (h != NULL)
+	    {
+	      if (h->plt.refcount > 0)
+		h->plt.refcount--;
+	    }
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return TRUE;
+}
+
+/* Adjust a symbol defined by a dynamic object and referenced by a
+   regular object.  The current definition is in some section of the
+   dynamic object, but we're not including those sections.  We have to
+   change the definition to something the rest of the link can
+   understand.  */
+
+static bfd_boolean
+riscv_elf_adjust_dynamic_symbol (struct bfd_link_info *info,
+				 struct elf_link_hash_entry *h)
+{
+  struct riscv_elf_link_hash_table *htab;
+  struct riscv_elf_link_hash_entry * eh;
+  struct riscv_elf_dyn_relocs *p;
+  bfd *dynobj;
+  asection *s, *srel;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+
+  dynobj = htab->elf.dynobj;
+
+  /* Make sure we know what is going on here.  */
+  BFD_ASSERT (dynobj != NULL
+	      && (h->needs_plt
+		  || h->type == STT_GNU_IFUNC
+		  || h->u.weakdef != NULL
+		  || (h->def_dynamic
+		      && h->ref_regular
+		      && !h->def_regular)));
+
+  /* If this is a function, put it in the procedure linkage table.  We
+     will fill in the contents of the procedure linkage table later
+     (although we could actually do it here).  */
+  if (h->type == STT_FUNC || h->type == STT_GNU_IFUNC || h->needs_plt)
+    {
+      if (h->plt.refcount <= 0
+	  || SYMBOL_CALLS_LOCAL (info, h)
+	  || (ELF_ST_VISIBILITY (h->other) != STV_DEFAULT
+	      && h->root.type == bfd_link_hash_undefweak))
+	{
+	  /* This case can occur if we saw a R_RISCV_CALL_PLT reloc in an
+	     input file, but the symbol was never referred to by a dynamic
+	     object, or if all references were garbage collected.  In such
+	     a case, we don't actually need to build a PLT entry.  */
+	  h->plt.offset = (bfd_vma) -1;
+	  h->needs_plt = 0;
+	}
+
+      return TRUE;
+    }
+  else
+    h->plt.offset = (bfd_vma) -1;
+
+  /* If this is a weak symbol, and there is a real definition, the
+     processor independent code will have arranged for us to see the
+     real definition first, and we can just use the same value.  */
+  if (h->u.weakdef != NULL)
+    {
+      BFD_ASSERT (h->u.weakdef->root.type == bfd_link_hash_defined
+		  || h->u.weakdef->root.type == bfd_link_hash_defweak);
+      h->root.u.def.section = h->u.weakdef->root.u.def.section;
+      h->root.u.def.value = h->u.weakdef->root.u.def.value;
+      return TRUE;
+    }
+
+  /* This is a reference to a symbol defined by a dynamic object which
+     is not a function.  */
+
+  /* If we are creating a shared library, we must presume that the
+     only references to the symbol are via the global offset table.
+     For such cases we need not do anything here; the relocations will
+     be handled correctly by relocate_section.  */
+  if (bfd_link_pic (info))
+    return TRUE;
+
+  /* If there are no references to this symbol that do not use the
+     GOT, we don't need to generate a copy reloc.  */
+  if (!h->non_got_ref)
+    return TRUE;
+
+  /* If -z nocopyreloc was given, we won't generate them either.  */
+  if (info->nocopyreloc)
+    {
+      h->non_got_ref = 0;
+      return TRUE;
+    }
+
+  eh = (struct riscv_elf_link_hash_entry *) h;
+  for (p = eh->dyn_relocs; p != NULL; p = p->next)
+    {
+      s = p->sec->output_section;
+      if (s != NULL && (s->flags & SEC_READONLY) != 0)
+	break;
+    }
+
+  /* If we didn't find any dynamic relocs in read-only sections, then
+     we'll be keeping the dynamic relocs and avoiding the copy reloc.  */
+  if (p == NULL)
+    {
+      h->non_got_ref = 0;
+      return TRUE;
+    }
+
+  /* We must allocate the symbol in our .dynbss section, which will
+     become part of the .bss section of the executable.  There will be
+     an entry for this symbol in the .dynsym section.  The dynamic
+     object will contain position independent code, so all references
+     from the dynamic object to this symbol will go through the global
+     offset table.  The dynamic linker will use the .dynsym entry to
+     determine the address it must put in the global offset table, so
+     both the dynamic object and the regular object will refer to the
+     same memory location for the variable.  */
+
+  /* We must generate a R_RISCV_COPY reloc to tell the dynamic linker
+     to copy the initial value out of the dynamic object and into the
+     runtime process image.  We need to remember the offset into the
+     .rel.bss section we are going to use.  */
+  if ((h->root.u.def.section->flags & SEC_READONLY) != 0)
+    {
+      s = htab->elf.sdynrelro;
+      srel = htab->elf.sreldynrelro;
+    }
+  else
+    {
+      s = htab->elf.sdynbss;
+      srel = htab->elf.srelbss;
+    }
+  if ((h->root.u.def.section->flags & SEC_ALLOC) != 0 && h->size != 0)
+    {
+      srel->size += sizeof (Elf32_External_Rela);
+      h->needs_copy = 1;
+    }
+
+  if (eh->tls_type & ~GOT_NORMAL)
+    return _bfd_elf_adjust_dynamic_copy (info, h, htab->sdyntdata);
+
+  return _bfd_elf_adjust_dynamic_copy (info, h, s);
+}
+
+/* Allocate space in .plt, .got and associated reloc sections for
+   dynamic relocs.  */
+
+static bfd_boolean
+allocate_dynrelocs (struct elf_link_hash_entry *h, void *inf)
+{
+  struct bfd_link_info *info;
+  struct riscv_elf_link_hash_table *htab;
+  struct riscv_elf_link_hash_entry *eh;
+  struct riscv_elf_dyn_relocs *p;
+
+  if (h->root.type == bfd_link_hash_indirect)
+    return TRUE;
+
+  info = (struct bfd_link_info *) inf;
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+
+  if (htab->elf.dynamic_sections_created
+      && h->plt.refcount > 0)
+    {
+      /* Make sure this symbol is output as a dynamic symbol.
+	 Undefined weak syms won't yet be marked as dynamic.  */
+      if (h->dynindx == -1
+	  && !h->forced_local)
+	{
+	  if (! bfd_elf_link_record_dynamic_symbol (info, h))
+	    return FALSE;
+	}
+
+      if (WILL_CALL_FINISH_DYNAMIC_SYMBOL (1, bfd_link_pic (info), h))
+	{
+	  asection *s = htab->elf.splt;
+
+	  if (s->size == 0)
+	    s->size = PLT_HEADER_SIZE;
+
+	  h->plt.offset = s->size;
+
+	  /* Make room for this entry.  */
+	  s->size += PLT_ENTRY_SIZE;
+
+	  /* We also need to make an entry in the .got.plt section.  */
+	  htab->elf.sgotplt->size += GOT_ENTRY_SIZE;
+
+	  /* We also need to make an entry in the .rela.plt section.  */
+	  htab->elf.srelplt->size += sizeof (Elf32_External_Rela);
+
+	  /* If this symbol is not defined in a regular file, and we are
+	     not generating a shared library, then set the symbol to this
+	     location in the .plt.  This is required to make function
+	     pointers compare as equal between the normal executable and
+	     the shared library.  */
+	  if (! bfd_link_pic (info)
+	      && !h->def_regular)
+	    {
+	      h->root.u.def.section = s;
+	      h->root.u.def.value = h->plt.offset;
+	    }
+	}
+      else
+	{
+	  h->plt.offset = (bfd_vma) -1;
+	  h->needs_plt = 0;
+	}
+    }
+  else
+    {
+      h->plt.offset = (bfd_vma) -1;
+      h->needs_plt = 0;
+    }
+
+  if (h->got.refcount > 0)
+    {
+      asection *s;
+      bfd_boolean dyn;
+      int tls_type = riscv_elf_hash_entry (h)->tls_type;
+
+      /* Make sure this symbol is output as a dynamic symbol.
+	 Undefined weak syms won't yet be marked as dynamic.  */
+      if (h->dynindx == -1
+	  && !h->forced_local)
+	{
+	  if (! bfd_elf_link_record_dynamic_symbol (info, h))
+	    return FALSE;
+	}
+
+      s = htab->elf.sgot;
+      h->got.offset = s->size;
+      dyn = htab->elf.dynamic_sections_created;
+      if (tls_type & (GOT_TLS_GD | GOT_TLS_IE))
+	{
+	  /* TLS_GD needs two dynamic relocs and two GOT slots.  */
+	  if (tls_type & GOT_TLS_GD)
+	    {
+	      s->size += 2 * RISCV_ELF_WORD_BYTES;
+	      htab->elf.srelgot->size += 2 * sizeof (Elf32_External_Rela);
+	    }
+
+	  /* TLS_IE needs one dynamic reloc and one GOT slot.  */
+	  if (tls_type & GOT_TLS_IE)
+	    {
+	      s->size += RISCV_ELF_WORD_BYTES;
+	      htab->elf.srelgot->size += sizeof (Elf32_External_Rela);
+	    }
+	}
+      else
+	{
+	  s->size += RISCV_ELF_WORD_BYTES;
+	  if (WILL_CALL_FINISH_DYNAMIC_SYMBOL (dyn, bfd_link_pic (info), h))
+	    htab->elf.srelgot->size += sizeof (Elf32_External_Rela);
+	}
+    }
+  else
+    h->got.offset = (bfd_vma) -1;
+
+  eh = (struct riscv_elf_link_hash_entry *) h;
+  if (eh->dyn_relocs == NULL)
+    return TRUE;
+
+  /* In the shared -Bsymbolic case, discard space allocated for
+     dynamic pc-relative relocs against symbols which turn out to be
+     defined in regular objects.  For the normal shared case, discard
+     space for pc-relative relocs that have become local due to symbol
+     visibility changes.  */
+
+  if (bfd_link_pic (info))
+    {
+      if (SYMBOL_CALLS_LOCAL (info, h))
+	{
+	  struct riscv_elf_dyn_relocs **pp;
+
+	  for (pp = &eh->dyn_relocs; (p = *pp) != NULL; )
+	    {
+	      p->count -= p->pc_count;
+	      p->pc_count = 0;
+	      if (p->count == 0)
+		*pp = p->next;
+	      else
+		pp = &p->next;
+	    }
+	}
+
+      /* Also discard relocs on undefined weak syms with non-default
+	 visibility.  */
+      if (eh->dyn_relocs != NULL
+	  && h->root.type == bfd_link_hash_undefweak)
+	{
+	  if (ELF_ST_VISIBILITY (h->other) != STV_DEFAULT)
+	    eh->dyn_relocs = NULL;
+
+	  /* Make sure undefined weak symbols are output as a dynamic
+	     symbol in PIEs.  */
+	  else if (h->dynindx == -1
+		   && !h->forced_local)
+	    {
+	      if (! bfd_elf_link_record_dynamic_symbol (info, h))
+		return FALSE;
+	    }
+	}
+    }
+  else
+    {
+      /* For the non-shared case, discard space for relocs against
+	 symbols which turn out to need copy relocs or are not
+	 dynamic.  */
+
+      if (!h->non_got_ref
+	  && ((h->def_dynamic
+	       && !h->def_regular)
+	      || (htab->elf.dynamic_sections_created
+		  && (h->root.type == bfd_link_hash_undefweak
+		      || h->root.type == bfd_link_hash_undefined))))
+	{
+	  /* Make sure this symbol is output as a dynamic symbol.
+	     Undefined weak syms won't yet be marked as dynamic.  */
+	  if (h->dynindx == -1
+	      && !h->forced_local)
+	    {
+	      if (! bfd_elf_link_record_dynamic_symbol (info, h))
+		return FALSE;
+	    }
+
+	  /* If that succeeded, we know we'll be keeping all the
+	     relocs.  */
+	  if (h->dynindx != -1)
+	    goto keep;
+	}
+
+      eh->dyn_relocs = NULL;
+
+    keep: ;
+    }
+
+  /* Finally, allocate space.  */
+  for (p = eh->dyn_relocs; p != NULL; p = p->next)
+    {
+      asection *sreloc = elf_section_data (p->sec)->sreloc;
+      sreloc->size += p->count * sizeof (Elf32_External_Rela);
+    }
+
+  return TRUE;
+}
+
+/* Find any dynamic relocs that apply to read-only sections.  */
+
+static bfd_boolean
+readonly_dynrelocs (struct elf_link_hash_entry *h, void *inf)
+{
+  struct riscv_elf_link_hash_entry *eh;
+  struct riscv_elf_dyn_relocs *p;
+
+  eh = (struct riscv_elf_link_hash_entry *) h;
+  for (p = eh->dyn_relocs; p != NULL; p = p->next)
+    {
+      asection *s = p->sec->output_section;
+
+      if (s != NULL && (s->flags & SEC_READONLY) != 0)
+	{
+	  ((struct bfd_link_info *) inf)->flags |= DF_TEXTREL;
+	  return FALSE;
+	}
+    }
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_elf_size_dynamic_sections (bfd *output_bfd, struct bfd_link_info *info)
+{
+  struct riscv_elf_link_hash_table *htab;
+  bfd *dynobj;
+  asection *s;
+  bfd *ibfd;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+  dynobj = htab->elf.dynobj;
+  BFD_ASSERT (dynobj != NULL);
+
+  if (elf_hash_table (info)->dynamic_sections_created)
+    {
+      /* Set the contents of the .interp section to the interpreter.  */
+      if (bfd_link_executable (info) && !info->nointerp)
+	{
+	  s = bfd_get_linker_section (dynobj, ".interp");
+	  BFD_ASSERT (s != NULL);
+	  s->size = strlen (ELF32_DYNAMIC_INTERPRETER) + 1;
+	  s->contents = (unsigned char *) ELF32_DYNAMIC_INTERPRETER;
+	}
+    }
+
+  /* Set up .got offsets for local syms, and space for local dynamic
+     relocs.  */
+  for (ibfd = info->input_bfds; ibfd != NULL; ibfd = ibfd->link.next)
+    {
+      bfd_signed_vma *local_got;
+      bfd_signed_vma *end_local_got;
+      char *local_tls_type;
+      bfd_size_type locsymcount;
+      Elf_Internal_Shdr *symtab_hdr;
+      asection *srel;
+
+      if (! is_riscv_elf (ibfd))
+	continue;
+
+      for (s = ibfd->sections; s != NULL; s = s->next)
+	{
+	  struct riscv_elf_dyn_relocs *p;
+
+	  for (p = elf_section_data (s)->local_dynrel; p != NULL; p = p->next)
+	    {
+	      if (!bfd_is_abs_section (p->sec)
+		  && bfd_is_abs_section (p->sec->output_section))
+		{
+		  /* Input section has been discarded, either because
+		     it is a copy of a linkonce section or due to
+		     linker script /DISCARD/, so we'll be discarding
+		     the relocs too.  */
+		}
+	      else if (p->count != 0)
+		{
+		  srel = elf_section_data (p->sec)->sreloc;
+		  srel->size += p->count * sizeof (Elf32_External_Rela);
+		  if ((p->sec->output_section->flags & SEC_READONLY) != 0)
+		    info->flags |= DF_TEXTREL;
+		}
+	    }
+	}
+
+      local_got = elf_local_got_refcounts (ibfd);
+      if (!local_got)
+	continue;
+
+      symtab_hdr = &elf_symtab_hdr (ibfd);
+      locsymcount = symtab_hdr->sh_info;
+      end_local_got = local_got + locsymcount;
+      local_tls_type = _bfd_riscv_elf_local_got_tls_type (ibfd);
+      s = htab->elf.sgot;
+      srel = htab->elf.srelgot;
+      for (; local_got < end_local_got; ++local_got, ++local_tls_type)
+	{
+	  if (*local_got > 0)
+	    {
+	      *local_got = s->size;
+	      s->size += RISCV_ELF_WORD_BYTES;
+	      if (*local_tls_type & GOT_TLS_GD)
+		s->size += RISCV_ELF_WORD_BYTES;
+	      if (bfd_link_pic (info)
+		  || (*local_tls_type & (GOT_TLS_GD | GOT_TLS_IE)))
+		srel->size += sizeof (Elf32_External_Rela);
+	    }
+	  else
+	    *local_got = (bfd_vma) -1;
+	}
+    }
+
+  /* Allocate global sym .plt and .got entries, and space for global
+     sym dynamic relocs.  */
+  elf_link_hash_traverse (&htab->elf, allocate_dynrelocs, info);
+
+  if (htab->elf.sgotplt)
+    {
+      struct elf_link_hash_entry *got;
+      got = elf_link_hash_lookup (elf_hash_table (info),
+				  "_GLOBAL_OFFSET_TABLE_",
+				  FALSE, FALSE, FALSE);
+
+      /* Don't allocate .got.plt section if there are no GOT nor PLT
+	 entries and there is no refeence to _GLOBAL_OFFSET_TABLE_.  */
+      if ((got == NULL
+	   || !got->ref_regular_nonweak)
+	  && (htab->elf.sgotplt->size == GOTPLT_HEADER_SIZE)
+	  && (htab->elf.splt == NULL
+	      || htab->elf.splt->size == 0)
+	  && (htab->elf.sgot == NULL
+	      || (htab->elf.sgot->size
+		  == get_elf_backend_data (output_bfd)->got_header_size)))
+	htab->elf.sgotplt->size = 0;
+    }
+
+  /* The check_relocs and adjust_dynamic_symbol entry points have
+     determined the sizes of the various dynamic sections.  Allocate
+     memory for them.  */
+  for (s = dynobj->sections; s != NULL; s = s->next)
+    {
+      if ((s->flags & SEC_LINKER_CREATED) == 0)
+	continue;
+
+      if (s == htab->elf.splt
+	  || s == htab->elf.sgot
+	  || s == htab->elf.sgotplt
+	  || s == htab->elf.sdynbss
+	  || s == htab->elf.sdynrelro)
+	{
+	  /* Strip this section if we don't need it; see the
+	     comment below.  */
+	}
+      else if (strncmp (s->name, ".rela", 5) == 0)
+	{
+	  if (s->size != 0)
+	    {
+	      /* We use the reloc_count field as a counter if we need
+		 to copy relocs into the output file.  */
+	      s->reloc_count = 0;
+	    }
+	}
+      else
+	{
+	  /* It's not one of our sections.  */
+	  continue;
+	}
+
+      if (s->size == 0)
+	{
+	  /* If we don't need this section, strip it from the
+	     output file.  This is mostly to handle .rela.bss and
+	     .rela.plt.  We must create both sections in
+	     create_dynamic_sections, because they must be created
+	     before the linker maps input sections to output
+	     sections.  The linker does that before
+	     adjust_dynamic_symbol is called, and it is that
+	     function which decides whether anything needs to go
+	     into these sections.  */
+	  s->flags |= SEC_EXCLUDE;
+	  continue;
+	}
+
+      if ((s->flags & SEC_HAS_CONTENTS) == 0)
+	continue;
+
+      /* Allocate memory for the section contents.  Zero the memory
+	 for the benefit of .rela.plt, which has 4 unused entries
+	 at the beginning, and we don't want garbage.  */
+      s->contents = (bfd_byte *) bfd_zalloc (dynobj, s->size);
+      if (s->contents == NULL)
+	return FALSE;
+    }
+
+  if (elf_hash_table (info)->dynamic_sections_created)
+    {
+      /* Add some entries to the .dynamic section.  We fill in the
+	 values later, in riscv_elf_finish_dynamic_sections, but we
+	 must add the entries now so that we get the correct size for
+	 the .dynamic section.  The DT_DEBUG entry is filled in by the
+	 dynamic linker and used by the debugger.  */
+#define add_dynamic_entry(TAG, VAL) \
+  _bfd_elf_add_dynamic_entry (info, TAG, VAL)
+
+      if (bfd_link_executable (info))
+	{
+	  if (!add_dynamic_entry (DT_DEBUG, 0))
+	    return FALSE;
+	}
+
+      if (htab->elf.srelplt->size != 0)
+	{
+	  if (!add_dynamic_entry (DT_PLTGOT, 0)
+	      || !add_dynamic_entry (DT_PLTRELSZ, 0)
+	      || !add_dynamic_entry (DT_PLTREL, DT_RELA)
+	      || !add_dynamic_entry (DT_JMPREL, 0))
+	    return FALSE;
+	}
+
+      if (!add_dynamic_entry (DT_RELA, 0)
+	  || !add_dynamic_entry (DT_RELASZ, 0)
+	  || !add_dynamic_entry (DT_RELAENT, sizeof (Elf32_External_Rela)))
+	return FALSE;
+
+      /* If any dynamic relocs apply to a read-only section,
+	 then we need a DT_TEXTREL entry.  */
+      if ((info->flags & DF_TEXTREL) == 0)
+	elf_link_hash_traverse (&htab->elf, readonly_dynrelocs, info);
+
+      if (info->flags & DF_TEXTREL)
+	{
+	  if (!add_dynamic_entry (DT_TEXTREL, 0))
+	    return FALSE;
+	}
+    }
+#undef add_dynamic_entry
+
+  return TRUE;
+}
+
+#define TP_OFFSET 0
+#define DTP_OFFSET 0x800
+
+/* Return the relocation value for a TLS dtp-relative reloc.  */
+
+static bfd_vma
+dtpoff (struct bfd_link_info *info, bfd_vma address)
+{
+  /* If tls_sec is NULL, we should have signalled an error already.  */
+  if (elf_hash_table (info)->tls_sec == NULL)
+    return 0;
+  return address - elf_hash_table (info)->tls_sec->vma - DTP_OFFSET;
+}
+
+/* Return the relocation value for a static TLS tp-relative relocation.  */
+
+static bfd_vma
+tpoff (struct bfd_link_info *info, bfd_vma address)
+{
+  /* If tls_sec is NULL, we should have signalled an error already.  */
+  if (elf_hash_table (info)->tls_sec == NULL)
+    return 0;
+  return address - elf_hash_table (info)->tls_sec->vma - TP_OFFSET;
+}
+
+/* Return the global pointer's value, or 0 if it is not in use.  */
+
+static bfd_vma
+riscv_global_pointer_value (struct bfd_link_info *info)
+{
+  struct bfd_link_hash_entry *h;
+
+  h = bfd_link_hash_lookup (info->hash, RISCV_GP_SYMBOL, FALSE, FALSE, TRUE);
+  if (h == NULL || h->type != bfd_link_hash_defined)
+    return 0;
+
+  return h->u.def.value + sec_addr (h->u.def.section);
+}
+
+/* Emplace a static relocation.  */
+
+static bfd_reloc_status_type
+perform_relocation (const reloc_howto_type *howto,
+		    const Elf_Internal_Rela *rel,
+		    bfd_vma value,
+		    asection *input_section,
+		    bfd *input_bfd,
+		    bfd_byte *contents,
+                    bfd_boolean IsImport)
+{
+  if (howto->pc_relative)
+    value -= sec_addr (input_section) + rel->r_offset;
+  value += rel->r_addend;
+
+  switch (ELF32_R_TYPE (rel->r_info))
+    {
+    case R_RISCV_HI20:
+    case R_RISCV_TPREL_HI20:
+    case R_RISCV_PCREL_HI20:
+    case R_RISCV_GOT_HI20:
+    case R_RISCV_TLS_GOT_HI20:
+    case R_RISCV_TLS_GD_HI20:
+      if (ARCH_SIZE > 32 && !VALID_UTYPE_IMM (RISCV_CONST_HIGH_PART (value)))
+	return bfd_reloc_overflow;
+      value = ENCODE_UTYPE_IMM (RISCV_CONST_HIGH_PART (value));
+      break;
+    /* Pulp specific relocs */
+    case R_RISCV_12_I:
+      if (!VALID_ITYPE_IMM (value)) return bfd_reloc_overflow;
+      value = ENCODE_ITYPE_IMM (value);
+      break;
+    case R_RISCV_12_S:
+      if (!VALID_STYPE_IMM (value)) return bfd_reloc_overflow;
+      value = ENCODE_STYPE_IMM (value);
+      break;
+
+    case R_RISCV_REL12:
+      value = ENCODE_ITYPE_IMM (value>>howto->rightshift);
+      break;
+    case R_RISCV_RELU5:
+      value = ENCODE_I1TYPE_UIMM (value>>howto->rightshift);
+      break;
+    /* End of Pulp specific relocs */
+    case R_RISCV_LO12_I:
+    case R_RISCV_GPREL_I:
+    case R_RISCV_TPREL_LO12_I:
+    case R_RISCV_TPREL_I:
+    case R_RISCV_PCREL_LO12_I:
+      value = ENCODE_ITYPE_IMM (value);
+      break;
+
+    case R_RISCV_LO12_S:
+    case R_RISCV_GPREL_S:
+    case R_RISCV_TPREL_LO12_S:
+    case R_RISCV_TPREL_S:
+    case R_RISCV_PCREL_LO12_S:
+      value = ENCODE_STYPE_IMM (value);
+      break;
+
+    case R_RISCV_CALL:
+    case R_RISCV_CALL_PLT:
+      if (ARCH_SIZE > 32 && !VALID_UTYPE_IMM (RISCV_CONST_HIGH_PART (value)))
+	return bfd_reloc_overflow;
+      value = ENCODE_UTYPE_IMM (RISCV_CONST_HIGH_PART (value))
+	      | (ENCODE_ITYPE_IMM (value) << 32);
+      break;
+
+    case R_RISCV_JAL:
+      if (!IsImport && !VALID_UJTYPE_IMM (value)) return bfd_reloc_overflow;
+      value = ENCODE_UJTYPE_IMM (value);
+      break;
+
+    case R_RISCV_BRANCH:
+      if (!VALID_SBTYPE_IMM (value))
+	return bfd_reloc_overflow;
+      value = ENCODE_SBTYPE_IMM (value);
+      break;
+
+    case R_RISCV_RVC_BRANCH:
+      if (!VALID_RVC_B_IMM (value))
+	return bfd_reloc_overflow;
+      value = ENCODE_RVC_B_IMM (value);
+      break;
+
+    case R_RISCV_RVC_JUMP:
+      if (!VALID_RVC_J_IMM (value))
+	return bfd_reloc_overflow;
+      value = ENCODE_RVC_J_IMM (value);
+      break;
+
+    case R_RISCV_RVC_LUI:
+      if (!VALID_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (value)))
+	return bfd_reloc_overflow;
+      value = ENCODE_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (value));
+      break;
+
+    case R_RISCV_32:
+    case R_RISCV_64:
+    case R_RISCV_ADD8:
+    case R_RISCV_ADD16:
+    case R_RISCV_ADD32:
+    case R_RISCV_ADD64:
+    case R_RISCV_SUB6:
+    case R_RISCV_SUB8:
+    case R_RISCV_SUB16:
+    case R_RISCV_SUB32:
+    case R_RISCV_SUB64:
+    case R_RISCV_SET6:
+    case R_RISCV_SET8:
+    case R_RISCV_SET16:
+    case R_RISCV_SET32:
+    case R_RISCV_TLS_DTPREL32:
+    case R_RISCV_TLS_DTPREL64:
+      break;
+
+    default:
+      return bfd_reloc_notsupported;
+    }
+
+  bfd_vma word = bfd_get (howto->bitsize, input_bfd, contents + rel->r_offset);
+  word = (word & ~howto->dst_mask) | (value & howto->dst_mask);
+  bfd_put (howto->bitsize, input_bfd, word, contents + rel->r_offset);
+
+  return bfd_reloc_ok;
+}
+
+/* Remember all PC-relative high-part relocs we've encountered to help us
+   later resolve the corresponding low-part relocs.  */
+
+typedef struct
+{
+  bfd_vma address;
+  bfd_vma value;
+} riscv_pcrel_hi_reloc;
+
+typedef struct riscv_pcrel_lo_reloc
+{
+  asection *                     input_section;
+  struct bfd_link_info *         info;
+  reloc_howto_type *             howto;
+  const Elf_Internal_Rela *      reloc;
+  bfd_vma                        addr;
+  const char *                   name;
+  bfd_byte *                     contents;
+  struct riscv_pcrel_lo_reloc *  next;
+} riscv_pcrel_lo_reloc;
+
+typedef struct
+{
+  htab_t hi_relocs;
+  riscv_pcrel_lo_reloc *lo_relocs;
+} riscv_pcrel_relocs;
+
+static hashval_t
+riscv_pcrel_reloc_hash (const void *entry)
+{
+  const riscv_pcrel_hi_reloc *e = entry;
+  return (hashval_t)(e->address >> 2);
+}
+
+static bfd_boolean
+riscv_pcrel_reloc_eq (const void *entry1, const void *entry2)
+{
+  const riscv_pcrel_hi_reloc *e1 = entry1, *e2 = entry2;
+  return e1->address == e2->address;
+}
+
+static bfd_boolean
+riscv_init_pcrel_relocs (riscv_pcrel_relocs *p)
+{
+
+  p->lo_relocs = NULL;
+  p->hi_relocs = htab_create (1024, riscv_pcrel_reloc_hash,
+			      riscv_pcrel_reloc_eq, free);
+  return p->hi_relocs != NULL;
+}
+
+static void
+riscv_free_pcrel_relocs (riscv_pcrel_relocs *p)
+{
+  riscv_pcrel_lo_reloc *cur = p->lo_relocs;
+
+  while (cur != NULL)
+    {
+      riscv_pcrel_lo_reloc *next = cur->next;
+      free (cur);
+      cur = next;
+    }
+
+  htab_delete (p->hi_relocs);
+}
+
+static bfd_boolean
+riscv_record_pcrel_hi_reloc (riscv_pcrel_relocs *p, bfd_vma addr, bfd_vma value)
+{
+  riscv_pcrel_hi_reloc entry = {addr, value - addr};
+  riscv_pcrel_hi_reloc **slot =
+    (riscv_pcrel_hi_reloc **) htab_find_slot (p->hi_relocs, &entry, INSERT);
+
+  BFD_ASSERT (*slot == NULL);
+  *slot = (riscv_pcrel_hi_reloc *) bfd_malloc (sizeof (riscv_pcrel_hi_reloc));
+  if (*slot == NULL)
+    return FALSE;
+  **slot = entry;
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_record_pcrel_lo_reloc (riscv_pcrel_relocs *p,
+			     asection *input_section,
+			     struct bfd_link_info *info,
+			     reloc_howto_type *howto,
+			     const Elf_Internal_Rela *reloc,
+			     bfd_vma addr,
+			     const char *name,
+			     bfd_byte *contents)
+{
+  riscv_pcrel_lo_reloc *entry;
+  entry = (riscv_pcrel_lo_reloc *) bfd_malloc (sizeof (riscv_pcrel_lo_reloc));
+  if (entry == NULL)
+    return FALSE;
+  *entry = (riscv_pcrel_lo_reloc) {input_section, info, howto, reloc, addr,
+				   name, contents, p->lo_relocs};
+  p->lo_relocs = entry;
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_resolve_pcrel_lo_relocs (riscv_pcrel_relocs *p)
+{
+  riscv_pcrel_lo_reloc *r;
+
+  for (r = p->lo_relocs; r != NULL; r = r->next)
+    {
+      bfd *input_bfd = r->input_section->owner;
+
+      riscv_pcrel_hi_reloc search = {r->addr, 0};
+      riscv_pcrel_hi_reloc *entry = htab_find (p->hi_relocs, &search);
+      if (entry == NULL)
+        {
+	  ((*r->info->callbacks->reloc_overflow)
+	   (r->info, NULL, r->name, r->howto->name, (bfd_vma) 0,
+	    input_bfd, r->input_section, r->reloc->r_offset));
+	  return TRUE;
+        }
+
+      perform_relocation (r->howto, r->reloc, entry->value, r->input_section,
+			  input_bfd, r->contents, FALSE);
+    }
+
+  return TRUE;
+}
+
+static bfd_boolean RegisterImportReloc(struct bfd_link_info *info,
+                                bfd *input_bfd,
+                                asection *input_section,
+                                Elf_Internal_Rela *rel,
+                                unsigned long r_symndx,
+                                Elf_Internal_Shdr *symtab_hdr,
+                                struct elf_link_hash_entry **sym_hashes,
+                                reloc_howto_type *howto)
+
+{
+        struct elf_link_hash_entry *h;
+        asection *sec;
+
+        if (sym_hashes == NULL) return FALSE;
+
+        /* It seems this can happen with erroneous or unsupported input (mixing a.out and elf in an archive, for example.)  */
+        h = sym_hashes[r_symndx - symtab_hdr->sh_info];
+
+        if (info->wrap_hash != NULL && (input_section->flags & SEC_DEBUGGING) != 0)
+                h = ((struct elf_link_hash_entry *) unwrap_hash_lookup (info, input_bfd, &h->root));
+
+        while (h->root.type == bfd_link_hash_indirect || h->root.type == bfd_link_hash_warning)
+                h = (struct elf_link_hash_entry *) h->root.u.i.link;
+        if (h->root.type == bfd_link_hash_defweak) {
+                sec = h->root.u.def.section;
+                if (sec != NULL && sec->output_section != NULL && (strcmp(sec->name, ".pulp.import")==0)) {
+                        static int Trace = 0;
+                        sec->flags |= SEC_KEEP;
+                        if (Trace) printf("    Importing %15s in reloc: %4d -> %4d:%22s, at offset: (%8X + %8X) => %X\n",
+                                          h->root.root.string, (int) rel->r_info,
+                                          (int) ELF32_R_TYPE(rel->r_info), howto->name, (int) rel->r_offset, (int) input_section->output_offset,
+                                          (int) ((int) input_section->output_offset+(int)rel->r_offset));
+                        InsertImportEntry(h->root.root.string, rel, input_section->output_offset, FALSE);
+                        return TRUE;
+                }
+        }
+        return FALSE;
+}
+
+
+/* Relocate a RISC-V ELF section.
+
+   The RELOCATE_SECTION function is called by the new ELF backend linker
+   to handle the relocations for a section.
+
+   The relocs are always passed as Rela structures.
+
+   This function is responsible for adjusting the section contents as
+   necessary, and (if generating a relocatable output file) adjusting
+   the reloc addend as necessary.
+
+   This function does not have to worry about setting the reloc
+   address or the reloc symbol index.
+
+   LOCAL_SYMS is a pointer to the swapped in local symbols.
+
+   LOCAL_SECTIONS is an array giving the section in the input file
+   corresponding to the st_shndx field of each local symbol.
+
+   The global hash table entry for the global symbols can be found
+   via elf_sym_hashes (input_bfd).
+
+   When generating relocatable output, this function must handle
+   STB_LOCAL/STT_SECTION symbols specially.  The output symbol is
+   going to be the section symbol corresponding to the output
+   section, which means that the addend must be adjusted
+   accordingly.  */
+
+static bfd_boolean
+riscv_elf_relocate_section (bfd *output_bfd,
+			    struct bfd_link_info *info,
+			    bfd *input_bfd,
+			    asection *input_section,
+			    bfd_byte *contents,
+			    Elf_Internal_Rela *relocs,
+			    Elf_Internal_Sym *local_syms,
+			    asection **local_sections)
+{
+  Elf_Internal_Rela *rel;
+  Elf_Internal_Rela *relend;
+  riscv_pcrel_relocs pcrel_relocs;
+  bfd_boolean ret = FALSE;
+  asection *sreloc = elf_section_data (input_section)->sreloc;
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  Elf_Internal_Shdr *symtab_hdr = &elf_symtab_hdr (input_bfd);
+  struct elf_link_hash_entry **sym_hashes = elf_sym_hashes (input_bfd);
+  bfd_vma *local_got_offsets = elf_local_got_offsets (input_bfd);
+
+  if (!riscv_init_pcrel_relocs (&pcrel_relocs))
+    return FALSE;
+
+  relend = relocs + input_section->reloc_count;
+  for (rel = relocs; rel < relend; rel++)
+    {
+      unsigned long r_symndx;
+      struct elf_link_hash_entry *h;
+      Elf_Internal_Sym *sym;
+      asection *sec;
+      bfd_vma relocation;
+      bfd_reloc_status_type r = bfd_reloc_ok;
+      const char *name;
+      bfd_vma off, ie_off;
+      bfd_boolean unresolved_reloc, is_ie = FALSE, IsImport = FALSE;
+      bfd_vma pc = sec_addr (input_section) + rel->r_offset;
+      int r_type = ELF32_R_TYPE (rel->r_info), tls_type;
+      reloc_howto_type *howto = riscv_elf_rtype_to_howto (r_type);
+      const char *msg = NULL;
+
+      if (r_type == R_RISCV_GNU_VTINHERIT || r_type == R_RISCV_GNU_VTENTRY)
+	continue;
+
+      /* This is a final link.  */
+      r_symndx = ELF32_R_SYM (rel->r_info);
+      h = NULL;
+      sym = NULL;
+      sec = NULL;
+      unresolved_reloc = FALSE;
+      if (r_symndx < symtab_hdr->sh_info)
+	{
+	  sym = local_syms + r_symndx;
+	  sec = local_sections[r_symndx];
+	  relocation = _bfd_elf_rela_local_sym (output_bfd, sym, &sec, rel);
+	}
+      else
+	{
+	  bfd_boolean warned, ignored;
+
+	  IsImport = RegisterImportReloc(info, input_bfd, input_section, rel, r_symndx, symtab_hdr, sym_hashes, howto);
+
+	  RELOC_FOR_GLOBAL_SYMBOL (info, input_bfd, input_section, rel,
+				   r_symndx, symtab_hdr, sym_hashes,
+				   h, sec, relocation,
+				   unresolved_reloc, warned, ignored);
+	  if (warned)
+	    {
+	      /* To avoid generating warning messages about truncated
+		 relocations, set the relocation's address to be the same as
+		 the start of this section.  */
+	      if (input_section->output_section != NULL)
+		relocation = input_section->output_section->vma;
+	      else
+		relocation = 0;
+	    }
+	}
+
+      if (sec != NULL && discarded_section (sec))
+	RELOC_AGAINST_DISCARDED_SECTION (info, input_bfd, input_section,
+					 rel, 1, relend, howto, 0, contents);
+
+      if (bfd_link_relocatable (info))
+	continue;
+
+      if (h != NULL)
+	name = h->root.root.string;
+      else
+	{
+	  name = (bfd_elf_string_from_elf_section
+		  (input_bfd, symtab_hdr->sh_link, sym->st_name));
+	  if (name == NULL || *name == '\0')
+	    name = bfd_section_name (input_bfd, sec);
+	}
+
+      switch (r_type)
+	{
+	case R_RISCV_NONE:
+	case R_RISCV_RELAX:
+	case R_RISCV_TPREL_ADD:
+	case R_RISCV_COPY:
+	case R_RISCV_JUMP_SLOT:
+	case R_RISCV_RELATIVE:
+	  /* These require nothing of us at all.  */
+	  continue;
+
+	case R_RISCV_HI20:
+	case R_RISCV_BRANCH:
+	case R_RISCV_RVC_BRANCH:
+	case R_RISCV_RVC_LUI:
+	case R_RISCV_LO12_I:
+	case R_RISCV_LO12_S:
+
+	/* Pulp specific */
+        case R_RISCV_RELU5:
+        case R_RISCV_REL12:
+        case R_RISCV_12_I:
+        case R_RISCV_12_S:
+	/* End of Pulp specific */
+
+	case R_RISCV_SET6:
+	case R_RISCV_SET8:
+	case R_RISCV_SET16:
+	case R_RISCV_SET32:
+	  /* These require no special handling beyond perform_relocation.  */
+	  break;
+
+	case R_RISCV_GOT_HI20:
+	  if (h != NULL)
+	    {
+	      bfd_boolean dyn, pic;
+
+	      off = h->got.offset;
+	      BFD_ASSERT (off != (bfd_vma) -1);
+	      dyn = elf_hash_table (info)->dynamic_sections_created;
+	      pic = bfd_link_pic (info);
+
+	      if (! WILL_CALL_FINISH_DYNAMIC_SYMBOL (dyn, pic, h)
+		  || (pic && SYMBOL_REFERENCES_LOCAL (info, h)))
+		{
+		  /* This is actually a static link, or it is a
+		     -Bsymbolic link and the symbol is defined
+		     locally, or the symbol was forced to be local
+		     because of a version file.  We must initialize
+		     this entry in the global offset table.  Since the
+		     offset must always be a multiple of the word size,
+		     we use the least significant bit to record whether
+		     we have initialized it already.
+
+		     When doing a dynamic link, we create a .rela.got
+		     relocation entry to initialize the value.  This
+		     is done in the finish_dynamic_symbol routine.  */
+		  if ((off & 1) != 0)
+		    off &= ~1;
+		  else
+		    {
+		      bfd_put_32 (output_bfd, relocation,
+				  htab->elf.sgot->contents + off);
+		      h->got.offset |= 1;
+		    }
+		}
+	      else
+		unresolved_reloc = FALSE;
+	    }
+	  else
+	    {
+	      BFD_ASSERT (local_got_offsets != NULL
+			  && local_got_offsets[r_symndx] != (bfd_vma) -1);
+
+	      off = local_got_offsets[r_symndx];
+
+	      /* The offset must always be a multiple of the word size.
+		 So, we can use the least significant bit to record
+		 whether we have already processed this entry.  */
+	      if ((off & 1) != 0)
+		off &= ~1;
+	      else
+		{
+		  if (bfd_link_pic (info))
+		    {
+		      asection *s;
+		      Elf_Internal_Rela outrel;
+
+		      /* We need to generate a R_RISCV_RELATIVE reloc
+			 for the dynamic linker.  */
+		      s = htab->elf.srelgot;
+		      BFD_ASSERT (s != NULL);
+
+		      outrel.r_offset = sec_addr (htab->elf.sgot) + off;
+		      outrel.r_info =
+			ELF32_R_INFO (0, R_RISCV_RELATIVE);
+		      outrel.r_addend = relocation;
+		      relocation = 0;
+		      riscv_elf_append_rela (output_bfd, s, &outrel);
+		    }
+
+		  bfd_put_32 (output_bfd, relocation,
+			      htab->elf.sgot->contents + off);
+		  local_got_offsets[r_symndx] |= 1;
+		}
+	    }
+	  relocation = sec_addr (htab->elf.sgot) + off;
+	  if (!riscv_record_pcrel_hi_reloc (&pcrel_relocs, pc, relocation))
+	    r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_ADD8:
+	case R_RISCV_ADD16:
+	case R_RISCV_ADD32:
+	case R_RISCV_ADD64:
+	  {
+	    bfd_vma old_value = bfd_get (howto->bitsize, input_bfd,
+					 contents + rel->r_offset);
+	    relocation = old_value + relocation;
+	  }
+	  break;
+
+	case R_RISCV_SUB6:
+	case R_RISCV_SUB8:
+	case R_RISCV_SUB16:
+	case R_RISCV_SUB32:
+	case R_RISCV_SUB64:
+	  {
+	    bfd_vma old_value = bfd_get (howto->bitsize, input_bfd,
+					 contents + rel->r_offset);
+	    relocation = old_value - relocation;
+	  }
+	  break;
+
+	case R_RISCV_CALL_PLT:
+	case R_RISCV_CALL:
+	case R_RISCV_JAL:
+	case R_RISCV_RVC_JUMP:
+	  if (bfd_link_pic (info) && h != NULL && h->plt.offset != MINUS_ONE)
+	    {
+	      /* Refer to the PLT entry.  */
+	      relocation = sec_addr (htab->elf.splt) + h->plt.offset;
+	      unresolved_reloc = FALSE;
+	    }
+	  break;
+
+	case R_RISCV_TPREL_HI20:
+	  relocation = tpoff (info, relocation);
+	  break;
+
+	case R_RISCV_TPREL_LO12_I:
+	case R_RISCV_TPREL_LO12_S:
+	  relocation = tpoff (info, relocation);
+	  break;
+
+	case R_RISCV_TPREL_I:
+	case R_RISCV_TPREL_S:
+	  relocation = tpoff (info, relocation);
+	  if (VALID_ITYPE_IMM (relocation + rel->r_addend))
+	    {
+	      /* We can use tp as the base register.  */
+	      bfd_vma insn = bfd_get_32 (input_bfd, contents + rel->r_offset);
+	      insn &= ~(OP_MASK_RS1 << OP_SH_RS1);
+	      insn |= X_TP << OP_SH_RS1;
+	      bfd_put_32 (input_bfd, insn, contents + rel->r_offset);
+	    }
+	  else
+	    r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_GPREL_I:
+	case R_RISCV_GPREL_S:
+	  {
+	    bfd_vma gp = riscv_global_pointer_value (info);
+	    bfd_boolean x0_base = VALID_ITYPE_IMM (relocation + rel->r_addend);
+	    if (x0_base || VALID_ITYPE_IMM (relocation + rel->r_addend - gp))
+	      {
+		/* We can use x0 or gp as the base register.  */
+		bfd_vma insn = bfd_get_32 (input_bfd, contents + rel->r_offset);
+		insn &= ~(OP_MASK_RS1 << OP_SH_RS1);
+		if (!x0_base)
+		  {
+		    rel->r_addend -= gp;
+		    insn |= X_GP << OP_SH_RS1;
+		  }
+		bfd_put_32 (input_bfd, insn, contents + rel->r_offset);
+	      }
+	    else
+	      r = bfd_reloc_overflow;
+	    break;
+	  }
+
+	case R_RISCV_PCREL_HI20:
+	  if (!riscv_record_pcrel_hi_reloc (&pcrel_relocs, pc,
+					    relocation + rel->r_addend))
+	    r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_PCREL_LO12_I:
+	case R_RISCV_PCREL_LO12_S:
+	  if (riscv_record_pcrel_lo_reloc (&pcrel_relocs, input_section, info,
+					   howto, rel, relocation, name,
+					   contents))
+	    continue;
+	  r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_TLS_DTPREL32:
+	case R_RISCV_TLS_DTPREL64:
+	  relocation = dtpoff (info, relocation);
+	  break;
+
+	case R_RISCV_32:
+	case R_RISCV_64:
+	  if ((input_section->flags & SEC_ALLOC) == 0)
+	    break;
+
+	  if ((bfd_link_pic (info)
+	       && (h == NULL
+		   || ELF_ST_VISIBILITY (h->other) == STV_DEFAULT
+		   || h->root.type != bfd_link_hash_undefweak)
+	       && (! howto->pc_relative
+		   || !SYMBOL_CALLS_LOCAL (info, h)))
+	      || (!bfd_link_pic (info)
+		  && h != NULL
+		  && h->dynindx != -1
+		  && !h->non_got_ref
+		  && ((h->def_dynamic
+		       && !h->def_regular)
+		      || h->root.type == bfd_link_hash_undefweak
+		      || h->root.type == bfd_link_hash_undefined)))
+	    {
+	      Elf_Internal_Rela outrel;
+	      bfd_boolean skip_static_relocation, skip_dynamic_relocation;
+
+	      /* When generating a shared object, these relocations
+		 are copied into the output file to be resolved at run
+		 time.  */
+
+	      outrel.r_offset =
+		_bfd_elf_section_offset (output_bfd, info, input_section,
+					 rel->r_offset);
+	      skip_static_relocation = outrel.r_offset != (bfd_vma) -2;
+	      skip_dynamic_relocation = outrel.r_offset >= (bfd_vma) -2;
+	      outrel.r_offset += sec_addr (input_section);
+
+	      if (skip_dynamic_relocation)
+		memset (&outrel, 0, sizeof outrel);
+	      else if (h != NULL && h->dynindx != -1
+		       && !(bfd_link_pic (info)
+			    && SYMBOLIC_BIND (info, h)
+			    && h->def_regular))
+		{
+		  outrel.r_info = ELF32_R_INFO (h->dynindx, r_type);
+		  outrel.r_addend = rel->r_addend;
+		}
+	      else
+		{
+		  outrel.r_info = ELF32_R_INFO (0, R_RISCV_RELATIVE);
+		  outrel.r_addend = relocation + rel->r_addend;
+		}
+
+	      riscv_elf_append_rela (output_bfd, sreloc, &outrel);
+	      if (skip_static_relocation)
+		continue;
+	    }
+	  break;
+
+	case R_RISCV_TLS_GOT_HI20:
+	  is_ie = TRUE;
+	  /* Fall through.  */
+
+	case R_RISCV_TLS_GD_HI20:
+	  if (h != NULL)
+	    {
+	      off = h->got.offset;
+	      h->got.offset |= 1;
+	    }
+	  else
+	    {
+	      off = local_got_offsets[r_symndx];
+	      local_got_offsets[r_symndx] |= 1;
+	    }
+
+	  tls_type = _bfd_riscv_elf_tls_type (input_bfd, h, r_symndx);
+	  BFD_ASSERT (tls_type & (GOT_TLS_IE | GOT_TLS_GD));
+	  /* If this symbol is referenced by both GD and IE TLS, the IE
+	     reference's GOT slot follows the GD reference's slots.  */
+	  ie_off = 0;
+	  if ((tls_type & GOT_TLS_GD) && (tls_type & GOT_TLS_IE))
+	    ie_off = 2 * GOT_ENTRY_SIZE;
+
+	  if ((off & 1) != 0)
+	    off &= ~1;
+	  else
+	    {
+	      Elf_Internal_Rela outrel;
+	      int indx = 0;
+	      bfd_boolean need_relocs = FALSE;
+
+	      if (htab->elf.srelgot == NULL)
+		abort ();
+
+	      if (h != NULL)
+		{
+		  bfd_boolean dyn, pic;
+		  dyn = htab->elf.dynamic_sections_created;
+		  pic = bfd_link_pic (info);
+
+		  if (WILL_CALL_FINISH_DYNAMIC_SYMBOL (dyn, pic, h)
+		      && (!pic || !SYMBOL_REFERENCES_LOCAL (info, h)))
+		    indx = h->dynindx;
+		}
+
+	      /* The GOT entries have not been initialized yet.  Do it
+	         now, and emit any relocations.  */
+	      if ((bfd_link_pic (info) || indx != 0)
+		  && (h == NULL
+		      || ELF_ST_VISIBILITY (h->other) == STV_DEFAULT
+		      || h->root.type != bfd_link_hash_undefweak))
+		    need_relocs = TRUE;
+
+	      if (tls_type & GOT_TLS_GD)
+		{
+		  if (need_relocs)
+		    {
+		      outrel.r_offset = sec_addr (htab->elf.sgot) + off;
+		      outrel.r_addend = 0;
+		      outrel.r_info = ELF32_R_INFO (indx, R_RISCV_TLS_DTPMOD32);
+		      bfd_put_32 (output_bfd, 0,
+				  htab->elf.sgot->contents + off);
+		      riscv_elf_append_rela (output_bfd, htab->elf.srelgot, &outrel);
+		      if (indx == 0)
+			{
+			  BFD_ASSERT (! unresolved_reloc);
+			  bfd_put_32 (output_bfd,
+				      dtpoff (info, relocation),
+				      (htab->elf.sgot->contents + off +
+				       RISCV_ELF_WORD_BYTES));
+			}
+		      else
+			{
+			  bfd_put_32 (output_bfd, 0,
+				      (htab->elf.sgot->contents + off +
+				       RISCV_ELF_WORD_BYTES));
+			  outrel.r_info = ELF32_R_INFO (indx, R_RISCV_TLS_DTPREL32);
+			  outrel.r_offset += RISCV_ELF_WORD_BYTES;
+			  riscv_elf_append_rela (output_bfd, htab->elf.srelgot, &outrel);
+			}
+		    }
+		  else
+		    {
+		      /* If we are not emitting relocations for a
+			 general dynamic reference, then we must be in a
+			 static link or an executable link with the
+			 symbol binding locally.  Mark it as belonging
+			 to module 1, the executable.  */
+		      bfd_put_32 (output_bfd, 1,
+				  htab->elf.sgot->contents + off);
+		      bfd_put_32 (output_bfd,
+				  dtpoff (info, relocation),
+				  (htab->elf.sgot->contents + off +
+				   RISCV_ELF_WORD_BYTES));
+		   }
+		}
+
+	      if (tls_type & GOT_TLS_IE)
+		{
+		  if (need_relocs)
+		    {
+		      bfd_put_32 (output_bfd, 0,
+				  htab->elf.sgot->contents + off + ie_off);
+		      outrel.r_offset = sec_addr (htab->elf.sgot)
+				       + off + ie_off;
+		      outrel.r_addend = 0;
+		      if (indx == 0)
+			outrel.r_addend = tpoff (info, relocation);
+		      outrel.r_info = ELF32_R_INFO (indx, R_RISCV_TLS_TPREL32);
+		      riscv_elf_append_rela (output_bfd, htab->elf.srelgot, &outrel);
+		    }
+		  else
+		    {
+		      bfd_put_32 (output_bfd, tpoff (info, relocation),
+				  htab->elf.sgot->contents + off + ie_off);
+		    }
+		}
+	    }
+
+	  BFD_ASSERT (off < (bfd_vma) -2);
+	  relocation = sec_addr (htab->elf.sgot) + off + (is_ie ? ie_off : 0);
+	  if (!riscv_record_pcrel_hi_reloc (&pcrel_relocs, pc, relocation))
+	    r = bfd_reloc_overflow;
+	  unresolved_reloc = FALSE;
+	  break;
+
+	default:
+	  r = bfd_reloc_notsupported;
+	}
+
+      /* Dynamic relocs are not propagated for SEC_DEBUGGING sections
+	 because such sections are not SEC_ALLOC and thus ld.so will
+	 not process them.  */
+      if (unresolved_reloc
+	  && !((input_section->flags & SEC_DEBUGGING) != 0
+	       && h->def_dynamic)
+	  && _bfd_elf_section_offset (output_bfd, info, input_section,
+				      rel->r_offset) != (bfd_vma) -1)
+	{
+	  (*_bfd_error_handler)
+	    (_("%B(%A+0x%lx): unresolvable %s relocation against symbol `%s'"),
+	     input_bfd,
+	     input_section,
+	     (long) rel->r_offset,
+	     howto->name,
+	     h->root.root.string);
+	  continue;
+	}
+
+      if (r == bfd_reloc_ok)
+	r = perform_relocation (howto, rel, relocation, input_section,
+				input_bfd, contents, IsImport);
+
+      switch (r)
+	{
+	case bfd_reloc_ok:
+	  continue;
+
+	case bfd_reloc_overflow:
+	  info->callbacks->reloc_overflow
+	    (info, (h ? &h->root : NULL), name, howto->name,
+	     (bfd_vma) 0, input_bfd, input_section, rel->r_offset);
+	  break;
+
+	case bfd_reloc_undefined:
+	  info->callbacks->undefined_symbol
+	    (info, name, input_bfd, input_section, rel->r_offset,
+	     TRUE);
+	  break;
+
+	case bfd_reloc_outofrange:
+	  msg = _("internal error: out of range error");
+	  break;
+
+	case bfd_reloc_notsupported:
+	  msg = _("internal error: unsupported relocation error");
+	  break;
+
+	case bfd_reloc_dangerous:
+	  msg = _("internal error: dangerous relocation");
+	  break;
+
+	default:
+	  msg = _("internal error: unknown error");
+	  break;
+	}
+
+      if (msg)
+	info->callbacks->warning
+	  (info, msg, name, input_bfd, input_section, rel->r_offset);
+      goto out;
+    }
+
+  ret = riscv_resolve_pcrel_lo_relocs (&pcrel_relocs);
+out:
+  riscv_free_pcrel_relocs (&pcrel_relocs);
+  return ret;
+}
+
+/* Finish up dynamic symbol handling.  We set the contents of various
+   dynamic sections here.  */
+
+static bfd_boolean
+riscv_elf_finish_dynamic_symbol (bfd *output_bfd,
+				 struct bfd_link_info *info,
+				 struct elf_link_hash_entry *h,
+				 Elf_Internal_Sym *sym)
+{
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  const struct elf_backend_data *bed = get_elf_backend_data (output_bfd);
+
+  if (h->plt.offset != (bfd_vma) -1)
+    {
+      /* We've decided to create a PLT entry for this symbol.  */
+      bfd_byte *loc;
+      bfd_vma i, header_address, plt_idx, got_address;
+      uint32_t plt_entry[PLT_ENTRY_INSNS];
+      Elf_Internal_Rela rela;
+
+      BFD_ASSERT (h->dynindx != -1);
+
+      /* Calculate the address of the PLT header.  */
+      header_address = sec_addr (htab->elf.splt);
+
+      /* Calculate the index of the entry.  */
+      plt_idx = (h->plt.offset - PLT_HEADER_SIZE) / PLT_ENTRY_SIZE;
+
+      /* Calculate the address of the .got.plt entry.  */
+      got_address = riscv_elf_got_plt_val (plt_idx, info);
+
+      /* Find out where the .plt entry should go.  */
+      loc = htab->elf.splt->contents + h->plt.offset;
+
+      /* Fill in the PLT entry itself.  */
+      riscv_make_plt_entry (got_address, header_address + h->plt.offset,
+			    plt_entry);
+      for (i = 0; i < PLT_ENTRY_INSNS; i++)
+	bfd_put_32 (output_bfd, plt_entry[i], loc + 4*i);
+
+      /* Fill in the initial value of the .got.plt entry.  */
+      loc = htab->elf.sgotplt->contents
+	    + (got_address - sec_addr (htab->elf.sgotplt));
+      bfd_put_32 (output_bfd, sec_addr (htab->elf.splt), loc);
+
+      /* Fill in the entry in the .rela.plt section.  */
+      rela.r_offset = got_address;
+      rela.r_addend = 0;
+      rela.r_info = ELF32_R_INFO (h->dynindx, R_RISCV_JUMP_SLOT);
+
+      loc = htab->elf.srelplt->contents + plt_idx * sizeof (Elf32_External_Rela);
+      bed->s->swap_reloca_out (output_bfd, &rela, loc);
+
+      if (!h->def_regular)
+	{
+	  /* Mark the symbol as undefined, rather than as defined in
+	     the .plt section.  Leave the value alone.  */
+	  sym->st_shndx = SHN_UNDEF;
+	  /* If the symbol is weak, we do need to clear the value.
+	     Otherwise, the PLT entry would provide a definition for
+	     the symbol even if the symbol wasn't defined anywhere,
+	     and so the symbol would never be NULL.  */
+	  if (!h->ref_regular_nonweak)
+	    sym->st_value = 0;
+	}
+    }
+
+  if (h->got.offset != (bfd_vma) -1
+      && !(riscv_elf_hash_entry (h)->tls_type & (GOT_TLS_GD | GOT_TLS_IE)))
+    {
+      asection *sgot;
+      asection *srela;
+      Elf_Internal_Rela rela;
+
+      /* This symbol has an entry in the GOT.  Set it up.  */
+
+      sgot = htab->elf.sgot;
+      srela = htab->elf.srelgot;
+      BFD_ASSERT (sgot != NULL && srela != NULL);
+
+      rela.r_offset = sec_addr (sgot) + (h->got.offset &~ (bfd_vma) 1);
+
+      /* If this is a -Bsymbolic link, and the symbol is defined
+	 locally, we just want to emit a RELATIVE reloc.  Likewise if
+	 the symbol was forced to be local because of a version file.
+	 The entry in the global offset table will already have been
+	 initialized in the relocate_section function.  */
+      if (bfd_link_pic (info)
+	  && (info->symbolic || h->dynindx == -1)
+	  && h->def_regular)
+	{
+	  asection *sec = h->root.u.def.section;
+	  rela.r_info = ELF32_R_INFO (0, R_RISCV_RELATIVE);
+	  rela.r_addend = (h->root.u.def.value
+			   + sec->output_section->vma
+			   + sec->output_offset);
+	}
+      else
+	{
+	  BFD_ASSERT (h->dynindx != -1);
+	  rela.r_info = ELF32_R_INFO (h->dynindx, R_RISCV_32);
+	  rela.r_addend = 0;
+	}
+
+      bfd_put_32 (output_bfd, 0,
+		  sgot->contents + (h->got.offset & ~(bfd_vma) 1));
+      riscv_elf_append_rela (output_bfd, srela, &rela);
+    }
+
+  if (h->needs_copy)
+    {
+      Elf_Internal_Rela rela;
+      asection *s;
+
+      /* This symbols needs a copy reloc.  Set it up.  */
+      BFD_ASSERT (h->dynindx != -1);
+
+      rela.r_offset = sec_addr (h->root.u.def.section) + h->root.u.def.value;
+      rela.r_info = ELF32_R_INFO (h->dynindx, R_RISCV_COPY);
+      rela.r_addend = 0;
+      if (h->root.u.def.section == htab->elf.sdynrelro)
+	s = htab->elf.sreldynrelro;
+      else
+	s = htab->elf.srelbss;
+      riscv_elf_append_rela (output_bfd, s, &rela);
+    }
+
+  /* Mark some specially defined symbols as absolute.  */
+  if (h == htab->elf.hdynamic
+      || (h == htab->elf.hgot || h == htab->elf.hplt))
+    sym->st_shndx = SHN_ABS;
+
+  return TRUE;
+}
+
+/* Finish up the dynamic sections.  */
+
+static bfd_boolean
+riscv_finish_dyn (bfd *output_bfd, struct bfd_link_info *info,
+		  bfd *dynobj, asection *sdyn)
+{
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  const struct elf_backend_data *bed = get_elf_backend_data (output_bfd);
+  size_t dynsize = bed->s->sizeof_dyn;
+  bfd_byte *dyncon, *dynconend;
+
+  dynconend = sdyn->contents + sdyn->size;
+  for (dyncon = sdyn->contents; dyncon < dynconend; dyncon += dynsize)
+    {
+      Elf_Internal_Dyn dyn;
+      asection *s;
+
+      bed->s->swap_dyn_in (dynobj, dyncon, &dyn);
+
+      switch (dyn.d_tag)
+	{
+	case DT_PLTGOT:
+	  s = htab->elf.sgotplt;
+	  dyn.d_un.d_ptr = s->output_section->vma + s->output_offset;
+	  break;
+	case DT_JMPREL:
+	  s = htab->elf.srelplt;
+	  dyn.d_un.d_ptr = s->output_section->vma + s->output_offset;
+	  break;
+	case DT_PLTRELSZ:
+	  s = htab->elf.srelplt;
+	  dyn.d_un.d_val = s->size;
+	  break;
+	default:
+	  continue;
+	}
+
+      bed->s->swap_dyn_out (output_bfd, &dyn, dyncon);
+    }
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_elf_finish_dynamic_sections (bfd *output_bfd,
+				   struct bfd_link_info *info)
+{
+  bfd *dynobj;
+  asection *sdyn;
+  struct riscv_elf_link_hash_table *htab;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+  dynobj = htab->elf.dynobj;
+
+  sdyn = bfd_get_linker_section (dynobj, ".dynamic");
+
+  if (elf_hash_table (info)->dynamic_sections_created)
+    {
+      asection *splt;
+      bfd_boolean ret;
+
+      splt = htab->elf.splt;
+      BFD_ASSERT (splt != NULL && sdyn != NULL);
+
+      ret = riscv_finish_dyn (output_bfd, info, dynobj, sdyn);
+
+      if (ret != TRUE)
+	return ret;
+
+      /* Fill in the head and tail entries in the procedure linkage table.  */
+      if (splt->size > 0)
+	{
+	  int i;
+	  uint32_t plt_header[PLT_HEADER_INSNS];
+	  riscv_make_plt_header (sec_addr (htab->elf.sgotplt),
+				 sec_addr (splt), plt_header);
+
+	  for (i = 0; i < PLT_HEADER_INSNS; i++)
+	    bfd_put_32 (output_bfd, plt_header[i], splt->contents + 4*i);
+	}
+
+      elf_section_data (splt->output_section)->this_hdr.sh_entsize
+	= PLT_ENTRY_SIZE;
+    }
+
+  if (htab->elf.sgotplt)
+    {
+      asection *output_section = htab->elf.sgotplt->output_section;
+
+      if (bfd_is_abs_section (output_section))
+	{
+	  (*_bfd_error_handler)
+	    (_("discarded output section: `%A'"), htab->elf.sgotplt);
+	  return FALSE;
+	}
+
+      if (htab->elf.sgotplt->size > 0)
+	{
+	  /* Write the first two entries in .got.plt, needed for the dynamic
+	     linker.  */
+	  bfd_put_32 (output_bfd, (bfd_vma) -1, htab->elf.sgotplt->contents);
+	  bfd_put_32 (output_bfd, (bfd_vma) 0,
+		      htab->elf.sgotplt->contents + GOT_ENTRY_SIZE);
+	}
+
+      elf_section_data (output_section)->this_hdr.sh_entsize = GOT_ENTRY_SIZE;
+    }
+
+  if (htab->elf.sgot)
+    {
+      asection *output_section = htab->elf.sgot->output_section;
+
+      if (htab->elf.sgot->size > 0)
+	{
+	  /* Set the first entry in the global offset table to the address of
+	     the dynamic section.  */
+	  bfd_vma val = sdyn ? sec_addr (sdyn) : 0;
+	  bfd_put_32 (output_bfd, val, htab->elf.sgot->contents);
+	}
+
+      elf_section_data (output_section)->this_hdr.sh_entsize = GOT_ENTRY_SIZE;
+    }
+
+  return TRUE;
+}
+
+/* Return address for Ith PLT stub in section PLT, for relocation REL
+   or (bfd_vma) -1 if it should not be included.  */
+
+static bfd_vma
+riscv_elf_plt_sym_val (bfd_vma i, const asection *plt,
+		       const arelent *rel ATTRIBUTE_UNUSED)
+{
+  return plt->vma + PLT_HEADER_SIZE + i * PLT_ENTRY_SIZE;
+}
+
+static enum elf_reloc_type_class
+riscv_reloc_type_class (const struct bfd_link_info *info ATTRIBUTE_UNUSED,
+			const asection *rel_sec ATTRIBUTE_UNUSED,
+			const Elf_Internal_Rela *rela)
+{
+  switch (ELF32_R_TYPE (rela->r_info))
+    {
+    case R_RISCV_RELATIVE:
+      return reloc_class_relative;
+    case R_RISCV_JUMP_SLOT:
+      return reloc_class_plt;
+    case R_RISCV_COPY:
+      return reloc_class_copy;
+    default:
+      return reloc_class_normal;
+    }
+}
+
+/* Merge backend specific data from an object file to the output
+   object file when linking.  */
+
+static bfd_boolean
+_bfd_riscv_elf_merge_private_bfd_data (bfd *ibfd, struct bfd_link_info *info)
+{
+  bfd *obfd = info->output_bfd;
+  flagword new_flags = elf_elfheader (ibfd)->e_flags;
+  flagword old_flags = elf_elfheader (obfd)->e_flags;
+
+  if (!is_riscv_elf (ibfd) || !is_riscv_elf (obfd))
+    return TRUE;
+
+  if (strcmp (bfd_get_target (ibfd), bfd_get_target (obfd)) != 0)
+    {
+      (*_bfd_error_handler)
+	(_("%B: ABI is incompatible with that of the selected emulation:\n"
+	   "  target emulation `%s' does not match `%s'"),
+	 ibfd, bfd_get_target (ibfd), bfd_get_target (obfd));
+      return FALSE;
+    }
+
+  if (!_bfd_elf_merge_object_attributes (ibfd, info))
+    return FALSE;
+
+  if (! elf_flags_init (obfd))
+    {
+      elf_flags_init (obfd) = TRUE;
+      elf_elfheader (obfd)->e_flags = new_flags;
+      return TRUE;
+    }
+
+  /* Disallow linking different float ABIs.  */
+  if ((old_flags ^ new_flags) & EF_RISCV_FLOAT_ABI)
+    {
+      (*_bfd_error_handler)
+	(_("%B: can't link hard-float modules with soft-float modules"), ibfd);
+      goto fail;
+    }
+
+  /* Allow linking RVC and non-RVC, and keep the RVC flag.  */
+  elf_elfheader (obfd)->e_flags |= new_flags & EF_RISCV_RVC;
+
+  return TRUE;
+
+fail:
+  bfd_set_error (bfd_error_bad_value);
+  return FALSE;
+}
+
+/* Delete some bytes from a section while relaxing.  */
+
+static bfd_boolean
+riscv_relax_delete_bytes (bfd *abfd, asection *sec, bfd_vma addr, size_t count)
+{
+  unsigned int i, symcount;
+  bfd_vma toaddr = sec->size;
+  struct elf_link_hash_entry **sym_hashes = elf_sym_hashes (abfd);
+  Elf_Internal_Shdr *symtab_hdr = &elf_tdata (abfd)->symtab_hdr;
+  unsigned int sec_shndx = _bfd_elf_section_from_bfd_section (abfd, sec);
+  struct bfd_elf_section_data *data = elf_section_data (sec);
+  bfd_byte *contents = data->this_hdr.contents;
+
+  /* Actually delete the bytes.  */
+  sec->size -= count;
+  memmove (contents + addr, contents + addr + count, toaddr - addr - count);
+
+  /* Adjust the location of all of the relocs.  Note that we need not
+     adjust the addends, since all PC-relative references must be against
+     symbols, which we will adjust below.  */
+  for (i = 0; i < sec->reloc_count; i++)
+    if (data->relocs[i].r_offset > addr && data->relocs[i].r_offset < toaddr)
+      data->relocs[i].r_offset -= count;
+
+  /* Adjust the local symbols defined in this section.  */
+  for (i = 0; i < symtab_hdr->sh_info; i++)
+    {
+      Elf_Internal_Sym *sym = (Elf_Internal_Sym *) symtab_hdr->contents + i;
+      if (sym->st_shndx == sec_shndx)
+	{
+	  /* If the symbol is in the range of memory we just moved, we
+	     have to adjust its value.  */
+	  if (sym->st_value > addr && sym->st_value <= toaddr)
+	    sym->st_value -= count;
+
+	  /* If the symbol *spans* the bytes we just deleted (i.e. its
+	     *end* is in the moved bytes but its *start* isn't), then we
+	     must adjust its size.  */
+	  if (sym->st_value <= addr
+	      && sym->st_value + sym->st_size > addr
+	      && sym->st_value + sym->st_size <= toaddr)
+	    sym->st_size -= count;
+	}
+    }
+
+  /* Now adjust the global symbols defined in this section.  */
+  symcount = ((symtab_hdr->sh_size / sizeof (Elf32_External_Sym))
+	      - symtab_hdr->sh_info);
+
+  for (i = 0; i < symcount; i++)
+    {
+      struct elf_link_hash_entry *sym_hash = sym_hashes[i];
+
+      if ((sym_hash->root.type == bfd_link_hash_defined
+	   || sym_hash->root.type == bfd_link_hash_defweak)
+	  && sym_hash->root.u.def.section == sec)
+	{
+	  /* As above, adjust the value if needed.  */
+	  if (sym_hash->root.u.def.value > addr
+	      && sym_hash->root.u.def.value <= toaddr)
+	    sym_hash->root.u.def.value -= count;
+
+	  /* As above, adjust the size if needed.  */
+	  if (sym_hash->root.u.def.value <= addr
+	      && sym_hash->root.u.def.value + sym_hash->size > addr
+	      && sym_hash->root.u.def.value + sym_hash->size <= toaddr)
+	    sym_hash->size -= count;
+	}
+    }
+
+  return TRUE;
+}
+
+typedef bfd_boolean (*relax_func_t) (bfd *, asection *, asection *,
+				     struct bfd_link_info *,
+				     Elf_Internal_Rela *,
+				     bfd_vma, bfd_vma, bfd_vma, bfd_boolean, bfd_boolean *);
+
+/* Relax AUIPC + JALR into JAL.  */
+
+static bfd_boolean
+_bfd_riscv_relax_call (bfd *abfd, asection *sec, asection *sym_sec,
+		       struct bfd_link_info *link_info,
+		       Elf_Internal_Rela *rel,
+		       bfd_vma symval,
+		       bfd_vma max_alignment,
+		       bfd_vma reserve_size ATTRIBUTE_UNUSED,
+		       bfd_boolean is_import,
+		       bfd_boolean *again)
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  bfd_signed_vma foff = symval - (sec_addr (sec) + rel->r_offset);
+  bfd_boolean near_zero = (symval + RISCV_IMM_REACH/2) < RISCV_IMM_REACH;
+  bfd_vma auipc, jalr;
+  int rd, r_type, len = 4, rvc = elf_elfheader (abfd)->e_flags & EF_RISCV_RVC;
+  static bfd_boolean Mem20Range = TRUE;
+
+  /* If the call crosses section boundaries, an alignment directive could
+     cause the PC-relative offset to later increase.  */
+  if (VALID_UJTYPE_IMM (foff) && sym_sec->output_section != sec->output_section)
+    foff += (foff < 0 ? -max_alignment : max_alignment);
+
+  /* See if this function call can be shortened.  */
+  if (!VALID_UJTYPE_IMM (foff) && !(!bfd_link_pic (link_info) && near_zero))
+  if ((is_import&&!Mem20Range) || (!VALID_UJTYPE_IMM (foff) && !(!bfd_link_pic (link_info) && near_zero)))
+    return TRUE;
+
+  /* Shorten the function call.  */
+  BFD_ASSERT (rel->r_offset + 8 <= sec->size);
+
+  auipc = bfd_get_32 (abfd, contents + rel->r_offset);
+  jalr = bfd_get_32 (abfd, contents + rel->r_offset + 4);
+  rd = (jalr >> OP_SH_RD) & OP_MASK_RD;
+  rvc = rvc && VALID_RVC_J_IMM (foff) && ARCH_SIZE == 32;
+  rvc = rvc &&!is_import;
+
+  if (rvc && (rd == 0 || rd == X_RA))
+    {
+      /* Relax to C.J[AL] rd, addr.  */
+      r_type = R_RISCV_RVC_JUMP;
+      auipc = rd == 0 ? MATCH_C_J : MATCH_C_JAL;
+      len = 2;
+    }
+  else if (VALID_UJTYPE_IMM (foff) || is_import)
+    {
+      /* Relax to JAL rd, addr.  */
+      r_type = R_RISCV_JAL;
+      auipc = MATCH_JAL | (rd << OP_SH_RD);
+    }
+  else /* near_zero */
+    {
+      /* Relax to JALR rd, x0, addr.  */
+      r_type = R_RISCV_LO12_I;
+      auipc = MATCH_JALR | (rd << OP_SH_RD);
+    }
+
+  /* Replace the R_RISCV_CALL reloc.  */
+  rel->r_info = ELF32_R_INFO (ELF32_R_SYM (rel->r_info), r_type);
+  /* Replace the AUIPC.  */
+  bfd_put (8 * len, abfd, auipc, contents + rel->r_offset);
+
+  /* Delete unnecessary JALR.  */
+  *again = TRUE;
+  return riscv_relax_delete_bytes (abfd, sec, rel->r_offset + len, 8 - len);
+}
+
+/* Traverse all output sections and return the max alignment.  */
+
+static bfd_vma
+_bfd_riscv_get_max_alignment (asection *sec)
+{
+  unsigned int max_alignment_power = 0;
+  asection *o;
+
+  for (o = sec->output_section->owner->sections; o != NULL; o = o->next)
+    {
+      if (o->alignment_power > max_alignment_power)
+	max_alignment_power = o->alignment_power;
+    }
+
+  return (bfd_vma) 1 << max_alignment_power;
+}
+
+/* Relax non-PIC global variable references.  */
+
+static bfd_boolean
+_bfd_riscv_relax_lui (bfd *abfd,
+		      asection *sec,
+		      asection *sym_sec,
+		      struct bfd_link_info *link_info,
+		      Elf_Internal_Rela *rel,
+		      bfd_vma symval,
+		      bfd_vma max_alignment,
+		      bfd_vma reserve_size,
+		      bfd_boolean is_import,
+		      bfd_boolean *again)
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  bfd_vma gp = riscv_global_pointer_value (link_info);
+  int use_rvc = elf_elfheader (abfd)->e_flags & EF_RISCV_RVC;
+
+  /* Mergeable symbols and code might later move out of range.  */
+  if (is_import || (sym_sec->flags & (SEC_MERGE | SEC_CODE)))
+    return TRUE;
+
+  BFD_ASSERT (rel->r_offset + 4 <= sec->size);
+
+  if (gp)
+    {
+      /* If gp and the symbol are in the same output section, then
+	 consider only that section's alignment.  */
+      struct bfd_link_hash_entry *h =
+	bfd_link_hash_lookup (link_info->hash, RISCV_GP_SYMBOL, FALSE, FALSE,
+			      TRUE);
+      if (h->u.def.section->output_section == sym_sec->output_section)
+	max_alignment = (bfd_vma) 1 << sym_sec->output_section->alignment_power;
+    }
+
+  /* Is the reference in range of x0 or gp?
+     Valid gp range conservatively because of alignment issue.  */
+  if (VALID_ITYPE_IMM (symval)
+      || (symval >= gp
+	  && VALID_ITYPE_IMM (symval - gp + max_alignment + reserve_size))
+      || (symval < gp
+	  && VALID_ITYPE_IMM (symval - gp - max_alignment - reserve_size)))
+    {
+      unsigned sym = ELF32_R_SYM (rel->r_info);
+      switch (ELF32_R_TYPE (rel->r_info))
+	{
+	case R_RISCV_LO12_I:
+	  rel->r_info = ELF32_R_INFO (sym, R_RISCV_GPREL_I);
+	  return TRUE;
+
+	case R_RISCV_LO12_S:
+	  rel->r_info = ELF32_R_INFO (sym, R_RISCV_GPREL_S);
+	  return TRUE;
+
+	case R_RISCV_HI20:
+	  /* We can delete the unnecessary LUI and reloc.  */
+	  rel->r_info = ELF32_R_INFO (0, R_RISCV_NONE);
+	  *again = TRUE;
+	  return riscv_relax_delete_bytes (abfd, sec, rel->r_offset, 4);
+
+	default:
+	  abort ();
+	}
+    }
+
+  /* Can we relax LUI to C.LUI?  Alignment might move the section forward;
+     account for this assuming page alignment at worst.  */
+  if (use_rvc
+      && ELF32_R_TYPE (rel->r_info) == R_RISCV_HI20
+      && VALID_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (symval))
+      && VALID_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (symval + ELF_MAXPAGESIZE)))
+    {
+      /* Replace LUI with C.LUI if legal (i.e., rd != x2/sp).  */
+      bfd_vma lui = bfd_get_32 (abfd, contents + rel->r_offset);
+      if (((lui >> OP_SH_RD) & OP_MASK_RD) == X_SP)
+	return TRUE;
+
+      lui = (lui & (OP_MASK_RD << OP_SH_RD)) | MATCH_C_LUI;
+      bfd_put_32 (abfd, lui, contents + rel->r_offset);
+
+      /* Replace the R_RISCV_HI20 reloc.  */
+      rel->r_info = ELF32_R_INFO (ELF32_R_SYM (rel->r_info), R_RISCV_RVC_LUI);
+
+      *again = TRUE;
+      return riscv_relax_delete_bytes (abfd, sec, rel->r_offset + 2, 2);
+    }
+
+  return TRUE;
+}
+
+/* Relax non-PIC TLS references.  */
+
+static bfd_boolean
+_bfd_riscv_relax_tls_le (bfd *abfd,
+			 asection *sec,
+			 asection *sym_sec ATTRIBUTE_UNUSED,
+			 struct bfd_link_info *link_info,
+			 Elf_Internal_Rela *rel,
+			 bfd_vma symval,
+			 bfd_vma max_alignment ATTRIBUTE_UNUSED,
+			 bfd_vma reserve_size ATTRIBUTE_UNUSED,
+			 bfd_boolean is_import,
+			 bfd_boolean *again)
+{
+  /* See if this symbol is in range of tp.  */
+  if (RISCV_CONST_HIGH_PART (tpoff (link_info, symval)) != 0 || is_import)
+    return TRUE;
+
+  BFD_ASSERT (rel->r_offset + 4 <= sec->size);
+  switch (ELF32_R_TYPE (rel->r_info))
+    {
+    case R_RISCV_TPREL_LO12_I:
+      rel->r_info = ELF32_R_INFO (ELF32_R_SYM (rel->r_info), R_RISCV_TPREL_I);
+      return TRUE;
+
+    case R_RISCV_TPREL_LO12_S:
+      rel->r_info = ELF32_R_INFO (ELF32_R_SYM (rel->r_info), R_RISCV_TPREL_S);
+      return TRUE;
+
+    case R_RISCV_TPREL_HI20:
+    case R_RISCV_TPREL_ADD:
+      /* We can delete the unnecessary instruction and reloc.  */
+      rel->r_info = ELF32_R_INFO (0, R_RISCV_NONE);
+      *again = TRUE;
+      return riscv_relax_delete_bytes (abfd, sec, rel->r_offset, 4);
+
+    default:
+      abort ();
+    }
+}
+
+/* Implement R_RISCV_ALIGN by deleting excess alignment NOPs.  */
+
+static bfd_boolean
+_bfd_riscv_relax_align (bfd *abfd, asection *sec,
+			asection *sym_sec ATTRIBUTE_UNUSED,
+			struct bfd_link_info *link_info ATTRIBUTE_UNUSED,
+			Elf_Internal_Rela *rel,
+			bfd_vma symval,
+			bfd_vma max_alignment ATTRIBUTE_UNUSED,
+			bfd_vma reserve_size ATTRIBUTE_UNUSED,
+			bfd_boolean is_import ATTRIBUTE_UNUSED,
+			bfd_boolean *again ATTRIBUTE_UNUSED)
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  bfd_vma alignment = 1, pos;
+  while (alignment <= rel->r_addend)
+    alignment *= 2;
+
+  symval -= rel->r_addend;
+  bfd_vma aligned_addr = ((symval - 1) & ~(alignment - 1)) + alignment;
+  bfd_vma nop_bytes = aligned_addr - symval;
+
+  /* Once we've handled an R_RISCV_ALIGN, we can't relax anything else.  */
+  sec->sec_flg0 = TRUE;
+
+  /* Make sure there are enough NOPs to actually achieve the alignment.  */
+  if (rel->r_addend < nop_bytes)
+    return FALSE;
+
+  /* Delete the reloc.  */
+  rel->r_info = ELF32_R_INFO (0, R_RISCV_NONE);
+
+  /* If the number of NOPs is already correct, there's nothing to do.  */
+  if (nop_bytes == rel->r_addend)
+    return TRUE;
+
+  /* Write as many RISC-V NOPs as we need.  */
+  for (pos = 0; pos < (nop_bytes & -4); pos += 4)
+    bfd_put_32 (abfd, RISCV_NOP, contents + rel->r_offset + pos);
+
+  /* Write a final RVC NOP if need be.  */
+  if (nop_bytes % 4 != 0)
+    bfd_put_16 (abfd, RVC_NOP, contents + rel->r_offset + pos);
+
+  /* Delete the excess bytes.  */
+  return riscv_relax_delete_bytes (abfd, sec, rel->r_offset + nop_bytes,
+				   rel->r_addend - nop_bytes);
+}
+
+
+static bfd_boolean
+_bfd_riscv_relax_import_pcrel (bfd *abfd, asection *sec,
+                        asection *sym_sec ATTRIBUTE_UNUSED,
+                        struct bfd_link_info *link_info ATTRIBUTE_UNUSED,
+                        Elf_Internal_Rela *rel,
+                        bfd_vma symval ATTRIBUTE_UNUSED,
+			bfd_vma max_alignment ATTRIBUTE_UNUSED,
+			bfd_vma reserve_size ATTRIBUTE_UNUSED,
+                        bfd_boolean is_import,
+                        bfd_boolean *again)
+
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+
+  if (is_import) {
+        unsigned sym = ELF32_R_SYM (rel->r_info);
+        switch (ELF32_R_TYPE (rel->r_info)) {
+                case R_RISCV_PCREL_LO12_I:
+                        rel->r_info = ELF32_R_INFO (sym, R_RISCV_LO12_I);
+                        return TRUE;
+                case R_RISCV_PCREL_HI20:
+                        {
+                                bfd_vma lui = bfd_get_32 (abfd, contents + rel->r_offset);
+                                lui = (lui & (OP_MASK_RD << OP_SH_RD)) | MATCH_LUI;
+                                bfd_put_32 (abfd, lui, contents + rel->r_offset);
+                                rel->r_info = ELF32_R_INFO (sym, R_RISCV_HI20);
+                                /*
+                                        PCREL_HI20 is always followed by a reloc on the lsp part of the symbol, we use
+                                        this assumption to force the reloc to pseudo absolute
+                                */
+                                (rel+1)->r_info = ELF32_R_INFO(sym, R_RISCV_LO12_I);
+                        }
+                        return TRUE;
+        }
+  }
+  *again = FALSE;
+  return TRUE;
+}
+
+static bfd_boolean
+_bfd_riscv_relax_got_ref (bfd *abfd, asection *sec,
+                        asection *sym_sec ATTRIBUTE_UNUSED,
+                        struct bfd_link_info *link_info ATTRIBUTE_UNUSED,
+                        Elf_Internal_Rela *rel,
+                        bfd_vma symval ATTRIBUTE_UNUSED,
+                        bfd_vma max_alignment ATTRIBUTE_UNUSED,
+                        bfd_vma reserve_size ATTRIBUTE_UNUSED,
+                        bfd_boolean is_import ATTRIBUTE_UNUSED,
+                        bfd_boolean *again ATTRIBUTE_UNUSED)
+
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  unsigned sym = ELF32_R_SYM (rel->r_info);
+  Elf_Internal_Rela *low_part_rel = rel + 1;
+  bfd_vma low_part_ref;
+
+  rel->r_info = ELF32_R_INFO (sym, R_RISCV_PCREL_HI20);
+  /* Force second part of the access to be an addi instead of the usual load got */
+  low_part_ref = bfd_get_32 (abfd, contents + low_part_rel->r_offset);
+  low_part_ref = (low_part_ref & ((OP_MASK_RD << OP_SH_RD) | (OP_MASK_RS1 << OP_SH_RS1))) | MATCH_ADDI;
+  bfd_put_32 (abfd, low_part_ref, contents + low_part_rel->r_offset);
+
+  sym = ELF32_R_SYM(low_part_rel->r_info);
+  low_part_rel->r_info = ELF32_R_INFO (sym, R_RISCV_PCREL_LO12_I);
+
+  return TRUE;
+}
+
+/* Relax a section.  Pass 0 shortens code sequences unless disabled.
+   Pass 1, which cannot be disabled, handles code alignment directives.  */
+
+static bfd_boolean
+_bfd_riscv_relax_section (bfd *abfd, asection *sec,
+			  struct bfd_link_info *info,
+			  bfd_boolean *again)
+{
+  Elf_Internal_Shdr *symtab_hdr = &elf_symtab_hdr (abfd);
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  struct bfd_elf_section_data *data = elf_section_data (sec);
+  Elf_Internal_Rela *relocs;
+  bfd_boolean ret = FALSE;
+  unsigned int i;
+  bfd_vma max_alignment, reserve_size = 0;
+
+  *again = FALSE;
+
+  if (bfd_link_relocatable (info)
+      || sec->sec_flg0
+      || (sec->flags & SEC_RELOC) == 0
+      || sec->reloc_count == 0
+      || (info->disable_target_specific_optimizations
+	  && info->relax_pass == 0))
+    return TRUE;
+
+  /* Read this BFD's relocs if we haven't done so already.  */
+  if (data->relocs)
+    relocs = data->relocs;
+  else if (!(relocs = _bfd_elf_link_read_relocs (abfd, sec, NULL, NULL,
+						 info->keep_memory)))
+    goto fail;
+
+  max_alignment = _bfd_riscv_get_max_alignment (sec);
+
+  /* Examine and consider relaxing each reloc.  */
+  for (i = 0; i < sec->reloc_count; i++)
+    {
+      asection *sym_sec;
+      Elf_Internal_Rela *rel = relocs + i;
+      relax_func_t relax_func;
+      int type = ELF32_R_TYPE (rel->r_info);
+      bfd_vma symval;
+      bfd_boolean Is_Import = FALSE;
+
+      if (info->relax_pass == 0)
+	{
+	  if (type == R_RISCV_CALL || type == R_RISCV_CALL_PLT)
+	    relax_func = _bfd_riscv_relax_call;
+	  else if (type == R_RISCV_HI20
+		   || type == R_RISCV_LO12_I
+		   || type == R_RISCV_LO12_S)
+	    relax_func = _bfd_riscv_relax_lui;
+	  else if (type == R_RISCV_TPREL_HI20
+		   || type == R_RISCV_TPREL_ADD
+		   || type == R_RISCV_TPREL_LO12_I
+		   || type == R_RISCV_TPREL_LO12_S)
+	    relax_func = _bfd_riscv_relax_tls_le;
+          else if (type == R_RISCV_PCREL_HI20 || type == R_RISCV_PCREL_LO12_I)
+            relax_func = _bfd_riscv_relax_import_pcrel;
+          else if (ComponentMode && (type == R_RISCV_GOT_HI20))
+            relax_func = _bfd_riscv_relax_got_ref;
+	  else
+	    continue;
+
+	  /* Only relax this reloc if it is paired with R_RISCV_RELAX.  */
+	  if (i == sec->reloc_count - 1
+	      || ELF32_R_TYPE ((rel + 1)->r_info) != R_RISCV_RELAX
+	      || rel->r_offset != (rel + 1)->r_offset)
+	    continue;
+
+	  /* Skip over the R_RISCV_RELAX.  */
+	  i++;
+	}
+      else if (type == R_RISCV_ALIGN)
+	relax_func = _bfd_riscv_relax_align;
+      else
+	continue;
+
+      data->relocs = relocs;
+
+      /* Read this BFD's contents if we haven't done so already.  */
+      if (!data->this_hdr.contents
+	  && !bfd_malloc_and_get_section (abfd, sec, &data->this_hdr.contents))
+	goto fail;
+
+      /* Read this BFD's symbols if we haven't done so already.  */
+      if (symtab_hdr->sh_info != 0
+	  && !symtab_hdr->contents
+	  && !(symtab_hdr->contents =
+	       (unsigned char *) bfd_elf_get_elf_syms (abfd, symtab_hdr,
+						       symtab_hdr->sh_info,
+						       0, NULL, NULL, NULL)))
+	goto fail;
+
+      /* Get the value of the symbol referred to by the reloc.  */
+      if (ELF32_R_SYM (rel->r_info) < symtab_hdr->sh_info)
+	{
+	  /* A local symbol.  */
+	  Elf_Internal_Sym *isym = ((Elf_Internal_Sym *) symtab_hdr->contents
+				    + ELF32_R_SYM (rel->r_info));
+	  reserve_size = (isym->st_size - rel->r_addend) > isym->st_size
+	    ? 0 : isym->st_size - rel->r_addend;
+
+	  if (isym->st_shndx == SHN_UNDEF)
+	    sym_sec = sec, symval = sec_addr (sec) + rel->r_offset;
+	  else
+	    {
+	      BFD_ASSERT (isym->st_shndx < elf_numsections (abfd));
+	      sym_sec = elf_elfsections (abfd)[isym->st_shndx]->bfd_section;
+	      if (sec_addr (sym_sec) == 0)
+		continue;
+	      symval = sec_addr (sym_sec) + isym->st_value;
+	    }
+	}
+      else
+	{
+	  unsigned long indx;
+	  struct elf_link_hash_entry *h;
+
+	  indx = ELF32_R_SYM (rel->r_info) - symtab_hdr->sh_info;
+	  h = elf_sym_hashes (abfd)[indx];
+
+	  while (h->root.type == bfd_link_hash_indirect
+		 || h->root.type == bfd_link_hash_warning)
+	    h = (struct elf_link_hash_entry *) h->root.u.i.link;
+
+	  if (h->plt.offset != MINUS_ONE)
+	    symval = sec_addr (htab->elf.splt) + h->plt.offset;
+	  else if (h->root.u.def.section->output_section == NULL
+		   || (h->root.type != bfd_link_hash_defined
+		       && h->root.type != bfd_link_hash_defweak))
+	    continue;
+	  else
+	    symval = sec_addr (h->root.u.def.section) + h->root.u.def.value;
+
+	  if (h->type != STT_FUNC)
+	    reserve_size =
+	      (h->size - rel->r_addend) > h->size ? 0 : h->size - rel->r_addend;
+	  sym_sec = h->root.u.def.section;
+          if (h->root.type == bfd_link_hash_defweak && strcmp(sec->name, "pulp.import")) Is_Import = TRUE;
+
+	}
+
+      symval += rel->r_addend;
+
+      if (!relax_func (abfd, sec, sym_sec, info, rel, symval,
+		       max_alignment, reserve_size, Is_Import, again))
+	goto fail;
+    }
+
+  ret = TRUE;
+
+fail:
+  if (relocs != data->relocs)
+    free (relocs);
+
+  return ret;
+}
+
+#if ARCH_SIZE == 32
+# define PRSTATUS_SIZE			0 /* FIXME */
+# define PRSTATUS_OFFSET_PR_CURSIG	12
+# define PRSTATUS_OFFSET_PR_PID		24
+# define PRSTATUS_OFFSET_PR_REG		72
+# define ELF_GREGSET_T_SIZE		128
+# define PRPSINFO_SIZE			128
+# define PRPSINFO_OFFSET_PR_PID		16
+# define PRPSINFO_OFFSET_PR_FNAME	32
+# define PRPSINFO_OFFSET_PR_PSARGS	48
+#else
+# define PRSTATUS_SIZE			376
+# define PRSTATUS_OFFSET_PR_CURSIG	12
+# define PRSTATUS_OFFSET_PR_PID		32
+# define PRSTATUS_OFFSET_PR_REG		112
+# define ELF_GREGSET_T_SIZE		256
+# define PRPSINFO_SIZE			136
+# define PRPSINFO_OFFSET_PR_PID		24
+# define PRPSINFO_OFFSET_PR_FNAME	40
+# define PRPSINFO_OFFSET_PR_PSARGS	56
+#endif
+
+/* Support for core dump NOTE sections.  */
+
+static bfd_boolean
+riscv_elf_grok_prstatus (bfd *abfd, Elf_Internal_Note *note)
+{
+  switch (note->descsz)
+    {
+      default:
+	return FALSE;
+
+      case PRSTATUS_SIZE:  /* sizeof(struct elf_prstatus) on Linux/RISC-V.  */
+	/* pr_cursig */
+	elf_tdata (abfd)->core->signal
+	  = bfd_get_16 (abfd, note->descdata + PRSTATUS_OFFSET_PR_CURSIG);
+
+	/* pr_pid */
+	elf_tdata (abfd)->core->lwpid
+	  = bfd_get_32 (abfd, note->descdata + PRSTATUS_OFFSET_PR_PID);
+	break;
+    }
+
+  /* Make a ".reg/999" section.  */
+  return _bfd_elfcore_make_pseudosection (abfd, ".reg", ELF_GREGSET_T_SIZE,
+					  note->descpos + PRSTATUS_OFFSET_PR_REG);
+}
+
+static bfd_boolean
+riscv_elf_grok_psinfo (bfd *abfd, Elf_Internal_Note *note)
+{
+  switch (note->descsz)
+    {
+      default:
+	return FALSE;
+
+      case PRPSINFO_SIZE: /* sizeof(struct elf_prpsinfo) on Linux/RISC-V.  */
+	/* pr_pid */
+	elf_tdata (abfd)->core->pid
+	  = bfd_get_32 (abfd, note->descdata + PRPSINFO_OFFSET_PR_PID);
+
+	/* pr_fname */
+	elf_tdata (abfd)->core->program = _bfd_elfcore_strndup
+	  (abfd, note->descdata + PRPSINFO_OFFSET_PR_FNAME, 16);
+
+	/* pr_psargs */
+	elf_tdata (abfd)->core->command = _bfd_elfcore_strndup
+	  (abfd, note->descdata + PRPSINFO_OFFSET_PR_PSARGS, 80);
+	break;
+    }
+
+  /* Note that for some reason, a spurious space is tacked
+     onto the end of the args in some (at least one anyway)
+     implementations, so strip it off if it exists.  */
+
+  {
+    char *command = elf_tdata (abfd)->core->command;
+    int n = strlen (command);
+
+    if (0 < n && command[n - 1] == ' ')
+      command[n - 1] = '\0';
+  }
+
+  return TRUE;
+}
+
+/* Set the right mach type.  */
+static bfd_boolean
+riscv_elf_object_p (bfd *abfd)
+{
+  /* There are only two mach types in RISCV currently.  */
+  if (strcmp (abfd->xvec->name, "elf32-littleriscv") == 0)
+    bfd_default_set_arch_mach (abfd, bfd_arch_riscv, bfd_mach_riscv32);
+  else
+    bfd_default_set_arch_mach (abfd, bfd_arch_riscv, bfd_mach_riscv64);
+
+  return TRUE;
+}
+
+bfd_boolean
+_bfd_riscv_elf32_final_link (bfd *abfd, struct bfd_link_info *info)
+
+{
+	struct bfd_section *s;
+	unsigned int SecNameSize, SecRelocSize, NImport=0, ExportSize;
+	unsigned int *NameSection, *RelocSection, *ExportSection;
+	static int Trace = 0;
+
+
+	if (!bfd_elf_final_link (abfd, info)) return FALSE;
+
+	if (PulpImportCreateNameAndRelocSections(0,
+						 &NameSection,  &SecNameSize,
+						 &RelocSection, &SecRelocSize, &NImport) == FALSE) {
+      		(*_bfd_error_handler)(_("Failed to create Import sections"));
+		return FALSE;
+	}
+	if (Trace) printf("NImport: %d, SecNameSize: %d, SecRelocSize: %d\n", NImport, SecNameSize, SecRelocSize);
+
+	if (NImport) {
+		struct bfd_section *TextSec = NULL;
+		unsigned int BaseText = 0;
+
+		if (ComponentMode == 0) {
+			TextSec = bfd_get_section_by_name (info->output_bfd, ".text");
+			if (TextSec) {
+				BaseText = (unsigned int) TextSec->lma;
+				AdjustRelocsImport(RelocSection, BaseText);
+			} else {
+      				(*_bfd_error_handler)(_("Failed to find .text section in output_bfd"));
+			}
+		}
+
+		if (DumpImportExportSections==1 || DumpImportExportSections==3)
+			DiassembleImports(NameSection, SecNameSize, RelocSection, SecRelocSize, BaseText);
+		if (DumpImportExportSections==2 || DumpImportExportSections==3) {
+			DumpCEquiv((unsigned int *) NameSection, SecNameSize, 4, ComponentMode?"CompImportNames":"ResiImportNames");
+			DumpCEquiv(RelocSection, SecRelocSize, 4, ComponentMode?"CompImportRelocs":"ResiImportRelocs");
+		}
+
+		s = bfd_get_section_by_name (abfd, ".pulp.import.names");
+		if (s) {
+			s->contents = xmalloc(SecNameSize);
+			s->size = SecNameSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) NameSection, 0, SecNameSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.names: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.names: Set content OK\n");
+			}
+		} else {
+      			(*_bfd_error_handler)(_("Can't find .pulp.import.names"));
+	        	return FALSE;
+		}
+		s = bfd_get_section_by_name (abfd, ".pulp.import.relocs");
+		if (s) {
+			s->contents = xmalloc(SecRelocSize);
+			s->size = SecRelocSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) RelocSection, 0, SecRelocSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.relocs: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.relocs: Set content OK\n");
+			}
+		} else {
+      			(*_bfd_error_handler)(_("Can't find .pulp.import.relocs"));
+	        	return FALSE;
+		}
+		(void) ReleaseImportEntry();
+	} else {
+		/* In this case both sections are empty with size = 4 for the section descriptor
+		   descriptor itself is 0 */
+		s = bfd_get_section_by_name (abfd, ".pulp.import.names");
+		if (s) {
+			unsigned int Empty = 0;
+			SecNameSize = 4;
+			s->contents = xmalloc(SecNameSize);
+			s->size = SecNameSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) &Empty, 0, SecNameSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.names: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.names: Set content OK\n");
+			}
+		}
+		s = bfd_get_section_by_name (abfd, ".pulp.import.relocs");
+		if (s) {
+			unsigned int Empty = 0;
+			SecRelocSize = 4;
+			s->contents = xmalloc(SecRelocSize);
+			s->size = SecRelocSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) &Empty, 0, SecRelocSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.relocs: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.relocs: Set content OK\n");
+			}
+		}
+	}
+
+	if (ComponentMode) {
+		if (ComponentEntryProvided == FALSE)
+			(*_bfd_error_handler)(_("No Entry provided for Component"));
+		else if (ExportLookup(ComponentEntry.name) == FALSE)
+			 (*_bfd_error_handler)(_("Component provided entry: %s not found in component export list"), ComponentEntry.name);
+	}
+	if (PulpExportCreateSection(&ExportSection, &ExportSize, info) == FALSE) {
+      		(*_bfd_error_handler)(_("Failed to create Export Section"));
+		return FALSE;
+	} else if (ExportSection) {
+		if (DumpImportExportSections==1 || DumpImportExportSections==3)
+			DiassembleExports(ExportSection, ExportSize);
+		if (DumpImportExportSections==1 || DumpImportExportSections==2) {
+			DumpCEquiv(ExportSection, ExportSize, 4, ComponentMode?"CompExports":"ResiExports");
+			if (ComponentMode) {
+				struct bfd_section *CompSec = NULL;
+				CompSec = bfd_get_section_by_name (info->output_bfd, ".component.body");
+				if (CompSec) {
+					long Size = CompSec->size;
+					char *Buffer = xmalloc (Size);
+					bfd_get_section_contents (info->output_bfd, CompSec, Buffer, 0, Size);
+					DumpCEquiv((unsigned int *) Buffer, Size, 1, "ComponentBody");
+					free(Buffer);
+				}
+			}
+		}
+		s = bfd_get_section_by_name (abfd, ".pulp.export");
+		if (s) {
+			s->contents = xmalloc(ExportSize);
+			s->size = ExportSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) ExportSection, 0, ExportSize)) {
+      				(*_bfd_error_handler)(_(".pulp.export: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.export: Set content OK\n");
+			}
+		} else {
+      			(*_bfd_error_handler)(_("Can't find .pulp.export"));
+	        	return FALSE;
+		}
+		(void) ReleaseExportEntry();
+	} else if (ComponentMode) {
+		/* We should have at least on export to be able to enter the component */
+		(*_bfd_error_handler)(_("Component has empty export section"));
+	}
+
+	return TRUE;
+}
+#define TARGET_LITTLE_SYM		riscv_elf32_vec
+#define TARGET_LITTLE_NAME		"elf32-littleriscv"
+
+#define elf_backend_reloc_type_class	     riscv_reloc_type_class
+
+#define bfd_elf32_bfd_reloc_name_lookup	     riscv_reloc_name_lookup
+#define bfd_elf32_bfd_link_hash_table_create riscv_elf_link_hash_table_create
+#define bfd_elf32_bfd_reloc_type_lookup	     riscv_reloc_type_lookup
+#define bfd_elf32_bfd_merge_private_bfd_data \
+  _bfd_riscv_elf_merge_private_bfd_data
+
+#define elf_backend_copy_indirect_symbol     riscv_elf_copy_indirect_symbol
+#define elf_backend_create_dynamic_sections  riscv_elf_create_dynamic_sections
+#define elf_backend_check_relocs	     riscv_elf_check_relocs
+#define elf_backend_adjust_dynamic_symbol    riscv_elf_adjust_dynamic_symbol
+#define elf_backend_size_dynamic_sections    riscv_elf_size_dynamic_sections
+#define elf_backend_relocate_section	     riscv_elf_relocate_section
+#define elf_backend_finish_dynamic_symbol    riscv_elf_finish_dynamic_symbol
+#define elf_backend_finish_dynamic_sections  riscv_elf_finish_dynamic_sections
+#define elf_backend_gc_mark_hook	     riscv_elf_gc_mark_hook
+#define elf_backend_gc_sweep_hook	     riscv_elf_gc_sweep_hook
+#define elf_backend_plt_sym_val		     riscv_elf_plt_sym_val
+#define elf_backend_grok_prstatus            riscv_elf_grok_prstatus
+#define elf_backend_grok_psinfo              riscv_elf_grok_psinfo
+#define elf_backend_object_p                 riscv_elf_object_p
+#define elf_info_to_howto_rel		     NULL
+#define elf_info_to_howto		     riscv_info_to_howto_rela
+#define bfd_elf32_bfd_relax_section	     _bfd_riscv_relax_section
+
+#define elf_backend_init_index_section	     _bfd_elf_init_1_index_section
+#define bfd_elf32_bfd_final_link             _bfd_riscv_elf32_final_link
+
+#define elf_backend_can_gc_sections	1
+#define elf_backend_can_refcount	1
+#define elf_backend_want_got_plt	1
+#define elf_backend_plt_readonly	1
+#define elf_backend_plt_alignment	4
+#define elf_backend_want_plt_sym	1
+#define elf_backend_got_header_size	(ARCH_SIZE / 8)
+#define elf_backend_want_dynrelro	1
+#define elf_backend_rela_normal		1
+#define elf_backend_default_execstack	0
+
+#include "elf32-target.h"
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/elf32-target.h b/utils/gapy/gen-debug-info-src/ext/bfd/elf32-target.h
new file mode 100644
index 000000000..4a30cfe43
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/elf32-target.h
@@ -0,0 +1,1081 @@
+/* Target definitions for 32-bit ELF
+   Copyright (C) 1993-2017 Free Software Foundation, Inc.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+
+/* This structure contains everything that BFD knows about a target.
+   It includes things like its byte order, name, what routines to call
+   to do various operations, etc.  Every BFD points to a target structure
+   with its "xvec" member.
+
+   There are two such structures here:  one for big-endian machines and
+   one for little-endian machines.   */
+
+#ifndef bfd_elf32_close_and_cleanup
+#define	bfd_elf32_close_and_cleanup _bfd_elf_close_and_cleanup
+#endif
+#ifndef bfd_elf32_bfd_free_cached_info
+#define bfd_elf32_bfd_free_cached_info _bfd_free_cached_info
+#endif
+#ifndef bfd_elf32_get_section_contents
+#define bfd_elf32_get_section_contents _bfd_generic_get_section_contents
+#endif
+
+#define bfd_elf32_canonicalize_dynamic_symtab \
+  _bfd_elf_canonicalize_dynamic_symtab
+#ifndef bfd_elf32_get_synthetic_symtab
+#define bfd_elf32_get_synthetic_symtab \
+  _bfd_elf_get_synthetic_symtab
+#endif
+#ifndef bfd_elf32_canonicalize_reloc
+#define bfd_elf32_canonicalize_reloc	_bfd_elf_canonicalize_reloc
+#endif
+#ifndef bfd_elf32_find_nearest_line
+#define bfd_elf32_find_nearest_line	_bfd_elf_find_nearest_line
+#endif
+#ifndef bfd_elf32_find_line
+#define bfd_elf32_find_line		_bfd_elf_find_line
+#endif
+#ifndef bfd_elf32_find_inliner_info
+#define bfd_elf32_find_inliner_info	_bfd_elf_find_inliner_info
+#endif
+#define bfd_elf32_read_minisymbols	_bfd_elf_read_minisymbols
+#define bfd_elf32_minisymbol_to_symbol	_bfd_elf_minisymbol_to_symbol
+#define bfd_elf32_get_dynamic_symtab_upper_bound \
+  _bfd_elf_get_dynamic_symtab_upper_bound
+#define bfd_elf32_get_lineno		_bfd_elf_get_lineno
+#ifndef bfd_elf32_get_reloc_upper_bound
+#define bfd_elf32_get_reloc_upper_bound _bfd_elf_get_reloc_upper_bound
+#endif
+#ifndef bfd_elf32_get_symbol_info
+#define bfd_elf32_get_symbol_info	_bfd_elf_get_symbol_info
+#endif
+#ifndef bfd_elf32_get_symbol_version_string
+#define bfd_elf32_get_symbol_version_string \
+  _bfd_elf_get_symbol_version_string
+#endif
+#define bfd_elf32_canonicalize_symtab	_bfd_elf_canonicalize_symtab
+#define bfd_elf32_get_symtab_upper_bound _bfd_elf_get_symtab_upper_bound
+#define bfd_elf32_make_empty_symbol	_bfd_elf_make_empty_symbol
+#ifndef bfd_elf32_new_section_hook
+#define bfd_elf32_new_section_hook	_bfd_elf_new_section_hook
+#endif
+#define bfd_elf32_set_arch_mach		_bfd_elf_set_arch_mach
+#ifndef bfd_elf32_set_section_contents
+#define bfd_elf32_set_section_contents	_bfd_elf_set_section_contents
+#endif
+#define bfd_elf32_sizeof_headers	_bfd_elf_sizeof_headers
+#define bfd_elf32_write_object_contents _bfd_elf_write_object_contents
+#define bfd_elf32_write_corefile_contents _bfd_elf_write_corefile_contents
+
+#define bfd_elf32_get_section_contents_in_window \
+  _bfd_generic_get_section_contents_in_window
+
+#ifndef elf_backend_can_refcount
+#define elf_backend_can_refcount 0
+#endif
+#ifndef elf_backend_want_got_plt
+#define elf_backend_want_got_plt 0
+#endif
+#ifndef elf_backend_plt_readonly
+#define elf_backend_plt_readonly 0
+#endif
+#ifndef elf_backend_want_plt_sym
+#define elf_backend_want_plt_sym 0
+#endif
+#ifndef elf_backend_plt_not_loaded
+#define elf_backend_plt_not_loaded 0
+#endif
+#ifndef elf_backend_plt_alignment
+#define elf_backend_plt_alignment 2
+#endif
+#ifndef elf_backend_want_dynbss
+#define elf_backend_want_dynbss 1
+#endif
+#ifndef elf_backend_want_dynrelro
+#define elf_backend_want_dynrelro 0
+#endif
+#ifndef elf_backend_want_p_paddr_set_to_zero
+#define elf_backend_want_p_paddr_set_to_zero 0
+#endif
+#ifndef elf_backend_no_page_alias
+#define elf_backend_no_page_alias 0
+#endif
+#ifndef elf_backend_default_execstack
+#define elf_backend_default_execstack 1
+#endif
+#ifndef elf_backend_caches_rawsize
+#define elf_backend_caches_rawsize 0
+#endif
+#ifndef elf_backend_extern_protected_data
+#define elf_backend_extern_protected_data 0
+#endif
+#ifndef elf_backend_always_renumber_dynsyms
+#define elf_backend_always_renumber_dynsyms FALSE
+#endif
+#ifndef elf_backend_stack_align
+#define elf_backend_stack_align 16
+#endif
+#ifndef elf_backend_strtab_flags
+#define elf_backend_strtab_flags 0
+#endif
+
+#define bfd_elf32_bfd_debug_info_start	bfd_void
+#define bfd_elf32_bfd_debug_info_end	bfd_void
+#define bfd_elf32_bfd_debug_info_accumulate \
+  ((void (*) (bfd*, struct bfd_section *)) bfd_void)
+
+#ifndef bfd_elf32_bfd_get_relocated_section_contents
+#define bfd_elf32_bfd_get_relocated_section_contents \
+  bfd_generic_get_relocated_section_contents
+#endif
+
+#ifndef bfd_elf32_bfd_relax_section
+#define bfd_elf32_bfd_relax_section bfd_generic_relax_section
+#endif
+
+#ifndef elf_backend_can_gc_sections
+#define elf_backend_can_gc_sections 0
+#endif
+#ifndef elf_backend_can_refcount
+#define elf_backend_can_refcount 0
+#endif
+#ifndef elf_backend_want_got_sym
+#define elf_backend_want_got_sym 1
+#endif
+#ifndef elf_backend_gc_keep
+#define elf_backend_gc_keep		_bfd_elf_gc_keep
+#endif
+#ifndef elf_backend_gc_mark_dynamic_ref
+#define elf_backend_gc_mark_dynamic_ref	bfd_elf_gc_mark_dynamic_ref_symbol
+#endif
+#ifndef elf_backend_gc_mark_hook
+#define elf_backend_gc_mark_hook	_bfd_elf_gc_mark_hook
+#endif
+#ifndef elf_backend_gc_mark_extra_sections
+#define elf_backend_gc_mark_extra_sections _bfd_elf_gc_mark_extra_sections
+#endif
+#ifndef elf_backend_gc_sweep_hook
+#define elf_backend_gc_sweep_hook	NULL
+#endif
+#ifndef bfd_elf32_bfd_gc_sections
+#define bfd_elf32_bfd_gc_sections bfd_elf_gc_sections
+#endif
+
+#ifndef bfd_elf32_bfd_merge_sections
+#define bfd_elf32_bfd_merge_sections \
+  _bfd_elf_merge_sections
+#endif
+
+#ifndef bfd_elf32_bfd_is_group_section
+#define bfd_elf32_bfd_is_group_section bfd_elf_is_group_section
+#endif
+
+#ifndef bfd_elf32_bfd_discard_group
+#define bfd_elf32_bfd_discard_group bfd_generic_discard_group
+#endif
+
+#ifndef bfd_elf32_section_already_linked
+#define bfd_elf32_section_already_linked \
+  _bfd_elf_section_already_linked
+#endif
+
+#ifndef bfd_elf32_bfd_define_common_symbol
+#define bfd_elf32_bfd_define_common_symbol bfd_generic_define_common_symbol
+#endif
+
+#ifndef bfd_elf32_bfd_lookup_section_flags
+#define bfd_elf32_bfd_lookup_section_flags bfd_elf_lookup_section_flags
+#endif
+
+#ifndef bfd_elf32_bfd_make_debug_symbol
+#define bfd_elf32_bfd_make_debug_symbol \
+  ((asymbol * (*) (bfd *, void *, unsigned long)) bfd_nullvoidptr)
+#endif
+
+#ifndef bfd_elf32_bfd_copy_private_symbol_data
+#define bfd_elf32_bfd_copy_private_symbol_data \
+  _bfd_elf_copy_private_symbol_data
+#endif
+
+#ifndef bfd_elf32_bfd_copy_private_section_data
+#define bfd_elf32_bfd_copy_private_section_data \
+  _bfd_elf_copy_private_section_data
+#endif
+#ifndef bfd_elf32_bfd_copy_private_header_data
+#define bfd_elf32_bfd_copy_private_header_data \
+  _bfd_elf_copy_private_header_data
+#endif
+#ifndef bfd_elf32_bfd_copy_private_bfd_data
+#define bfd_elf32_bfd_copy_private_bfd_data \
+  _bfd_elf_copy_private_bfd_data
+#endif
+#ifndef bfd_elf32_bfd_print_private_bfd_data
+#define bfd_elf32_bfd_print_private_bfd_data \
+  _bfd_elf_print_private_bfd_data
+#endif
+#ifndef bfd_elf32_bfd_merge_private_bfd_data
+#define bfd_elf32_bfd_merge_private_bfd_data \
+  ((bfd_boolean (*) (bfd *, struct bfd_link_info *)) bfd_true)
+#endif
+#ifndef bfd_elf32_bfd_set_private_flags
+#define bfd_elf32_bfd_set_private_flags \
+  ((bfd_boolean (*) (bfd *, flagword)) bfd_true)
+#endif
+#ifndef bfd_elf32_bfd_is_local_label_name
+#define bfd_elf32_bfd_is_local_label_name _bfd_elf_is_local_label_name
+#endif
+#ifndef bfd_elf32_bfd_is_target_special_symbol
+#define bfd_elf32_bfd_is_target_special_symbol \
+  ((bfd_boolean (*) (bfd *, asymbol *)) bfd_false)
+#endif
+
+#ifndef bfd_elf32_get_dynamic_reloc_upper_bound
+#define bfd_elf32_get_dynamic_reloc_upper_bound \
+  _bfd_elf_get_dynamic_reloc_upper_bound
+#endif
+#ifndef bfd_elf32_canonicalize_dynamic_reloc
+#define bfd_elf32_canonicalize_dynamic_reloc \
+  _bfd_elf_canonicalize_dynamic_reloc
+#endif
+
+#ifdef elf_backend_relocate_section
+#ifndef bfd_elf32_bfd_link_hash_table_create
+#define bfd_elf32_bfd_link_hash_table_create _bfd_elf_link_hash_table_create
+#endif
+#ifndef bfd_elf32_bfd_copy_link_hash_symbol_type
+#define bfd_elf32_bfd_copy_link_hash_symbol_type \
+  _bfd_elf_copy_link_hash_symbol_type
+#endif
+#ifndef bfd_elf32_bfd_link_add_symbols
+#define bfd_elf32_bfd_link_add_symbols	bfd_elf_link_add_symbols
+#endif
+#ifndef bfd_elf32_bfd_final_link
+#define bfd_elf32_bfd_final_link	bfd_elf_final_link
+#endif
+#else /* ! defined (elf_backend_relocate_section) */
+/* If no backend relocate_section routine, use the generic linker.
+   Note - this will prevent the port from being able to use some of
+   the other features of the ELF linker, because the generic hash structure
+   does not have the fields needed by the ELF linker.  In particular it
+   means that linking directly to S-records will not work.  */
+#ifndef bfd_elf32_bfd_link_hash_table_create
+#define bfd_elf32_bfd_link_hash_table_create \
+  _bfd_generic_link_hash_table_create
+#endif
+#ifndef bfd_elf32_bfd_copy_link_hash_symbol_type
+#define bfd_elf32_bfd_copy_link_hash_symbol_type \
+  _bfd_generic_copy_link_hash_symbol_type
+#endif
+#ifndef bfd_elf32_bfd_link_add_symbols
+#define bfd_elf32_bfd_link_add_symbols	_bfd_generic_link_add_symbols
+#endif
+#ifndef bfd_elf32_bfd_final_link
+#define bfd_elf32_bfd_final_link	_bfd_generic_final_link
+#endif
+#endif /* ! defined (elf_backend_relocate_section) */
+
+#ifndef bfd_elf32_bfd_link_just_syms
+#define bfd_elf32_bfd_link_just_syms	_bfd_elf_link_just_syms
+#endif
+
+#ifndef bfd_elf32_bfd_link_split_section
+#define bfd_elf32_bfd_link_split_section _bfd_generic_link_split_section
+#endif
+
+#ifndef bfd_elf32_bfd_link_check_relocs
+#define bfd_elf32_bfd_link_check_relocs  _bfd_elf_link_check_relocs
+#endif
+
+#ifndef bfd_elf32_archive_p
+#define bfd_elf32_archive_p bfd_generic_archive_p
+#endif
+
+#ifndef bfd_elf32_write_archive_contents
+#define bfd_elf32_write_archive_contents _bfd_write_archive_contents
+#endif
+
+#ifndef bfd_elf32_mkobject
+#define bfd_elf32_mkobject bfd_elf_make_object
+#endif
+
+#ifndef bfd_elf32_mkcorefile
+#define bfd_elf32_mkcorefile bfd_elf_mkcorefile
+#endif
+
+#ifndef bfd_elf32_mkarchive
+#define bfd_elf32_mkarchive _bfd_generic_mkarchive
+#endif
+
+#ifndef bfd_elf32_print_symbol
+#define bfd_elf32_print_symbol bfd_elf_print_symbol
+#endif
+
+#ifndef elf_symbol_leading_char
+#define elf_symbol_leading_char 0
+#endif
+
+#ifndef elf_info_to_howto
+#define elf_info_to_howto 0
+#endif
+
+#ifndef elf_info_to_howto_rel
+#define elf_info_to_howto_rel 0
+#endif
+
+#ifndef elf_backend_arch_data
+#define elf_backend_arch_data NULL
+#endif
+
+#ifndef ELF_TARGET_ID
+#define ELF_TARGET_ID	GENERIC_ELF_DATA
+#endif
+
+#ifndef ELF_OSABI
+#define ELF_OSABI ELFOSABI_NONE
+#endif
+
+#ifndef ELF_MAXPAGESIZE
+# error ELF_MAXPAGESIZE is not defined
+#define ELF_MAXPAGESIZE 1
+#endif
+
+#ifndef ELF_COMMONPAGESIZE
+#define ELF_COMMONPAGESIZE ELF_MAXPAGESIZE
+#endif
+
+#ifndef ELF_MINPAGESIZE
+#define ELF_MINPAGESIZE ELF_COMMONPAGESIZE
+#endif
+
+#if ELF_COMMONPAGESIZE > ELF_MAXPAGESIZE
+# error ELF_COMMONPAGESIZE > ELF_MAXPAGESIZE
+#endif
+#if ELF_MINPAGESIZE > ELF_COMMONPAGESIZE
+# error ELF_MINPAGESIZE > ELF_COMMONPAGESIZE
+#endif
+
+#ifndef ELF_DYNAMIC_SEC_FLAGS
+/* Note that we set the SEC_IN_MEMORY flag for these sections.  */
+#define ELF_DYNAMIC_SEC_FLAGS			\
+  (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS	\
+   | SEC_IN_MEMORY | SEC_LINKER_CREATED)
+#endif
+
+#ifndef elf_backend_collect
+#define elf_backend_collect FALSE
+#endif
+#ifndef elf_backend_type_change_ok
+#define elf_backend_type_change_ok FALSE
+#endif
+
+#ifndef elf_backend_sym_is_global
+#define elf_backend_sym_is_global	0
+#endif
+#ifndef elf_backend_object_p
+#define elf_backend_object_p		0
+#endif
+#ifndef elf_backend_symbol_processing
+#define elf_backend_symbol_processing	0
+#endif
+#ifndef elf_backend_symbol_table_processing
+#define elf_backend_symbol_table_processing	0
+#endif
+#ifndef elf_backend_get_symbol_type
+#define elf_backend_get_symbol_type 0
+#endif
+#ifndef elf_backend_archive_symbol_lookup
+#define elf_backend_archive_symbol_lookup _bfd_elf_archive_symbol_lookup
+#endif
+#ifndef elf_backend_name_local_section_symbols
+#define elf_backend_name_local_section_symbols	0
+#endif
+#ifndef elf_backend_section_processing
+#define elf_backend_section_processing	0
+#endif
+#ifndef elf_backend_section_from_shdr
+#define elf_backend_section_from_shdr	_bfd_elf_make_section_from_shdr
+#endif
+#ifndef elf_backend_section_flags
+#define elf_backend_section_flags	0
+#endif
+#ifndef elf_backend_get_sec_type_attr
+#define elf_backend_get_sec_type_attr	_bfd_elf_get_sec_type_attr
+#endif
+#ifndef elf_backend_section_from_phdr
+#define elf_backend_section_from_phdr	_bfd_elf_make_section_from_phdr
+#endif
+#ifndef elf_backend_fake_sections
+#define elf_backend_fake_sections	0
+#endif
+#ifndef elf_backend_section_from_bfd_section
+#define elf_backend_section_from_bfd_section	0
+#endif
+#ifndef elf_backend_add_symbol_hook
+#define elf_backend_add_symbol_hook	0
+#endif
+#ifndef elf_backend_link_output_symbol_hook
+#define elf_backend_link_output_symbol_hook 0
+#endif
+#ifndef elf_backend_create_dynamic_sections
+#define elf_backend_create_dynamic_sections 0
+#endif
+#ifndef elf_backend_omit_section_dynsym
+#define elf_backend_omit_section_dynsym _bfd_elf_link_omit_section_dynsym
+#endif
+#ifndef elf_backend_relocs_compatible
+#define elf_backend_relocs_compatible _bfd_elf_default_relocs_compatible
+#endif
+#ifndef elf_backend_check_relocs
+#define elf_backend_check_relocs	0
+#endif
+#ifndef elf_backend_check_directives
+#define elf_backend_check_directives	0
+#endif
+#ifndef elf_backend_notice_as_needed
+#define elf_backend_notice_as_needed	_bfd_elf_notice_as_needed
+#endif
+#ifndef elf_backend_adjust_dynamic_symbol
+#define elf_backend_adjust_dynamic_symbol 0
+#endif
+#ifndef elf_backend_always_size_sections
+#define elf_backend_always_size_sections 0
+#endif
+#ifndef elf_backend_size_dynamic_sections
+#define elf_backend_size_dynamic_sections 0
+#endif
+#ifndef elf_backend_init_index_section
+#define elf_backend_init_index_section \
+ ((void (*) (bfd *, struct bfd_link_info *)) bfd_void)
+#endif
+#ifndef elf_backend_relocate_section
+#define elf_backend_relocate_section	0
+#endif
+#ifndef elf_backend_finish_dynamic_symbol
+#define elf_backend_finish_dynamic_symbol	0
+#endif
+#ifndef elf_backend_finish_dynamic_sections
+#define elf_backend_finish_dynamic_sections	0
+#endif
+#ifndef elf_backend_begin_write_processing
+#define elf_backend_begin_write_processing	0
+#endif
+#ifndef elf_backend_final_write_processing
+#define elf_backend_final_write_processing	0
+#endif
+#ifndef elf_backend_additional_program_headers
+#define elf_backend_additional_program_headers	0
+#endif
+#ifndef elf_backend_modify_segment_map
+#define elf_backend_modify_segment_map	0
+#endif
+#ifndef elf_backend_modify_program_headers
+#define elf_backend_modify_program_headers	0
+#endif
+#ifndef elf_backend_allow_non_load_phdr
+#define elf_backend_allow_non_load_phdr	\
+  ((bfd_boolean (*) (bfd *, const Elf_Internal_Phdr *, unsigned)) bfd_false)
+#endif
+#ifndef elf_backend_ecoff_debug_swap
+#define elf_backend_ecoff_debug_swap	0
+#endif
+#ifndef elf_backend_bfd_from_remote_memory
+#define elf_backend_bfd_from_remote_memory _bfd_elf32_bfd_from_remote_memory
+#endif
+#ifndef elf_backend_got_header_size
+#define elf_backend_got_header_size	0
+#endif
+#ifndef elf_backend_got_elt_size
+#define elf_backend_got_elt_size _bfd_elf_default_got_elt_size
+#endif
+#ifndef elf_backend_obj_attrs_vendor
+#define elf_backend_obj_attrs_vendor		NULL
+#endif
+#ifndef elf_backend_obj_attrs_section
+#define elf_backend_obj_attrs_section		NULL
+#endif
+#ifndef elf_backend_obj_attrs_arg_type
+#define elf_backend_obj_attrs_arg_type		NULL
+#endif
+#ifndef elf_backend_obj_attrs_section_type
+#define elf_backend_obj_attrs_section_type		SHT_GNU_ATTRIBUTES
+#endif
+#ifndef elf_backend_obj_attrs_order
+#define elf_backend_obj_attrs_order		NULL
+#endif
+#ifndef elf_backend_obj_attrs_handle_unknown
+#define elf_backend_obj_attrs_handle_unknown	NULL
+#endif
+#ifndef elf_backend_static_tls_alignment
+#define elf_backend_static_tls_alignment	1
+#endif
+#ifndef elf_backend_post_process_headers
+#define elf_backend_post_process_headers	_bfd_elf_post_process_headers
+#endif
+#ifndef elf_backend_print_symbol_all
+#define elf_backend_print_symbol_all		NULL
+#endif
+#ifndef elf_backend_output_arch_local_syms
+#define elf_backend_output_arch_local_syms	NULL
+#endif
+#ifndef elf_backend_output_arch_syms
+#define elf_backend_output_arch_syms		NULL
+#endif
+#ifndef elf_backend_filter_implib_symbols
+#define elf_backend_filter_implib_symbols	NULL
+#endif
+#ifndef elf_backend_copy_indirect_symbol
+#define elf_backend_copy_indirect_symbol	_bfd_elf_link_hash_copy_indirect
+#endif
+#ifndef elf_backend_hide_symbol
+#define elf_backend_hide_symbol			_bfd_elf_link_hash_hide_symbol
+#endif
+#ifndef elf_backend_fixup_symbol
+#define elf_backend_fixup_symbol		NULL
+#endif
+#ifndef elf_backend_merge_symbol_attribute
+#define elf_backend_merge_symbol_attribute	NULL
+#endif
+#ifndef elf_backend_get_target_dtag
+#define elf_backend_get_target_dtag		NULL
+#endif
+#ifndef elf_backend_ignore_undef_symbol
+#define elf_backend_ignore_undef_symbol		NULL
+#endif
+#ifndef elf_backend_emit_relocs
+#define elf_backend_emit_relocs			_bfd_elf_link_output_relocs
+#endif
+#ifndef elf_backend_update_relocs
+#define elf_backend_update_relocs		NULL
+#endif
+#ifndef elf_backend_count_relocs
+#define elf_backend_count_relocs		NULL
+#endif
+#ifndef elf_backend_count_additional_relocs
+#define elf_backend_count_additional_relocs	NULL
+#endif
+#ifndef elf_backend_sort_relocs_p
+#define elf_backend_sort_relocs_p		NULL
+#endif
+#ifndef elf_backend_grok_prstatus
+#define elf_backend_grok_prstatus		NULL
+#endif
+#ifndef elf_backend_grok_psinfo
+#define elf_backend_grok_psinfo			NULL
+#endif
+#ifndef elf_backend_write_core_note
+#define elf_backend_write_core_note		NULL
+#endif
+#ifndef elf_backend_lookup_section_flags_hook
+#define elf_backend_lookup_section_flags_hook	NULL
+#endif
+#ifndef elf_backend_reloc_type_class
+#define elf_backend_reloc_type_class		_bfd_elf_reloc_type_class
+#endif
+#ifndef elf_backend_discard_info
+#define elf_backend_discard_info		NULL
+#endif
+#ifndef elf_backend_ignore_discarded_relocs
+#define elf_backend_ignore_discarded_relocs	NULL
+#endif
+#ifndef elf_backend_action_discarded
+#define elf_backend_action_discarded _bfd_elf_default_action_discarded
+#endif
+#ifndef elf_backend_eh_frame_address_size
+#define elf_backend_eh_frame_address_size _bfd_elf_eh_frame_address_size
+#endif
+#ifndef elf_backend_can_make_relative_eh_frame
+#define elf_backend_can_make_relative_eh_frame	_bfd_elf_can_make_relative
+#endif
+#ifndef elf_backend_can_make_lsda_relative_eh_frame
+#define elf_backend_can_make_lsda_relative_eh_frame	_bfd_elf_can_make_relative
+#endif
+#ifndef elf_backend_encode_eh_address
+#define elf_backend_encode_eh_address		_bfd_elf_encode_eh_address
+#endif
+#ifndef elf_backend_write_section
+#define elf_backend_write_section		NULL
+#endif
+#ifndef elf_backend_mips_irix_compat
+#define elf_backend_mips_irix_compat		NULL
+#endif
+#ifndef elf_backend_mips_rtype_to_howto
+#define elf_backend_mips_rtype_to_howto		NULL
+#endif
+
+/* Previously, backends could only use SHT_REL or SHT_RELA relocation
+   sections, but not both.  They defined USE_REL to indicate SHT_REL
+   sections, and left it undefined to indicated SHT_RELA sections.
+   For backwards compatibility, we still support this usage.  */
+#ifndef USE_REL
+#define USE_REL 0
+#endif
+
+/* Use these in new code.  */
+#ifndef elf_backend_may_use_rel_p
+#define elf_backend_may_use_rel_p USE_REL
+#endif
+#ifndef elf_backend_may_use_rela_p
+#define elf_backend_may_use_rela_p !USE_REL
+#endif
+#ifndef elf_backend_default_use_rela_p
+#define elf_backend_default_use_rela_p !USE_REL
+#endif
+#ifndef elf_backend_rela_plts_and_copies_p
+#define elf_backend_rela_plts_and_copies_p elf_backend_default_use_rela_p
+#endif
+
+#ifndef elf_backend_rela_normal
+#define elf_backend_rela_normal 0
+#endif
+
+#ifndef elf_backend_dtrel_excludes_plt
+#define elf_backend_dtrel_excludes_plt 0
+#endif
+
+#ifndef elf_backend_plt_sym_val
+#define elf_backend_plt_sym_val NULL
+#endif
+#ifndef elf_backend_relplt_name
+#define elf_backend_relplt_name NULL
+#endif
+
+#ifndef ELF_MACHINE_ALT1
+#define ELF_MACHINE_ALT1 0
+#endif
+
+#ifndef ELF_MACHINE_ALT2
+#define ELF_MACHINE_ALT2 0
+#endif
+
+#ifndef elf_backend_size_info
+#define elf_backend_size_info _bfd_elf32_size_info
+#endif
+
+#ifndef elf_backend_special_sections
+#define elf_backend_special_sections NULL
+#endif
+
+#ifndef elf_backend_sign_extend_vma
+#define elf_backend_sign_extend_vma 0
+#endif
+
+#ifndef elf_backend_link_order_error_handler
+#define elf_backend_link_order_error_handler _bfd_error_handler
+#endif
+
+#ifndef elf_backend_common_definition
+#define elf_backend_common_definition _bfd_elf_common_definition
+#endif
+
+#ifndef elf_backend_common_section_index
+#define elf_backend_common_section_index _bfd_elf_common_section_index
+#endif
+
+#ifndef elf_backend_common_section
+#define elf_backend_common_section _bfd_elf_common_section
+#endif
+
+#ifndef elf_backend_merge_symbol
+#define elf_backend_merge_symbol NULL
+#endif
+
+#ifndef elf_backend_hash_symbol
+#define elf_backend_hash_symbol _bfd_elf_hash_symbol
+#endif
+
+#ifndef elf_backend_is_function_type
+#define elf_backend_is_function_type _bfd_elf_is_function_type
+#endif
+
+#ifndef elf_backend_maybe_function_sym
+#define elf_backend_maybe_function_sym _bfd_elf_maybe_function_sym
+#endif
+
+#ifndef elf_backend_get_reloc_section
+#define elf_backend_get_reloc_section _bfd_elf_get_reloc_section
+#endif
+
+#ifndef elf_backend_copy_special_section_fields
+#define elf_backend_copy_special_section_fields NULL
+#endif
+
+#ifndef elf_backend_compact_eh_encoding
+#define elf_backend_compact_eh_encoding NULL
+#endif
+
+#ifndef elf_backend_cant_unwind_opcode
+#define elf_backend_cant_unwind_opcode 0
+#endif
+
+#ifndef elf_match_priority
+#define elf_match_priority \
+  (ELF_ARCH == bfd_arch_unknown ? 2 : ELF_OSABI == ELFOSABI_NONE ? 1 : 0)
+#endif
+
+extern const struct elf_size_info _bfd_elf32_size_info;
+
+static struct elf_backend_data elf32_bed =
+{
+  ELF_ARCH,			/* arch */
+  ELF_TARGET_ID,		/* target_id */
+  ELF_MACHINE_CODE,		/* elf_machine_code */
+  ELF_OSABI,			/* elf_osabi  */
+  ELF_MAXPAGESIZE,		/* maxpagesize */
+  ELF_MINPAGESIZE,		/* minpagesize */
+  ELF_COMMONPAGESIZE,		/* commonpagesize */
+  ELF_DYNAMIC_SEC_FLAGS,	/* dynamic_sec_flags */
+  elf_backend_arch_data,
+  elf_info_to_howto,
+  elf_info_to_howto_rel,
+  elf_backend_sym_is_global,
+  elf_backend_object_p,
+  elf_backend_symbol_processing,
+  elf_backend_symbol_table_processing,
+  elf_backend_get_symbol_type,
+  elf_backend_archive_symbol_lookup,
+  elf_backend_name_local_section_symbols,
+  elf_backend_section_processing,
+  elf_backend_section_from_shdr,
+  elf_backend_section_flags,
+  elf_backend_get_sec_type_attr,
+  elf_backend_section_from_phdr,
+  elf_backend_fake_sections,
+  elf_backend_section_from_bfd_section,
+  elf_backend_add_symbol_hook,
+  elf_backend_link_output_symbol_hook,
+  elf_backend_create_dynamic_sections,
+  elf_backend_omit_section_dynsym,
+  elf_backend_relocs_compatible,
+  elf_backend_check_relocs,
+  elf_backend_check_directives,
+  elf_backend_notice_as_needed,
+  elf_backend_adjust_dynamic_symbol,
+  elf_backend_always_size_sections,
+  elf_backend_size_dynamic_sections,
+  elf_backend_init_index_section,
+  elf_backend_relocate_section,
+  elf_backend_finish_dynamic_symbol,
+  elf_backend_finish_dynamic_sections,
+  elf_backend_begin_write_processing,
+  elf_backend_final_write_processing,
+  elf_backend_additional_program_headers,
+  elf_backend_modify_segment_map,
+  elf_backend_modify_program_headers,
+  elf_backend_allow_non_load_phdr,
+  elf_backend_gc_keep,
+  elf_backend_gc_mark_dynamic_ref,
+  elf_backend_gc_mark_hook,
+  elf_backend_gc_mark_extra_sections,
+  elf_backend_gc_sweep_hook,
+  elf_backend_post_process_headers,
+  elf_backend_print_symbol_all,
+  elf_backend_output_arch_local_syms,
+  elf_backend_output_arch_syms,
+  elf_backend_filter_implib_symbols,
+  elf_backend_copy_indirect_symbol,
+  elf_backend_hide_symbol,
+  elf_backend_fixup_symbol,
+  elf_backend_merge_symbol_attribute,
+  elf_backend_get_target_dtag,
+  elf_backend_ignore_undef_symbol,
+  elf_backend_emit_relocs,
+  elf_backend_update_relocs,
+  elf_backend_count_relocs,
+  elf_backend_count_additional_relocs,
+  elf_backend_sort_relocs_p,
+  elf_backend_grok_prstatus,
+  elf_backend_grok_psinfo,
+  elf_backend_write_core_note,
+  elf_backend_lookup_section_flags_hook,
+  elf_backend_reloc_type_class,
+  elf_backend_discard_info,
+  elf_backend_ignore_discarded_relocs,
+  elf_backend_action_discarded,
+  elf_backend_eh_frame_address_size,
+  elf_backend_can_make_relative_eh_frame,
+  elf_backend_can_make_lsda_relative_eh_frame,
+  elf_backend_encode_eh_address,
+  elf_backend_write_section,
+  elf_backend_mips_irix_compat,
+  elf_backend_mips_rtype_to_howto,
+  elf_backend_ecoff_debug_swap,
+  elf_backend_bfd_from_remote_memory,
+  elf_backend_plt_sym_val,
+  elf_backend_common_definition,
+  elf_backend_common_section_index,
+  elf_backend_common_section,
+  elf_backend_merge_symbol,
+  elf_backend_hash_symbol,
+  elf_backend_is_function_type,
+  elf_backend_maybe_function_sym,
+  elf_backend_get_reloc_section,
+  elf_backend_copy_special_section_fields,
+  elf_backend_link_order_error_handler,
+  elf_backend_relplt_name,
+  ELF_MACHINE_ALT1,
+  ELF_MACHINE_ALT2,
+  &elf_backend_size_info,
+  elf_backend_special_sections,
+  elf_backend_got_header_size,
+  elf_backend_got_elt_size,
+  elf_backend_obj_attrs_vendor,
+  elf_backend_obj_attrs_section,
+  elf_backend_obj_attrs_arg_type,
+  elf_backend_obj_attrs_section_type,
+  elf_backend_obj_attrs_order,
+  elf_backend_obj_attrs_handle_unknown,
+  elf_backend_compact_eh_encoding,
+  elf_backend_cant_unwind_opcode,
+  elf_backend_static_tls_alignment,
+  elf_backend_stack_align,
+  elf_backend_strtab_flags,
+  elf_backend_collect,
+  elf_backend_type_change_ok,
+  elf_backend_may_use_rel_p,
+  elf_backend_may_use_rela_p,
+  elf_backend_default_use_rela_p,
+  elf_backend_rela_plts_and_copies_p,
+  elf_backend_rela_normal,
+  elf_backend_dtrel_excludes_plt,
+  elf_backend_sign_extend_vma,
+  elf_backend_want_got_plt,
+  elf_backend_plt_readonly,
+  elf_backend_want_plt_sym,
+  elf_backend_plt_not_loaded,
+  elf_backend_plt_alignment,
+  elf_backend_can_gc_sections,
+  elf_backend_can_refcount,
+  elf_backend_want_got_sym,
+  elf_backend_want_dynbss,
+  elf_backend_want_dynrelro,
+  elf_backend_want_p_paddr_set_to_zero,
+  elf_backend_no_page_alias,
+  elf_backend_default_execstack,
+  elf_backend_caches_rawsize,
+  elf_backend_extern_protected_data,
+  elf_backend_always_renumber_dynsyms
+};
+
+/* Forward declaration for use when initialising alternative_target field.  */
+#ifdef TARGET_LITTLE_SYM
+extern const bfd_target TARGET_LITTLE_SYM;
+#endif
+
+#ifdef TARGET_BIG_SYM
+const bfd_target TARGET_BIG_SYM =
+{
+  /* name: identify kind of target */
+  TARGET_BIG_NAME,
+
+  /* flavour: general indication about file */
+  bfd_target_elf_flavour,
+
+  /* byteorder: data is big endian */
+  BFD_ENDIAN_BIG,
+
+  /* header_byteorder: header is also big endian */
+  BFD_ENDIAN_BIG,
+
+  /* object_flags: mask of all file flags */
+  (HAS_RELOC | EXEC_P | HAS_LINENO | HAS_DEBUG | HAS_SYMS | HAS_LOCALS
+   | DYNAMIC | WP_TEXT | D_PAGED | BFD_COMPRESS | BFD_DECOMPRESS
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON),
+
+  /* section_flags: mask of all section flags */
+  (SEC_HAS_CONTENTS | SEC_ALLOC | SEC_LOAD | SEC_RELOC | SEC_READONLY
+   | SEC_CODE | SEC_DATA | SEC_DEBUGGING | SEC_EXCLUDE | SEC_SORT_ENTRIES
+   | SEC_SMALL_DATA | SEC_MERGE | SEC_STRINGS | SEC_GROUP),
+
+   /* leading_symbol_char: is the first char of a user symbol
+      predictable, and if so what is it */
+  elf_symbol_leading_char,
+
+  /* ar_pad_char: pad character for filenames within an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and/or os and should be independently tunable */
+  '/',
+
+  /* ar_max_namelen: maximum number of characters in an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and should be independently tunable.  The System V ABI,
+     Chapter 7 (Formats & Protocols), Archive section sets this as 15.  */
+  15,
+
+  elf_match_priority,
+
+  /* Routines to byte-swap various sized integers from the data sections */
+  bfd_getb64, bfd_getb_signed_64, bfd_putb64,
+    bfd_getb32, bfd_getb_signed_32, bfd_putb32,
+    bfd_getb16, bfd_getb_signed_16, bfd_putb16,
+
+  /* Routines to byte-swap various sized integers from the file headers */
+  bfd_getb64, bfd_getb_signed_64, bfd_putb64,
+    bfd_getb32, bfd_getb_signed_32, bfd_putb32,
+    bfd_getb16, bfd_getb_signed_16, bfd_putb16,
+
+  /* bfd_check_format: check the format of a file being read */
+  { _bfd_dummy_target,		/* unknown format */
+    bfd_elf32_object_p,		/* assembler/linker output (object file) */
+    bfd_elf32_archive_p,	/* an archive */
+    bfd_elf32_core_file_p	/* a core file */
+  },
+
+  /* bfd_set_format: set the format of a file being written */
+  { bfd_false,
+    bfd_elf32_mkobject,
+    bfd_elf32_mkarchive,
+    bfd_elf32_mkcorefile
+  },
+
+  /* bfd_write_contents: write cached information into a file being written */
+  { bfd_false,
+    bfd_elf32_write_object_contents,
+    bfd_elf32_write_archive_contents,
+    bfd_elf32_write_corefile_contents,
+  },
+
+  BFD_JUMP_TABLE_GENERIC (bfd_elf32),
+  BFD_JUMP_TABLE_COPY (bfd_elf32),
+  BFD_JUMP_TABLE_CORE (bfd_elf32),
+#ifdef bfd_elf32_archive_functions
+  BFD_JUMP_TABLE_ARCHIVE (bfd_elf32_archive),
+#elif defined USE_64_BIT_ARCHIVE
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_64_bit),
+#else
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_coff),
+#endif
+  BFD_JUMP_TABLE_SYMBOLS (bfd_elf32),
+  BFD_JUMP_TABLE_RELOCS (bfd_elf32),
+  BFD_JUMP_TABLE_WRITE (bfd_elf32),
+  BFD_JUMP_TABLE_LINK (bfd_elf32),
+  BFD_JUMP_TABLE_DYNAMIC (bfd_elf32),
+
+  /* Alternative endian target.  */
+#ifdef TARGET_LITTLE_SYM
+  & TARGET_LITTLE_SYM,
+#else
+  NULL,
+#endif
+
+  /* backend_data: */
+  &elf32_bed
+};
+#endif
+
+#ifdef TARGET_LITTLE_SYM
+const bfd_target TARGET_LITTLE_SYM =
+{
+  /* name: identify kind of target */
+  TARGET_LITTLE_NAME,
+
+  /* flavour: general indication about file */
+  bfd_target_elf_flavour,
+
+  /* byteorder: data is little endian */
+  BFD_ENDIAN_LITTLE,
+
+  /* header_byteorder: header is also little endian */
+  BFD_ENDIAN_LITTLE,
+
+  /* object_flags: mask of all file flags */
+  (HAS_RELOC | EXEC_P | HAS_LINENO | HAS_DEBUG | HAS_SYMS | HAS_LOCALS
+   | DYNAMIC | WP_TEXT | D_PAGED | BFD_COMPRESS | BFD_DECOMPRESS
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON),
+
+  /* section_flags: mask of all section flags */
+  (SEC_HAS_CONTENTS | SEC_ALLOC | SEC_LOAD | SEC_RELOC | SEC_READONLY
+   | SEC_CODE | SEC_DATA | SEC_DEBUGGING | SEC_EXCLUDE | SEC_SORT_ENTRIES
+   | SEC_SMALL_DATA | SEC_MERGE | SEC_STRINGS | SEC_GROUP),
+
+   /* leading_symbol_char: is the first char of a user symbol
+      predictable, and if so what is it */
+  elf_symbol_leading_char,
+
+  /* ar_pad_char: pad character for filenames within an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and/or os and should be independently tunable */
+  '/',
+
+  /* ar_max_namelen: maximum number of characters in an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and should be independently tunable.  The System V ABI,
+     Chapter 7 (Formats & Protocols), Archive section sets this as 15.  */
+  15,
+
+  elf_match_priority,
+
+  /* Routines to byte-swap various sized integers from the data sections */
+  bfd_getl64, bfd_getl_signed_64, bfd_putl64,
+    bfd_getl32, bfd_getl_signed_32, bfd_putl32,
+    bfd_getl16, bfd_getl_signed_16, bfd_putl16,
+
+  /* Routines to byte-swap various sized integers from the file headers */
+  bfd_getl64, bfd_getl_signed_64, bfd_putl64,
+    bfd_getl32, bfd_getl_signed_32, bfd_putl32,
+    bfd_getl16, bfd_getl_signed_16, bfd_putl16,
+
+  /* bfd_check_format: check the format of a file being read */
+  { _bfd_dummy_target,		/* unknown format */
+    bfd_elf32_object_p,		/* assembler/linker output (object file) */
+    bfd_elf32_archive_p,	/* an archive */
+    bfd_elf32_core_file_p	/* a core file */
+  },
+
+  /* bfd_set_format: set the format of a file being written */
+  { bfd_false,
+    bfd_elf32_mkobject,
+    bfd_elf32_mkarchive,
+    bfd_elf32_mkcorefile
+  },
+
+  /* bfd_write_contents: write cached information into a file being written */
+  { bfd_false,
+    bfd_elf32_write_object_contents,
+    bfd_elf32_write_archive_contents,
+    bfd_elf32_write_corefile_contents,
+  },
+
+  BFD_JUMP_TABLE_GENERIC (bfd_elf32),
+  BFD_JUMP_TABLE_COPY (bfd_elf32),
+  BFD_JUMP_TABLE_CORE (bfd_elf32),
+#ifdef bfd_elf32_archive_functions
+  BFD_JUMP_TABLE_ARCHIVE (bfd_elf32_archive),
+#elif defined USE_64_BIT_ARCHIVE
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_64_bit),
+#else
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_coff),
+#endif
+  BFD_JUMP_TABLE_SYMBOLS (bfd_elf32),
+  BFD_JUMP_TABLE_RELOCS (bfd_elf32),
+  BFD_JUMP_TABLE_WRITE (bfd_elf32),
+  BFD_JUMP_TABLE_LINK (bfd_elf32),
+  BFD_JUMP_TABLE_DYNAMIC (bfd_elf32),
+
+  /* Alternative endian target.  */
+#ifdef TARGET_BIG_SYM
+  & TARGET_BIG_SYM,
+#else
+  NULL,
+#endif
+
+  /* backend_data: */
+  &elf32_bed
+};
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/elf64-riscv.c b/utils/gapy/gen-debug-info-src/ext/bfd/elf64-riscv.c
new file mode 100644
index 000000000..2f3bd5615
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/elf64-riscv.c
@@ -0,0 +1,4322 @@
+#line 1 "/home/haugoug/src/gap_sdk/riscv-gnu-toolchain/riscv-binutils-gdb/bfd/elfnn-riscv.c"
+/* RISC-V-specific support for 64-bit ELF.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+
+   Contributed by Andrew Waterman (andrew@sifive.com).
+   Based on TILE-Gx and MIPS targets.
+
+   PULP family support contributed by Eric Flamand (eflamand@iis.ee.ethz.ch) at ETH-Zurich
+   and Greenwaves Technologies (eric.flamand@greenwaves-technologies.com)
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING3. If not,
+   see <http://www.gnu.org/licenses/>.  */
+
+/* This file handles RISC-V ELF targets.  */
+
+#include "sysdep.h"
+#include "bfd.h"
+#include "libbfd.h"
+#include "bfdlink.h"
+#include "genlink.h"
+#include "elf-bfd.h"
+#include "elfxx-riscv.h"
+#include "elf/riscv.h"
+#include "opcode/riscv.h"
+
+#define ARCH_SIZE 64
+
+#define MINUS_ONE ((bfd_vma)0 - 1)
+
+#define RISCV_ELF_LOG_WORD_BYTES (ARCH_SIZE == 32 ? 2 : 3)
+
+#define RISCV_ELF_WORD_BYTES (1 << RISCV_ELF_LOG_WORD_BYTES)
+
+/* The name of the dynamic interpreter.  This is put in the .interp
+   section.  */
+
+#define ELF64_DYNAMIC_INTERPRETER "/lib/ld.so.1"
+#define ELF32_DYNAMIC_INTERPRETER "/lib32/ld.so.1"
+
+#define ELF_ARCH			bfd_arch_riscv
+#define ELF_TARGET_ID			RISCV_ELF_DATA
+#define ELF_MACHINE_CODE		EM_RISCV
+#define ELF_MAXPAGESIZE			0x1000
+#define ELF_COMMONPAGESIZE		0x1000
+
+/* Linker argument -mComp
+   Set linker to component mode, in this case when export section is generated we use an offset relative to the section head
+   for exported symbol. If not, resident mode, we use the absolute address */
+
+#if 64==32
+bfd_boolean ComponentMode = FALSE;
+
+/*  Linker argumenr -mDIE, to dump import and export sections */
+unsigned int DumpImportExportSections = 0;
+#endif
+
+/* The RISC-V linker needs to keep track of the number of relocs that it
+   decides to copy as dynamic relocs in check_relocs for each symbol.
+   This is so that it can later discard them if they are found to be
+   unnecessary.  We store the information in a field extending the
+   regular ELF linker hash table.  */
+
+struct riscv_elf_dyn_relocs
+{
+  struct riscv_elf_dyn_relocs *next;
+
+  /* The input section of the reloc.  */
+  asection *sec;
+
+  /* Total number of relocs copied for the input section.  */
+  bfd_size_type count;
+
+  /* Number of pc-relative relocs copied for the input section.  */
+  bfd_size_type pc_count;
+};
+
+/* RISC-V ELF linker hash entry.  */
+
+struct riscv_elf_link_hash_entry
+{
+  struct elf_link_hash_entry elf;
+
+  /* Track dynamic relocs copied for this symbol.  */
+  struct riscv_elf_dyn_relocs *dyn_relocs;
+
+#define GOT_UNKNOWN     0
+#define GOT_NORMAL      1
+#define GOT_TLS_GD      2
+#define GOT_TLS_IE      4
+#define GOT_TLS_LE      8
+  char tls_type;
+};
+
+#define riscv_elf_hash_entry(ent) \
+  ((struct riscv_elf_link_hash_entry *)(ent))
+
+struct _bfd_riscv_elf_obj_tdata
+{
+  struct elf_obj_tdata root;
+
+  /* tls_type for each local got entry.  */
+  char *local_got_tls_type;
+};
+
+#define _bfd_riscv_elf_tdata(abfd) \
+  ((struct _bfd_riscv_elf_obj_tdata *) (abfd)->tdata.any)
+
+#define _bfd_riscv_elf_local_got_tls_type(abfd) \
+  (_bfd_riscv_elf_tdata (abfd)->local_got_tls_type)
+
+#define _bfd_riscv_elf_tls_type(abfd, h, symndx)		\
+  (*((h) != NULL ? &riscv_elf_hash_entry (h)->tls_type		\
+     : &_bfd_riscv_elf_local_got_tls_type (abfd) [symndx]))
+
+#define is_riscv_elf(bfd)				\
+  (bfd_get_flavour (bfd) == bfd_target_elf_flavour	\
+   && elf_tdata (bfd) != NULL				\
+   && elf_object_id (bfd) == RISCV_ELF_DATA)
+
+#include "elf/common.h"
+#include "elf/internal.h"
+
+struct riscv_elf_link_hash_table
+{
+  struct elf_link_hash_table elf;
+
+  /* Short-cuts to get to dynamic linker sections.  */
+  asection *sdyntdata;
+
+  /* Small local sym to section mapping cache.  */
+  struct sym_cache sym_cache;
+};
+
+
+/* Get the RISC-V ELF linker hash table from a link_info structure.  */
+#define riscv_elf_hash_table(p) \
+  (elf_hash_table_id ((struct elf_link_hash_table *) ((p)->hash)) \
+  == RISCV_ELF_DATA ? ((struct riscv_elf_link_hash_table *) ((p)->hash)) : NULL)
+
+static void
+riscv_info_to_howto_rela (bfd *abfd ATTRIBUTE_UNUSED,
+			  arelent *cache_ptr,
+			  Elf_Internal_Rela *dst)
+{
+  cache_ptr->howto = riscv_elf_rtype_to_howto (ELF64_R_TYPE (dst->r_info));
+}
+
+bfd_boolean _bfd_riscv_elf_final_link (bfd *, struct bfd_link_info *);
+
+static void
+riscv_elf_append_rela (bfd *abfd, asection *s, Elf_Internal_Rela *rel)
+{
+  const struct elf_backend_data *bed;
+  bfd_byte *loc;
+
+  bed = get_elf_backend_data (abfd);
+  loc = s->contents + (s->reloc_count++ * bed->s->sizeof_rela);
+  bed->s->swap_reloca_out (abfd, rel, loc);
+}
+
+/* PLT/GOT stuff.  */
+
+#define PLT_HEADER_INSNS 8
+#define PLT_ENTRY_INSNS 4
+#define PLT_HEADER_SIZE (PLT_HEADER_INSNS * 4)
+#define PLT_ENTRY_SIZE (PLT_ENTRY_INSNS * 4)
+
+#define GOT_ENTRY_SIZE RISCV_ELF_WORD_BYTES
+
+#define GOTPLT_HEADER_SIZE (2 * GOT_ENTRY_SIZE)
+
+#define sec_addr(sec) ((sec)->output_section->vma + (sec)->output_offset)
+
+static bfd_vma
+riscv_elf_got_plt_val (bfd_vma plt_index, struct bfd_link_info *info)
+{
+  return sec_addr (riscv_elf_hash_table (info)->elf.sgotplt)
+	 + GOTPLT_HEADER_SIZE + (plt_index * GOT_ENTRY_SIZE);
+}
+
+#if ARCH_SIZE == 32
+# define MATCH_LREG MATCH_LW
+#else
+# define MATCH_LREG MATCH_LD
+#endif
+
+/* Generate a PLT header.  */
+
+static void
+riscv_make_plt_header (bfd_vma gotplt_addr, bfd_vma addr, uint32_t *entry)
+{
+  bfd_vma gotplt_offset_high = RISCV_PCREL_HIGH_PART (gotplt_addr, addr);
+  bfd_vma gotplt_offset_low = RISCV_PCREL_LOW_PART (gotplt_addr, addr);
+
+  /* auipc  t2, %hi(.got.plt)
+     sub    t1, t1, t3               # shifted .got.plt offset + hdr size + 12
+     l[w|d] t3, %lo(.got.plt)(t2)    # _dl_runtime_resolve
+     addi   t1, t1, -(hdr size + 12) # shifted .got.plt offset
+     addi   t0, t2, %lo(.got.plt)    # &.got.plt
+     srli   t1, t1, log2(16/PTRSIZE) # .got.plt offset
+     l[w|d] t0, PTRSIZE(t0)          # link map
+     jr     t3 */
+
+  entry[0] = RISCV_UTYPE (AUIPC, X_T2, gotplt_offset_high);
+  entry[1] = RISCV_RTYPE (SUB, X_T1, X_T1, X_T3);
+  entry[2] = RISCV_ITYPE (LREG, X_T3, X_T2, gotplt_offset_low);
+  entry[3] = RISCV_ITYPE (ADDI, X_T1, X_T1, -(PLT_HEADER_SIZE + 12));
+  entry[4] = RISCV_ITYPE (ADDI, X_T0, X_T2, gotplt_offset_low);
+  entry[5] = RISCV_ITYPE (SRLI, X_T1, X_T1, 4 - RISCV_ELF_LOG_WORD_BYTES);
+  entry[6] = RISCV_ITYPE (LREG, X_T0, X_T0, RISCV_ELF_WORD_BYTES);
+  entry[7] = RISCV_ITYPE (JALR, 0, X_T3, 0);
+}
+
+/* Generate a PLT entry.  */
+
+static void
+riscv_make_plt_entry (bfd_vma got, bfd_vma addr, uint32_t *entry)
+{
+  /* auipc  t3, %hi(.got.plt entry)
+     l[w|d] t3, %lo(.got.plt entry)(t3)
+     jalr   t1, t3
+     nop */
+
+  entry[0] = RISCV_UTYPE (AUIPC, X_T3, RISCV_PCREL_HIGH_PART (got, addr));
+  entry[1] = RISCV_ITYPE (LREG,  X_T3, X_T3, RISCV_PCREL_LOW_PART (got, addr));
+  entry[2] = RISCV_ITYPE (JALR, X_T1, X_T3, 0);
+  entry[3] = RISCV_NOP;
+}
+
+/* Create an entry in an RISC-V ELF linker hash table.  */
+
+static struct bfd_hash_entry *
+link_hash_newfunc (struct bfd_hash_entry *entry,
+		   struct bfd_hash_table *table, const char *string)
+{
+  /* Allocate the structure if it has not already been allocated by a
+     subclass.  */
+  if (entry == NULL)
+    {
+      entry =
+	bfd_hash_allocate (table,
+			   sizeof (struct riscv_elf_link_hash_entry));
+      if (entry == NULL)
+	return entry;
+    }
+
+  /* Call the allocation method of the superclass.  */
+  entry = _bfd_elf_link_hash_newfunc (entry, table, string);
+  if (entry != NULL)
+    {
+      struct riscv_elf_link_hash_entry *eh;
+
+      eh = (struct riscv_elf_link_hash_entry *) entry;
+      eh->dyn_relocs = NULL;
+      eh->tls_type = GOT_UNKNOWN;
+    }
+
+  return entry;
+}
+
+/* Create a RISC-V ELF linker hash table.  */
+
+static struct bfd_link_hash_table *
+riscv_elf_link_hash_table_create (bfd *abfd)
+{
+  struct riscv_elf_link_hash_table *ret;
+  bfd_size_type amt = sizeof (struct riscv_elf_link_hash_table);
+
+  ret = (struct riscv_elf_link_hash_table *) bfd_zmalloc (amt);
+  if (ret == NULL)
+    return NULL;
+
+  if (!_bfd_elf_link_hash_table_init (&ret->elf, abfd, link_hash_newfunc,
+				      sizeof (struct riscv_elf_link_hash_entry),
+				      RISCV_ELF_DATA))
+    {
+      free (ret);
+      return NULL;
+    }
+
+  return &ret->elf.root;
+}
+
+/* Create the .got section.  */
+
+static bfd_boolean
+riscv_elf_create_got_section (bfd *abfd, struct bfd_link_info *info)
+{
+  flagword flags;
+  asection *s, *s_got;
+  struct elf_link_hash_entry *h;
+  const struct elf_backend_data *bed = get_elf_backend_data (abfd);
+  struct elf_link_hash_table *htab = elf_hash_table (info);
+
+  /* This function may be called more than once.  */
+  if (htab->sgot != NULL)
+    return TRUE;
+
+  flags = bed->dynamic_sec_flags;
+
+  s = bfd_make_section_anyway_with_flags (abfd,
+					  (bed->rela_plts_and_copies_p
+					   ? ".rela.got" : ".rel.got"),
+					  (bed->dynamic_sec_flags
+					   | SEC_READONLY));
+  if (s == NULL
+      || ! bfd_set_section_alignment (abfd, s, bed->s->log_file_align))
+    return FALSE;
+  htab->srelgot = s;
+
+  s = s_got = bfd_make_section_anyway_with_flags (abfd, ".got", flags);
+  if (s == NULL
+      || !bfd_set_section_alignment (abfd, s, bed->s->log_file_align))
+    return FALSE;
+  htab->sgot = s;
+
+  /* The first bit of the global offset table is the header.  */
+  s->size += bed->got_header_size;
+
+  if (bed->want_got_plt)
+    {
+      s = bfd_make_section_anyway_with_flags (abfd, ".got.plt", flags);
+      if (s == NULL
+	  || !bfd_set_section_alignment (abfd, s,
+					 bed->s->log_file_align))
+	return FALSE;
+      htab->sgotplt = s;
+
+      /* Reserve room for the header.  */
+      s->size += GOTPLT_HEADER_SIZE;
+    }
+
+  if (bed->want_got_sym)
+    {
+      /* Define the symbol _GLOBAL_OFFSET_TABLE_ at the start of the .got
+	 section.  We don't do this in the linker script because we don't want
+	 to define the symbol if we are not creating a global offset
+	 table.  */
+      h = _bfd_elf_define_linkage_sym (abfd, info, s_got,
+				       "_GLOBAL_OFFSET_TABLE_");
+      elf_hash_table (info)->hgot = h;
+      if (h == NULL)
+	return FALSE;
+    }
+
+  return TRUE;
+}
+
+/* Create .plt, .rela.plt, .got, .got.plt, .rela.got, .dynbss, and
+   .rela.bss sections in DYNOBJ, and set up shortcuts to them in our
+   hash table.  */
+
+static bfd_boolean
+riscv_elf_create_dynamic_sections (bfd *dynobj,
+				   struct bfd_link_info *info)
+{
+  struct riscv_elf_link_hash_table *htab;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+
+  if (!riscv_elf_create_got_section (dynobj, info))
+    return FALSE;
+
+  if (!_bfd_elf_create_dynamic_sections (dynobj, info))
+    return FALSE;
+
+  if (!bfd_link_pic (info))
+    {
+      htab->sdyntdata =
+	bfd_make_section_anyway_with_flags (dynobj, ".tdata.dyn",
+					    SEC_ALLOC | SEC_THREAD_LOCAL);
+    }
+
+  if (!htab->elf.splt || !htab->elf.srelplt || !htab->elf.sdynbss
+      || (!bfd_link_pic (info) && (!htab->elf.srelbss || !htab->sdyntdata)))
+    abort ();
+
+  return TRUE;
+}
+
+/* Copy the extra info we tack onto an elf_link_hash_entry.  */
+
+static void
+riscv_elf_copy_indirect_symbol (struct bfd_link_info *info,
+				struct elf_link_hash_entry *dir,
+				struct elf_link_hash_entry *ind)
+{
+  struct riscv_elf_link_hash_entry *edir, *eind;
+
+  edir = (struct riscv_elf_link_hash_entry *) dir;
+  eind = (struct riscv_elf_link_hash_entry *) ind;
+
+  if (eind->dyn_relocs != NULL)
+    {
+      if (edir->dyn_relocs != NULL)
+	{
+	  struct riscv_elf_dyn_relocs **pp;
+	  struct riscv_elf_dyn_relocs *p;
+
+	  /* Add reloc counts against the indirect sym to the direct sym
+	     list.  Merge any entries against the same section.  */
+	  for (pp = &eind->dyn_relocs; (p = *pp) != NULL; )
+	    {
+	      struct riscv_elf_dyn_relocs *q;
+
+	      for (q = edir->dyn_relocs; q != NULL; q = q->next)
+		if (q->sec == p->sec)
+		  {
+		    q->pc_count += p->pc_count;
+		    q->count += p->count;
+		    *pp = p->next;
+		    break;
+		  }
+	      if (q == NULL)
+		pp = &p->next;
+	    }
+	  *pp = edir->dyn_relocs;
+	}
+
+      edir->dyn_relocs = eind->dyn_relocs;
+      eind->dyn_relocs = NULL;
+    }
+
+  if (ind->root.type == bfd_link_hash_indirect
+      && dir->got.refcount <= 0)
+    {
+      edir->tls_type = eind->tls_type;
+      eind->tls_type = GOT_UNKNOWN;
+    }
+  _bfd_elf_link_hash_copy_indirect (info, dir, ind);
+}
+
+static bfd_boolean
+riscv_elf_record_tls_type (bfd *abfd, struct elf_link_hash_entry *h,
+			   unsigned long symndx, char tls_type)
+{
+  char *new_tls_type = &_bfd_riscv_elf_tls_type (abfd, h, symndx);
+
+  *new_tls_type |= tls_type;
+  if ((*new_tls_type & GOT_NORMAL) && (*new_tls_type & ~GOT_NORMAL))
+    {
+      (*_bfd_error_handler)
+	(_("%B: `%s' accessed both as normal and thread local symbol"),
+	 abfd, h ? h->root.root.string : "<local>");
+      return FALSE;
+    }
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_elf_record_got_reference (bfd *abfd, struct bfd_link_info *info,
+				struct elf_link_hash_entry *h, long symndx)
+{
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  Elf_Internal_Shdr *symtab_hdr = &elf_tdata (abfd)->symtab_hdr;
+
+  if (htab->elf.sgot == NULL)
+    {
+      if (!riscv_elf_create_got_section (htab->elf.dynobj, info))
+	return FALSE;
+    }
+
+  if (h != NULL)
+    {
+      h->got.refcount += 1;
+      return TRUE;
+    }
+
+  /* This is a global offset table entry for a local symbol.  */
+  if (elf_local_got_refcounts (abfd) == NULL)
+    {
+      bfd_size_type size = symtab_hdr->sh_info * (sizeof (bfd_vma) + 1);
+      if (!(elf_local_got_refcounts (abfd) = bfd_zalloc (abfd, size)))
+	return FALSE;
+      _bfd_riscv_elf_local_got_tls_type (abfd)
+	= (char *) (elf_local_got_refcounts (abfd) + symtab_hdr->sh_info);
+    }
+  elf_local_got_refcounts (abfd) [symndx] += 1;
+
+  return TRUE;
+}
+
+static bfd_boolean
+bad_static_reloc (bfd *abfd, unsigned r_type, struct elf_link_hash_entry *h)
+{
+  (*_bfd_error_handler)
+    (_("%B: relocation %s against `%s' can not be used when making a shared "
+       "object; recompile with -fPIC"),
+      abfd, riscv_elf_rtype_to_howto (r_type)->name,
+      h != NULL ? h->root.root.string : "a local symbol");
+  bfd_set_error (bfd_error_bad_value);
+  return FALSE;
+}
+
+/* Pulp add on for proprietary dynmaic relocation */
+typedef struct PulpImportRef {
+        Elf_Internal_Rela       Rel;
+        struct PulpImportRef    *Next;
+} PulpImportRef;
+
+typedef struct PulpImportEntry {
+        char                    *Name;
+	int			RelocCount;
+        PulpImportRef           *Ref;
+        struct PulpImportEntry  *Next;
+} PulpImportEntry;
+
+typedef struct PulpExportEntry {
+        char                    *Name;
+	unsigned int		Address;
+        struct PulpExportEntry  *Next;
+} PulpExportEntry;
+
+#define HASH_IMPORT_E 1024
+
+static PulpImportEntry * ImportEntries[HASH_IMPORT_E];
+static PulpExportEntry * ExportEntries[HASH_IMPORT_E];
+
+static struct bfd_sym_chain ComponentEntry;
+static bfd_boolean ComponentEntryProvided;
+
+#if 64 == 32
+void PulpRegisterSymbolEntry(struct bfd_sym_chain EntrySymb, bfd_boolean EntryOnCmdLine)
+
+{
+	ComponentEntry = EntrySymb;
+	ComponentEntryProvided = EntryOnCmdLine;
+}
+#endif
+
+
+static unsigned long hash_sdbm(const char *str)
+
+{
+        unsigned long hash = 0;
+        int c;
+
+        while ((c = (*str++))) hash = c + (hash << 6) + (hash << 16) - hash;
+        return (hash % HASH_IMPORT_E);
+}
+
+static bfd_boolean ExportLookup(const char *Name)
+
+{
+	unsigned int Index = hash_sdbm(Name);
+	PulpExportEntry *PtEntry = ExportEntries[Index];
+
+	while (PtEntry && (strcmp(PtEntry->Name, Name) != 0)) PtEntry = PtEntry->Next;
+
+	return (PtEntry != NULL);
+}
+
+#if 64 == 32
+bfd_boolean InsertExportEntry(const char *Name)
+
+{
+	unsigned int Index = hash_sdbm(Name);
+	PulpExportEntry *PtEntry = ExportEntries[Index];
+	PulpExportEntry *PtPrevEntry = NULL;
+
+	while (PtEntry && (strcmp(PtEntry->Name, Name) != 0)) {
+		PtPrevEntry = PtEntry; PtEntry = PtEntry->Next;
+	}
+	if (PtEntry == NULL) {
+		PtEntry = (PulpExportEntry *) bfd_malloc (sizeof (PulpExportEntry));
+		if (PtEntry == NULL) return FALSE;
+		PtEntry->Name = (char *) bfd_malloc (sizeof (char) * (strlen(Name)+1));
+		if (PtEntry->Name == NULL) return FALSE;
+		strcpy(PtEntry->Name, Name);
+		PtEntry->Address = 0; PtEntry->Next = NULL;
+		if (PtPrevEntry) PtPrevEntry->Next = PtEntry; else ExportEntries[Index] = PtEntry;
+	}
+	return TRUE;
+}
+
+unsigned int ExportSectionSize(unsigned int *EntryCount)
+
+{
+	int i;
+	PulpExportEntry *PtEntry;
+	unsigned int Size = 4;	/* Room for Number of Exported Symbs */
+	unsigned int Entry = 0;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			Entry++;
+			Size += (strlen(PtEntry->Name)+1+1);	// Null terminated and one byte prefix to pass section index
+		}
+	}
+	/* Align on 4 bytes */
+	if (Size % 4) {
+		Size = ((Size>>2)+1)<<2;
+	}
+	/* Add Value section */
+	Size += Entry*4;
+	if (EntryCount) *EntryCount = Entry;
+	if (Entry == 0) Size = 0;
+	return Size;
+}
+
+#endif
+
+static bfd_boolean ReleaseExportEntry(void)
+
+{
+	int i;
+	PulpExportEntry *PtEntry, *NextEntry;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = NextEntry) {
+			NextEntry = PtEntry->Next;
+			free(PtEntry->Name); free(PtEntry);
+		}
+		ExportEntries[i] = NULL;
+	}
+	return TRUE;
+}
+
+static bfd_boolean InsertImportEntry(const char *Name, Elf_Internal_Rela *Rel, bfd_vma OutOffset, bfd_boolean Collect)
+
+{
+	unsigned int Index = hash_sdbm(Name);
+	PulpImportEntry *PtEntry = ImportEntries[Index];
+	PulpImportEntry *PtPrevEntry = NULL;
+	PulpImportRef *Ref, *PtRef;
+	static PulpImportEntry *LastEntry = NULL;
+	static Elf_Internal_Rela *LastRel = NULL;
+
+	while (PtEntry && (strcmp(PtEntry->Name, Name) != 0)) {
+		PtPrevEntry = PtEntry; PtEntry = PtEntry->Next;
+	}
+	if (PtEntry == NULL) {
+		PtEntry = (PulpImportEntry *) bfd_malloc (sizeof (PulpImportEntry));
+		if (PtEntry == NULL) return FALSE;
+		PtEntry->Name = (char *) bfd_malloc (sizeof (char) * (strlen(Name)+1));
+		if (PtEntry->Name == NULL) return FALSE;
+		strcpy(PtEntry->Name, Name);
+		PtEntry->Ref = NULL; PtEntry->Next = NULL;
+		PtEntry->RelocCount = 0;
+		if (PtPrevEntry) PtPrevEntry->Next = PtEntry; else ImportEntries[Index] = PtEntry;
+	}
+	if (Collect) {
+		LastEntry = PtEntry;
+		if ((ELF64_R_TYPE(Rel->r_info) == R_RISCV_LO12_I) && (LastEntry == PtEntry) &&
+		    (ELF64_R_TYPE(LastRel->r_info) == R_RISCV_HI20) && ((Rel->r_offset - LastRel->r_offset) == 4) ) {
+		} else PtEntry->RelocCount = PtEntry->RelocCount + 1;
+		LastRel = Rel;
+		return TRUE;
+	}
+
+	Ref = (PulpImportRef *) bfd_malloc (sizeof (PulpImportRef));
+	if (Ref == NULL) return FALSE;
+	Ref->Rel = *Rel; Ref->Next = NULL;
+	Ref->Rel.r_info = ELF64_R_TYPE(Rel->r_info);
+	Ref->Rel.r_offset = Rel->r_offset + OutOffset;
+/*
+	if (ComponentMode == 0) {
+		Ref->Rel.r_offset += Sec->vma;
+	}
+*/
+	PtRef = PtEntry->Ref;
+	while (PtRef && PtRef->Next != NULL) PtRef = PtRef->Next;
+	if (PtRef) PtRef->Next = Ref; else PtEntry->Ref = Ref;
+	return TRUE;
+}
+
+static bfd_boolean ReleaseImportEntry(void)
+
+{
+	int i;
+	PulpImportEntry *PtEntry, *NextEntry;
+	PulpImportRef *PtRef, *NextRef;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = NextEntry) {
+			NextEntry = PtEntry->Next;
+			for (PtRef = PtEntry->Ref; PtRef; PtRef = NextRef) {
+				NextRef = PtRef->Next;
+				free(PtRef);
+			}
+			free(PtEntry->Name); free(PtEntry);
+		}
+		ImportEntries[i] = NULL;
+	}
+	return TRUE;
+}
+
+/*
+
+N imported symbols
+
+Structure of .pulp.import.names Section:
+----------------------------------------
+We use only Type 0
+	Len(Name) = Length(Name)+1 				Null terminated string
+	Size:	Pad4((N+1)*4 + Sum(j:1..N){Len(Namej)})		If Type=0
+	     	(N+1)*4						If Type=1
+
+		Base						Bit0: 		Section Type: 0 with names, 1 uses pre resolved indexes
+								Bit1:31:	(Section size) / 4 Always 4 byte aligned
+
+		Base						Bit  0:11 	NumberOfImports
+								Bit 12:31 	(Size of Names Section) / 4.
+
+	Type=0 (Names)
+		Base+4						Name1_Index = Base+4*(N+1)
+		Base+8						Name2_Index = Base+4*(N+1)+Len(Name1)
+		...
+		Base+4*(i)					Namei_Index = Base+4*(N+1)+Sum(j:1..(i-1)){Len(Namej)}
+		...
+		Base+4*(N)					NameN_Index = Base+4*(N+1)+Sum(j:1..(N-1)){Len(Namej)}
+		Base+4*(N+1)					Name1
+		Base+4*(N+1)+Len(Name1)				Name2
+		...
+		Base+4*(N+1)+Sum(j:1..(i-1)){Len(Namej)}	Namei	
+		...
+		Base+4*(N+1)+Sum(j:1..(N-1)){Len(Namej)}	NameN	
+		Pad till next address aligned on 4 bytes
+	Type=1 (Pre resolved indexes)
+		Base+4						Name1_Index (points to corresponding name in .export)
+		Base+8						Name2_Index (points to corresponding name in .export)
+		...
+		Base+4*(i)					Namei_Index (points to corresponding name in .export)
+		...
+		Base+4*(N)					NameN_Index (points to corresponding name in .export)
+
+Structure of .pulp.import.reloc Section:
+----------------------------------------
+
+	Size for Namei:	4*(N_Reloc(i) + 1)
+	Total Size:	4 + Sum(j:1..N){4*(N_Reloc(j) + 1)}
+
+	Entry(i+1) = Entry(i) + 4*(N_Reloc(i) + 1)
+
+		Base			Bit  0:11 		NumberOfImports
+					Bit 12:31 		(Size of Relocs Section) / 4. Contains only words thus multiple of 4 bytes
+
+		Base+4			Name_Index(1)		In .pulp.import.names
+		Base+6			N_Reloc(1)		Number of reloc for this name
+		Base+8			Reloc			One reloc
+		Base+12			Reloc			One reloc
+		...
+		Base+4+4*N_Reloc(1)	Reloc			Size for relocs: N_Reloc(1)*4, Total for Name1: 4*(N_Reloc(1) + 1)
+		....
+
+	Reloc:
+		Reloc Type: 4 Bits  	=> Bit31 : Bit28
+			0	R_RISCV_JAL					Offset = @Name-pc
+					pc: 	jal (pc+Offset[20..1])		InsnBits[31:12] =>  I[20],I[10:1],I[11],I[19:12]
+			1	Pair of R_RISCV_HI20, R_RISCV_LO12_I
+					pc:	lui Reg,Hi20(Name)		InsBits[31:12] => @Name[31:12]
+					pc+4:	addi Reg, Reg, Lo12(Name)	InsBits[31:20] => @Name[11:0}
+			2	R_RISCV_HI20
+					pc:	lui Reg,Hi20(Name)		InsBits[31:12] => @Name[31:12]
+			3	R_RISCV_LO12_I
+					pc:	addi Reg, Reg, Lo12(Name)	InsBits[31:20] => @Name[11:0}
+			4	R_RISCV_LO12_S
+					pc:					InsnBits[31:25] => @Name[15:5], InsnBits[11:7] => @Name[4:0]
+
+		Reloc Offset: 28 Bits	=> Bit27 : Bit0				Offset from section base / 2.
+										On RiscV we can assume that offset is always a multiple of 2
+
+
+Structure of .pulp.export Section:
+----------------------------------
+
+	Total Size:	Pad4(4 + +Sum(j=1..N){Len(Namej)+1}) + 4*N. Is a multiple of 4
+
+	Base						Bit0		0: Resident, 1: Component
+							Bit15:Bit1 	N: Number of exported names
+							Bit31:Bit16 	Offset/4 to first Value in this section. /4 since all entities are words
+	Base+4						Section Name1	Section (byte) in which name is defined, Null terminated name
+	Base+4+Len(Name1)				Section Name2
+	...
+	Base+4+Sum(j=1..i-1){Len(Namej)+1}		Section Namei
+	...
+	Base+4+Sum(j=1..N-1){Len(Namej)+1}		Section NameN
+
+	Base+4+Sum(j=1..N){Len(Namej+1)}+Pad4		Value1	Link time Offset for Name1	Pad4: Alignment to 4
+	...
+	Base+4+Sum(j=1..N){Len(Namej+1)}+Pad4+4*N	ValueN	Link time Offset for NameN
+	
+
+*/
+
+#define	IMPORT_REL_JAL			0
+#define	IMPORT_REL_HI20_LO12_I		1
+#define	IMPORT_REL_HI20			2
+#define	IMPORT_REL_LO12_I		3
+#define	IMPORT_REL_LO12_S		4
+
+#define	IMPORT_SECN_NAME_SZ		4
+#define	IMPORT_SECN_TYPE_SZ		4
+#define	IMPORT_SECN_NAME_INDEX_SZ	4
+
+#define	IMPORT_SECR_IMPORT_CNT_SZ	4
+#define	IMPORT_SECR_NAME_INDEX_SZ	2
+#define	IMPORT_SECR_REL_CNT_SZ		2
+#define	IMPORT_SECR_REL_EXPR_SZ		4
+
+static char *RelImage(unsigned int Rel)
+
+{
+	switch (Rel) {
+		case IMPORT_REL_JAL: return "REL_JAL";
+		case IMPORT_REL_HI20_LO12_I : return "REL_HI20_LO12_I";
+		case IMPORT_REL_HI20: return "REL_HI20";
+		case IMPORT_REL_LO12_I: return "REL_LO12_I";
+		case IMPORT_REL_LO12_S: return "REL_LO12_S";
+		default: return "Unknown Rel";
+	}
+}
+static unsigned int PulpImportNameSize(const char *Name)
+
+{
+	unsigned int Size = strlen(Name)+1;
+	return Size;
+}
+
+#if 64 == 32
+void PulpImportSectionsSize(int Mode, unsigned int *SecName, unsigned int *SecReloc, unsigned int *N_Import, bfd_boolean Collect)
+
+{
+	int i;
+	PulpImportEntry *PtEntry;
+	PulpImportRef *PtRef;
+	unsigned int NameSize = IMPORT_SECN_TYPE_SZ;		/* Name Section Type */
+	unsigned int RefSize = IMPORT_SECR_IMPORT_CNT_SZ;	/* N Import */
+	unsigned int N_Imp = 0;
+	int Skip = 0;
+	
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			if (((PtEntry->RelocCount == 0)&&Collect) || ((PtEntry->Ref == NULL)&&(!Collect))) continue;
+			RefSize += IMPORT_SECR_NAME_INDEX_SZ;		/* Name Index */
+			RefSize += IMPORT_SECR_REL_CNT_SZ;		/* Number of Relocs for Name Index */
+			NameSize += IMPORT_SECN_NAME_INDEX_SZ;		/* Name Index */
+			N_Imp ++;
+			if (Mode == 0) NameSize += PulpImportNameSize(PtEntry->Name);
+			if (Collect) RefSize += PtEntry->RelocCount*IMPORT_SECR_REL_EXPR_SZ;
+			else {
+				for (PtRef = PtEntry->Ref; PtRef; PtRef = PtRef->Next) {
+					if (Skip) {
+						Skip = 0; continue;
+					}
+					if ((PtRef->Rel.r_info == R_RISCV_HI20) && PtRef->Next &&
+					    (PtRef->Next->Rel.r_info == R_RISCV_LO12_I) && ((PtRef->Next->Rel.r_offset - PtRef->Rel.r_offset)==4)) Skip = 1;
+					RefSize += IMPORT_SECR_REL_EXPR_SZ;	/* Reloc Expr */
+				}
+			}
+		}
+	}
+	/* Force Names section size to be 4 bytes aligned */
+	if (NameSize%4) NameSize = ((NameSize>>2)+1)<<2;
+	*SecName = NameSize;
+	*SecReloc = RefSize;
+	*N_Import = N_Imp;
+}
+#endif
+
+static bfd_boolean PulpExportCreateSection(unsigned int **Section, unsigned int *SizeSection, struct bfd_link_info *info)
+
+{
+	PulpExportEntry *PtEntry;
+	char *Base = NULL;
+	unsigned int *BaseI;
+	unsigned int *Entries;
+	unsigned int Entry = 0, BaseLinkedVal, Addr = 0;
+	unsigned int Size = ExportSectionSize(&Entry);
+	int i, j;
+
+	*SizeSection = Size;
+	if (Size == 0) {
+		*Section = NULL; return TRUE;
+	}
+	*Section = (unsigned int *) bfd_malloc (Size);
+	Entries = (unsigned int *) bfd_malloc (Entry*sizeof(unsigned int));
+
+	if (*Section == NULL || Entries == NULL) {
+	  	(*_bfd_error_handler) (_("Export Create Section, Can't allocate memory"));
+		return FALSE;
+	}
+
+	Entry = 0;
+	Base = (char *) (&(*Section)[1]);
+	Addr = 4;
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			int Len = (int) strlen(PtEntry->Name);
+  			struct bfd_link_hash_entry *h;
+  			h = bfd_link_hash_lookup (info->hash, PtEntry->Name, FALSE, FALSE, TRUE);
+			if (h == NULL) {
+	  			(*_bfd_error_handler) (_("Export Create Section, Can't find symbol: %s"), PtEntry->Name);
+				return FALSE;
+			}
+			if (ComponentMode)
+				// We don't want lma or vma added here just an offset relative to the beginning of the output section in which it is
+				Entries[Entry++] = h->u.def.value + h->u.def.section->output_offset;
+			else 
+				Entries[Entry++] = h->u.def.value + sec_addr (h->u.def.section);
+				// Entries[Entry++] = h->u.def.value + h->u.def.section->output_offset + h->u.def.section->lma;
+			Base[0] = 0; /* Here should come the section in which the symbol is defined */
+			for (j=0; j<Len; j++)  Base[j+1] = PtEntry->Name[j];
+			Base[j+1] = 0; /* Null termination */
+			Base += (Len+2); Addr += (Len+2);
+		}
+	}
+	{
+		unsigned long int Base1 = (unsigned long int) Base;
+		if (Base1 % 4) Base1 = ((Base1>>2)+1)<<2;
+		BaseI = (unsigned int *) Base1;
+		if (Addr % 4) Addr = ((Addr>>2)+1)<<2;
+	}
+	BaseLinkedVal = Addr;
+	Entry = 0;
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ExportEntries[i] == NULL) continue;
+		for (PtEntry = ExportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			BaseI[Entry] = Entries[Entry]; Entry++;
+		}
+	}
+	(*Section)[0] = (ComponentMode&0x01) | ((Entry<<1)&0x0FFFE) | ((BaseLinkedVal>>2) << 16);
+	free(Entries);
+	return TRUE;
+}
+
+static bfd_boolean PulpImportCreateNameAndRelocSections(int Mode,
+						 unsigned int **S_Name, unsigned int *S_NameSize,
+						 unsigned int **S_Reloc, unsigned int *S_RelocSize,
+						 unsigned int *NImport)
+
+{
+	PulpImportEntry *PtEntry;
+	PulpImportRef *PtRef;
+	unsigned int SecNameSize;
+	unsigned int SecRelocSize;
+	unsigned int *SecName;
+	unsigned int *SecReloc;
+	unsigned int N_Import;
+	int Skip = 0;
+	unsigned N_Imp = 0;
+	unsigned int HeadName, HeadRel;
+	unsigned int i;
+	static int Trace = 0;
+
+	PulpImportSectionsSize(Mode, &SecNameSize, &SecRelocSize, &N_Import, FALSE);
+
+	SecName = (unsigned int *) bfd_malloc (SecNameSize);
+	SecReloc = (unsigned int *) bfd_malloc (SecRelocSize);
+	*S_Name = SecName; *S_NameSize = SecNameSize;
+	*S_Reloc = SecReloc; *S_RelocSize = SecRelocSize;
+
+	if (SecName == NULL || SecReloc == NULL) return FALSE;
+	// SecName[0]  = (Mode&0x1) | ((SecNameSize)<<1);
+
+	for (i=0; i< (SecNameSize>>2); i++) SecName[i] = 0;
+	for (i=0; i< (SecRelocSize>>2); i++) SecReloc[i] = 0;
+
+	SecName[0] = (N_Import&0x0FFF) | ((SecNameSize>>2)<<12);
+	HeadName = (1 + N_Import)*4;
+
+	if (Trace) fprintf(stderr, "Names: Size: %d, Relocs: Size: %d, N Imports: %d, Head Strings: %X\n", SecNameSize, SecRelocSize, N_Import, HeadName*4);
+
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			unsigned int j;
+			unsigned int NameSize;
+			char *Pt;
+
+			if (PtEntry->Ref == NULL) continue;
+
+			if (Trace) fprintf(stderr, "At: %5X Creating Name  Entry: %5d, String: %5X (%s)\n", (1+N_Imp)*4, N_Imp, HeadName, PtEntry->Name);
+			SecName[1+N_Imp] = HeadName; N_Imp++;
+			if (Mode == 1) continue;
+			NameSize = PulpImportNameSize(PtEntry->Name);
+			Pt = ((char *) SecName) + HeadName;
+			for (j=0; j<strlen(PtEntry->Name); j++) Pt[j] = PtEntry->Name[j];
+			Pt[j] = 0;
+			if (Trace) fprintf(stderr, "At: %5X Creating Name String: %s\n", HeadName, PtEntry->Name);
+			HeadName += NameSize;
+		}
+	}
+	SecReloc[0] = (N_Import&0x0FFF) | ((SecRelocSize>>2)<<12); N_Imp = 0; HeadRel = 1;
+	for (i=0; i<HASH_IMPORT_E; i++) {
+		unsigned int Base;
+		if (ImportEntries[i] == NULL) continue;
+		for (PtEntry = ImportEntries[i]; PtEntry; PtEntry = PtEntry->Next) {
+			unsigned int RelCount = 0;
+			if (PtEntry->Ref == NULL) continue;
+			Base = HeadRel++;
+			for (PtRef = PtEntry->Ref; PtRef; PtRef = PtRef->Next) {
+				unsigned int Rel;
+				if (Skip) {
+					Skip = 0; continue;
+				}
+				if ((PtRef->Rel.r_info == R_RISCV_HI20) && PtRef->Next &&
+				    (PtRef->Next->Rel.r_info == R_RISCV_LO12_I) && ((PtRef->Next->Rel.r_offset - PtRef->Rel.r_offset)==4)) {
+					Skip = 1;
+					Rel = IMPORT_REL_HI20_LO12_I;
+				} else {
+					switch (PtRef->Rel.r_info) {
+						case R_RISCV_JAL: Rel = IMPORT_REL_JAL; break;
+						case R_RISCV_HI20: Rel = IMPORT_REL_HI20; break;
+						case R_RISCV_LO12_I: Rel = IMPORT_REL_LO12_I; break;
+						case R_RISCV_LO12_S: Rel = IMPORT_REL_LO12_S; break;
+						default: {
+								reloc_howto_type *howto = riscv_elf_rtype_to_howto (PtRef->Rel.r_info);
+								Rel = -1; /* Error */
+	  							(*_bfd_error_handler) (_("Unknown Relocation: %X (%s)"),
+											(int) PtRef->Rel.r_info,
+											howto?howto->name:"Unknown");
+								return FALSE;
+							}
+					}
+				}
+				if (Trace) fprintf(stderr, "At: %5X Adding   Rel for Entry: %5d => %8X [Rel:%8X, Offset: %8X]\n",
+						   HeadRel*4, N_Imp+1, (unsigned int) ((Rel<<28) | ((PtRef->Rel.r_offset>>1) & 0x0FFFFFFF)),
+						   (unsigned int) Rel, (unsigned int) PtRef->Rel.r_offset);
+				SecReloc[HeadRel++] = (Rel<<28) | ((PtRef->Rel.r_offset>>1) & 0x0FFFFFFF);
+				RelCount++;
+			}
+			SecReloc[Base] = ((4*(N_Imp+1))&0x0FFFF) | ((RelCount<<16)&0xFFFF0000);
+			if (Trace) fprintf(stderr, "At: %5X Creating Rel     Entry: %5d, Rel Count: %d\n", Base*4, 4*(N_Imp+1), RelCount); 
+			N_Imp++;
+		}
+	}
+	*NImport = N_Import;
+	return TRUE;
+}
+
+/* We adjust reloc offset to absolute address when ComponentMode=0, e.g resident mode. In this case we need to add the lma of the text section */
+
+static void AdjustRelocsImport(unsigned int *ImportRelocs, unsigned int BaseText)
+
+{
+	unsigned int Addr, N_Import;
+	unsigned int i, j;
+
+	if (ImportRelocs == NULL) return;
+
+	N_Import = (ImportRelocs[0] & 0x0FFF); // ??? >>1;
+	Addr = 4;
+	for (i=0; i<N_Import; i++) {
+		unsigned int RelCount = (ImportRelocs[Addr/4]>>16) & 0x0FFFF;
+
+		Addr += 4;
+		for (j=0; j<RelCount; j++) {
+			unsigned int Rel = ImportRelocs[Addr/4];
+			unsigned int Offset = (((Rel & 0x0FFFFFFF)<<1)+BaseText)>>1;
+			unsigned int Type = (Rel>>28);
+
+			Rel = (Type<<28) | (Offset & 0x0FFFFFFF);
+			ImportRelocs[Addr/4] = Rel;
+			Addr += 4;
+		}
+	}
+}
+
+static void DumpCEquiv(unsigned int *Section, unsigned int Size, unsigned int Elem, char *DeclName)
+
+{
+	unsigned int i;
+	unsigned DeclSize =  Size/Elem;
+	unsigned short *Half = (unsigned short *) Section;
+	unsigned char *Byte = (unsigned char *) Section;
+
+	switch (Elem) {
+		case 1:
+			fprintf(stderr, "unsigned char %s[%d] = {\n\t", DeclName, DeclSize);
+			for (i=0; i<DeclSize; i++) {
+				fprintf(stderr, "0X%X, ", Byte[i]);
+				if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+			}
+			fprintf(stderr, "\n};\n\n");
+			break;
+		case 2:
+			fprintf(stderr, "unsigned short int %s[%d] = {\n\t", DeclName, DeclSize);
+			for (i=0; i<DeclSize; i++) {
+				fprintf(stderr, "0X%X, ", Half[i]);
+				if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+			}
+			fprintf(stderr, "\n};\n\n");
+			break;
+		case 4:
+			fprintf(stderr, "unsigned int %s[%d] = {\n\t", DeclName, DeclSize);
+			for (i=0; i<DeclSize; i++) {
+				fprintf(stderr, "0X%X, ", Section[i]);
+				if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+			}
+			fprintf(stderr, "\n};\n\n");
+			break;
+		default: ;
+	}
+}
+
+static void DiassembleImports(unsigned int *ImportNames, unsigned SecNamesSize, unsigned int *ImportRelocs, unsigned int SecRelocsSize, unsigned int BaseText)
+
+{
+	static int RawDump = 0;
+	unsigned int Addr, N_Import;
+	unsigned int i, j;
+	char *Name;
+
+	if (ImportNames == NULL || ImportRelocs == NULL) {
+		return;
+	}
+	N_Import = ImportRelocs[0] & 0x0FFF;
+
+	fprintf(stderr, "Section: .pulp.import.names\n");
+	Addr = 0;
+	fprintf(stderr, "%8s  %17s %20s\n", "Offset", "Content", "Comment");
+	fprintf(stderr, "%8x: 0x%15X (NImport = %d, Section Size: 0x%X)\n",
+		Addr, ImportNames[Addr/4], ImportNames[Addr/4]&0x0FFF, (ImportNames[Addr/4]>>12)*4);
+	Addr += 4;
+	Name = (char *) &ImportNames[1];
+	for (i=0; i<N_Import; i++) {
+		fprintf(stderr, "%8x: 0x%15X (Import Symbol %5d, Name @ in this section)\n", Addr, ImportNames[Addr/4], i);
+		Addr += 4; Name += 4;
+	}
+	for (i=0; i<N_Import; i++) {
+		fprintf(stderr, "%8x: %17s (Import Symbol %5d)\n", Addr, Name, i);
+		Addr = Addr + strlen(Name) + 1; Name = Name + strlen(Name) + 1;
+	}
+
+	fprintf(stderr, "Section: .pulp.import.relocs, Mode=%s, BaseText=%X\n", ComponentMode?"Component":"Resident", BaseText);
+	Addr = 0;
+	fprintf(stderr, "%8s  %17s %20s\n", "Offset", "Content", "Comment");
+	fprintf(stderr, "%8x: 0x%15X (Number of Imported Symbols: %d, Section Size: 0x%X)\n",
+		Addr, ImportRelocs[Addr/4], ImportRelocs[Addr/4]&0x0FFF, ((ImportRelocs[Addr/4]>>12)&0x000FFFFF)<<2);
+	Addr += 4;
+	for (i=0; i<N_Import; i++) {
+		unsigned int Entry = (ImportRelocs[Addr/4] & 0x0FFFF);
+		unsigned int RelCount = (ImportRelocs[Addr/4]>>16) & 0x0FFFF;
+
+		Name = ((char *) ImportNames) + ImportNames[Entry>>2];
+		fprintf(stderr, "%8x: 0x%15X (Name @: 0x%6X, Reloc: %3d) %s\n", Addr, ImportRelocs[Addr/4], Entry, RelCount, Name);
+		Addr += 4;
+		for (j=0; j<RelCount; j++) {
+			unsigned int Rel = ImportRelocs[Addr/4];
+			unsigned int Offset = ((Rel & 0x0FFFFFFF)<<1);
+			unsigned int Type = (Rel>>28);
+
+			fprintf(stderr, "%8x: 0x%15X (Offset: 0x%6X, Reloc: %s)\n", Addr, ImportRelocs[Addr/4], Offset, RelImage(Type));
+			Addr += 4;
+		}
+	}
+	if (RawDump) {
+		unsigned int NameSize = SecNamesSize>>2, RelocSize = SecRelocsSize>>2;
+		if (SecNamesSize % 4) NameSize++;
+		if (SecRelocsSize % 4) RelocSize++;
+		fprintf(stderr, "unsigned int CompNames[%d] = {\n\t", NameSize);
+		for (i=0; i<NameSize; i++) {
+			fprintf(stderr, "0X%X, ", ImportNames[i]);
+			if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+		}
+		fprintf(stderr, "\n};\n");
+		fprintf(stderr, "unsigned int CompRelocs[%d] = {\n\t", RelocSize);
+		for (i=0; i<RelocSize; i++) {
+			fprintf(stderr, "0X%X, ", ImportRelocs[i]);
+			if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+		}
+		fprintf(stderr, "\n};\n");
+	}
+}
+
+static void DiassembleExports(unsigned int *Section, unsigned int SectionSize)
+
+{
+	static int RawDump = 0;
+	unsigned int Entry;
+	char *Base;
+	unsigned int *BaseI;
+	unsigned long int Addr;
+	unsigned int i;
+
+	if (Section == NULL) return;
+
+	fprintf(stderr, "Section: .pulp.export\n");
+	Entry = (Section[0]&0x0FFFF)>>1;
+	Base = (char *) (&Section[1]);
+
+	Addr = 0;
+	fprintf(stderr, "%8s  %17s %20s\n", "Offset", "Content", "Comment");
+	fprintf(stderr, "%8x: 0x%15X (Type: %s, Number of Exported Symbols: %d, Base Linker Values: 0x%X, Section Size: 0x%X)\n",
+		(unsigned int) Addr, Section[Addr], (Section[Addr]&0x01)?"Component":"Resident",
+		(Section[Addr]&0x0FFFF)>>1, ((Section[Addr]>>16)<<2)&0x0FFFF,
+		((Section[Addr]&0x0FFFF)>>1)*4 + (((Section[Addr]>>16)<<2)&0x0FFFF));
+	Addr += 4;
+	for (i=0; i<Entry; i++) {
+		unsigned int Off = strlen(Base+1) + 2;
+		fprintf(stderr, "%8x: %17s Section: %2d (Exported Symbol %5d)\n", (unsigned int) Addr, Base+1, Base[0], i);
+		Base = Base + Off; Addr = Addr + Off;
+	}
+	while ((unsigned long int) Base % 4) {
+		Base++; Addr++;
+	}
+	// BaseI = (unsigned int *) Base;
+	Addr = (Section[0]>>16)<<2;
+	BaseI = Section + (Section[0]>>16);
+	for (i=0; i<Entry; i++) {
+		fprintf(stderr, "%8x: 0x%15X (Exported Symbol %5d, Offset in Section)\n", (unsigned int) Addr, BaseI[i], i);
+		Addr+=4;
+	}
+	if (RawDump) {
+		unsigned int Size = SectionSize>>2;
+		if (SectionSize % 4) Size++;
+		fprintf(stderr, "unsigned int ExportSymb[%d] = {\n\t", Size);
+		for (i=0; i<Size; i++) {
+			fprintf(stderr, "0X%X, ", Section[i]);
+			if (((i+1)%5)==0) fprintf(stderr, "\n\t");
+		}
+		fprintf(stderr, "\n};\n");
+	}
+}
+
+
+/* Look through the relocs for a section during the first phase, and
+   allocate space in the global offset table or procedure linkage
+   table.  */
+
+static bfd_boolean
+riscv_elf_check_relocs (bfd *abfd, struct bfd_link_info *info,
+			asection *sec, const Elf_Internal_Rela *relocs)
+{
+  struct riscv_elf_link_hash_table *htab;
+  Elf_Internal_Shdr *symtab_hdr;
+  struct elf_link_hash_entry **sym_hashes;
+  Elf_Internal_Rela *rel;
+  asection *sreloc = NULL;
+
+  if (bfd_link_relocatable (info))
+    return TRUE;
+
+  htab = riscv_elf_hash_table (info);
+  symtab_hdr = &elf_tdata (abfd)->symtab_hdr;
+  sym_hashes = elf_sym_hashes (abfd);
+
+  if (htab->elf.dynobj == NULL)
+    htab->elf.dynobj = abfd;
+
+  for (rel = relocs; rel < relocs + sec->reloc_count; rel++)
+    {
+      unsigned int r_type;
+      unsigned long r_symndx;
+      struct elf_link_hash_entry *h;
+
+      r_symndx = ELF64_R_SYM (rel->r_info);
+      r_type = ELF64_R_TYPE (rel->r_info);
+
+      if (r_symndx >= NUM_SHDR_ENTRIES (symtab_hdr))
+	{
+	  (*_bfd_error_handler) (_("%B: bad symbol index: %d"),
+				 abfd, r_symndx);
+	  return FALSE;
+	}
+
+      if (r_symndx < symtab_hdr->sh_info)
+	h = NULL;
+      else
+	{
+	  h = sym_hashes[r_symndx - symtab_hdr->sh_info];
+	  while (h->root.type == bfd_link_hash_indirect
+		 || h->root.type == bfd_link_hash_warning)
+	    h = (struct elf_link_hash_entry *) h->root.u.i.link;
+
+	  /* PR15323, ref flags aren't set for references in the same
+	     object.  */
+	  h->root.non_ir_ref = 1;
+	}
+
+      if (h && h->root.type == bfd_link_hash_defweak) {
+                static int Trace = 0;
+                asection *sec1;
+                reloc_howto_type *howto = riscv_elf_rtype_to_howto (ELF64_R_TYPE (rel->r_info));
+
+                sec1 = h->root.u.def.section;
+
+                if (sec1 != NULL && (strcmp(sec1->name, ".pulp.import")==0)) {
+                        if (Trace) printf("Pre Importing %15s in reloc: %4d -> %4d:%22s, at offset: (%8X + %8X) => %X\n",
+                                          h->root.root.string, (int) rel->r_info,
+                                          (int) ELF64_R_TYPE(rel->r_info), howto->name, (int) rel->r_offset, (int) sec1->output_offset,
+                                          (int) sec1->output_offset+(int)rel->r_offset);
+                        sec1->flags |= SEC_KEEP;
+                        InsertImportEntry(h->root.root.string, rel, sec1->output_offset, TRUE);
+                }
+
+      }
+      switch (r_type)
+	{
+	case R_RISCV_TLS_GD_HI20:
+	  if (!riscv_elf_record_got_reference (abfd, info, h, r_symndx)
+	      || !riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_TLS_GD))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_TLS_GOT_HI20:
+	  if (bfd_link_pic (info))
+	    info->flags |= DF_STATIC_TLS;
+	  if (!riscv_elf_record_got_reference (abfd, info, h, r_symndx)
+	      || !riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_TLS_IE))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_GOT_HI20:
+	  if (!riscv_elf_record_got_reference (abfd, info, h, r_symndx)
+	      || !riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_NORMAL))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_CALL_PLT:
+	  /* This symbol requires a procedure linkage table entry.  We
+	     actually build the entry in adjust_dynamic_symbol,
+	     because this might be a case of linking PIC code without
+	     linking in any dynamic objects, in which case we don't
+	     need to generate a procedure linkage table after all.  */
+
+	  if (h != NULL)
+	    {
+	      h->needs_plt = 1;
+	      h->plt.refcount += 1;
+	    }
+	  break;
+
+	case R_RISCV_CALL:
+	case R_RISCV_JAL:
+	case R_RISCV_BRANCH:
+	case R_RISCV_RVC_BRANCH:
+	case R_RISCV_RVC_JUMP:
+	case R_RISCV_PCREL_HI20:
+	  /* In shared libraries, these relocs are known to bind locally.  */
+	  if (bfd_link_pic (info))
+	    break;
+	  goto static_reloc;
+
+	case R_RISCV_TPREL_HI20:
+	  if (!bfd_link_executable (info))
+	    return bad_static_reloc (abfd, r_type, h);
+	  if (h != NULL)
+	    riscv_elf_record_tls_type (abfd, h, r_symndx, GOT_TLS_LE);
+	  goto static_reloc;
+
+	case R_RISCV_HI20:
+	  if (bfd_link_pic (info))
+	    return bad_static_reloc (abfd, r_type, h);
+	  /* Fall through.  */
+
+	case R_RISCV_COPY:
+	case R_RISCV_JUMP_SLOT:
+	case R_RISCV_RELATIVE:
+	case R_RISCV_64:
+	case R_RISCV_32:
+	  /* Fall through.  */
+
+	static_reloc:
+	  /* This reloc might not bind locally.  */
+	  if (h != NULL)
+	    h->non_got_ref = 1;
+
+	  if (h != NULL && !bfd_link_pic (info))
+	    {
+	      /* We may need a .plt entry if the function this reloc
+		 refers to is in a shared lib.  */
+	      h->plt.refcount += 1;
+	    }
+
+	  /* If we are creating a shared library, and this is a reloc
+	     against a global symbol, or a non PC relative reloc
+	     against a local symbol, then we need to copy the reloc
+	     into the shared library.  However, if we are linking with
+	     -Bsymbolic, we do not need to copy a reloc against a
+	     global symbol which is defined in an object we are
+	     including in the link (i.e., DEF_REGULAR is set).  At
+	     this point we have not seen all the input files, so it is
+	     possible that DEF_REGULAR is not set now but will be set
+	     later (it is never cleared).  In case of a weak definition,
+	     DEF_REGULAR may be cleared later by a strong definition in
+	     a shared library.  We account for that possibility below by
+	     storing information in the relocs_copied field of the hash
+	     table entry.  A similar situation occurs when creating
+	     shared libraries and symbol visibility changes render the
+	     symbol local.
+
+	     If on the other hand, we are creating an executable, we
+	     may need to keep relocations for symbols satisfied by a
+	     dynamic library if we manage to avoid copy relocs for the
+	     symbol.  */
+	  if ((bfd_link_pic (info)
+	       && (sec->flags & SEC_ALLOC) != 0
+	       && (! riscv_elf_rtype_to_howto (r_type)->pc_relative
+		   || (h != NULL
+		       && (! info->symbolic
+			   || h->root.type == bfd_link_hash_defweak
+			   || !h->def_regular))))
+	      || (!bfd_link_pic (info)
+		  && (sec->flags & SEC_ALLOC) != 0
+		  && h != NULL
+		  && (h->root.type == bfd_link_hash_defweak
+		      || !h->def_regular)))
+	    {
+	      struct riscv_elf_dyn_relocs *p;
+	      struct riscv_elf_dyn_relocs **head;
+
+	      /* When creating a shared object, we must copy these
+		 relocs into the output file.  We create a reloc
+		 section in dynobj and make room for the reloc.  */
+	      if (sreloc == NULL)
+		{
+		  sreloc = _bfd_elf_make_dynamic_reloc_section
+		    (sec, htab->elf.dynobj, RISCV_ELF_LOG_WORD_BYTES,
+		    abfd, /*rela?*/ TRUE);
+
+		  if (sreloc == NULL)
+		    return FALSE;
+		}
+
+	      /* If this is a global symbol, we count the number of
+		 relocations we need for this symbol.  */
+	      if (h != NULL)
+		head = &((struct riscv_elf_link_hash_entry *) h)->dyn_relocs;
+	      else
+		{
+		  /* Track dynamic relocs needed for local syms too.
+		     We really need local syms available to do this
+		     easily.  Oh well.  */
+
+		  asection *s;
+		  void *vpp;
+		  Elf_Internal_Sym *isym;
+
+		  isym = bfd_sym_from_r_symndx (&htab->sym_cache,
+						abfd, r_symndx);
+		  if (isym == NULL)
+		    return FALSE;
+
+		  s = bfd_section_from_elf_index (abfd, isym->st_shndx);
+		  if (s == NULL)
+		    s = sec;
+
+		  vpp = &elf_section_data (s)->local_dynrel;
+		  head = (struct riscv_elf_dyn_relocs **) vpp;
+		}
+
+	      p = *head;
+	      if (p == NULL || p->sec != sec)
+		{
+		  bfd_size_type amt = sizeof *p;
+		  p = ((struct riscv_elf_dyn_relocs *)
+		       bfd_alloc (htab->elf.dynobj, amt));
+		  if (p == NULL)
+		    return FALSE;
+		  p->next = *head;
+		  *head = p;
+		  p->sec = sec;
+		  p->count = 0;
+		  p->pc_count = 0;
+		}
+
+	      p->count += 1;
+	      p->pc_count += riscv_elf_rtype_to_howto (r_type)->pc_relative;
+	    }
+
+	  break;
+
+	case R_RISCV_GNU_VTINHERIT:
+	  if (!bfd_elf_gc_record_vtinherit (abfd, sec, h, rel->r_offset))
+	    return FALSE;
+	  break;
+
+	case R_RISCV_GNU_VTENTRY:
+	  if (!bfd_elf_gc_record_vtentry (abfd, sec, h, rel->r_addend))
+	    return FALSE;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return TRUE;
+}
+
+static asection *
+riscv_elf_gc_mark_hook (asection *sec,
+			struct bfd_link_info *info,
+			Elf_Internal_Rela *rel,
+			struct elf_link_hash_entry *h,
+			Elf_Internal_Sym *sym)
+{
+  if (h != NULL)
+    switch (ELF64_R_TYPE (rel->r_info))
+      {
+      case R_RISCV_GNU_VTINHERIT:
+      case R_RISCV_GNU_VTENTRY:
+	return NULL;
+      }
+
+  return _bfd_elf_gc_mark_hook (sec, info, rel, h, sym);
+}
+
+/* Update the got entry reference counts for the section being removed.  */
+
+static bfd_boolean
+riscv_elf_gc_sweep_hook (bfd *abfd,
+			 struct bfd_link_info *info,
+			 asection *sec,
+			 const Elf_Internal_Rela *relocs)
+{
+  const Elf_Internal_Rela *rel, *relend;
+  Elf_Internal_Shdr *symtab_hdr = &elf_symtab_hdr (abfd);
+  struct elf_link_hash_entry **sym_hashes = elf_sym_hashes (abfd);
+  bfd_signed_vma *local_got_refcounts = elf_local_got_refcounts (abfd);
+
+  if (bfd_link_relocatable (info))
+    return TRUE;
+
+  elf_section_data (sec)->local_dynrel = NULL;
+
+  for (rel = relocs, relend = relocs + sec->reloc_count; rel < relend; rel++)
+    {
+      unsigned long r_symndx;
+      struct elf_link_hash_entry *h = NULL;
+
+      r_symndx = ELF64_R_SYM (rel->r_info);
+      if (r_symndx >= symtab_hdr->sh_info)
+	{
+	  struct riscv_elf_link_hash_entry *eh;
+	  struct riscv_elf_dyn_relocs **pp;
+	  struct riscv_elf_dyn_relocs *p;
+
+	  h = sym_hashes[r_symndx - symtab_hdr->sh_info];
+	  while (h->root.type == bfd_link_hash_indirect
+		 || h->root.type == bfd_link_hash_warning)
+	    h = (struct elf_link_hash_entry *) h->root.u.i.link;
+	  eh = (struct riscv_elf_link_hash_entry *) h;
+	  for (pp = &eh->dyn_relocs; (p = *pp) != NULL; pp = &p->next)
+	    if (p->sec == sec)
+	      {
+		/* Everything must go for SEC.  */
+		*pp = p->next;
+		break;
+	      }
+	}
+
+      switch (ELF64_R_TYPE (rel->r_info))
+	{
+	case R_RISCV_GOT_HI20:
+	case R_RISCV_TLS_GOT_HI20:
+	case R_RISCV_TLS_GD_HI20:
+	  if (h != NULL)
+	    {
+	      if (h->got.refcount > 0)
+		h->got.refcount--;
+	    }
+	  else
+	    {
+	      if (local_got_refcounts &&
+		  local_got_refcounts[r_symndx] > 0)
+		local_got_refcounts[r_symndx]--;
+	    }
+	  break;
+
+	case R_RISCV_HI20:
+	case R_RISCV_PCREL_HI20:
+	case R_RISCV_COPY:
+	case R_RISCV_JUMP_SLOT:
+	case R_RISCV_RELATIVE:
+	case R_RISCV_64:
+	case R_RISCV_32:
+	case R_RISCV_BRANCH:
+	case R_RISCV_CALL:
+	case R_RISCV_JAL:
+	case R_RISCV_RVC_BRANCH:
+	case R_RISCV_RVC_JUMP:
+	  if (bfd_link_pic (info))
+	    break;
+	  /* Fall through.  */
+
+	case R_RISCV_CALL_PLT:
+	  if (h != NULL)
+	    {
+	      if (h->plt.refcount > 0)
+		h->plt.refcount--;
+	    }
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return TRUE;
+}
+
+/* Adjust a symbol defined by a dynamic object and referenced by a
+   regular object.  The current definition is in some section of the
+   dynamic object, but we're not including those sections.  We have to
+   change the definition to something the rest of the link can
+   understand.  */
+
+static bfd_boolean
+riscv_elf_adjust_dynamic_symbol (struct bfd_link_info *info,
+				 struct elf_link_hash_entry *h)
+{
+  struct riscv_elf_link_hash_table *htab;
+  struct riscv_elf_link_hash_entry * eh;
+  struct riscv_elf_dyn_relocs *p;
+  bfd *dynobj;
+  asection *s, *srel;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+
+  dynobj = htab->elf.dynobj;
+
+  /* Make sure we know what is going on here.  */
+  BFD_ASSERT (dynobj != NULL
+	      && (h->needs_plt
+		  || h->type == STT_GNU_IFUNC
+		  || h->u.weakdef != NULL
+		  || (h->def_dynamic
+		      && h->ref_regular
+		      && !h->def_regular)));
+
+  /* If this is a function, put it in the procedure linkage table.  We
+     will fill in the contents of the procedure linkage table later
+     (although we could actually do it here).  */
+  if (h->type == STT_FUNC || h->type == STT_GNU_IFUNC || h->needs_plt)
+    {
+      if (h->plt.refcount <= 0
+	  || SYMBOL_CALLS_LOCAL (info, h)
+	  || (ELF_ST_VISIBILITY (h->other) != STV_DEFAULT
+	      && h->root.type == bfd_link_hash_undefweak))
+	{
+	  /* This case can occur if we saw a R_RISCV_CALL_PLT reloc in an
+	     input file, but the symbol was never referred to by a dynamic
+	     object, or if all references were garbage collected.  In such
+	     a case, we don't actually need to build a PLT entry.  */
+	  h->plt.offset = (bfd_vma) -1;
+	  h->needs_plt = 0;
+	}
+
+      return TRUE;
+    }
+  else
+    h->plt.offset = (bfd_vma) -1;
+
+  /* If this is a weak symbol, and there is a real definition, the
+     processor independent code will have arranged for us to see the
+     real definition first, and we can just use the same value.  */
+  if (h->u.weakdef != NULL)
+    {
+      BFD_ASSERT (h->u.weakdef->root.type == bfd_link_hash_defined
+		  || h->u.weakdef->root.type == bfd_link_hash_defweak);
+      h->root.u.def.section = h->u.weakdef->root.u.def.section;
+      h->root.u.def.value = h->u.weakdef->root.u.def.value;
+      return TRUE;
+    }
+
+  /* This is a reference to a symbol defined by a dynamic object which
+     is not a function.  */
+
+  /* If we are creating a shared library, we must presume that the
+     only references to the symbol are via the global offset table.
+     For such cases we need not do anything here; the relocations will
+     be handled correctly by relocate_section.  */
+  if (bfd_link_pic (info))
+    return TRUE;
+
+  /* If there are no references to this symbol that do not use the
+     GOT, we don't need to generate a copy reloc.  */
+  if (!h->non_got_ref)
+    return TRUE;
+
+  /* If -z nocopyreloc was given, we won't generate them either.  */
+  if (info->nocopyreloc)
+    {
+      h->non_got_ref = 0;
+      return TRUE;
+    }
+
+  eh = (struct riscv_elf_link_hash_entry *) h;
+  for (p = eh->dyn_relocs; p != NULL; p = p->next)
+    {
+      s = p->sec->output_section;
+      if (s != NULL && (s->flags & SEC_READONLY) != 0)
+	break;
+    }
+
+  /* If we didn't find any dynamic relocs in read-only sections, then
+     we'll be keeping the dynamic relocs and avoiding the copy reloc.  */
+  if (p == NULL)
+    {
+      h->non_got_ref = 0;
+      return TRUE;
+    }
+
+  /* We must allocate the symbol in our .dynbss section, which will
+     become part of the .bss section of the executable.  There will be
+     an entry for this symbol in the .dynsym section.  The dynamic
+     object will contain position independent code, so all references
+     from the dynamic object to this symbol will go through the global
+     offset table.  The dynamic linker will use the .dynsym entry to
+     determine the address it must put in the global offset table, so
+     both the dynamic object and the regular object will refer to the
+     same memory location for the variable.  */
+
+  /* We must generate a R_RISCV_COPY reloc to tell the dynamic linker
+     to copy the initial value out of the dynamic object and into the
+     runtime process image.  We need to remember the offset into the
+     .rel.bss section we are going to use.  */
+  if ((h->root.u.def.section->flags & SEC_READONLY) != 0)
+    {
+      s = htab->elf.sdynrelro;
+      srel = htab->elf.sreldynrelro;
+    }
+  else
+    {
+      s = htab->elf.sdynbss;
+      srel = htab->elf.srelbss;
+    }
+  if ((h->root.u.def.section->flags & SEC_ALLOC) != 0 && h->size != 0)
+    {
+      srel->size += sizeof (Elf64_External_Rela);
+      h->needs_copy = 1;
+    }
+
+  if (eh->tls_type & ~GOT_NORMAL)
+    return _bfd_elf_adjust_dynamic_copy (info, h, htab->sdyntdata);
+
+  return _bfd_elf_adjust_dynamic_copy (info, h, s);
+}
+
+/* Allocate space in .plt, .got and associated reloc sections for
+   dynamic relocs.  */
+
+static bfd_boolean
+allocate_dynrelocs (struct elf_link_hash_entry *h, void *inf)
+{
+  struct bfd_link_info *info;
+  struct riscv_elf_link_hash_table *htab;
+  struct riscv_elf_link_hash_entry *eh;
+  struct riscv_elf_dyn_relocs *p;
+
+  if (h->root.type == bfd_link_hash_indirect)
+    return TRUE;
+
+  info = (struct bfd_link_info *) inf;
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+
+  if (htab->elf.dynamic_sections_created
+      && h->plt.refcount > 0)
+    {
+      /* Make sure this symbol is output as a dynamic symbol.
+	 Undefined weak syms won't yet be marked as dynamic.  */
+      if (h->dynindx == -1
+	  && !h->forced_local)
+	{
+	  if (! bfd_elf_link_record_dynamic_symbol (info, h))
+	    return FALSE;
+	}
+
+      if (WILL_CALL_FINISH_DYNAMIC_SYMBOL (1, bfd_link_pic (info), h))
+	{
+	  asection *s = htab->elf.splt;
+
+	  if (s->size == 0)
+	    s->size = PLT_HEADER_SIZE;
+
+	  h->plt.offset = s->size;
+
+	  /* Make room for this entry.  */
+	  s->size += PLT_ENTRY_SIZE;
+
+	  /* We also need to make an entry in the .got.plt section.  */
+	  htab->elf.sgotplt->size += GOT_ENTRY_SIZE;
+
+	  /* We also need to make an entry in the .rela.plt section.  */
+	  htab->elf.srelplt->size += sizeof (Elf64_External_Rela);
+
+	  /* If this symbol is not defined in a regular file, and we are
+	     not generating a shared library, then set the symbol to this
+	     location in the .plt.  This is required to make function
+	     pointers compare as equal between the normal executable and
+	     the shared library.  */
+	  if (! bfd_link_pic (info)
+	      && !h->def_regular)
+	    {
+	      h->root.u.def.section = s;
+	      h->root.u.def.value = h->plt.offset;
+	    }
+	}
+      else
+	{
+	  h->plt.offset = (bfd_vma) -1;
+	  h->needs_plt = 0;
+	}
+    }
+  else
+    {
+      h->plt.offset = (bfd_vma) -1;
+      h->needs_plt = 0;
+    }
+
+  if (h->got.refcount > 0)
+    {
+      asection *s;
+      bfd_boolean dyn;
+      int tls_type = riscv_elf_hash_entry (h)->tls_type;
+
+      /* Make sure this symbol is output as a dynamic symbol.
+	 Undefined weak syms won't yet be marked as dynamic.  */
+      if (h->dynindx == -1
+	  && !h->forced_local)
+	{
+	  if (! bfd_elf_link_record_dynamic_symbol (info, h))
+	    return FALSE;
+	}
+
+      s = htab->elf.sgot;
+      h->got.offset = s->size;
+      dyn = htab->elf.dynamic_sections_created;
+      if (tls_type & (GOT_TLS_GD | GOT_TLS_IE))
+	{
+	  /* TLS_GD needs two dynamic relocs and two GOT slots.  */
+	  if (tls_type & GOT_TLS_GD)
+	    {
+	      s->size += 2 * RISCV_ELF_WORD_BYTES;
+	      htab->elf.srelgot->size += 2 * sizeof (Elf64_External_Rela);
+	    }
+
+	  /* TLS_IE needs one dynamic reloc and one GOT slot.  */
+	  if (tls_type & GOT_TLS_IE)
+	    {
+	      s->size += RISCV_ELF_WORD_BYTES;
+	      htab->elf.srelgot->size += sizeof (Elf64_External_Rela);
+	    }
+	}
+      else
+	{
+	  s->size += RISCV_ELF_WORD_BYTES;
+	  if (WILL_CALL_FINISH_DYNAMIC_SYMBOL (dyn, bfd_link_pic (info), h))
+	    htab->elf.srelgot->size += sizeof (Elf64_External_Rela);
+	}
+    }
+  else
+    h->got.offset = (bfd_vma) -1;
+
+  eh = (struct riscv_elf_link_hash_entry *) h;
+  if (eh->dyn_relocs == NULL)
+    return TRUE;
+
+  /* In the shared -Bsymbolic case, discard space allocated for
+     dynamic pc-relative relocs against symbols which turn out to be
+     defined in regular objects.  For the normal shared case, discard
+     space for pc-relative relocs that have become local due to symbol
+     visibility changes.  */
+
+  if (bfd_link_pic (info))
+    {
+      if (SYMBOL_CALLS_LOCAL (info, h))
+	{
+	  struct riscv_elf_dyn_relocs **pp;
+
+	  for (pp = &eh->dyn_relocs; (p = *pp) != NULL; )
+	    {
+	      p->count -= p->pc_count;
+	      p->pc_count = 0;
+	      if (p->count == 0)
+		*pp = p->next;
+	      else
+		pp = &p->next;
+	    }
+	}
+
+      /* Also discard relocs on undefined weak syms with non-default
+	 visibility.  */
+      if (eh->dyn_relocs != NULL
+	  && h->root.type == bfd_link_hash_undefweak)
+	{
+	  if (ELF_ST_VISIBILITY (h->other) != STV_DEFAULT)
+	    eh->dyn_relocs = NULL;
+
+	  /* Make sure undefined weak symbols are output as a dynamic
+	     symbol in PIEs.  */
+	  else if (h->dynindx == -1
+		   && !h->forced_local)
+	    {
+	      if (! bfd_elf_link_record_dynamic_symbol (info, h))
+		return FALSE;
+	    }
+	}
+    }
+  else
+    {
+      /* For the non-shared case, discard space for relocs against
+	 symbols which turn out to need copy relocs or are not
+	 dynamic.  */
+
+      if (!h->non_got_ref
+	  && ((h->def_dynamic
+	       && !h->def_regular)
+	      || (htab->elf.dynamic_sections_created
+		  && (h->root.type == bfd_link_hash_undefweak
+		      || h->root.type == bfd_link_hash_undefined))))
+	{
+	  /* Make sure this symbol is output as a dynamic symbol.
+	     Undefined weak syms won't yet be marked as dynamic.  */
+	  if (h->dynindx == -1
+	      && !h->forced_local)
+	    {
+	      if (! bfd_elf_link_record_dynamic_symbol (info, h))
+		return FALSE;
+	    }
+
+	  /* If that succeeded, we know we'll be keeping all the
+	     relocs.  */
+	  if (h->dynindx != -1)
+	    goto keep;
+	}
+
+      eh->dyn_relocs = NULL;
+
+    keep: ;
+    }
+
+  /* Finally, allocate space.  */
+  for (p = eh->dyn_relocs; p != NULL; p = p->next)
+    {
+      asection *sreloc = elf_section_data (p->sec)->sreloc;
+      sreloc->size += p->count * sizeof (Elf64_External_Rela);
+    }
+
+  return TRUE;
+}
+
+/* Find any dynamic relocs that apply to read-only sections.  */
+
+static bfd_boolean
+readonly_dynrelocs (struct elf_link_hash_entry *h, void *inf)
+{
+  struct riscv_elf_link_hash_entry *eh;
+  struct riscv_elf_dyn_relocs *p;
+
+  eh = (struct riscv_elf_link_hash_entry *) h;
+  for (p = eh->dyn_relocs; p != NULL; p = p->next)
+    {
+      asection *s = p->sec->output_section;
+
+      if (s != NULL && (s->flags & SEC_READONLY) != 0)
+	{
+	  ((struct bfd_link_info *) inf)->flags |= DF_TEXTREL;
+	  return FALSE;
+	}
+    }
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_elf_size_dynamic_sections (bfd *output_bfd, struct bfd_link_info *info)
+{
+  struct riscv_elf_link_hash_table *htab;
+  bfd *dynobj;
+  asection *s;
+  bfd *ibfd;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+  dynobj = htab->elf.dynobj;
+  BFD_ASSERT (dynobj != NULL);
+
+  if (elf_hash_table (info)->dynamic_sections_created)
+    {
+      /* Set the contents of the .interp section to the interpreter.  */
+      if (bfd_link_executable (info) && !info->nointerp)
+	{
+	  s = bfd_get_linker_section (dynobj, ".interp");
+	  BFD_ASSERT (s != NULL);
+	  s->size = strlen (ELF64_DYNAMIC_INTERPRETER) + 1;
+	  s->contents = (unsigned char *) ELF64_DYNAMIC_INTERPRETER;
+	}
+    }
+
+  /* Set up .got offsets for local syms, and space for local dynamic
+     relocs.  */
+  for (ibfd = info->input_bfds; ibfd != NULL; ibfd = ibfd->link.next)
+    {
+      bfd_signed_vma *local_got;
+      bfd_signed_vma *end_local_got;
+      char *local_tls_type;
+      bfd_size_type locsymcount;
+      Elf_Internal_Shdr *symtab_hdr;
+      asection *srel;
+
+      if (! is_riscv_elf (ibfd))
+	continue;
+
+      for (s = ibfd->sections; s != NULL; s = s->next)
+	{
+	  struct riscv_elf_dyn_relocs *p;
+
+	  for (p = elf_section_data (s)->local_dynrel; p != NULL; p = p->next)
+	    {
+	      if (!bfd_is_abs_section (p->sec)
+		  && bfd_is_abs_section (p->sec->output_section))
+		{
+		  /* Input section has been discarded, either because
+		     it is a copy of a linkonce section or due to
+		     linker script /DISCARD/, so we'll be discarding
+		     the relocs too.  */
+		}
+	      else if (p->count != 0)
+		{
+		  srel = elf_section_data (p->sec)->sreloc;
+		  srel->size += p->count * sizeof (Elf64_External_Rela);
+		  if ((p->sec->output_section->flags & SEC_READONLY) != 0)
+		    info->flags |= DF_TEXTREL;
+		}
+	    }
+	}
+
+      local_got = elf_local_got_refcounts (ibfd);
+      if (!local_got)
+	continue;
+
+      symtab_hdr = &elf_symtab_hdr (ibfd);
+      locsymcount = symtab_hdr->sh_info;
+      end_local_got = local_got + locsymcount;
+      local_tls_type = _bfd_riscv_elf_local_got_tls_type (ibfd);
+      s = htab->elf.sgot;
+      srel = htab->elf.srelgot;
+      for (; local_got < end_local_got; ++local_got, ++local_tls_type)
+	{
+	  if (*local_got > 0)
+	    {
+	      *local_got = s->size;
+	      s->size += RISCV_ELF_WORD_BYTES;
+	      if (*local_tls_type & GOT_TLS_GD)
+		s->size += RISCV_ELF_WORD_BYTES;
+	      if (bfd_link_pic (info)
+		  || (*local_tls_type & (GOT_TLS_GD | GOT_TLS_IE)))
+		srel->size += sizeof (Elf64_External_Rela);
+	    }
+	  else
+	    *local_got = (bfd_vma) -1;
+	}
+    }
+
+  /* Allocate global sym .plt and .got entries, and space for global
+     sym dynamic relocs.  */
+  elf_link_hash_traverse (&htab->elf, allocate_dynrelocs, info);
+
+  if (htab->elf.sgotplt)
+    {
+      struct elf_link_hash_entry *got;
+      got = elf_link_hash_lookup (elf_hash_table (info),
+				  "_GLOBAL_OFFSET_TABLE_",
+				  FALSE, FALSE, FALSE);
+
+      /* Don't allocate .got.plt section if there are no GOT nor PLT
+	 entries and there is no refeence to _GLOBAL_OFFSET_TABLE_.  */
+      if ((got == NULL
+	   || !got->ref_regular_nonweak)
+	  && (htab->elf.sgotplt->size == GOTPLT_HEADER_SIZE)
+	  && (htab->elf.splt == NULL
+	      || htab->elf.splt->size == 0)
+	  && (htab->elf.sgot == NULL
+	      || (htab->elf.sgot->size
+		  == get_elf_backend_data (output_bfd)->got_header_size)))
+	htab->elf.sgotplt->size = 0;
+    }
+
+  /* The check_relocs and adjust_dynamic_symbol entry points have
+     determined the sizes of the various dynamic sections.  Allocate
+     memory for them.  */
+  for (s = dynobj->sections; s != NULL; s = s->next)
+    {
+      if ((s->flags & SEC_LINKER_CREATED) == 0)
+	continue;
+
+      if (s == htab->elf.splt
+	  || s == htab->elf.sgot
+	  || s == htab->elf.sgotplt
+	  || s == htab->elf.sdynbss
+	  || s == htab->elf.sdynrelro)
+	{
+	  /* Strip this section if we don't need it; see the
+	     comment below.  */
+	}
+      else if (strncmp (s->name, ".rela", 5) == 0)
+	{
+	  if (s->size != 0)
+	    {
+	      /* We use the reloc_count field as a counter if we need
+		 to copy relocs into the output file.  */
+	      s->reloc_count = 0;
+	    }
+	}
+      else
+	{
+	  /* It's not one of our sections.  */
+	  continue;
+	}
+
+      if (s->size == 0)
+	{
+	  /* If we don't need this section, strip it from the
+	     output file.  This is mostly to handle .rela.bss and
+	     .rela.plt.  We must create both sections in
+	     create_dynamic_sections, because they must be created
+	     before the linker maps input sections to output
+	     sections.  The linker does that before
+	     adjust_dynamic_symbol is called, and it is that
+	     function which decides whether anything needs to go
+	     into these sections.  */
+	  s->flags |= SEC_EXCLUDE;
+	  continue;
+	}
+
+      if ((s->flags & SEC_HAS_CONTENTS) == 0)
+	continue;
+
+      /* Allocate memory for the section contents.  Zero the memory
+	 for the benefit of .rela.plt, which has 4 unused entries
+	 at the beginning, and we don't want garbage.  */
+      s->contents = (bfd_byte *) bfd_zalloc (dynobj, s->size);
+      if (s->contents == NULL)
+	return FALSE;
+    }
+
+  if (elf_hash_table (info)->dynamic_sections_created)
+    {
+      /* Add some entries to the .dynamic section.  We fill in the
+	 values later, in riscv_elf_finish_dynamic_sections, but we
+	 must add the entries now so that we get the correct size for
+	 the .dynamic section.  The DT_DEBUG entry is filled in by the
+	 dynamic linker and used by the debugger.  */
+#define add_dynamic_entry(TAG, VAL) \
+  _bfd_elf_add_dynamic_entry (info, TAG, VAL)
+
+      if (bfd_link_executable (info))
+	{
+	  if (!add_dynamic_entry (DT_DEBUG, 0))
+	    return FALSE;
+	}
+
+      if (htab->elf.srelplt->size != 0)
+	{
+	  if (!add_dynamic_entry (DT_PLTGOT, 0)
+	      || !add_dynamic_entry (DT_PLTRELSZ, 0)
+	      || !add_dynamic_entry (DT_PLTREL, DT_RELA)
+	      || !add_dynamic_entry (DT_JMPREL, 0))
+	    return FALSE;
+	}
+
+      if (!add_dynamic_entry (DT_RELA, 0)
+	  || !add_dynamic_entry (DT_RELASZ, 0)
+	  || !add_dynamic_entry (DT_RELAENT, sizeof (Elf64_External_Rela)))
+	return FALSE;
+
+      /* If any dynamic relocs apply to a read-only section,
+	 then we need a DT_TEXTREL entry.  */
+      if ((info->flags & DF_TEXTREL) == 0)
+	elf_link_hash_traverse (&htab->elf, readonly_dynrelocs, info);
+
+      if (info->flags & DF_TEXTREL)
+	{
+	  if (!add_dynamic_entry (DT_TEXTREL, 0))
+	    return FALSE;
+	}
+    }
+#undef add_dynamic_entry
+
+  return TRUE;
+}
+
+#define TP_OFFSET 0
+#define DTP_OFFSET 0x800
+
+/* Return the relocation value for a TLS dtp-relative reloc.  */
+
+static bfd_vma
+dtpoff (struct bfd_link_info *info, bfd_vma address)
+{
+  /* If tls_sec is NULL, we should have signalled an error already.  */
+  if (elf_hash_table (info)->tls_sec == NULL)
+    return 0;
+  return address - elf_hash_table (info)->tls_sec->vma - DTP_OFFSET;
+}
+
+/* Return the relocation value for a static TLS tp-relative relocation.  */
+
+static bfd_vma
+tpoff (struct bfd_link_info *info, bfd_vma address)
+{
+  /* If tls_sec is NULL, we should have signalled an error already.  */
+  if (elf_hash_table (info)->tls_sec == NULL)
+    return 0;
+  return address - elf_hash_table (info)->tls_sec->vma - TP_OFFSET;
+}
+
+/* Return the global pointer's value, or 0 if it is not in use.  */
+
+static bfd_vma
+riscv_global_pointer_value (struct bfd_link_info *info)
+{
+  struct bfd_link_hash_entry *h;
+
+  h = bfd_link_hash_lookup (info->hash, RISCV_GP_SYMBOL, FALSE, FALSE, TRUE);
+  if (h == NULL || h->type != bfd_link_hash_defined)
+    return 0;
+
+  return h->u.def.value + sec_addr (h->u.def.section);
+}
+
+/* Emplace a static relocation.  */
+
+static bfd_reloc_status_type
+perform_relocation (const reloc_howto_type *howto,
+		    const Elf_Internal_Rela *rel,
+		    bfd_vma value,
+		    asection *input_section,
+		    bfd *input_bfd,
+		    bfd_byte *contents,
+                    bfd_boolean IsImport)
+{
+  if (howto->pc_relative)
+    value -= sec_addr (input_section) + rel->r_offset;
+  value += rel->r_addend;
+
+  switch (ELF64_R_TYPE (rel->r_info))
+    {
+    case R_RISCV_HI20:
+    case R_RISCV_TPREL_HI20:
+    case R_RISCV_PCREL_HI20:
+    case R_RISCV_GOT_HI20:
+    case R_RISCV_TLS_GOT_HI20:
+    case R_RISCV_TLS_GD_HI20:
+      if (ARCH_SIZE > 32 && !VALID_UTYPE_IMM (RISCV_CONST_HIGH_PART (value)))
+	return bfd_reloc_overflow;
+      value = ENCODE_UTYPE_IMM (RISCV_CONST_HIGH_PART (value));
+      break;
+    /* Pulp specific relocs */
+    case R_RISCV_12_I:
+      if (!VALID_ITYPE_IMM (value)) return bfd_reloc_overflow;
+      value = ENCODE_ITYPE_IMM (value);
+      break;
+    case R_RISCV_12_S:
+      if (!VALID_STYPE_IMM (value)) return bfd_reloc_overflow;
+      value = ENCODE_STYPE_IMM (value);
+      break;
+
+    case R_RISCV_REL12:
+      value = ENCODE_ITYPE_IMM (value>>howto->rightshift);
+      break;
+    case R_RISCV_RELU5:
+      value = ENCODE_I1TYPE_UIMM (value>>howto->rightshift);
+      break;
+    /* End of Pulp specific relocs */
+    case R_RISCV_LO12_I:
+    case R_RISCV_GPREL_I:
+    case R_RISCV_TPREL_LO12_I:
+    case R_RISCV_TPREL_I:
+    case R_RISCV_PCREL_LO12_I:
+      value = ENCODE_ITYPE_IMM (value);
+      break;
+
+    case R_RISCV_LO12_S:
+    case R_RISCV_GPREL_S:
+    case R_RISCV_TPREL_LO12_S:
+    case R_RISCV_TPREL_S:
+    case R_RISCV_PCREL_LO12_S:
+      value = ENCODE_STYPE_IMM (value);
+      break;
+
+    case R_RISCV_CALL:
+    case R_RISCV_CALL_PLT:
+      if (ARCH_SIZE > 32 && !VALID_UTYPE_IMM (RISCV_CONST_HIGH_PART (value)))
+	return bfd_reloc_overflow;
+      value = ENCODE_UTYPE_IMM (RISCV_CONST_HIGH_PART (value))
+	      | (ENCODE_ITYPE_IMM (value) << 32);
+      break;
+
+    case R_RISCV_JAL:
+      if (!IsImport && !VALID_UJTYPE_IMM (value)) return bfd_reloc_overflow;
+      value = ENCODE_UJTYPE_IMM (value);
+      break;
+
+    case R_RISCV_BRANCH:
+      if (!VALID_SBTYPE_IMM (value))
+	return bfd_reloc_overflow;
+      value = ENCODE_SBTYPE_IMM (value);
+      break;
+
+    case R_RISCV_RVC_BRANCH:
+      if (!VALID_RVC_B_IMM (value))
+	return bfd_reloc_overflow;
+      value = ENCODE_RVC_B_IMM (value);
+      break;
+
+    case R_RISCV_RVC_JUMP:
+      if (!VALID_RVC_J_IMM (value))
+	return bfd_reloc_overflow;
+      value = ENCODE_RVC_J_IMM (value);
+      break;
+
+    case R_RISCV_RVC_LUI:
+      if (!VALID_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (value)))
+	return bfd_reloc_overflow;
+      value = ENCODE_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (value));
+      break;
+
+    case R_RISCV_32:
+    case R_RISCV_64:
+    case R_RISCV_ADD8:
+    case R_RISCV_ADD16:
+    case R_RISCV_ADD32:
+    case R_RISCV_ADD64:
+    case R_RISCV_SUB6:
+    case R_RISCV_SUB8:
+    case R_RISCV_SUB16:
+    case R_RISCV_SUB32:
+    case R_RISCV_SUB64:
+    case R_RISCV_SET6:
+    case R_RISCV_SET8:
+    case R_RISCV_SET16:
+    case R_RISCV_SET32:
+    case R_RISCV_TLS_DTPREL32:
+    case R_RISCV_TLS_DTPREL64:
+      break;
+
+    default:
+      return bfd_reloc_notsupported;
+    }
+
+  bfd_vma word = bfd_get (howto->bitsize, input_bfd, contents + rel->r_offset);
+  word = (word & ~howto->dst_mask) | (value & howto->dst_mask);
+  bfd_put (howto->bitsize, input_bfd, word, contents + rel->r_offset);
+
+  return bfd_reloc_ok;
+}
+
+/* Remember all PC-relative high-part relocs we've encountered to help us
+   later resolve the corresponding low-part relocs.  */
+
+typedef struct
+{
+  bfd_vma address;
+  bfd_vma value;
+} riscv_pcrel_hi_reloc;
+
+typedef struct riscv_pcrel_lo_reloc
+{
+  asection *                     input_section;
+  struct bfd_link_info *         info;
+  reloc_howto_type *             howto;
+  const Elf_Internal_Rela *      reloc;
+  bfd_vma                        addr;
+  const char *                   name;
+  bfd_byte *                     contents;
+  struct riscv_pcrel_lo_reloc *  next;
+} riscv_pcrel_lo_reloc;
+
+typedef struct
+{
+  htab_t hi_relocs;
+  riscv_pcrel_lo_reloc *lo_relocs;
+} riscv_pcrel_relocs;
+
+static hashval_t
+riscv_pcrel_reloc_hash (const void *entry)
+{
+  const riscv_pcrel_hi_reloc *e = entry;
+  return (hashval_t)(e->address >> 2);
+}
+
+static bfd_boolean
+riscv_pcrel_reloc_eq (const void *entry1, const void *entry2)
+{
+  const riscv_pcrel_hi_reloc *e1 = entry1, *e2 = entry2;
+  return e1->address == e2->address;
+}
+
+static bfd_boolean
+riscv_init_pcrel_relocs (riscv_pcrel_relocs *p)
+{
+
+  p->lo_relocs = NULL;
+  p->hi_relocs = htab_create (1024, riscv_pcrel_reloc_hash,
+			      riscv_pcrel_reloc_eq, free);
+  return p->hi_relocs != NULL;
+}
+
+static void
+riscv_free_pcrel_relocs (riscv_pcrel_relocs *p)
+{
+  riscv_pcrel_lo_reloc *cur = p->lo_relocs;
+
+  while (cur != NULL)
+    {
+      riscv_pcrel_lo_reloc *next = cur->next;
+      free (cur);
+      cur = next;
+    }
+
+  htab_delete (p->hi_relocs);
+}
+
+static bfd_boolean
+riscv_record_pcrel_hi_reloc (riscv_pcrel_relocs *p, bfd_vma addr, bfd_vma value)
+{
+  riscv_pcrel_hi_reloc entry = {addr, value - addr};
+  riscv_pcrel_hi_reloc **slot =
+    (riscv_pcrel_hi_reloc **) htab_find_slot (p->hi_relocs, &entry, INSERT);
+
+  BFD_ASSERT (*slot == NULL);
+  *slot = (riscv_pcrel_hi_reloc *) bfd_malloc (sizeof (riscv_pcrel_hi_reloc));
+  if (*slot == NULL)
+    return FALSE;
+  **slot = entry;
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_record_pcrel_lo_reloc (riscv_pcrel_relocs *p,
+			     asection *input_section,
+			     struct bfd_link_info *info,
+			     reloc_howto_type *howto,
+			     const Elf_Internal_Rela *reloc,
+			     bfd_vma addr,
+			     const char *name,
+			     bfd_byte *contents)
+{
+  riscv_pcrel_lo_reloc *entry;
+  entry = (riscv_pcrel_lo_reloc *) bfd_malloc (sizeof (riscv_pcrel_lo_reloc));
+  if (entry == NULL)
+    return FALSE;
+  *entry = (riscv_pcrel_lo_reloc) {input_section, info, howto, reloc, addr,
+				   name, contents, p->lo_relocs};
+  p->lo_relocs = entry;
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_resolve_pcrel_lo_relocs (riscv_pcrel_relocs *p)
+{
+  riscv_pcrel_lo_reloc *r;
+
+  for (r = p->lo_relocs; r != NULL; r = r->next)
+    {
+      bfd *input_bfd = r->input_section->owner;
+
+      riscv_pcrel_hi_reloc search = {r->addr, 0};
+      riscv_pcrel_hi_reloc *entry = htab_find (p->hi_relocs, &search);
+      if (entry == NULL)
+        {
+	  ((*r->info->callbacks->reloc_overflow)
+	   (r->info, NULL, r->name, r->howto->name, (bfd_vma) 0,
+	    input_bfd, r->input_section, r->reloc->r_offset));
+	  return TRUE;
+        }
+
+      perform_relocation (r->howto, r->reloc, entry->value, r->input_section,
+			  input_bfd, r->contents, FALSE);
+    }
+
+  return TRUE;
+}
+
+static bfd_boolean RegisterImportReloc(struct bfd_link_info *info,
+                                bfd *input_bfd,
+                                asection *input_section,
+                                Elf_Internal_Rela *rel,
+                                unsigned long r_symndx,
+                                Elf_Internal_Shdr *symtab_hdr,
+                                struct elf_link_hash_entry **sym_hashes,
+                                reloc_howto_type *howto)
+
+{
+        struct elf_link_hash_entry *h;
+        asection *sec;
+
+        if (sym_hashes == NULL) return FALSE;
+
+        /* It seems this can happen with erroneous or unsupported input (mixing a.out and elf in an archive, for example.)  */
+        h = sym_hashes[r_symndx - symtab_hdr->sh_info];
+
+        if (info->wrap_hash != NULL && (input_section->flags & SEC_DEBUGGING) != 0)
+                h = ((struct elf_link_hash_entry *) unwrap_hash_lookup (info, input_bfd, &h->root));
+
+        while (h->root.type == bfd_link_hash_indirect || h->root.type == bfd_link_hash_warning)
+                h = (struct elf_link_hash_entry *) h->root.u.i.link;
+        if (h->root.type == bfd_link_hash_defweak) {
+                sec = h->root.u.def.section;
+                if (sec != NULL && sec->output_section != NULL && (strcmp(sec->name, ".pulp.import")==0)) {
+                        static int Trace = 0;
+                        sec->flags |= SEC_KEEP;
+                        if (Trace) printf("    Importing %15s in reloc: %4d -> %4d:%22s, at offset: (%8X + %8X) => %X\n",
+                                          h->root.root.string, (int) rel->r_info,
+                                          (int) ELF64_R_TYPE(rel->r_info), howto->name, (int) rel->r_offset, (int) input_section->output_offset,
+                                          (int) ((int) input_section->output_offset+(int)rel->r_offset));
+                        InsertImportEntry(h->root.root.string, rel, input_section->output_offset, FALSE);
+                        return TRUE;
+                }
+        }
+        return FALSE;
+}
+
+
+/* Relocate a RISC-V ELF section.
+
+   The RELOCATE_SECTION function is called by the new ELF backend linker
+   to handle the relocations for a section.
+
+   The relocs are always passed as Rela structures.
+
+   This function is responsible for adjusting the section contents as
+   necessary, and (if generating a relocatable output file) adjusting
+   the reloc addend as necessary.
+
+   This function does not have to worry about setting the reloc
+   address or the reloc symbol index.
+
+   LOCAL_SYMS is a pointer to the swapped in local symbols.
+
+   LOCAL_SECTIONS is an array giving the section in the input file
+   corresponding to the st_shndx field of each local symbol.
+
+   The global hash table entry for the global symbols can be found
+   via elf_sym_hashes (input_bfd).
+
+   When generating relocatable output, this function must handle
+   STB_LOCAL/STT_SECTION symbols specially.  The output symbol is
+   going to be the section symbol corresponding to the output
+   section, which means that the addend must be adjusted
+   accordingly.  */
+
+static bfd_boolean
+riscv_elf_relocate_section (bfd *output_bfd,
+			    struct bfd_link_info *info,
+			    bfd *input_bfd,
+			    asection *input_section,
+			    bfd_byte *contents,
+			    Elf_Internal_Rela *relocs,
+			    Elf_Internal_Sym *local_syms,
+			    asection **local_sections)
+{
+  Elf_Internal_Rela *rel;
+  Elf_Internal_Rela *relend;
+  riscv_pcrel_relocs pcrel_relocs;
+  bfd_boolean ret = FALSE;
+  asection *sreloc = elf_section_data (input_section)->sreloc;
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  Elf_Internal_Shdr *symtab_hdr = &elf_symtab_hdr (input_bfd);
+  struct elf_link_hash_entry **sym_hashes = elf_sym_hashes (input_bfd);
+  bfd_vma *local_got_offsets = elf_local_got_offsets (input_bfd);
+
+  if (!riscv_init_pcrel_relocs (&pcrel_relocs))
+    return FALSE;
+
+  relend = relocs + input_section->reloc_count;
+  for (rel = relocs; rel < relend; rel++)
+    {
+      unsigned long r_symndx;
+      struct elf_link_hash_entry *h;
+      Elf_Internal_Sym *sym;
+      asection *sec;
+      bfd_vma relocation;
+      bfd_reloc_status_type r = bfd_reloc_ok;
+      const char *name;
+      bfd_vma off, ie_off;
+      bfd_boolean unresolved_reloc, is_ie = FALSE, IsImport = FALSE;
+      bfd_vma pc = sec_addr (input_section) + rel->r_offset;
+      int r_type = ELF64_R_TYPE (rel->r_info), tls_type;
+      reloc_howto_type *howto = riscv_elf_rtype_to_howto (r_type);
+      const char *msg = NULL;
+
+      if (r_type == R_RISCV_GNU_VTINHERIT || r_type == R_RISCV_GNU_VTENTRY)
+	continue;
+
+      /* This is a final link.  */
+      r_symndx = ELF64_R_SYM (rel->r_info);
+      h = NULL;
+      sym = NULL;
+      sec = NULL;
+      unresolved_reloc = FALSE;
+      if (r_symndx < symtab_hdr->sh_info)
+	{
+	  sym = local_syms + r_symndx;
+	  sec = local_sections[r_symndx];
+	  relocation = _bfd_elf_rela_local_sym (output_bfd, sym, &sec, rel);
+	}
+      else
+	{
+	  bfd_boolean warned, ignored;
+
+	  IsImport = RegisterImportReloc(info, input_bfd, input_section, rel, r_symndx, symtab_hdr, sym_hashes, howto);
+
+	  RELOC_FOR_GLOBAL_SYMBOL (info, input_bfd, input_section, rel,
+				   r_symndx, symtab_hdr, sym_hashes,
+				   h, sec, relocation,
+				   unresolved_reloc, warned, ignored);
+	  if (warned)
+	    {
+	      /* To avoid generating warning messages about truncated
+		 relocations, set the relocation's address to be the same as
+		 the start of this section.  */
+	      if (input_section->output_section != NULL)
+		relocation = input_section->output_section->vma;
+	      else
+		relocation = 0;
+	    }
+	}
+
+      if (sec != NULL && discarded_section (sec))
+	RELOC_AGAINST_DISCARDED_SECTION (info, input_bfd, input_section,
+					 rel, 1, relend, howto, 0, contents);
+
+      if (bfd_link_relocatable (info))
+	continue;
+
+      if (h != NULL)
+	name = h->root.root.string;
+      else
+	{
+	  name = (bfd_elf_string_from_elf_section
+		  (input_bfd, symtab_hdr->sh_link, sym->st_name));
+	  if (name == NULL || *name == '\0')
+	    name = bfd_section_name (input_bfd, sec);
+	}
+
+      switch (r_type)
+	{
+	case R_RISCV_NONE:
+	case R_RISCV_RELAX:
+	case R_RISCV_TPREL_ADD:
+	case R_RISCV_COPY:
+	case R_RISCV_JUMP_SLOT:
+	case R_RISCV_RELATIVE:
+	  /* These require nothing of us at all.  */
+	  continue;
+
+	case R_RISCV_HI20:
+	case R_RISCV_BRANCH:
+	case R_RISCV_RVC_BRANCH:
+	case R_RISCV_RVC_LUI:
+	case R_RISCV_LO12_I:
+	case R_RISCV_LO12_S:
+
+	/* Pulp specific */
+        case R_RISCV_RELU5:
+        case R_RISCV_REL12:
+        case R_RISCV_12_I:
+        case R_RISCV_12_S:
+	/* End of Pulp specific */
+
+	case R_RISCV_SET6:
+	case R_RISCV_SET8:
+	case R_RISCV_SET16:
+	case R_RISCV_SET32:
+	  /* These require no special handling beyond perform_relocation.  */
+	  break;
+
+	case R_RISCV_GOT_HI20:
+	  if (h != NULL)
+	    {
+	      bfd_boolean dyn, pic;
+
+	      off = h->got.offset;
+	      BFD_ASSERT (off != (bfd_vma) -1);
+	      dyn = elf_hash_table (info)->dynamic_sections_created;
+	      pic = bfd_link_pic (info);
+
+	      if (! WILL_CALL_FINISH_DYNAMIC_SYMBOL (dyn, pic, h)
+		  || (pic && SYMBOL_REFERENCES_LOCAL (info, h)))
+		{
+		  /* This is actually a static link, or it is a
+		     -Bsymbolic link and the symbol is defined
+		     locally, or the symbol was forced to be local
+		     because of a version file.  We must initialize
+		     this entry in the global offset table.  Since the
+		     offset must always be a multiple of the word size,
+		     we use the least significant bit to record whether
+		     we have initialized it already.
+
+		     When doing a dynamic link, we create a .rela.got
+		     relocation entry to initialize the value.  This
+		     is done in the finish_dynamic_symbol routine.  */
+		  if ((off & 1) != 0)
+		    off &= ~1;
+		  else
+		    {
+		      bfd_put_64 (output_bfd, relocation,
+				  htab->elf.sgot->contents + off);
+		      h->got.offset |= 1;
+		    }
+		}
+	      else
+		unresolved_reloc = FALSE;
+	    }
+	  else
+	    {
+	      BFD_ASSERT (local_got_offsets != NULL
+			  && local_got_offsets[r_symndx] != (bfd_vma) -1);
+
+	      off = local_got_offsets[r_symndx];
+
+	      /* The offset must always be a multiple of the word size.
+		 So, we can use the least significant bit to record
+		 whether we have already processed this entry.  */
+	      if ((off & 1) != 0)
+		off &= ~1;
+	      else
+		{
+		  if (bfd_link_pic (info))
+		    {
+		      asection *s;
+		      Elf_Internal_Rela outrel;
+
+		      /* We need to generate a R_RISCV_RELATIVE reloc
+			 for the dynamic linker.  */
+		      s = htab->elf.srelgot;
+		      BFD_ASSERT (s != NULL);
+
+		      outrel.r_offset = sec_addr (htab->elf.sgot) + off;
+		      outrel.r_info =
+			ELF64_R_INFO (0, R_RISCV_RELATIVE);
+		      outrel.r_addend = relocation;
+		      relocation = 0;
+		      riscv_elf_append_rela (output_bfd, s, &outrel);
+		    }
+
+		  bfd_put_64 (output_bfd, relocation,
+			      htab->elf.sgot->contents + off);
+		  local_got_offsets[r_symndx] |= 1;
+		}
+	    }
+	  relocation = sec_addr (htab->elf.sgot) + off;
+	  if (!riscv_record_pcrel_hi_reloc (&pcrel_relocs, pc, relocation))
+	    r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_ADD8:
+	case R_RISCV_ADD16:
+	case R_RISCV_ADD32:
+	case R_RISCV_ADD64:
+	  {
+	    bfd_vma old_value = bfd_get (howto->bitsize, input_bfd,
+					 contents + rel->r_offset);
+	    relocation = old_value + relocation;
+	  }
+	  break;
+
+	case R_RISCV_SUB6:
+	case R_RISCV_SUB8:
+	case R_RISCV_SUB16:
+	case R_RISCV_SUB32:
+	case R_RISCV_SUB64:
+	  {
+	    bfd_vma old_value = bfd_get (howto->bitsize, input_bfd,
+					 contents + rel->r_offset);
+	    relocation = old_value - relocation;
+	  }
+	  break;
+
+	case R_RISCV_CALL_PLT:
+	case R_RISCV_CALL:
+	case R_RISCV_JAL:
+	case R_RISCV_RVC_JUMP:
+	  if (bfd_link_pic (info) && h != NULL && h->plt.offset != MINUS_ONE)
+	    {
+	      /* Refer to the PLT entry.  */
+	      relocation = sec_addr (htab->elf.splt) + h->plt.offset;
+	      unresolved_reloc = FALSE;
+	    }
+	  break;
+
+	case R_RISCV_TPREL_HI20:
+	  relocation = tpoff (info, relocation);
+	  break;
+
+	case R_RISCV_TPREL_LO12_I:
+	case R_RISCV_TPREL_LO12_S:
+	  relocation = tpoff (info, relocation);
+	  break;
+
+	case R_RISCV_TPREL_I:
+	case R_RISCV_TPREL_S:
+	  relocation = tpoff (info, relocation);
+	  if (VALID_ITYPE_IMM (relocation + rel->r_addend))
+	    {
+	      /* We can use tp as the base register.  */
+	      bfd_vma insn = bfd_get_32 (input_bfd, contents + rel->r_offset);
+	      insn &= ~(OP_MASK_RS1 << OP_SH_RS1);
+	      insn |= X_TP << OP_SH_RS1;
+	      bfd_put_32 (input_bfd, insn, contents + rel->r_offset);
+	    }
+	  else
+	    r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_GPREL_I:
+	case R_RISCV_GPREL_S:
+	  {
+	    bfd_vma gp = riscv_global_pointer_value (info);
+	    bfd_boolean x0_base = VALID_ITYPE_IMM (relocation + rel->r_addend);
+	    if (x0_base || VALID_ITYPE_IMM (relocation + rel->r_addend - gp))
+	      {
+		/* We can use x0 or gp as the base register.  */
+		bfd_vma insn = bfd_get_32 (input_bfd, contents + rel->r_offset);
+		insn &= ~(OP_MASK_RS1 << OP_SH_RS1);
+		if (!x0_base)
+		  {
+		    rel->r_addend -= gp;
+		    insn |= X_GP << OP_SH_RS1;
+		  }
+		bfd_put_32 (input_bfd, insn, contents + rel->r_offset);
+	      }
+	    else
+	      r = bfd_reloc_overflow;
+	    break;
+	  }
+
+	case R_RISCV_PCREL_HI20:
+	  if (!riscv_record_pcrel_hi_reloc (&pcrel_relocs, pc,
+					    relocation + rel->r_addend))
+	    r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_PCREL_LO12_I:
+	case R_RISCV_PCREL_LO12_S:
+	  if (riscv_record_pcrel_lo_reloc (&pcrel_relocs, input_section, info,
+					   howto, rel, relocation, name,
+					   contents))
+	    continue;
+	  r = bfd_reloc_overflow;
+	  break;
+
+	case R_RISCV_TLS_DTPREL32:
+	case R_RISCV_TLS_DTPREL64:
+	  relocation = dtpoff (info, relocation);
+	  break;
+
+	case R_RISCV_32:
+	case R_RISCV_64:
+	  if ((input_section->flags & SEC_ALLOC) == 0)
+	    break;
+
+	  if ((bfd_link_pic (info)
+	       && (h == NULL
+		   || ELF_ST_VISIBILITY (h->other) == STV_DEFAULT
+		   || h->root.type != bfd_link_hash_undefweak)
+	       && (! howto->pc_relative
+		   || !SYMBOL_CALLS_LOCAL (info, h)))
+	      || (!bfd_link_pic (info)
+		  && h != NULL
+		  && h->dynindx != -1
+		  && !h->non_got_ref
+		  && ((h->def_dynamic
+		       && !h->def_regular)
+		      || h->root.type == bfd_link_hash_undefweak
+		      || h->root.type == bfd_link_hash_undefined)))
+	    {
+	      Elf_Internal_Rela outrel;
+	      bfd_boolean skip_static_relocation, skip_dynamic_relocation;
+
+	      /* When generating a shared object, these relocations
+		 are copied into the output file to be resolved at run
+		 time.  */
+
+	      outrel.r_offset =
+		_bfd_elf_section_offset (output_bfd, info, input_section,
+					 rel->r_offset);
+	      skip_static_relocation = outrel.r_offset != (bfd_vma) -2;
+	      skip_dynamic_relocation = outrel.r_offset >= (bfd_vma) -2;
+	      outrel.r_offset += sec_addr (input_section);
+
+	      if (skip_dynamic_relocation)
+		memset (&outrel, 0, sizeof outrel);
+	      else if (h != NULL && h->dynindx != -1
+		       && !(bfd_link_pic (info)
+			    && SYMBOLIC_BIND (info, h)
+			    && h->def_regular))
+		{
+		  outrel.r_info = ELF64_R_INFO (h->dynindx, r_type);
+		  outrel.r_addend = rel->r_addend;
+		}
+	      else
+		{
+		  outrel.r_info = ELF64_R_INFO (0, R_RISCV_RELATIVE);
+		  outrel.r_addend = relocation + rel->r_addend;
+		}
+
+	      riscv_elf_append_rela (output_bfd, sreloc, &outrel);
+	      if (skip_static_relocation)
+		continue;
+	    }
+	  break;
+
+	case R_RISCV_TLS_GOT_HI20:
+	  is_ie = TRUE;
+	  /* Fall through.  */
+
+	case R_RISCV_TLS_GD_HI20:
+	  if (h != NULL)
+	    {
+	      off = h->got.offset;
+	      h->got.offset |= 1;
+	    }
+	  else
+	    {
+	      off = local_got_offsets[r_symndx];
+	      local_got_offsets[r_symndx] |= 1;
+	    }
+
+	  tls_type = _bfd_riscv_elf_tls_type (input_bfd, h, r_symndx);
+	  BFD_ASSERT (tls_type & (GOT_TLS_IE | GOT_TLS_GD));
+	  /* If this symbol is referenced by both GD and IE TLS, the IE
+	     reference's GOT slot follows the GD reference's slots.  */
+	  ie_off = 0;
+	  if ((tls_type & GOT_TLS_GD) && (tls_type & GOT_TLS_IE))
+	    ie_off = 2 * GOT_ENTRY_SIZE;
+
+	  if ((off & 1) != 0)
+	    off &= ~1;
+	  else
+	    {
+	      Elf_Internal_Rela outrel;
+	      int indx = 0;
+	      bfd_boolean need_relocs = FALSE;
+
+	      if (htab->elf.srelgot == NULL)
+		abort ();
+
+	      if (h != NULL)
+		{
+		  bfd_boolean dyn, pic;
+		  dyn = htab->elf.dynamic_sections_created;
+		  pic = bfd_link_pic (info);
+
+		  if (WILL_CALL_FINISH_DYNAMIC_SYMBOL (dyn, pic, h)
+		      && (!pic || !SYMBOL_REFERENCES_LOCAL (info, h)))
+		    indx = h->dynindx;
+		}
+
+	      /* The GOT entries have not been initialized yet.  Do it
+	         now, and emit any relocations.  */
+	      if ((bfd_link_pic (info) || indx != 0)
+		  && (h == NULL
+		      || ELF_ST_VISIBILITY (h->other) == STV_DEFAULT
+		      || h->root.type != bfd_link_hash_undefweak))
+		    need_relocs = TRUE;
+
+	      if (tls_type & GOT_TLS_GD)
+		{
+		  if (need_relocs)
+		    {
+		      outrel.r_offset = sec_addr (htab->elf.sgot) + off;
+		      outrel.r_addend = 0;
+		      outrel.r_info = ELF64_R_INFO (indx, R_RISCV_TLS_DTPMOD64);
+		      bfd_put_64 (output_bfd, 0,
+				  htab->elf.sgot->contents + off);
+		      riscv_elf_append_rela (output_bfd, htab->elf.srelgot, &outrel);
+		      if (indx == 0)
+			{
+			  BFD_ASSERT (! unresolved_reloc);
+			  bfd_put_64 (output_bfd,
+				      dtpoff (info, relocation),
+				      (htab->elf.sgot->contents + off +
+				       RISCV_ELF_WORD_BYTES));
+			}
+		      else
+			{
+			  bfd_put_64 (output_bfd, 0,
+				      (htab->elf.sgot->contents + off +
+				       RISCV_ELF_WORD_BYTES));
+			  outrel.r_info = ELF64_R_INFO (indx, R_RISCV_TLS_DTPREL64);
+			  outrel.r_offset += RISCV_ELF_WORD_BYTES;
+			  riscv_elf_append_rela (output_bfd, htab->elf.srelgot, &outrel);
+			}
+		    }
+		  else
+		    {
+		      /* If we are not emitting relocations for a
+			 general dynamic reference, then we must be in a
+			 static link or an executable link with the
+			 symbol binding locally.  Mark it as belonging
+			 to module 1, the executable.  */
+		      bfd_put_64 (output_bfd, 1,
+				  htab->elf.sgot->contents + off);
+		      bfd_put_64 (output_bfd,
+				  dtpoff (info, relocation),
+				  (htab->elf.sgot->contents + off +
+				   RISCV_ELF_WORD_BYTES));
+		   }
+		}
+
+	      if (tls_type & GOT_TLS_IE)
+		{
+		  if (need_relocs)
+		    {
+		      bfd_put_64 (output_bfd, 0,
+				  htab->elf.sgot->contents + off + ie_off);
+		      outrel.r_offset = sec_addr (htab->elf.sgot)
+				       + off + ie_off;
+		      outrel.r_addend = 0;
+		      if (indx == 0)
+			outrel.r_addend = tpoff (info, relocation);
+		      outrel.r_info = ELF64_R_INFO (indx, R_RISCV_TLS_TPREL64);
+		      riscv_elf_append_rela (output_bfd, htab->elf.srelgot, &outrel);
+		    }
+		  else
+		    {
+		      bfd_put_64 (output_bfd, tpoff (info, relocation),
+				  htab->elf.sgot->contents + off + ie_off);
+		    }
+		}
+	    }
+
+	  BFD_ASSERT (off < (bfd_vma) -2);
+	  relocation = sec_addr (htab->elf.sgot) + off + (is_ie ? ie_off : 0);
+	  if (!riscv_record_pcrel_hi_reloc (&pcrel_relocs, pc, relocation))
+	    r = bfd_reloc_overflow;
+	  unresolved_reloc = FALSE;
+	  break;
+
+	default:
+	  r = bfd_reloc_notsupported;
+	}
+
+      /* Dynamic relocs are not propagated for SEC_DEBUGGING sections
+	 because such sections are not SEC_ALLOC and thus ld.so will
+	 not process them.  */
+      if (unresolved_reloc
+	  && !((input_section->flags & SEC_DEBUGGING) != 0
+	       && h->def_dynamic)
+	  && _bfd_elf_section_offset (output_bfd, info, input_section,
+				      rel->r_offset) != (bfd_vma) -1)
+	{
+	  (*_bfd_error_handler)
+	    (_("%B(%A+0x%lx): unresolvable %s relocation against symbol `%s'"),
+	     input_bfd,
+	     input_section,
+	     (long) rel->r_offset,
+	     howto->name,
+	     h->root.root.string);
+	  continue;
+	}
+
+      if (r == bfd_reloc_ok)
+	r = perform_relocation (howto, rel, relocation, input_section,
+				input_bfd, contents, IsImport);
+
+      switch (r)
+	{
+	case bfd_reloc_ok:
+	  continue;
+
+	case bfd_reloc_overflow:
+	  info->callbacks->reloc_overflow
+	    (info, (h ? &h->root : NULL), name, howto->name,
+	     (bfd_vma) 0, input_bfd, input_section, rel->r_offset);
+	  break;
+
+	case bfd_reloc_undefined:
+	  info->callbacks->undefined_symbol
+	    (info, name, input_bfd, input_section, rel->r_offset,
+	     TRUE);
+	  break;
+
+	case bfd_reloc_outofrange:
+	  msg = _("internal error: out of range error");
+	  break;
+
+	case bfd_reloc_notsupported:
+	  msg = _("internal error: unsupported relocation error");
+	  break;
+
+	case bfd_reloc_dangerous:
+	  msg = _("internal error: dangerous relocation");
+	  break;
+
+	default:
+	  msg = _("internal error: unknown error");
+	  break;
+	}
+
+      if (msg)
+	info->callbacks->warning
+	  (info, msg, name, input_bfd, input_section, rel->r_offset);
+      goto out;
+    }
+
+  ret = riscv_resolve_pcrel_lo_relocs (&pcrel_relocs);
+out:
+  riscv_free_pcrel_relocs (&pcrel_relocs);
+  return ret;
+}
+
+/* Finish up dynamic symbol handling.  We set the contents of various
+   dynamic sections here.  */
+
+static bfd_boolean
+riscv_elf_finish_dynamic_symbol (bfd *output_bfd,
+				 struct bfd_link_info *info,
+				 struct elf_link_hash_entry *h,
+				 Elf_Internal_Sym *sym)
+{
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  const struct elf_backend_data *bed = get_elf_backend_data (output_bfd);
+
+  if (h->plt.offset != (bfd_vma) -1)
+    {
+      /* We've decided to create a PLT entry for this symbol.  */
+      bfd_byte *loc;
+      bfd_vma i, header_address, plt_idx, got_address;
+      uint32_t plt_entry[PLT_ENTRY_INSNS];
+      Elf_Internal_Rela rela;
+
+      BFD_ASSERT (h->dynindx != -1);
+
+      /* Calculate the address of the PLT header.  */
+      header_address = sec_addr (htab->elf.splt);
+
+      /* Calculate the index of the entry.  */
+      plt_idx = (h->plt.offset - PLT_HEADER_SIZE) / PLT_ENTRY_SIZE;
+
+      /* Calculate the address of the .got.plt entry.  */
+      got_address = riscv_elf_got_plt_val (plt_idx, info);
+
+      /* Find out where the .plt entry should go.  */
+      loc = htab->elf.splt->contents + h->plt.offset;
+
+      /* Fill in the PLT entry itself.  */
+      riscv_make_plt_entry (got_address, header_address + h->plt.offset,
+			    plt_entry);
+      for (i = 0; i < PLT_ENTRY_INSNS; i++)
+	bfd_put_32 (output_bfd, plt_entry[i], loc + 4*i);
+
+      /* Fill in the initial value of the .got.plt entry.  */
+      loc = htab->elf.sgotplt->contents
+	    + (got_address - sec_addr (htab->elf.sgotplt));
+      bfd_put_64 (output_bfd, sec_addr (htab->elf.splt), loc);
+
+      /* Fill in the entry in the .rela.plt section.  */
+      rela.r_offset = got_address;
+      rela.r_addend = 0;
+      rela.r_info = ELF64_R_INFO (h->dynindx, R_RISCV_JUMP_SLOT);
+
+      loc = htab->elf.srelplt->contents + plt_idx * sizeof (Elf64_External_Rela);
+      bed->s->swap_reloca_out (output_bfd, &rela, loc);
+
+      if (!h->def_regular)
+	{
+	  /* Mark the symbol as undefined, rather than as defined in
+	     the .plt section.  Leave the value alone.  */
+	  sym->st_shndx = SHN_UNDEF;
+	  /* If the symbol is weak, we do need to clear the value.
+	     Otherwise, the PLT entry would provide a definition for
+	     the symbol even if the symbol wasn't defined anywhere,
+	     and so the symbol would never be NULL.  */
+	  if (!h->ref_regular_nonweak)
+	    sym->st_value = 0;
+	}
+    }
+
+  if (h->got.offset != (bfd_vma) -1
+      && !(riscv_elf_hash_entry (h)->tls_type & (GOT_TLS_GD | GOT_TLS_IE)))
+    {
+      asection *sgot;
+      asection *srela;
+      Elf_Internal_Rela rela;
+
+      /* This symbol has an entry in the GOT.  Set it up.  */
+
+      sgot = htab->elf.sgot;
+      srela = htab->elf.srelgot;
+      BFD_ASSERT (sgot != NULL && srela != NULL);
+
+      rela.r_offset = sec_addr (sgot) + (h->got.offset &~ (bfd_vma) 1);
+
+      /* If this is a -Bsymbolic link, and the symbol is defined
+	 locally, we just want to emit a RELATIVE reloc.  Likewise if
+	 the symbol was forced to be local because of a version file.
+	 The entry in the global offset table will already have been
+	 initialized in the relocate_section function.  */
+      if (bfd_link_pic (info)
+	  && (info->symbolic || h->dynindx == -1)
+	  && h->def_regular)
+	{
+	  asection *sec = h->root.u.def.section;
+	  rela.r_info = ELF64_R_INFO (0, R_RISCV_RELATIVE);
+	  rela.r_addend = (h->root.u.def.value
+			   + sec->output_section->vma
+			   + sec->output_offset);
+	}
+      else
+	{
+	  BFD_ASSERT (h->dynindx != -1);
+	  rela.r_info = ELF64_R_INFO (h->dynindx, R_RISCV_64);
+	  rela.r_addend = 0;
+	}
+
+      bfd_put_64 (output_bfd, 0,
+		  sgot->contents + (h->got.offset & ~(bfd_vma) 1));
+      riscv_elf_append_rela (output_bfd, srela, &rela);
+    }
+
+  if (h->needs_copy)
+    {
+      Elf_Internal_Rela rela;
+      asection *s;
+
+      /* This symbols needs a copy reloc.  Set it up.  */
+      BFD_ASSERT (h->dynindx != -1);
+
+      rela.r_offset = sec_addr (h->root.u.def.section) + h->root.u.def.value;
+      rela.r_info = ELF64_R_INFO (h->dynindx, R_RISCV_COPY);
+      rela.r_addend = 0;
+      if (h->root.u.def.section == htab->elf.sdynrelro)
+	s = htab->elf.sreldynrelro;
+      else
+	s = htab->elf.srelbss;
+      riscv_elf_append_rela (output_bfd, s, &rela);
+    }
+
+  /* Mark some specially defined symbols as absolute.  */
+  if (h == htab->elf.hdynamic
+      || (h == htab->elf.hgot || h == htab->elf.hplt))
+    sym->st_shndx = SHN_ABS;
+
+  return TRUE;
+}
+
+/* Finish up the dynamic sections.  */
+
+static bfd_boolean
+riscv_finish_dyn (bfd *output_bfd, struct bfd_link_info *info,
+		  bfd *dynobj, asection *sdyn)
+{
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  const struct elf_backend_data *bed = get_elf_backend_data (output_bfd);
+  size_t dynsize = bed->s->sizeof_dyn;
+  bfd_byte *dyncon, *dynconend;
+
+  dynconend = sdyn->contents + sdyn->size;
+  for (dyncon = sdyn->contents; dyncon < dynconend; dyncon += dynsize)
+    {
+      Elf_Internal_Dyn dyn;
+      asection *s;
+
+      bed->s->swap_dyn_in (dynobj, dyncon, &dyn);
+
+      switch (dyn.d_tag)
+	{
+	case DT_PLTGOT:
+	  s = htab->elf.sgotplt;
+	  dyn.d_un.d_ptr = s->output_section->vma + s->output_offset;
+	  break;
+	case DT_JMPREL:
+	  s = htab->elf.srelplt;
+	  dyn.d_un.d_ptr = s->output_section->vma + s->output_offset;
+	  break;
+	case DT_PLTRELSZ:
+	  s = htab->elf.srelplt;
+	  dyn.d_un.d_val = s->size;
+	  break;
+	default:
+	  continue;
+	}
+
+      bed->s->swap_dyn_out (output_bfd, &dyn, dyncon);
+    }
+  return TRUE;
+}
+
+static bfd_boolean
+riscv_elf_finish_dynamic_sections (bfd *output_bfd,
+				   struct bfd_link_info *info)
+{
+  bfd *dynobj;
+  asection *sdyn;
+  struct riscv_elf_link_hash_table *htab;
+
+  htab = riscv_elf_hash_table (info);
+  BFD_ASSERT (htab != NULL);
+  dynobj = htab->elf.dynobj;
+
+  sdyn = bfd_get_linker_section (dynobj, ".dynamic");
+
+  if (elf_hash_table (info)->dynamic_sections_created)
+    {
+      asection *splt;
+      bfd_boolean ret;
+
+      splt = htab->elf.splt;
+      BFD_ASSERT (splt != NULL && sdyn != NULL);
+
+      ret = riscv_finish_dyn (output_bfd, info, dynobj, sdyn);
+
+      if (ret != TRUE)
+	return ret;
+
+      /* Fill in the head and tail entries in the procedure linkage table.  */
+      if (splt->size > 0)
+	{
+	  int i;
+	  uint32_t plt_header[PLT_HEADER_INSNS];
+	  riscv_make_plt_header (sec_addr (htab->elf.sgotplt),
+				 sec_addr (splt), plt_header);
+
+	  for (i = 0; i < PLT_HEADER_INSNS; i++)
+	    bfd_put_32 (output_bfd, plt_header[i], splt->contents + 4*i);
+	}
+
+      elf_section_data (splt->output_section)->this_hdr.sh_entsize
+	= PLT_ENTRY_SIZE;
+    }
+
+  if (htab->elf.sgotplt)
+    {
+      asection *output_section = htab->elf.sgotplt->output_section;
+
+      if (bfd_is_abs_section (output_section))
+	{
+	  (*_bfd_error_handler)
+	    (_("discarded output section: `%A'"), htab->elf.sgotplt);
+	  return FALSE;
+	}
+
+      if (htab->elf.sgotplt->size > 0)
+	{
+	  /* Write the first two entries in .got.plt, needed for the dynamic
+	     linker.  */
+	  bfd_put_64 (output_bfd, (bfd_vma) -1, htab->elf.sgotplt->contents);
+	  bfd_put_64 (output_bfd, (bfd_vma) 0,
+		      htab->elf.sgotplt->contents + GOT_ENTRY_SIZE);
+	}
+
+      elf_section_data (output_section)->this_hdr.sh_entsize = GOT_ENTRY_SIZE;
+    }
+
+  if (htab->elf.sgot)
+    {
+      asection *output_section = htab->elf.sgot->output_section;
+
+      if (htab->elf.sgot->size > 0)
+	{
+	  /* Set the first entry in the global offset table to the address of
+	     the dynamic section.  */
+	  bfd_vma val = sdyn ? sec_addr (sdyn) : 0;
+	  bfd_put_64 (output_bfd, val, htab->elf.sgot->contents);
+	}
+
+      elf_section_data (output_section)->this_hdr.sh_entsize = GOT_ENTRY_SIZE;
+    }
+
+  return TRUE;
+}
+
+/* Return address for Ith PLT stub in section PLT, for relocation REL
+   or (bfd_vma) -1 if it should not be included.  */
+
+static bfd_vma
+riscv_elf_plt_sym_val (bfd_vma i, const asection *plt,
+		       const arelent *rel ATTRIBUTE_UNUSED)
+{
+  return plt->vma + PLT_HEADER_SIZE + i * PLT_ENTRY_SIZE;
+}
+
+static enum elf_reloc_type_class
+riscv_reloc_type_class (const struct bfd_link_info *info ATTRIBUTE_UNUSED,
+			const asection *rel_sec ATTRIBUTE_UNUSED,
+			const Elf_Internal_Rela *rela)
+{
+  switch (ELF64_R_TYPE (rela->r_info))
+    {
+    case R_RISCV_RELATIVE:
+      return reloc_class_relative;
+    case R_RISCV_JUMP_SLOT:
+      return reloc_class_plt;
+    case R_RISCV_COPY:
+      return reloc_class_copy;
+    default:
+      return reloc_class_normal;
+    }
+}
+
+/* Merge backend specific data from an object file to the output
+   object file when linking.  */
+
+static bfd_boolean
+_bfd_riscv_elf_merge_private_bfd_data (bfd *ibfd, struct bfd_link_info *info)
+{
+  bfd *obfd = info->output_bfd;
+  flagword new_flags = elf_elfheader (ibfd)->e_flags;
+  flagword old_flags = elf_elfheader (obfd)->e_flags;
+
+  if (!is_riscv_elf (ibfd) || !is_riscv_elf (obfd))
+    return TRUE;
+
+  if (strcmp (bfd_get_target (ibfd), bfd_get_target (obfd)) != 0)
+    {
+      (*_bfd_error_handler)
+	(_("%B: ABI is incompatible with that of the selected emulation:\n"
+	   "  target emulation `%s' does not match `%s'"),
+	 ibfd, bfd_get_target (ibfd), bfd_get_target (obfd));
+      return FALSE;
+    }
+
+  if (!_bfd_elf_merge_object_attributes (ibfd, info))
+    return FALSE;
+
+  if (! elf_flags_init (obfd))
+    {
+      elf_flags_init (obfd) = TRUE;
+      elf_elfheader (obfd)->e_flags = new_flags;
+      return TRUE;
+    }
+
+  /* Disallow linking different float ABIs.  */
+  if ((old_flags ^ new_flags) & EF_RISCV_FLOAT_ABI)
+    {
+      (*_bfd_error_handler)
+	(_("%B: can't link hard-float modules with soft-float modules"), ibfd);
+      goto fail;
+    }
+
+  /* Allow linking RVC and non-RVC, and keep the RVC flag.  */
+  elf_elfheader (obfd)->e_flags |= new_flags & EF_RISCV_RVC;
+
+  return TRUE;
+
+fail:
+  bfd_set_error (bfd_error_bad_value);
+  return FALSE;
+}
+
+/* Delete some bytes from a section while relaxing.  */
+
+static bfd_boolean
+riscv_relax_delete_bytes (bfd *abfd, asection *sec, bfd_vma addr, size_t count)
+{
+  unsigned int i, symcount;
+  bfd_vma toaddr = sec->size;
+  struct elf_link_hash_entry **sym_hashes = elf_sym_hashes (abfd);
+  Elf_Internal_Shdr *symtab_hdr = &elf_tdata (abfd)->symtab_hdr;
+  unsigned int sec_shndx = _bfd_elf_section_from_bfd_section (abfd, sec);
+  struct bfd_elf_section_data *data = elf_section_data (sec);
+  bfd_byte *contents = data->this_hdr.contents;
+
+  /* Actually delete the bytes.  */
+  sec->size -= count;
+  memmove (contents + addr, contents + addr + count, toaddr - addr - count);
+
+  /* Adjust the location of all of the relocs.  Note that we need not
+     adjust the addends, since all PC-relative references must be against
+     symbols, which we will adjust below.  */
+  for (i = 0; i < sec->reloc_count; i++)
+    if (data->relocs[i].r_offset > addr && data->relocs[i].r_offset < toaddr)
+      data->relocs[i].r_offset -= count;
+
+  /* Adjust the local symbols defined in this section.  */
+  for (i = 0; i < symtab_hdr->sh_info; i++)
+    {
+      Elf_Internal_Sym *sym = (Elf_Internal_Sym *) symtab_hdr->contents + i;
+      if (sym->st_shndx == sec_shndx)
+	{
+	  /* If the symbol is in the range of memory we just moved, we
+	     have to adjust its value.  */
+	  if (sym->st_value > addr && sym->st_value <= toaddr)
+	    sym->st_value -= count;
+
+	  /* If the symbol *spans* the bytes we just deleted (i.e. its
+	     *end* is in the moved bytes but its *start* isn't), then we
+	     must adjust its size.  */
+	  if (sym->st_value <= addr
+	      && sym->st_value + sym->st_size > addr
+	      && sym->st_value + sym->st_size <= toaddr)
+	    sym->st_size -= count;
+	}
+    }
+
+  /* Now adjust the global symbols defined in this section.  */
+  symcount = ((symtab_hdr->sh_size / sizeof (Elf64_External_Sym))
+	      - symtab_hdr->sh_info);
+
+  for (i = 0; i < symcount; i++)
+    {
+      struct elf_link_hash_entry *sym_hash = sym_hashes[i];
+
+      if ((sym_hash->root.type == bfd_link_hash_defined
+	   || sym_hash->root.type == bfd_link_hash_defweak)
+	  && sym_hash->root.u.def.section == sec)
+	{
+	  /* As above, adjust the value if needed.  */
+	  if (sym_hash->root.u.def.value > addr
+	      && sym_hash->root.u.def.value <= toaddr)
+	    sym_hash->root.u.def.value -= count;
+
+	  /* As above, adjust the size if needed.  */
+	  if (sym_hash->root.u.def.value <= addr
+	      && sym_hash->root.u.def.value + sym_hash->size > addr
+	      && sym_hash->root.u.def.value + sym_hash->size <= toaddr)
+	    sym_hash->size -= count;
+	}
+    }
+
+  return TRUE;
+}
+
+typedef bfd_boolean (*relax_func_t) (bfd *, asection *, asection *,
+				     struct bfd_link_info *,
+				     Elf_Internal_Rela *,
+				     bfd_vma, bfd_vma, bfd_vma, bfd_boolean, bfd_boolean *);
+
+/* Relax AUIPC + JALR into JAL.  */
+
+static bfd_boolean
+_bfd_riscv_relax_call (bfd *abfd, asection *sec, asection *sym_sec,
+		       struct bfd_link_info *link_info,
+		       Elf_Internal_Rela *rel,
+		       bfd_vma symval,
+		       bfd_vma max_alignment,
+		       bfd_vma reserve_size ATTRIBUTE_UNUSED,
+		       bfd_boolean is_import,
+		       bfd_boolean *again)
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  bfd_signed_vma foff = symval - (sec_addr (sec) + rel->r_offset);
+  bfd_boolean near_zero = (symval + RISCV_IMM_REACH/2) < RISCV_IMM_REACH;
+  bfd_vma auipc, jalr;
+  int rd, r_type, len = 4, rvc = elf_elfheader (abfd)->e_flags & EF_RISCV_RVC;
+  static bfd_boolean Mem20Range = TRUE;
+
+  /* If the call crosses section boundaries, an alignment directive could
+     cause the PC-relative offset to later increase.  */
+  if (VALID_UJTYPE_IMM (foff) && sym_sec->output_section != sec->output_section)
+    foff += (foff < 0 ? -max_alignment : max_alignment);
+
+  /* See if this function call can be shortened.  */
+  if (!VALID_UJTYPE_IMM (foff) && !(!bfd_link_pic (link_info) && near_zero))
+  if ((is_import&&!Mem20Range) || (!VALID_UJTYPE_IMM (foff) && !(!bfd_link_pic (link_info) && near_zero)))
+    return TRUE;
+
+  /* Shorten the function call.  */
+  BFD_ASSERT (rel->r_offset + 8 <= sec->size);
+
+  auipc = bfd_get_32 (abfd, contents + rel->r_offset);
+  jalr = bfd_get_32 (abfd, contents + rel->r_offset + 4);
+  rd = (jalr >> OP_SH_RD) & OP_MASK_RD;
+  rvc = rvc && VALID_RVC_J_IMM (foff) && ARCH_SIZE == 32;
+  rvc = rvc &&!is_import;
+
+  if (rvc && (rd == 0 || rd == X_RA))
+    {
+      /* Relax to C.J[AL] rd, addr.  */
+      r_type = R_RISCV_RVC_JUMP;
+      auipc = rd == 0 ? MATCH_C_J : MATCH_C_JAL;
+      len = 2;
+    }
+  else if (VALID_UJTYPE_IMM (foff) || is_import)
+    {
+      /* Relax to JAL rd, addr.  */
+      r_type = R_RISCV_JAL;
+      auipc = MATCH_JAL | (rd << OP_SH_RD);
+    }
+  else /* near_zero */
+    {
+      /* Relax to JALR rd, x0, addr.  */
+      r_type = R_RISCV_LO12_I;
+      auipc = MATCH_JALR | (rd << OP_SH_RD);
+    }
+
+  /* Replace the R_RISCV_CALL reloc.  */
+  rel->r_info = ELF64_R_INFO (ELF64_R_SYM (rel->r_info), r_type);
+  /* Replace the AUIPC.  */
+  bfd_put (8 * len, abfd, auipc, contents + rel->r_offset);
+
+  /* Delete unnecessary JALR.  */
+  *again = TRUE;
+  return riscv_relax_delete_bytes (abfd, sec, rel->r_offset + len, 8 - len);
+}
+
+/* Traverse all output sections and return the max alignment.  */
+
+static bfd_vma
+_bfd_riscv_get_max_alignment (asection *sec)
+{
+  unsigned int max_alignment_power = 0;
+  asection *o;
+
+  for (o = sec->output_section->owner->sections; o != NULL; o = o->next)
+    {
+      if (o->alignment_power > max_alignment_power)
+	max_alignment_power = o->alignment_power;
+    }
+
+  return (bfd_vma) 1 << max_alignment_power;
+}
+
+/* Relax non-PIC global variable references.  */
+
+static bfd_boolean
+_bfd_riscv_relax_lui (bfd *abfd,
+		      asection *sec,
+		      asection *sym_sec,
+		      struct bfd_link_info *link_info,
+		      Elf_Internal_Rela *rel,
+		      bfd_vma symval,
+		      bfd_vma max_alignment,
+		      bfd_vma reserve_size,
+		      bfd_boolean is_import,
+		      bfd_boolean *again)
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  bfd_vma gp = riscv_global_pointer_value (link_info);
+  int use_rvc = elf_elfheader (abfd)->e_flags & EF_RISCV_RVC;
+
+  /* Mergeable symbols and code might later move out of range.  */
+  if (is_import || (sym_sec->flags & (SEC_MERGE | SEC_CODE)))
+    return TRUE;
+
+  BFD_ASSERT (rel->r_offset + 4 <= sec->size);
+
+  if (gp)
+    {
+      /* If gp and the symbol are in the same output section, then
+	 consider only that section's alignment.  */
+      struct bfd_link_hash_entry *h =
+	bfd_link_hash_lookup (link_info->hash, RISCV_GP_SYMBOL, FALSE, FALSE,
+			      TRUE);
+      if (h->u.def.section->output_section == sym_sec->output_section)
+	max_alignment = (bfd_vma) 1 << sym_sec->output_section->alignment_power;
+    }
+
+  /* Is the reference in range of x0 or gp?
+     Valid gp range conservatively because of alignment issue.  */
+  if (VALID_ITYPE_IMM (symval)
+      || (symval >= gp
+	  && VALID_ITYPE_IMM (symval - gp + max_alignment + reserve_size))
+      || (symval < gp
+	  && VALID_ITYPE_IMM (symval - gp - max_alignment - reserve_size)))
+    {
+      unsigned sym = ELF64_R_SYM (rel->r_info);
+      switch (ELF64_R_TYPE (rel->r_info))
+	{
+	case R_RISCV_LO12_I:
+	  rel->r_info = ELF64_R_INFO (sym, R_RISCV_GPREL_I);
+	  return TRUE;
+
+	case R_RISCV_LO12_S:
+	  rel->r_info = ELF64_R_INFO (sym, R_RISCV_GPREL_S);
+	  return TRUE;
+
+	case R_RISCV_HI20:
+	  /* We can delete the unnecessary LUI and reloc.  */
+	  rel->r_info = ELF64_R_INFO (0, R_RISCV_NONE);
+	  *again = TRUE;
+	  return riscv_relax_delete_bytes (abfd, sec, rel->r_offset, 4);
+
+	default:
+	  abort ();
+	}
+    }
+
+  /* Can we relax LUI to C.LUI?  Alignment might move the section forward;
+     account for this assuming page alignment at worst.  */
+  if (use_rvc
+      && ELF64_R_TYPE (rel->r_info) == R_RISCV_HI20
+      && VALID_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (symval))
+      && VALID_RVC_LUI_IMM (RISCV_CONST_HIGH_PART (symval + ELF_MAXPAGESIZE)))
+    {
+      /* Replace LUI with C.LUI if legal (i.e., rd != x2/sp).  */
+      bfd_vma lui = bfd_get_32 (abfd, contents + rel->r_offset);
+      if (((lui >> OP_SH_RD) & OP_MASK_RD) == X_SP)
+	return TRUE;
+
+      lui = (lui & (OP_MASK_RD << OP_SH_RD)) | MATCH_C_LUI;
+      bfd_put_32 (abfd, lui, contents + rel->r_offset);
+
+      /* Replace the R_RISCV_HI20 reloc.  */
+      rel->r_info = ELF64_R_INFO (ELF64_R_SYM (rel->r_info), R_RISCV_RVC_LUI);
+
+      *again = TRUE;
+      return riscv_relax_delete_bytes (abfd, sec, rel->r_offset + 2, 2);
+    }
+
+  return TRUE;
+}
+
+/* Relax non-PIC TLS references.  */
+
+static bfd_boolean
+_bfd_riscv_relax_tls_le (bfd *abfd,
+			 asection *sec,
+			 asection *sym_sec ATTRIBUTE_UNUSED,
+			 struct bfd_link_info *link_info,
+			 Elf_Internal_Rela *rel,
+			 bfd_vma symval,
+			 bfd_vma max_alignment ATTRIBUTE_UNUSED,
+			 bfd_vma reserve_size ATTRIBUTE_UNUSED,
+			 bfd_boolean is_import,
+			 bfd_boolean *again)
+{
+  /* See if this symbol is in range of tp.  */
+  if (RISCV_CONST_HIGH_PART (tpoff (link_info, symval)) != 0 || is_import)
+    return TRUE;
+
+  BFD_ASSERT (rel->r_offset + 4 <= sec->size);
+  switch (ELF64_R_TYPE (rel->r_info))
+    {
+    case R_RISCV_TPREL_LO12_I:
+      rel->r_info = ELF64_R_INFO (ELF64_R_SYM (rel->r_info), R_RISCV_TPREL_I);
+      return TRUE;
+
+    case R_RISCV_TPREL_LO12_S:
+      rel->r_info = ELF64_R_INFO (ELF64_R_SYM (rel->r_info), R_RISCV_TPREL_S);
+      return TRUE;
+
+    case R_RISCV_TPREL_HI20:
+    case R_RISCV_TPREL_ADD:
+      /* We can delete the unnecessary instruction and reloc.  */
+      rel->r_info = ELF64_R_INFO (0, R_RISCV_NONE);
+      *again = TRUE;
+      return riscv_relax_delete_bytes (abfd, sec, rel->r_offset, 4);
+
+    default:
+      abort ();
+    }
+}
+
+/* Implement R_RISCV_ALIGN by deleting excess alignment NOPs.  */
+
+static bfd_boolean
+_bfd_riscv_relax_align (bfd *abfd, asection *sec,
+			asection *sym_sec ATTRIBUTE_UNUSED,
+			struct bfd_link_info *link_info ATTRIBUTE_UNUSED,
+			Elf_Internal_Rela *rel,
+			bfd_vma symval,
+			bfd_vma max_alignment ATTRIBUTE_UNUSED,
+			bfd_vma reserve_size ATTRIBUTE_UNUSED,
+			bfd_boolean is_import ATTRIBUTE_UNUSED,
+			bfd_boolean *again ATTRIBUTE_UNUSED)
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  bfd_vma alignment = 1, pos;
+  while (alignment <= rel->r_addend)
+    alignment *= 2;
+
+  symval -= rel->r_addend;
+  bfd_vma aligned_addr = ((symval - 1) & ~(alignment - 1)) + alignment;
+  bfd_vma nop_bytes = aligned_addr - symval;
+
+  /* Once we've handled an R_RISCV_ALIGN, we can't relax anything else.  */
+  sec->sec_flg0 = TRUE;
+
+  /* Make sure there are enough NOPs to actually achieve the alignment.  */
+  if (rel->r_addend < nop_bytes)
+    return FALSE;
+
+  /* Delete the reloc.  */
+  rel->r_info = ELF64_R_INFO (0, R_RISCV_NONE);
+
+  /* If the number of NOPs is already correct, there's nothing to do.  */
+  if (nop_bytes == rel->r_addend)
+    return TRUE;
+
+  /* Write as many RISC-V NOPs as we need.  */
+  for (pos = 0; pos < (nop_bytes & -4); pos += 4)
+    bfd_put_32 (abfd, RISCV_NOP, contents + rel->r_offset + pos);
+
+  /* Write a final RVC NOP if need be.  */
+  if (nop_bytes % 4 != 0)
+    bfd_put_16 (abfd, RVC_NOP, contents + rel->r_offset + pos);
+
+  /* Delete the excess bytes.  */
+  return riscv_relax_delete_bytes (abfd, sec, rel->r_offset + nop_bytes,
+				   rel->r_addend - nop_bytes);
+}
+
+
+static bfd_boolean
+_bfd_riscv_relax_import_pcrel (bfd *abfd, asection *sec,
+                        asection *sym_sec ATTRIBUTE_UNUSED,
+                        struct bfd_link_info *link_info ATTRIBUTE_UNUSED,
+                        Elf_Internal_Rela *rel,
+                        bfd_vma symval ATTRIBUTE_UNUSED,
+			bfd_vma max_alignment ATTRIBUTE_UNUSED,
+			bfd_vma reserve_size ATTRIBUTE_UNUSED,
+                        bfd_boolean is_import,
+                        bfd_boolean *again)
+
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+
+  if (is_import) {
+        unsigned sym = ELF64_R_SYM (rel->r_info);
+        switch (ELF64_R_TYPE (rel->r_info)) {
+                case R_RISCV_PCREL_LO12_I:
+                        rel->r_info = ELF64_R_INFO (sym, R_RISCV_LO12_I);
+                        return TRUE;
+                case R_RISCV_PCREL_HI20:
+                        {
+                                bfd_vma lui = bfd_get_32 (abfd, contents + rel->r_offset);
+                                lui = (lui & (OP_MASK_RD << OP_SH_RD)) | MATCH_LUI;
+                                bfd_put_32 (abfd, lui, contents + rel->r_offset);
+                                rel->r_info = ELF64_R_INFO (sym, R_RISCV_HI20);
+                                /*
+                                        PCREL_HI20 is always followed by a reloc on the lsp part of the symbol, we use
+                                        this assumption to force the reloc to pseudo absolute
+                                */
+                                (rel+1)->r_info = ELF64_R_INFO(sym, R_RISCV_LO12_I);
+                        }
+                        return TRUE;
+        }
+  }
+  *again = FALSE;
+  return TRUE;
+}
+
+static bfd_boolean
+_bfd_riscv_relax_got_ref (bfd *abfd, asection *sec,
+                        asection *sym_sec ATTRIBUTE_UNUSED,
+                        struct bfd_link_info *link_info ATTRIBUTE_UNUSED,
+                        Elf_Internal_Rela *rel,
+                        bfd_vma symval ATTRIBUTE_UNUSED,
+                        bfd_vma max_alignment ATTRIBUTE_UNUSED,
+                        bfd_vma reserve_size ATTRIBUTE_UNUSED,
+                        bfd_boolean is_import ATTRIBUTE_UNUSED,
+                        bfd_boolean *again ATTRIBUTE_UNUSED)
+
+{
+  bfd_byte *contents = elf_section_data (sec)->this_hdr.contents;
+  unsigned sym = ELF64_R_SYM (rel->r_info);
+  Elf_Internal_Rela *low_part_rel = rel + 1;
+  bfd_vma low_part_ref;
+
+  rel->r_info = ELF64_R_INFO (sym, R_RISCV_PCREL_HI20);
+  /* Force second part of the access to be an addi instead of the usual load got */
+  low_part_ref = bfd_get_32 (abfd, contents + low_part_rel->r_offset);
+  low_part_ref = (low_part_ref & ((OP_MASK_RD << OP_SH_RD) | (OP_MASK_RS1 << OP_SH_RS1))) | MATCH_ADDI;
+  bfd_put_32 (abfd, low_part_ref, contents + low_part_rel->r_offset);
+
+  sym = ELF64_R_SYM(low_part_rel->r_info);
+  low_part_rel->r_info = ELF64_R_INFO (sym, R_RISCV_PCREL_LO12_I);
+
+  return TRUE;
+}
+
+/* Relax a section.  Pass 0 shortens code sequences unless disabled.
+   Pass 1, which cannot be disabled, handles code alignment directives.  */
+
+static bfd_boolean
+_bfd_riscv_relax_section (bfd *abfd, asection *sec,
+			  struct bfd_link_info *info,
+			  bfd_boolean *again)
+{
+  Elf_Internal_Shdr *symtab_hdr = &elf_symtab_hdr (abfd);
+  struct riscv_elf_link_hash_table *htab = riscv_elf_hash_table (info);
+  struct bfd_elf_section_data *data = elf_section_data (sec);
+  Elf_Internal_Rela *relocs;
+  bfd_boolean ret = FALSE;
+  unsigned int i;
+  bfd_vma max_alignment, reserve_size = 0;
+
+  *again = FALSE;
+
+  if (bfd_link_relocatable (info)
+      || sec->sec_flg0
+      || (sec->flags & SEC_RELOC) == 0
+      || sec->reloc_count == 0
+      || (info->disable_target_specific_optimizations
+	  && info->relax_pass == 0))
+    return TRUE;
+
+  /* Read this BFD's relocs if we haven't done so already.  */
+  if (data->relocs)
+    relocs = data->relocs;
+  else if (!(relocs = _bfd_elf_link_read_relocs (abfd, sec, NULL, NULL,
+						 info->keep_memory)))
+    goto fail;
+
+  max_alignment = _bfd_riscv_get_max_alignment (sec);
+
+  /* Examine and consider relaxing each reloc.  */
+  for (i = 0; i < sec->reloc_count; i++)
+    {
+      asection *sym_sec;
+      Elf_Internal_Rela *rel = relocs + i;
+      relax_func_t relax_func;
+      int type = ELF64_R_TYPE (rel->r_info);
+      bfd_vma symval;
+      bfd_boolean Is_Import = FALSE;
+
+      if (info->relax_pass == 0)
+	{
+	  if (type == R_RISCV_CALL || type == R_RISCV_CALL_PLT)
+	    relax_func = _bfd_riscv_relax_call;
+	  else if (type == R_RISCV_HI20
+		   || type == R_RISCV_LO12_I
+		   || type == R_RISCV_LO12_S)
+	    relax_func = _bfd_riscv_relax_lui;
+	  else if (type == R_RISCV_TPREL_HI20
+		   || type == R_RISCV_TPREL_ADD
+		   || type == R_RISCV_TPREL_LO12_I
+		   || type == R_RISCV_TPREL_LO12_S)
+	    relax_func = _bfd_riscv_relax_tls_le;
+          else if (type == R_RISCV_PCREL_HI20 || type == R_RISCV_PCREL_LO12_I)
+            relax_func = _bfd_riscv_relax_import_pcrel;
+          else if (ComponentMode && (type == R_RISCV_GOT_HI20))
+            relax_func = _bfd_riscv_relax_got_ref;
+	  else
+	    continue;
+
+	  /* Only relax this reloc if it is paired with R_RISCV_RELAX.  */
+	  if (i == sec->reloc_count - 1
+	      || ELF64_R_TYPE ((rel + 1)->r_info) != R_RISCV_RELAX
+	      || rel->r_offset != (rel + 1)->r_offset)
+	    continue;
+
+	  /* Skip over the R_RISCV_RELAX.  */
+	  i++;
+	}
+      else if (type == R_RISCV_ALIGN)
+	relax_func = _bfd_riscv_relax_align;
+      else
+	continue;
+
+      data->relocs = relocs;
+
+      /* Read this BFD's contents if we haven't done so already.  */
+      if (!data->this_hdr.contents
+	  && !bfd_malloc_and_get_section (abfd, sec, &data->this_hdr.contents))
+	goto fail;
+
+      /* Read this BFD's symbols if we haven't done so already.  */
+      if (symtab_hdr->sh_info != 0
+	  && !symtab_hdr->contents
+	  && !(symtab_hdr->contents =
+	       (unsigned char *) bfd_elf_get_elf_syms (abfd, symtab_hdr,
+						       symtab_hdr->sh_info,
+						       0, NULL, NULL, NULL)))
+	goto fail;
+
+      /* Get the value of the symbol referred to by the reloc.  */
+      if (ELF64_R_SYM (rel->r_info) < symtab_hdr->sh_info)
+	{
+	  /* A local symbol.  */
+	  Elf_Internal_Sym *isym = ((Elf_Internal_Sym *) symtab_hdr->contents
+				    + ELF64_R_SYM (rel->r_info));
+	  reserve_size = (isym->st_size - rel->r_addend) > isym->st_size
+	    ? 0 : isym->st_size - rel->r_addend;
+
+	  if (isym->st_shndx == SHN_UNDEF)
+	    sym_sec = sec, symval = sec_addr (sec) + rel->r_offset;
+	  else
+	    {
+	      BFD_ASSERT (isym->st_shndx < elf_numsections (abfd));
+	      sym_sec = elf_elfsections (abfd)[isym->st_shndx]->bfd_section;
+	      if (sec_addr (sym_sec) == 0)
+		continue;
+	      symval = sec_addr (sym_sec) + isym->st_value;
+	    }
+	}
+      else
+	{
+	  unsigned long indx;
+	  struct elf_link_hash_entry *h;
+
+	  indx = ELF64_R_SYM (rel->r_info) - symtab_hdr->sh_info;
+	  h = elf_sym_hashes (abfd)[indx];
+
+	  while (h->root.type == bfd_link_hash_indirect
+		 || h->root.type == bfd_link_hash_warning)
+	    h = (struct elf_link_hash_entry *) h->root.u.i.link;
+
+	  if (h->plt.offset != MINUS_ONE)
+	    symval = sec_addr (htab->elf.splt) + h->plt.offset;
+	  else if (h->root.u.def.section->output_section == NULL
+		   || (h->root.type != bfd_link_hash_defined
+		       && h->root.type != bfd_link_hash_defweak))
+	    continue;
+	  else
+	    symval = sec_addr (h->root.u.def.section) + h->root.u.def.value;
+
+	  if (h->type != STT_FUNC)
+	    reserve_size =
+	      (h->size - rel->r_addend) > h->size ? 0 : h->size - rel->r_addend;
+	  sym_sec = h->root.u.def.section;
+          if (h->root.type == bfd_link_hash_defweak && strcmp(sec->name, "pulp.import")) Is_Import = TRUE;
+
+	}
+
+      symval += rel->r_addend;
+
+      if (!relax_func (abfd, sec, sym_sec, info, rel, symval,
+		       max_alignment, reserve_size, Is_Import, again))
+	goto fail;
+    }
+
+  ret = TRUE;
+
+fail:
+  if (relocs != data->relocs)
+    free (relocs);
+
+  return ret;
+}
+
+#if ARCH_SIZE == 32
+# define PRSTATUS_SIZE			0 /* FIXME */
+# define PRSTATUS_OFFSET_PR_CURSIG	12
+# define PRSTATUS_OFFSET_PR_PID		24
+# define PRSTATUS_OFFSET_PR_REG		72
+# define ELF_GREGSET_T_SIZE		128
+# define PRPSINFO_SIZE			128
+# define PRPSINFO_OFFSET_PR_PID		16
+# define PRPSINFO_OFFSET_PR_FNAME	32
+# define PRPSINFO_OFFSET_PR_PSARGS	48
+#else
+# define PRSTATUS_SIZE			376
+# define PRSTATUS_OFFSET_PR_CURSIG	12
+# define PRSTATUS_OFFSET_PR_PID		32
+# define PRSTATUS_OFFSET_PR_REG		112
+# define ELF_GREGSET_T_SIZE		256
+# define PRPSINFO_SIZE			136
+# define PRPSINFO_OFFSET_PR_PID		24
+# define PRPSINFO_OFFSET_PR_FNAME	40
+# define PRPSINFO_OFFSET_PR_PSARGS	56
+#endif
+
+/* Support for core dump NOTE sections.  */
+
+static bfd_boolean
+riscv_elf_grok_prstatus (bfd *abfd, Elf_Internal_Note *note)
+{
+  switch (note->descsz)
+    {
+      default:
+	return FALSE;
+
+      case PRSTATUS_SIZE:  /* sizeof(struct elf_prstatus) on Linux/RISC-V.  */
+	/* pr_cursig */
+	elf_tdata (abfd)->core->signal
+	  = bfd_get_16 (abfd, note->descdata + PRSTATUS_OFFSET_PR_CURSIG);
+
+	/* pr_pid */
+	elf_tdata (abfd)->core->lwpid
+	  = bfd_get_32 (abfd, note->descdata + PRSTATUS_OFFSET_PR_PID);
+	break;
+    }
+
+  /* Make a ".reg/999" section.  */
+  return _bfd_elfcore_make_pseudosection (abfd, ".reg", ELF_GREGSET_T_SIZE,
+					  note->descpos + PRSTATUS_OFFSET_PR_REG);
+}
+
+static bfd_boolean
+riscv_elf_grok_psinfo (bfd *abfd, Elf_Internal_Note *note)
+{
+  switch (note->descsz)
+    {
+      default:
+	return FALSE;
+
+      case PRPSINFO_SIZE: /* sizeof(struct elf_prpsinfo) on Linux/RISC-V.  */
+	/* pr_pid */
+	elf_tdata (abfd)->core->pid
+	  = bfd_get_32 (abfd, note->descdata + PRPSINFO_OFFSET_PR_PID);
+
+	/* pr_fname */
+	elf_tdata (abfd)->core->program = _bfd_elfcore_strndup
+	  (abfd, note->descdata + PRPSINFO_OFFSET_PR_FNAME, 16);
+
+	/* pr_psargs */
+	elf_tdata (abfd)->core->command = _bfd_elfcore_strndup
+	  (abfd, note->descdata + PRPSINFO_OFFSET_PR_PSARGS, 80);
+	break;
+    }
+
+  /* Note that for some reason, a spurious space is tacked
+     onto the end of the args in some (at least one anyway)
+     implementations, so strip it off if it exists.  */
+
+  {
+    char *command = elf_tdata (abfd)->core->command;
+    int n = strlen (command);
+
+    if (0 < n && command[n - 1] == ' ')
+      command[n - 1] = '\0';
+  }
+
+  return TRUE;
+}
+
+/* Set the right mach type.  */
+static bfd_boolean
+riscv_elf_object_p (bfd *abfd)
+{
+  /* There are only two mach types in RISCV currently.  */
+  if (strcmp (abfd->xvec->name, "elf32-littleriscv") == 0)
+    bfd_default_set_arch_mach (abfd, bfd_arch_riscv, bfd_mach_riscv32);
+  else
+    bfd_default_set_arch_mach (abfd, bfd_arch_riscv, bfd_mach_riscv64);
+
+  return TRUE;
+}
+
+bfd_boolean
+_bfd_riscv_elf64_final_link (bfd *abfd, struct bfd_link_info *info)
+
+{
+	struct bfd_section *s;
+	unsigned int SecNameSize, SecRelocSize, NImport=0, ExportSize;
+	unsigned int *NameSection, *RelocSection, *ExportSection;
+	static int Trace = 0;
+
+
+	if (!bfd_elf_final_link (abfd, info)) return FALSE;
+
+	if (PulpImportCreateNameAndRelocSections(0,
+						 &NameSection,  &SecNameSize,
+						 &RelocSection, &SecRelocSize, &NImport) == FALSE) {
+      		(*_bfd_error_handler)(_("Failed to create Import sections"));
+		return FALSE;
+	}
+	if (Trace) printf("NImport: %d, SecNameSize: %d, SecRelocSize: %d\n", NImport, SecNameSize, SecRelocSize);
+
+	if (NImport) {
+		struct bfd_section *TextSec = NULL;
+		unsigned int BaseText = 0;
+
+		if (ComponentMode == 0) {
+			TextSec = bfd_get_section_by_name (info->output_bfd, ".text");
+			if (TextSec) {
+				BaseText = (unsigned int) TextSec->lma;
+				AdjustRelocsImport(RelocSection, BaseText);
+			} else {
+      				(*_bfd_error_handler)(_("Failed to find .text section in output_bfd"));
+			}
+		}
+
+		if (DumpImportExportSections==1 || DumpImportExportSections==3)
+			DiassembleImports(NameSection, SecNameSize, RelocSection, SecRelocSize, BaseText);
+		if (DumpImportExportSections==2 || DumpImportExportSections==3) {
+			DumpCEquiv((unsigned int *) NameSection, SecNameSize, 4, ComponentMode?"CompImportNames":"ResiImportNames");
+			DumpCEquiv(RelocSection, SecRelocSize, 4, ComponentMode?"CompImportRelocs":"ResiImportRelocs");
+		}
+
+		s = bfd_get_section_by_name (abfd, ".pulp.import.names");
+		if (s) {
+			s->contents = xmalloc(SecNameSize);
+			s->size = SecNameSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) NameSection, 0, SecNameSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.names: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.names: Set content OK\n");
+			}
+		} else {
+      			(*_bfd_error_handler)(_("Can't find .pulp.import.names"));
+	        	return FALSE;
+		}
+		s = bfd_get_section_by_name (abfd, ".pulp.import.relocs");
+		if (s) {
+			s->contents = xmalloc(SecRelocSize);
+			s->size = SecRelocSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) RelocSection, 0, SecRelocSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.relocs: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.relocs: Set content OK\n");
+			}
+		} else {
+      			(*_bfd_error_handler)(_("Can't find .pulp.import.relocs"));
+	        	return FALSE;
+		}
+		(void) ReleaseImportEntry();
+	} else {
+		/* In this case both sections are empty with size = 4 for the section descriptor
+		   descriptor itself is 0 */
+		s = bfd_get_section_by_name (abfd, ".pulp.import.names");
+		if (s) {
+			unsigned int Empty = 0;
+			SecNameSize = 4;
+			s->contents = xmalloc(SecNameSize);
+			s->size = SecNameSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) &Empty, 0, SecNameSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.names: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.names: Set content OK\n");
+			}
+		}
+		s = bfd_get_section_by_name (abfd, ".pulp.import.relocs");
+		if (s) {
+			unsigned int Empty = 0;
+			SecRelocSize = 4;
+			s->contents = xmalloc(SecRelocSize);
+			s->size = SecRelocSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) &Empty, 0, SecRelocSize)) {
+      				(*_bfd_error_handler)(_(".pulp.import.relocs: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.import.relocs: Set content OK\n");
+			}
+		}
+	}
+
+	if (ComponentMode) {
+		if (ComponentEntryProvided == FALSE)
+			(*_bfd_error_handler)(_("No Entry provided for Component"));
+		else if (ExportLookup(ComponentEntry.name) == FALSE)
+			 (*_bfd_error_handler)(_("Component provided entry: %s not found in component export list"), ComponentEntry.name);
+	}
+	if (PulpExportCreateSection(&ExportSection, &ExportSize, info) == FALSE) {
+      		(*_bfd_error_handler)(_("Failed to create Export Section"));
+		return FALSE;
+	} else if (ExportSection) {
+		if (DumpImportExportSections==1 || DumpImportExportSections==3)
+			DiassembleExports(ExportSection, ExportSize);
+		if (DumpImportExportSections==1 || DumpImportExportSections==2) {
+			DumpCEquiv(ExportSection, ExportSize, 4, ComponentMode?"CompExports":"ResiExports");
+			if (ComponentMode) {
+				struct bfd_section *CompSec = NULL;
+				CompSec = bfd_get_section_by_name (info->output_bfd, ".component.body");
+				if (CompSec) {
+					long Size = CompSec->size;
+					char *Buffer = xmalloc (Size);
+					bfd_get_section_contents (info->output_bfd, CompSec, Buffer, 0, Size);
+					DumpCEquiv((unsigned int *) Buffer, Size, 1, "ComponentBody");
+					free(Buffer);
+				}
+			}
+		}
+		s = bfd_get_section_by_name (abfd, ".pulp.export");
+		if (s) {
+			s->contents = xmalloc(ExportSize);
+			s->size = ExportSize;
+			if (! bfd_set_section_contents (abfd, s, (char *) ExportSection, 0, ExportSize)) {
+      				(*_bfd_error_handler)(_(".pulp.export: Failed to set content"));
+	        		return FALSE;
+			} else if (Trace) {
+				fprintf(stderr, ".pulp.export: Set content OK\n");
+			}
+		} else {
+      			(*_bfd_error_handler)(_("Can't find .pulp.export"));
+	        	return FALSE;
+		}
+		(void) ReleaseExportEntry();
+	} else if (ComponentMode) {
+		/* We should have at least on export to be able to enter the component */
+		(*_bfd_error_handler)(_("Component has empty export section"));
+	}
+
+	return TRUE;
+}
+#define TARGET_LITTLE_SYM		riscv_elf64_vec
+#define TARGET_LITTLE_NAME		"elf64-littleriscv"
+
+#define elf_backend_reloc_type_class	     riscv_reloc_type_class
+
+#define bfd_elf64_bfd_reloc_name_lookup	     riscv_reloc_name_lookup
+#define bfd_elf64_bfd_link_hash_table_create riscv_elf_link_hash_table_create
+#define bfd_elf64_bfd_reloc_type_lookup	     riscv_reloc_type_lookup
+#define bfd_elf64_bfd_merge_private_bfd_data \
+  _bfd_riscv_elf_merge_private_bfd_data
+
+#define elf_backend_copy_indirect_symbol     riscv_elf_copy_indirect_symbol
+#define elf_backend_create_dynamic_sections  riscv_elf_create_dynamic_sections
+#define elf_backend_check_relocs	     riscv_elf_check_relocs
+#define elf_backend_adjust_dynamic_symbol    riscv_elf_adjust_dynamic_symbol
+#define elf_backend_size_dynamic_sections    riscv_elf_size_dynamic_sections
+#define elf_backend_relocate_section	     riscv_elf_relocate_section
+#define elf_backend_finish_dynamic_symbol    riscv_elf_finish_dynamic_symbol
+#define elf_backend_finish_dynamic_sections  riscv_elf_finish_dynamic_sections
+#define elf_backend_gc_mark_hook	     riscv_elf_gc_mark_hook
+#define elf_backend_gc_sweep_hook	     riscv_elf_gc_sweep_hook
+#define elf_backend_plt_sym_val		     riscv_elf_plt_sym_val
+#define elf_backend_grok_prstatus            riscv_elf_grok_prstatus
+#define elf_backend_grok_psinfo              riscv_elf_grok_psinfo
+#define elf_backend_object_p                 riscv_elf_object_p
+#define elf_info_to_howto_rel		     NULL
+#define elf_info_to_howto		     riscv_info_to_howto_rela
+#define bfd_elf64_bfd_relax_section	     _bfd_riscv_relax_section
+
+#define elf_backend_init_index_section	     _bfd_elf_init_1_index_section
+#define bfd_elf64_bfd_final_link             _bfd_riscv_elf64_final_link
+
+#define elf_backend_can_gc_sections	1
+#define elf_backend_can_refcount	1
+#define elf_backend_want_got_plt	1
+#define elf_backend_plt_readonly	1
+#define elf_backend_plt_alignment	4
+#define elf_backend_want_plt_sym	1
+#define elf_backend_got_header_size	(ARCH_SIZE / 8)
+#define elf_backend_want_dynrelro	1
+#define elf_backend_rela_normal		1
+#define elf_backend_default_execstack	0
+
+#include "elf64-target.h"
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/elf64-target.h b/utils/gapy/gen-debug-info-src/ext/bfd/elf64-target.h
new file mode 100644
index 000000000..f94308d50
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/elf64-target.h
@@ -0,0 +1,1081 @@
+/* Target definitions for 64-bit ELF
+   Copyright (C) 1993-2017 Free Software Foundation, Inc.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+
+/* This structure contains everything that BFD knows about a target.
+   It includes things like its byte order, name, what routines to call
+   to do various operations, etc.  Every BFD points to a target structure
+   with its "xvec" member.
+
+   There are two such structures here:  one for big-endian machines and
+   one for little-endian machines.   */
+
+#ifndef bfd_elf64_close_and_cleanup
+#define	bfd_elf64_close_and_cleanup _bfd_elf_close_and_cleanup
+#endif
+#ifndef bfd_elf64_bfd_free_cached_info
+#define bfd_elf64_bfd_free_cached_info _bfd_free_cached_info
+#endif
+#ifndef bfd_elf64_get_section_contents
+#define bfd_elf64_get_section_contents _bfd_generic_get_section_contents
+#endif
+
+#define bfd_elf64_canonicalize_dynamic_symtab \
+  _bfd_elf_canonicalize_dynamic_symtab
+#ifndef bfd_elf64_get_synthetic_symtab
+#define bfd_elf64_get_synthetic_symtab \
+  _bfd_elf_get_synthetic_symtab
+#endif
+#ifndef bfd_elf64_canonicalize_reloc
+#define bfd_elf64_canonicalize_reloc	_bfd_elf_canonicalize_reloc
+#endif
+#ifndef bfd_elf64_find_nearest_line
+#define bfd_elf64_find_nearest_line	_bfd_elf_find_nearest_line
+#endif
+#ifndef bfd_elf64_find_line
+#define bfd_elf64_find_line		_bfd_elf_find_line
+#endif
+#ifndef bfd_elf64_find_inliner_info
+#define bfd_elf64_find_inliner_info	_bfd_elf_find_inliner_info
+#endif
+#define bfd_elf64_read_minisymbols	_bfd_elf_read_minisymbols
+#define bfd_elf64_minisymbol_to_symbol	_bfd_elf_minisymbol_to_symbol
+#define bfd_elf64_get_dynamic_symtab_upper_bound \
+  _bfd_elf_get_dynamic_symtab_upper_bound
+#define bfd_elf64_get_lineno		_bfd_elf_get_lineno
+#ifndef bfd_elf64_get_reloc_upper_bound
+#define bfd_elf64_get_reloc_upper_bound _bfd_elf_get_reloc_upper_bound
+#endif
+#ifndef bfd_elf64_get_symbol_info
+#define bfd_elf64_get_symbol_info	_bfd_elf_get_symbol_info
+#endif
+#ifndef bfd_elf64_get_symbol_version_string
+#define bfd_elf64_get_symbol_version_string \
+  _bfd_elf_get_symbol_version_string
+#endif
+#define bfd_elf64_canonicalize_symtab	_bfd_elf_canonicalize_symtab
+#define bfd_elf64_get_symtab_upper_bound _bfd_elf_get_symtab_upper_bound
+#define bfd_elf64_make_empty_symbol	_bfd_elf_make_empty_symbol
+#ifndef bfd_elf64_new_section_hook
+#define bfd_elf64_new_section_hook	_bfd_elf_new_section_hook
+#endif
+#define bfd_elf64_set_arch_mach		_bfd_elf_set_arch_mach
+#ifndef bfd_elf64_set_section_contents
+#define bfd_elf64_set_section_contents	_bfd_elf_set_section_contents
+#endif
+#define bfd_elf64_sizeof_headers	_bfd_elf_sizeof_headers
+#define bfd_elf64_write_object_contents _bfd_elf_write_object_contents
+#define bfd_elf64_write_corefile_contents _bfd_elf_write_corefile_contents
+
+#define bfd_elf64_get_section_contents_in_window \
+  _bfd_generic_get_section_contents_in_window
+
+#ifndef elf_backend_can_refcount
+#define elf_backend_can_refcount 0
+#endif
+#ifndef elf_backend_want_got_plt
+#define elf_backend_want_got_plt 0
+#endif
+#ifndef elf_backend_plt_readonly
+#define elf_backend_plt_readonly 0
+#endif
+#ifndef elf_backend_want_plt_sym
+#define elf_backend_want_plt_sym 0
+#endif
+#ifndef elf_backend_plt_not_loaded
+#define elf_backend_plt_not_loaded 0
+#endif
+#ifndef elf_backend_plt_alignment
+#define elf_backend_plt_alignment 2
+#endif
+#ifndef elf_backend_want_dynbss
+#define elf_backend_want_dynbss 1
+#endif
+#ifndef elf_backend_want_dynrelro
+#define elf_backend_want_dynrelro 0
+#endif
+#ifndef elf_backend_want_p_paddr_set_to_zero
+#define elf_backend_want_p_paddr_set_to_zero 0
+#endif
+#ifndef elf_backend_no_page_alias
+#define elf_backend_no_page_alias 0
+#endif
+#ifndef elf_backend_default_execstack
+#define elf_backend_default_execstack 1
+#endif
+#ifndef elf_backend_caches_rawsize
+#define elf_backend_caches_rawsize 0
+#endif
+#ifndef elf_backend_extern_protected_data
+#define elf_backend_extern_protected_data 0
+#endif
+#ifndef elf_backend_always_renumber_dynsyms
+#define elf_backend_always_renumber_dynsyms FALSE
+#endif
+#ifndef elf_backend_stack_align
+#define elf_backend_stack_align 16
+#endif
+#ifndef elf_backend_strtab_flags
+#define elf_backend_strtab_flags 0
+#endif
+
+#define bfd_elf64_bfd_debug_info_start	bfd_void
+#define bfd_elf64_bfd_debug_info_end	bfd_void
+#define bfd_elf64_bfd_debug_info_accumulate \
+  ((void (*) (bfd*, struct bfd_section *)) bfd_void)
+
+#ifndef bfd_elf64_bfd_get_relocated_section_contents
+#define bfd_elf64_bfd_get_relocated_section_contents \
+  bfd_generic_get_relocated_section_contents
+#endif
+
+#ifndef bfd_elf64_bfd_relax_section
+#define bfd_elf64_bfd_relax_section bfd_generic_relax_section
+#endif
+
+#ifndef elf_backend_can_gc_sections
+#define elf_backend_can_gc_sections 0
+#endif
+#ifndef elf_backend_can_refcount
+#define elf_backend_can_refcount 0
+#endif
+#ifndef elf_backend_want_got_sym
+#define elf_backend_want_got_sym 1
+#endif
+#ifndef elf_backend_gc_keep
+#define elf_backend_gc_keep		_bfd_elf_gc_keep
+#endif
+#ifndef elf_backend_gc_mark_dynamic_ref
+#define elf_backend_gc_mark_dynamic_ref	bfd_elf_gc_mark_dynamic_ref_symbol
+#endif
+#ifndef elf_backend_gc_mark_hook
+#define elf_backend_gc_mark_hook	_bfd_elf_gc_mark_hook
+#endif
+#ifndef elf_backend_gc_mark_extra_sections
+#define elf_backend_gc_mark_extra_sections _bfd_elf_gc_mark_extra_sections
+#endif
+#ifndef elf_backend_gc_sweep_hook
+#define elf_backend_gc_sweep_hook	NULL
+#endif
+#ifndef bfd_elf64_bfd_gc_sections
+#define bfd_elf64_bfd_gc_sections bfd_elf_gc_sections
+#endif
+
+#ifndef bfd_elf64_bfd_merge_sections
+#define bfd_elf64_bfd_merge_sections \
+  _bfd_elf_merge_sections
+#endif
+
+#ifndef bfd_elf64_bfd_is_group_section
+#define bfd_elf64_bfd_is_group_section bfd_elf_is_group_section
+#endif
+
+#ifndef bfd_elf64_bfd_discard_group
+#define bfd_elf64_bfd_discard_group bfd_generic_discard_group
+#endif
+
+#ifndef bfd_elf64_section_already_linked
+#define bfd_elf64_section_already_linked \
+  _bfd_elf_section_already_linked
+#endif
+
+#ifndef bfd_elf64_bfd_define_common_symbol
+#define bfd_elf64_bfd_define_common_symbol bfd_generic_define_common_symbol
+#endif
+
+#ifndef bfd_elf64_bfd_lookup_section_flags
+#define bfd_elf64_bfd_lookup_section_flags bfd_elf_lookup_section_flags
+#endif
+
+#ifndef bfd_elf64_bfd_make_debug_symbol
+#define bfd_elf64_bfd_make_debug_symbol \
+  ((asymbol * (*) (bfd *, void *, unsigned long)) bfd_nullvoidptr)
+#endif
+
+#ifndef bfd_elf64_bfd_copy_private_symbol_data
+#define bfd_elf64_bfd_copy_private_symbol_data \
+  _bfd_elf_copy_private_symbol_data
+#endif
+
+#ifndef bfd_elf64_bfd_copy_private_section_data
+#define bfd_elf64_bfd_copy_private_section_data \
+  _bfd_elf_copy_private_section_data
+#endif
+#ifndef bfd_elf64_bfd_copy_private_header_data
+#define bfd_elf64_bfd_copy_private_header_data \
+  _bfd_elf_copy_private_header_data
+#endif
+#ifndef bfd_elf64_bfd_copy_private_bfd_data
+#define bfd_elf64_bfd_copy_private_bfd_data \
+  _bfd_elf_copy_private_bfd_data
+#endif
+#ifndef bfd_elf64_bfd_print_private_bfd_data
+#define bfd_elf64_bfd_print_private_bfd_data \
+  _bfd_elf_print_private_bfd_data
+#endif
+#ifndef bfd_elf64_bfd_merge_private_bfd_data
+#define bfd_elf64_bfd_merge_private_bfd_data \
+  ((bfd_boolean (*) (bfd *, struct bfd_link_info *)) bfd_true)
+#endif
+#ifndef bfd_elf64_bfd_set_private_flags
+#define bfd_elf64_bfd_set_private_flags \
+  ((bfd_boolean (*) (bfd *, flagword)) bfd_true)
+#endif
+#ifndef bfd_elf64_bfd_is_local_label_name
+#define bfd_elf64_bfd_is_local_label_name _bfd_elf_is_local_label_name
+#endif
+#ifndef bfd_elf64_bfd_is_target_special_symbol
+#define bfd_elf64_bfd_is_target_special_symbol \
+  ((bfd_boolean (*) (bfd *, asymbol *)) bfd_false)
+#endif
+
+#ifndef bfd_elf64_get_dynamic_reloc_upper_bound
+#define bfd_elf64_get_dynamic_reloc_upper_bound \
+  _bfd_elf_get_dynamic_reloc_upper_bound
+#endif
+#ifndef bfd_elf64_canonicalize_dynamic_reloc
+#define bfd_elf64_canonicalize_dynamic_reloc \
+  _bfd_elf_canonicalize_dynamic_reloc
+#endif
+
+#ifdef elf_backend_relocate_section
+#ifndef bfd_elf64_bfd_link_hash_table_create
+#define bfd_elf64_bfd_link_hash_table_create _bfd_elf_link_hash_table_create
+#endif
+#ifndef bfd_elf64_bfd_copy_link_hash_symbol_type
+#define bfd_elf64_bfd_copy_link_hash_symbol_type \
+  _bfd_elf_copy_link_hash_symbol_type
+#endif
+#ifndef bfd_elf64_bfd_link_add_symbols
+#define bfd_elf64_bfd_link_add_symbols	bfd_elf_link_add_symbols
+#endif
+#ifndef bfd_elf64_bfd_final_link
+#define bfd_elf64_bfd_final_link	bfd_elf_final_link
+#endif
+#else /* ! defined (elf_backend_relocate_section) */
+/* If no backend relocate_section routine, use the generic linker.
+   Note - this will prevent the port from being able to use some of
+   the other features of the ELF linker, because the generic hash structure
+   does not have the fields needed by the ELF linker.  In particular it
+   means that linking directly to S-records will not work.  */
+#ifndef bfd_elf64_bfd_link_hash_table_create
+#define bfd_elf64_bfd_link_hash_table_create \
+  _bfd_generic_link_hash_table_create
+#endif
+#ifndef bfd_elf64_bfd_copy_link_hash_symbol_type
+#define bfd_elf64_bfd_copy_link_hash_symbol_type \
+  _bfd_generic_copy_link_hash_symbol_type
+#endif
+#ifndef bfd_elf64_bfd_link_add_symbols
+#define bfd_elf64_bfd_link_add_symbols	_bfd_generic_link_add_symbols
+#endif
+#ifndef bfd_elf64_bfd_final_link
+#define bfd_elf64_bfd_final_link	_bfd_generic_final_link
+#endif
+#endif /* ! defined (elf_backend_relocate_section) */
+
+#ifndef bfd_elf64_bfd_link_just_syms
+#define bfd_elf64_bfd_link_just_syms	_bfd_elf_link_just_syms
+#endif
+
+#ifndef bfd_elf64_bfd_link_split_section
+#define bfd_elf64_bfd_link_split_section _bfd_generic_link_split_section
+#endif
+
+#ifndef bfd_elf64_bfd_link_check_relocs
+#define bfd_elf64_bfd_link_check_relocs  _bfd_elf_link_check_relocs
+#endif
+
+#ifndef bfd_elf64_archive_p
+#define bfd_elf64_archive_p bfd_generic_archive_p
+#endif
+
+#ifndef bfd_elf64_write_archive_contents
+#define bfd_elf64_write_archive_contents _bfd_write_archive_contents
+#endif
+
+#ifndef bfd_elf64_mkobject
+#define bfd_elf64_mkobject bfd_elf_make_object
+#endif
+
+#ifndef bfd_elf64_mkcorefile
+#define bfd_elf64_mkcorefile bfd_elf_mkcorefile
+#endif
+
+#ifndef bfd_elf64_mkarchive
+#define bfd_elf64_mkarchive _bfd_generic_mkarchive
+#endif
+
+#ifndef bfd_elf64_print_symbol
+#define bfd_elf64_print_symbol bfd_elf_print_symbol
+#endif
+
+#ifndef elf_symbol_leading_char
+#define elf_symbol_leading_char 0
+#endif
+
+#ifndef elf_info_to_howto
+#define elf_info_to_howto 0
+#endif
+
+#ifndef elf_info_to_howto_rel
+#define elf_info_to_howto_rel 0
+#endif
+
+#ifndef elf_backend_arch_data
+#define elf_backend_arch_data NULL
+#endif
+
+#ifndef ELF_TARGET_ID
+#define ELF_TARGET_ID	GENERIC_ELF_DATA
+#endif
+
+#ifndef ELF_OSABI
+#define ELF_OSABI ELFOSABI_NONE
+#endif
+
+#ifndef ELF_MAXPAGESIZE
+# error ELF_MAXPAGESIZE is not defined
+#define ELF_MAXPAGESIZE 1
+#endif
+
+#ifndef ELF_COMMONPAGESIZE
+#define ELF_COMMONPAGESIZE ELF_MAXPAGESIZE
+#endif
+
+#ifndef ELF_MINPAGESIZE
+#define ELF_MINPAGESIZE ELF_COMMONPAGESIZE
+#endif
+
+#if ELF_COMMONPAGESIZE > ELF_MAXPAGESIZE
+# error ELF_COMMONPAGESIZE > ELF_MAXPAGESIZE
+#endif
+#if ELF_MINPAGESIZE > ELF_COMMONPAGESIZE
+# error ELF_MINPAGESIZE > ELF_COMMONPAGESIZE
+#endif
+
+#ifndef ELF_DYNAMIC_SEC_FLAGS
+/* Note that we set the SEC_IN_MEMORY flag for these sections.  */
+#define ELF_DYNAMIC_SEC_FLAGS			\
+  (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS	\
+   | SEC_IN_MEMORY | SEC_LINKER_CREATED)
+#endif
+
+#ifndef elf_backend_collect
+#define elf_backend_collect FALSE
+#endif
+#ifndef elf_backend_type_change_ok
+#define elf_backend_type_change_ok FALSE
+#endif
+
+#ifndef elf_backend_sym_is_global
+#define elf_backend_sym_is_global	0
+#endif
+#ifndef elf_backend_object_p
+#define elf_backend_object_p		0
+#endif
+#ifndef elf_backend_symbol_processing
+#define elf_backend_symbol_processing	0
+#endif
+#ifndef elf_backend_symbol_table_processing
+#define elf_backend_symbol_table_processing	0
+#endif
+#ifndef elf_backend_get_symbol_type
+#define elf_backend_get_symbol_type 0
+#endif
+#ifndef elf_backend_archive_symbol_lookup
+#define elf_backend_archive_symbol_lookup _bfd_elf_archive_symbol_lookup
+#endif
+#ifndef elf_backend_name_local_section_symbols
+#define elf_backend_name_local_section_symbols	0
+#endif
+#ifndef elf_backend_section_processing
+#define elf_backend_section_processing	0
+#endif
+#ifndef elf_backend_section_from_shdr
+#define elf_backend_section_from_shdr	_bfd_elf_make_section_from_shdr
+#endif
+#ifndef elf_backend_section_flags
+#define elf_backend_section_flags	0
+#endif
+#ifndef elf_backend_get_sec_type_attr
+#define elf_backend_get_sec_type_attr	_bfd_elf_get_sec_type_attr
+#endif
+#ifndef elf_backend_section_from_phdr
+#define elf_backend_section_from_phdr	_bfd_elf_make_section_from_phdr
+#endif
+#ifndef elf_backend_fake_sections
+#define elf_backend_fake_sections	0
+#endif
+#ifndef elf_backend_section_from_bfd_section
+#define elf_backend_section_from_bfd_section	0
+#endif
+#ifndef elf_backend_add_symbol_hook
+#define elf_backend_add_symbol_hook	0
+#endif
+#ifndef elf_backend_link_output_symbol_hook
+#define elf_backend_link_output_symbol_hook 0
+#endif
+#ifndef elf_backend_create_dynamic_sections
+#define elf_backend_create_dynamic_sections 0
+#endif
+#ifndef elf_backend_omit_section_dynsym
+#define elf_backend_omit_section_dynsym _bfd_elf_link_omit_section_dynsym
+#endif
+#ifndef elf_backend_relocs_compatible
+#define elf_backend_relocs_compatible _bfd_elf_default_relocs_compatible
+#endif
+#ifndef elf_backend_check_relocs
+#define elf_backend_check_relocs	0
+#endif
+#ifndef elf_backend_check_directives
+#define elf_backend_check_directives	0
+#endif
+#ifndef elf_backend_notice_as_needed
+#define elf_backend_notice_as_needed	_bfd_elf_notice_as_needed
+#endif
+#ifndef elf_backend_adjust_dynamic_symbol
+#define elf_backend_adjust_dynamic_symbol 0
+#endif
+#ifndef elf_backend_always_size_sections
+#define elf_backend_always_size_sections 0
+#endif
+#ifndef elf_backend_size_dynamic_sections
+#define elf_backend_size_dynamic_sections 0
+#endif
+#ifndef elf_backend_init_index_section
+#define elf_backend_init_index_section \
+ ((void (*) (bfd *, struct bfd_link_info *)) bfd_void)
+#endif
+#ifndef elf_backend_relocate_section
+#define elf_backend_relocate_section	0
+#endif
+#ifndef elf_backend_finish_dynamic_symbol
+#define elf_backend_finish_dynamic_symbol	0
+#endif
+#ifndef elf_backend_finish_dynamic_sections
+#define elf_backend_finish_dynamic_sections	0
+#endif
+#ifndef elf_backend_begin_write_processing
+#define elf_backend_begin_write_processing	0
+#endif
+#ifndef elf_backend_final_write_processing
+#define elf_backend_final_write_processing	0
+#endif
+#ifndef elf_backend_additional_program_headers
+#define elf_backend_additional_program_headers	0
+#endif
+#ifndef elf_backend_modify_segment_map
+#define elf_backend_modify_segment_map	0
+#endif
+#ifndef elf_backend_modify_program_headers
+#define elf_backend_modify_program_headers	0
+#endif
+#ifndef elf_backend_allow_non_load_phdr
+#define elf_backend_allow_non_load_phdr	\
+  ((bfd_boolean (*) (bfd *, const Elf_Internal_Phdr *, unsigned)) bfd_false)
+#endif
+#ifndef elf_backend_ecoff_debug_swap
+#define elf_backend_ecoff_debug_swap	0
+#endif
+#ifndef elf_backend_bfd_from_remote_memory
+#define elf_backend_bfd_from_remote_memory _bfd_elf64_bfd_from_remote_memory
+#endif
+#ifndef elf_backend_got_header_size
+#define elf_backend_got_header_size	0
+#endif
+#ifndef elf_backend_got_elt_size
+#define elf_backend_got_elt_size _bfd_elf_default_got_elt_size
+#endif
+#ifndef elf_backend_obj_attrs_vendor
+#define elf_backend_obj_attrs_vendor		NULL
+#endif
+#ifndef elf_backend_obj_attrs_section
+#define elf_backend_obj_attrs_section		NULL
+#endif
+#ifndef elf_backend_obj_attrs_arg_type
+#define elf_backend_obj_attrs_arg_type		NULL
+#endif
+#ifndef elf_backend_obj_attrs_section_type
+#define elf_backend_obj_attrs_section_type		SHT_GNU_ATTRIBUTES
+#endif
+#ifndef elf_backend_obj_attrs_order
+#define elf_backend_obj_attrs_order		NULL
+#endif
+#ifndef elf_backend_obj_attrs_handle_unknown
+#define elf_backend_obj_attrs_handle_unknown	NULL
+#endif
+#ifndef elf_backend_static_tls_alignment
+#define elf_backend_static_tls_alignment	1
+#endif
+#ifndef elf_backend_post_process_headers
+#define elf_backend_post_process_headers	_bfd_elf_post_process_headers
+#endif
+#ifndef elf_backend_print_symbol_all
+#define elf_backend_print_symbol_all		NULL
+#endif
+#ifndef elf_backend_output_arch_local_syms
+#define elf_backend_output_arch_local_syms	NULL
+#endif
+#ifndef elf_backend_output_arch_syms
+#define elf_backend_output_arch_syms		NULL
+#endif
+#ifndef elf_backend_filter_implib_symbols
+#define elf_backend_filter_implib_symbols	NULL
+#endif
+#ifndef elf_backend_copy_indirect_symbol
+#define elf_backend_copy_indirect_symbol	_bfd_elf_link_hash_copy_indirect
+#endif
+#ifndef elf_backend_hide_symbol
+#define elf_backend_hide_symbol			_bfd_elf_link_hash_hide_symbol
+#endif
+#ifndef elf_backend_fixup_symbol
+#define elf_backend_fixup_symbol		NULL
+#endif
+#ifndef elf_backend_merge_symbol_attribute
+#define elf_backend_merge_symbol_attribute	NULL
+#endif
+#ifndef elf_backend_get_target_dtag
+#define elf_backend_get_target_dtag		NULL
+#endif
+#ifndef elf_backend_ignore_undef_symbol
+#define elf_backend_ignore_undef_symbol		NULL
+#endif
+#ifndef elf_backend_emit_relocs
+#define elf_backend_emit_relocs			_bfd_elf_link_output_relocs
+#endif
+#ifndef elf_backend_update_relocs
+#define elf_backend_update_relocs		NULL
+#endif
+#ifndef elf_backend_count_relocs
+#define elf_backend_count_relocs		NULL
+#endif
+#ifndef elf_backend_count_additional_relocs
+#define elf_backend_count_additional_relocs	NULL
+#endif
+#ifndef elf_backend_sort_relocs_p
+#define elf_backend_sort_relocs_p		NULL
+#endif
+#ifndef elf_backend_grok_prstatus
+#define elf_backend_grok_prstatus		NULL
+#endif
+#ifndef elf_backend_grok_psinfo
+#define elf_backend_grok_psinfo			NULL
+#endif
+#ifndef elf_backend_write_core_note
+#define elf_backend_write_core_note		NULL
+#endif
+#ifndef elf_backend_lookup_section_flags_hook
+#define elf_backend_lookup_section_flags_hook	NULL
+#endif
+#ifndef elf_backend_reloc_type_class
+#define elf_backend_reloc_type_class		_bfd_elf_reloc_type_class
+#endif
+#ifndef elf_backend_discard_info
+#define elf_backend_discard_info		NULL
+#endif
+#ifndef elf_backend_ignore_discarded_relocs
+#define elf_backend_ignore_discarded_relocs	NULL
+#endif
+#ifndef elf_backend_action_discarded
+#define elf_backend_action_discarded _bfd_elf_default_action_discarded
+#endif
+#ifndef elf_backend_eh_frame_address_size
+#define elf_backend_eh_frame_address_size _bfd_elf_eh_frame_address_size
+#endif
+#ifndef elf_backend_can_make_relative_eh_frame
+#define elf_backend_can_make_relative_eh_frame	_bfd_elf_can_make_relative
+#endif
+#ifndef elf_backend_can_make_lsda_relative_eh_frame
+#define elf_backend_can_make_lsda_relative_eh_frame	_bfd_elf_can_make_relative
+#endif
+#ifndef elf_backend_encode_eh_address
+#define elf_backend_encode_eh_address		_bfd_elf_encode_eh_address
+#endif
+#ifndef elf_backend_write_section
+#define elf_backend_write_section		NULL
+#endif
+#ifndef elf_backend_mips_irix_compat
+#define elf_backend_mips_irix_compat		NULL
+#endif
+#ifndef elf_backend_mips_rtype_to_howto
+#define elf_backend_mips_rtype_to_howto		NULL
+#endif
+
+/* Previously, backends could only use SHT_REL or SHT_RELA relocation
+   sections, but not both.  They defined USE_REL to indicate SHT_REL
+   sections, and left it undefined to indicated SHT_RELA sections.
+   For backwards compatibility, we still support this usage.  */
+#ifndef USE_REL
+#define USE_REL 0
+#endif
+
+/* Use these in new code.  */
+#ifndef elf_backend_may_use_rel_p
+#define elf_backend_may_use_rel_p USE_REL
+#endif
+#ifndef elf_backend_may_use_rela_p
+#define elf_backend_may_use_rela_p !USE_REL
+#endif
+#ifndef elf_backend_default_use_rela_p
+#define elf_backend_default_use_rela_p !USE_REL
+#endif
+#ifndef elf_backend_rela_plts_and_copies_p
+#define elf_backend_rela_plts_and_copies_p elf_backend_default_use_rela_p
+#endif
+
+#ifndef elf_backend_rela_normal
+#define elf_backend_rela_normal 0
+#endif
+
+#ifndef elf_backend_dtrel_excludes_plt
+#define elf_backend_dtrel_excludes_plt 0
+#endif
+
+#ifndef elf_backend_plt_sym_val
+#define elf_backend_plt_sym_val NULL
+#endif
+#ifndef elf_backend_relplt_name
+#define elf_backend_relplt_name NULL
+#endif
+
+#ifndef ELF_MACHINE_ALT1
+#define ELF_MACHINE_ALT1 0
+#endif
+
+#ifndef ELF_MACHINE_ALT2
+#define ELF_MACHINE_ALT2 0
+#endif
+
+#ifndef elf_backend_size_info
+#define elf_backend_size_info _bfd_elf64_size_info
+#endif
+
+#ifndef elf_backend_special_sections
+#define elf_backend_special_sections NULL
+#endif
+
+#ifndef elf_backend_sign_extend_vma
+#define elf_backend_sign_extend_vma 0
+#endif
+
+#ifndef elf_backend_link_order_error_handler
+#define elf_backend_link_order_error_handler _bfd_error_handler
+#endif
+
+#ifndef elf_backend_common_definition
+#define elf_backend_common_definition _bfd_elf_common_definition
+#endif
+
+#ifndef elf_backend_common_section_index
+#define elf_backend_common_section_index _bfd_elf_common_section_index
+#endif
+
+#ifndef elf_backend_common_section
+#define elf_backend_common_section _bfd_elf_common_section
+#endif
+
+#ifndef elf_backend_merge_symbol
+#define elf_backend_merge_symbol NULL
+#endif
+
+#ifndef elf_backend_hash_symbol
+#define elf_backend_hash_symbol _bfd_elf_hash_symbol
+#endif
+
+#ifndef elf_backend_is_function_type
+#define elf_backend_is_function_type _bfd_elf_is_function_type
+#endif
+
+#ifndef elf_backend_maybe_function_sym
+#define elf_backend_maybe_function_sym _bfd_elf_maybe_function_sym
+#endif
+
+#ifndef elf_backend_get_reloc_section
+#define elf_backend_get_reloc_section _bfd_elf_get_reloc_section
+#endif
+
+#ifndef elf_backend_copy_special_section_fields
+#define elf_backend_copy_special_section_fields NULL
+#endif
+
+#ifndef elf_backend_compact_eh_encoding
+#define elf_backend_compact_eh_encoding NULL
+#endif
+
+#ifndef elf_backend_cant_unwind_opcode
+#define elf_backend_cant_unwind_opcode 0
+#endif
+
+#ifndef elf_match_priority
+#define elf_match_priority \
+  (ELF_ARCH == bfd_arch_unknown ? 2 : ELF_OSABI == ELFOSABI_NONE ? 1 : 0)
+#endif
+
+extern const struct elf_size_info _bfd_elf64_size_info;
+
+static struct elf_backend_data elf64_bed =
+{
+  ELF_ARCH,			/* arch */
+  ELF_TARGET_ID,		/* target_id */
+  ELF_MACHINE_CODE,		/* elf_machine_code */
+  ELF_OSABI,			/* elf_osabi  */
+  ELF_MAXPAGESIZE,		/* maxpagesize */
+  ELF_MINPAGESIZE,		/* minpagesize */
+  ELF_COMMONPAGESIZE,		/* commonpagesize */
+  ELF_DYNAMIC_SEC_FLAGS,	/* dynamic_sec_flags */
+  elf_backend_arch_data,
+  elf_info_to_howto,
+  elf_info_to_howto_rel,
+  elf_backend_sym_is_global,
+  elf_backend_object_p,
+  elf_backend_symbol_processing,
+  elf_backend_symbol_table_processing,
+  elf_backend_get_symbol_type,
+  elf_backend_archive_symbol_lookup,
+  elf_backend_name_local_section_symbols,
+  elf_backend_section_processing,
+  elf_backend_section_from_shdr,
+  elf_backend_section_flags,
+  elf_backend_get_sec_type_attr,
+  elf_backend_section_from_phdr,
+  elf_backend_fake_sections,
+  elf_backend_section_from_bfd_section,
+  elf_backend_add_symbol_hook,
+  elf_backend_link_output_symbol_hook,
+  elf_backend_create_dynamic_sections,
+  elf_backend_omit_section_dynsym,
+  elf_backend_relocs_compatible,
+  elf_backend_check_relocs,
+  elf_backend_check_directives,
+  elf_backend_notice_as_needed,
+  elf_backend_adjust_dynamic_symbol,
+  elf_backend_always_size_sections,
+  elf_backend_size_dynamic_sections,
+  elf_backend_init_index_section,
+  elf_backend_relocate_section,
+  elf_backend_finish_dynamic_symbol,
+  elf_backend_finish_dynamic_sections,
+  elf_backend_begin_write_processing,
+  elf_backend_final_write_processing,
+  elf_backend_additional_program_headers,
+  elf_backend_modify_segment_map,
+  elf_backend_modify_program_headers,
+  elf_backend_allow_non_load_phdr,
+  elf_backend_gc_keep,
+  elf_backend_gc_mark_dynamic_ref,
+  elf_backend_gc_mark_hook,
+  elf_backend_gc_mark_extra_sections,
+  elf_backend_gc_sweep_hook,
+  elf_backend_post_process_headers,
+  elf_backend_print_symbol_all,
+  elf_backend_output_arch_local_syms,
+  elf_backend_output_arch_syms,
+  elf_backend_filter_implib_symbols,
+  elf_backend_copy_indirect_symbol,
+  elf_backend_hide_symbol,
+  elf_backend_fixup_symbol,
+  elf_backend_merge_symbol_attribute,
+  elf_backend_get_target_dtag,
+  elf_backend_ignore_undef_symbol,
+  elf_backend_emit_relocs,
+  elf_backend_update_relocs,
+  elf_backend_count_relocs,
+  elf_backend_count_additional_relocs,
+  elf_backend_sort_relocs_p,
+  elf_backend_grok_prstatus,
+  elf_backend_grok_psinfo,
+  elf_backend_write_core_note,
+  elf_backend_lookup_section_flags_hook,
+  elf_backend_reloc_type_class,
+  elf_backend_discard_info,
+  elf_backend_ignore_discarded_relocs,
+  elf_backend_action_discarded,
+  elf_backend_eh_frame_address_size,
+  elf_backend_can_make_relative_eh_frame,
+  elf_backend_can_make_lsda_relative_eh_frame,
+  elf_backend_encode_eh_address,
+  elf_backend_write_section,
+  elf_backend_mips_irix_compat,
+  elf_backend_mips_rtype_to_howto,
+  elf_backend_ecoff_debug_swap,
+  elf_backend_bfd_from_remote_memory,
+  elf_backend_plt_sym_val,
+  elf_backend_common_definition,
+  elf_backend_common_section_index,
+  elf_backend_common_section,
+  elf_backend_merge_symbol,
+  elf_backend_hash_symbol,
+  elf_backend_is_function_type,
+  elf_backend_maybe_function_sym,
+  elf_backend_get_reloc_section,
+  elf_backend_copy_special_section_fields,
+  elf_backend_link_order_error_handler,
+  elf_backend_relplt_name,
+  ELF_MACHINE_ALT1,
+  ELF_MACHINE_ALT2,
+  &elf_backend_size_info,
+  elf_backend_special_sections,
+  elf_backend_got_header_size,
+  elf_backend_got_elt_size,
+  elf_backend_obj_attrs_vendor,
+  elf_backend_obj_attrs_section,
+  elf_backend_obj_attrs_arg_type,
+  elf_backend_obj_attrs_section_type,
+  elf_backend_obj_attrs_order,
+  elf_backend_obj_attrs_handle_unknown,
+  elf_backend_compact_eh_encoding,
+  elf_backend_cant_unwind_opcode,
+  elf_backend_static_tls_alignment,
+  elf_backend_stack_align,
+  elf_backend_strtab_flags,
+  elf_backend_collect,
+  elf_backend_type_change_ok,
+  elf_backend_may_use_rel_p,
+  elf_backend_may_use_rela_p,
+  elf_backend_default_use_rela_p,
+  elf_backend_rela_plts_and_copies_p,
+  elf_backend_rela_normal,
+  elf_backend_dtrel_excludes_plt,
+  elf_backend_sign_extend_vma,
+  elf_backend_want_got_plt,
+  elf_backend_plt_readonly,
+  elf_backend_want_plt_sym,
+  elf_backend_plt_not_loaded,
+  elf_backend_plt_alignment,
+  elf_backend_can_gc_sections,
+  elf_backend_can_refcount,
+  elf_backend_want_got_sym,
+  elf_backend_want_dynbss,
+  elf_backend_want_dynrelro,
+  elf_backend_want_p_paddr_set_to_zero,
+  elf_backend_no_page_alias,
+  elf_backend_default_execstack,
+  elf_backend_caches_rawsize,
+  elf_backend_extern_protected_data,
+  elf_backend_always_renumber_dynsyms
+};
+
+/* Forward declaration for use when initialising alternative_target field.  */
+#ifdef TARGET_LITTLE_SYM
+extern const bfd_target TARGET_LITTLE_SYM;
+#endif
+
+#ifdef TARGET_BIG_SYM
+const bfd_target TARGET_BIG_SYM =
+{
+  /* name: identify kind of target */
+  TARGET_BIG_NAME,
+
+  /* flavour: general indication about file */
+  bfd_target_elf_flavour,
+
+  /* byteorder: data is big endian */
+  BFD_ENDIAN_BIG,
+
+  /* header_byteorder: header is also big endian */
+  BFD_ENDIAN_BIG,
+
+  /* object_flags: mask of all file flags */
+  (HAS_RELOC | EXEC_P | HAS_LINENO | HAS_DEBUG | HAS_SYMS | HAS_LOCALS
+   | DYNAMIC | WP_TEXT | D_PAGED | BFD_COMPRESS | BFD_DECOMPRESS
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON),
+
+  /* section_flags: mask of all section flags */
+  (SEC_HAS_CONTENTS | SEC_ALLOC | SEC_LOAD | SEC_RELOC | SEC_READONLY
+   | SEC_CODE | SEC_DATA | SEC_DEBUGGING | SEC_EXCLUDE | SEC_SORT_ENTRIES
+   | SEC_SMALL_DATA | SEC_MERGE | SEC_STRINGS | SEC_GROUP),
+
+   /* leading_symbol_char: is the first char of a user symbol
+      predictable, and if so what is it */
+  elf_symbol_leading_char,
+
+  /* ar_pad_char: pad character for filenames within an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and/or os and should be independently tunable */
+  '/',
+
+  /* ar_max_namelen: maximum number of characters in an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and should be independently tunable.  The System V ABI,
+     Chapter 7 (Formats & Protocols), Archive section sets this as 15.  */
+  15,
+
+  elf_match_priority,
+
+  /* Routines to byte-swap various sized integers from the data sections */
+  bfd_getb64, bfd_getb_signed_64, bfd_putb64,
+    bfd_getb32, bfd_getb_signed_32, bfd_putb32,
+    bfd_getb16, bfd_getb_signed_16, bfd_putb16,
+
+  /* Routines to byte-swap various sized integers from the file headers */
+  bfd_getb64, bfd_getb_signed_64, bfd_putb64,
+    bfd_getb32, bfd_getb_signed_32, bfd_putb32,
+    bfd_getb16, bfd_getb_signed_16, bfd_putb16,
+
+  /* bfd_check_format: check the format of a file being read */
+  { _bfd_dummy_target,		/* unknown format */
+    bfd_elf64_object_p,		/* assembler/linker output (object file) */
+    bfd_elf64_archive_p,	/* an archive */
+    bfd_elf64_core_file_p	/* a core file */
+  },
+
+  /* bfd_set_format: set the format of a file being written */
+  { bfd_false,
+    bfd_elf64_mkobject,
+    bfd_elf64_mkarchive,
+    bfd_elf64_mkcorefile
+  },
+
+  /* bfd_write_contents: write cached information into a file being written */
+  { bfd_false,
+    bfd_elf64_write_object_contents,
+    bfd_elf64_write_archive_contents,
+    bfd_elf64_write_corefile_contents,
+  },
+
+  BFD_JUMP_TABLE_GENERIC (bfd_elf64),
+  BFD_JUMP_TABLE_COPY (bfd_elf64),
+  BFD_JUMP_TABLE_CORE (bfd_elf64),
+#ifdef bfd_elf64_archive_functions
+  BFD_JUMP_TABLE_ARCHIVE (bfd_elf64_archive),
+#elif defined USE_64_BIT_ARCHIVE
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_64_bit),
+#else
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_coff),
+#endif
+  BFD_JUMP_TABLE_SYMBOLS (bfd_elf64),
+  BFD_JUMP_TABLE_RELOCS (bfd_elf64),
+  BFD_JUMP_TABLE_WRITE (bfd_elf64),
+  BFD_JUMP_TABLE_LINK (bfd_elf64),
+  BFD_JUMP_TABLE_DYNAMIC (bfd_elf64),
+
+  /* Alternative endian target.  */
+#ifdef TARGET_LITTLE_SYM
+  & TARGET_LITTLE_SYM,
+#else
+  NULL,
+#endif
+
+  /* backend_data: */
+  &elf64_bed
+};
+#endif
+
+#ifdef TARGET_LITTLE_SYM
+const bfd_target TARGET_LITTLE_SYM =
+{
+  /* name: identify kind of target */
+  TARGET_LITTLE_NAME,
+
+  /* flavour: general indication about file */
+  bfd_target_elf_flavour,
+
+  /* byteorder: data is little endian */
+  BFD_ENDIAN_LITTLE,
+
+  /* header_byteorder: header is also little endian */
+  BFD_ENDIAN_LITTLE,
+
+  /* object_flags: mask of all file flags */
+  (HAS_RELOC | EXEC_P | HAS_LINENO | HAS_DEBUG | HAS_SYMS | HAS_LOCALS
+   | DYNAMIC | WP_TEXT | D_PAGED | BFD_COMPRESS | BFD_DECOMPRESS
+   | BFD_COMPRESS_GABI | BFD_CONVERT_ELF_COMMON | BFD_USE_ELF_STT_COMMON),
+
+  /* section_flags: mask of all section flags */
+  (SEC_HAS_CONTENTS | SEC_ALLOC | SEC_LOAD | SEC_RELOC | SEC_READONLY
+   | SEC_CODE | SEC_DATA | SEC_DEBUGGING | SEC_EXCLUDE | SEC_SORT_ENTRIES
+   | SEC_SMALL_DATA | SEC_MERGE | SEC_STRINGS | SEC_GROUP),
+
+   /* leading_symbol_char: is the first char of a user symbol
+      predictable, and if so what is it */
+  elf_symbol_leading_char,
+
+  /* ar_pad_char: pad character for filenames within an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and/or os and should be independently tunable */
+  '/',
+
+  /* ar_max_namelen: maximum number of characters in an archive header
+     FIXME:  this really has nothing to do with ELF, this is a characteristic
+     of the archiver and should be independently tunable.  The System V ABI,
+     Chapter 7 (Formats & Protocols), Archive section sets this as 15.  */
+  15,
+
+  elf_match_priority,
+
+  /* Routines to byte-swap various sized integers from the data sections */
+  bfd_getl64, bfd_getl_signed_64, bfd_putl64,
+    bfd_getl32, bfd_getl_signed_32, bfd_putl32,
+    bfd_getl16, bfd_getl_signed_16, bfd_putl16,
+
+  /* Routines to byte-swap various sized integers from the file headers */
+  bfd_getl64, bfd_getl_signed_64, bfd_putl64,
+    bfd_getl32, bfd_getl_signed_32, bfd_putl32,
+    bfd_getl16, bfd_getl_signed_16, bfd_putl16,
+
+  /* bfd_check_format: check the format of a file being read */
+  { _bfd_dummy_target,		/* unknown format */
+    bfd_elf64_object_p,		/* assembler/linker output (object file) */
+    bfd_elf64_archive_p,	/* an archive */
+    bfd_elf64_core_file_p	/* a core file */
+  },
+
+  /* bfd_set_format: set the format of a file being written */
+  { bfd_false,
+    bfd_elf64_mkobject,
+    bfd_elf64_mkarchive,
+    bfd_elf64_mkcorefile
+  },
+
+  /* bfd_write_contents: write cached information into a file being written */
+  { bfd_false,
+    bfd_elf64_write_object_contents,
+    bfd_elf64_write_archive_contents,
+    bfd_elf64_write_corefile_contents,
+  },
+
+  BFD_JUMP_TABLE_GENERIC (bfd_elf64),
+  BFD_JUMP_TABLE_COPY (bfd_elf64),
+  BFD_JUMP_TABLE_CORE (bfd_elf64),
+#ifdef bfd_elf64_archive_functions
+  BFD_JUMP_TABLE_ARCHIVE (bfd_elf64_archive),
+#elif defined USE_64_BIT_ARCHIVE
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_64_bit),
+#else
+  BFD_JUMP_TABLE_ARCHIVE (_bfd_archive_coff),
+#endif
+  BFD_JUMP_TABLE_SYMBOLS (bfd_elf64),
+  BFD_JUMP_TABLE_RELOCS (bfd_elf64),
+  BFD_JUMP_TABLE_WRITE (bfd_elf64),
+  BFD_JUMP_TABLE_LINK (bfd_elf64),
+  BFD_JUMP_TABLE_DYNAMIC (bfd_elf64),
+
+  /* Alternative endian target.  */
+#ifdef TARGET_BIG_SYM
+  & TARGET_BIG_SYM,
+#else
+  NULL,
+#endif
+
+  /* backend_data: */
+  &elf64_bed
+};
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/libbfd.a b/utils/gapy/gen-debug-info-src/ext/bfd/libbfd.a
new file mode 100644
index 000000000..5b848f563
Binary files /dev/null and b/utils/gapy/gen-debug-info-src/ext/bfd/libbfd.a differ
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/stmp-bfd-h b/utils/gapy/gen-debug-info-src/ext/bfd/stmp-bfd-h
new file mode 100644
index 000000000..e69de29bb
diff --git a/utils/gapy/gen-debug-info-src/ext/bfd/targmatch.h b/utils/gapy/gen-debug-info-src/ext/bfd/targmatch.h
new file mode 100644
index 000000000..88fb6bcf9
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfd/targmatch.h
@@ -0,0 +1,2923 @@
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_mach_o_vec)
+
+{ "aarch64-*-darwin*",
+&aarch64_mach_o_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_le_vec)
+
+{ "aarch64-*-elf", NULL },{ "aarch64-*-rtems*",
+&aarch64_elf64_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_be_vec)
+
+{ "aarch64_be-*-elf",
+&aarch64_elf64_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_le_vec)
+
+{ "aarch64-*-freebsd*",
+&aarch64_elf64_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_le_vec)
+
+{ "aarch64-*-fuchsia*",
+&aarch64_elf64_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_le_cloudabi_vec)
+
+{ "aarch64-*-cloudabi*",
+&aarch64_elf64_le_cloudabi_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_le_vec)
+
+{ "aarch64-*-linux*",
+&aarch64_elf64_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aarch64_elf64_be_vec)
+
+{ "aarch64_be-*-linux*",
+&aarch64_elf64_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_elf64_fbsd_vec)
+
+{ "alpha*-*-freebsd*", NULL },{ "alpha*-*-kfreebsd*-gnu",
+&alpha_elf64_fbsd_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_elf64_vec)
+
+{ "alpha*-*-netbsd*", NULL },{ "alpha*-*-openbsd*",
+&alpha_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_ecoff_le_vec)
+
+{ "alpha*-*-netware*",
+&alpha_ecoff_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_ecoff_le_vec)
+
+{ "alpha*-*-linux*ecoff*",
+&alpha_ecoff_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_elf64_vec)
+
+{ "alpha*-*-linux-*", NULL },{ "alpha*-*-elf*",
+&alpha_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_vms_vec)
+
+{ "alpha*-*-*vms*",
+&alpha_vms_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_alpha_ecoff_le_vec)
+
+{ "alpha*-*-*",
+&alpha_ecoff_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_ia64_elf64_le_vec)
+
+{ "ia64*-*-freebsd*", NULL },{ "ia64*-*-netbsd*", NULL },{ "ia64*-*-linux-*", NULL },{ "ia64*-*-elf*", NULL },{ "ia64*-*-kfreebsd*-gnu",
+&ia64_elf64_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_ia64_elf32_hpux_be_vec)
+
+{ "ia64*-*-hpux*",
+&ia64_elf32_hpux_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_ia64_elf64_vms_vec)
+
+{ "ia64*-*-*vms*",
+&ia64_elf64_vms_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf64_fbsd_vec)
+
+{ "sparc64-*-freebsd*", NULL },{ "sparc64-*-kfreebsd*-gnu",
+&sparc_elf64_fbsd_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf64_vec)
+
+{ "sparc64-*-netbsd*", NULL },{ "sparc64-*-openbsd*",
+&sparc_elf64_vec },
+#endif
+
+
+
+    
+#endif /* BFD64 */
+
+#if !defined (SELECT_VECS) || defined (HAVE_am33_elf32_linux_vec)
+
+{ "am33_2.0-*-linux*",
+&am33_elf32_linux_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_arc_elf32_be_vec)
+
+{ "arc*eb-*-elf*", NULL },{ "arc*eb-*-linux*",
+&arc_elf32_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_arc_elf32_le_vec)
+
+{ "arc*-*-elf*", NULL },{ "arc*-*-linux*",
+&arc_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_arm_mach_o_vec)
+
+{ "arm-*-darwin*",
+&arm_mach_o_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-fuchsia*",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_nacl_le_vec)
+
+{ "arm-*-nacl*",
+&arm_elf32_nacl_le_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_nacl_be_vec)
+
+{ "armeb-*-nacl*",
+&arm_elf32_nacl_be_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_be_vec)
+
+{ "armeb-*-netbsdelf*",
+&arm_elf32_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-netbsdelf*",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_aout_nbsd_vec)
+
+{ "arm-*-netbsd*", NULL },{ "arm-*-openbsd*",
+&arm_aout_nbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-nto*", NULL },{ "nto*arm*",
+&arm_elf32_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_aout_riscix_vec)
+
+{ "arm-*-riscix*",
+&arm_aout_riscix_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_pe_epoc_le_vec)
+
+{ "arm-epoc-pe*",
+&arm_pe_epoc_le_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_pe_wince_le_vec)
+
+{ "arm-wince-pe", NULL },{ "arm-*-wince", NULL },{ "arm*-*-mingw32ce*", NULL },{ "arm*-*-cegcc*",
+&arm_pe_wince_le_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_pe_le_vec)
+
+{ "arm-*-pe*",
+&arm_pe_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_aout_le_vec)
+
+{ "arm-*-aout", NULL },{ "armel-*-aout",
+&arm_aout_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_aout_be_vec)
+
+{ "armeb-*-aout",
+&arm_aout_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_coff_le_vec)
+
+{ "arm-*-coff",
+&arm_coff_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-phoenix*",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-rtems*",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_be_vec)
+
+{ "armeb-*-elf", NULL },{ "arm*b-*-freebsd*", NULL },{ "arm*b-*-linux-*", NULL },{ "armeb-*-eabi*",
+&arm_elf32_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-kaos*",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm-*-elf", NULL },{ "arm*-*-freebsd*", NULL },{ "arm*-*-linux-*", NULL },{ "arm*-*-conix*", NULL },
+{ "arm*-*-uclinux*", NULL },{ "arm-*-kfreebsd*-gnu", NULL },
+{ "arm*-*-eabi*",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_vxworks_le_vec)
+
+{ "arm*-*-vxworks", NULL },{ "arm*-*-windiss",
+&arm_elf32_vxworks_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_symbian_le_vec)
+
+{ "arm*-*-symbianelf*",
+&arm_elf32_symbian_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_arm_elf32_le_vec)
+
+{ "arm9e-*-elf",
+&arm_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_avr_elf32_vec)
+
+{ "avr-*-*",
+&avr_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_bfin_elf32_vec)
+
+{ "bfin-*-*",
+&bfin_elf32_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_tic30_aout_vec)
+
+{ "c30-*-*aout*", NULL },{ "tic30-*-*aout*",
+&tic30_aout_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_tic30_coff_vec)
+
+{ "c30-*-*coff*", NULL },{ "tic30-*-*coff*",
+&tic30_coff_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_tic4x_coff1_vec)
+
+{ "c4x-*-*coff*", NULL },{ "tic4x-*-*coff*", NULL },{ "tic4x-*-rtems*",
+&tic4x_coff1_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_tic54x_coff1_vec)
+
+{ "c54x*-*-*coff*", NULL },{ "tic54x-*-*coff*",
+&tic54x_coff1_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_cr16_elf32_vec)
+
+{ "cr16-*-elf*", NULL },{ "cr16*-*-uclinux*",
+&cr16_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_cr16c_elf32_vec)
+
+{ "cr16c-*-elf*",
+&cr16c_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_cris_aout_vec)
+
+{ "cris-*-*", NULL },{ "crisv32-*-*",
+&cris_aout_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_crx_elf32_vec)
+
+{ "crx-*-elf*",
+&crx_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_d10v_elf32_vec)
+
+{ "d10v-*-*",
+&d10v_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_dlx_elf32_be_vec)
+
+{ "dlx-*-elf*",
+&dlx_elf32_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_d30v_elf32_vec)
+
+{ "d30v-*-*",
+&d30v_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_epiphany_elf32_vec)
+
+{ "epiphany-*-elf",
+&epiphany_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "fido-*-elf*",
+&m68k_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_fr30_elf32_vec)
+
+{ "fr30-*-elf",
+&fr30_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_frv_elf32_vec)
+
+{ "frv-*-elf",
+&frv_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_frv_elf32_fdpic_vec)
+
+{ "frv-*-*linux*",
+&frv_elf32_fdpic_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_moxie_elf32_be_vec)
+
+{ "moxie-*-elf", NULL },{ "moxie-*-rtems*", NULL },{ "moxie-*-uclinux",
+&moxie_elf32_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_moxie_elf32_le_vec)
+
+{ "moxie-*-moxiebox*",
+&moxie_elf32_le_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_h8300_coff_vec)
+
+{ "h8300*-*-rtemscoff*",
+&h8300_coff_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_h8300_elf32_vec)
+
+{ "h8300*-*-elf", NULL },{ "h8300*-*-rtems*",
+&h8300_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_h8300_elf32_linux_vec)
+
+{ "h8300*-*-linux*",
+&h8300_elf32_linux_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_h8300_coff_vec)
+
+{ "h8300*-*-*",
+&h8300_coff_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_h8500_coff_vec)
+
+{ "h8500-*-*",
+&h8500_coff_vec },
+#endif
+
+
+    
+
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_elf64_linux_vec)
+
+{ "hppa*64*-*-linux-*",
+&hppa_elf64_linux_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_elf64_vec)
+
+{ "hppa*64*-*-hpux11*",
+&hppa_elf64_vec },
+#endif
+
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_elf32_linux_vec)
+
+{ "hppa*-*-linux-*",
+&hppa_elf32_linux_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_elf32_nbsd_vec)
+
+{ "hppa*-*-netbsd*",
+&hppa_elf32_nbsd_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_elf32_vec)
+
+{ "hppa*-*-*elf*", NULL },{ "hppa*-*-lites*", NULL },{ "hppa*-*-sysv4*", NULL },{ "hppa*-*-openbsd*",
+&hppa_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_som_vec)
+
+{ "hppa*-*-bsd*",
+&hppa_som_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_som_vec)
+
+{ "hppa*-*-hpux*", NULL },{ "hppa*-*-hiux*", NULL },{ "hppa*-*-mpeix*",
+&hppa_som_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_hppa_som_vec)
+
+{ "hppa*-*-osf*",
+&hppa_som_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_i370_elf32_vec)
+
+{ "i370-*-*",
+&i370_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_i386_coff_vec)
+
+{ "i[3-7]86-*-sco3.2v5*coff",
+&i386_coff_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-sysv4*", NULL },{ "i[3-7]86-*-unixware*", NULL },
+{ "i[3-7]86-*-elf*", NULL },{ "i[3-7]86-*-sco3.2v5*", NULL },
+{ "i[3-7]86-*-dgux*", NULL },{ "i[3-7]86-*-sysv5*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_sol2_vec)
+
+{ "i[3-7]86-*-solaris2*",
+&i386_elf32_sol2_vec },
+#endif
+
+
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_sol2_vec)
+
+{ "x86_64-*-solaris2*",
+&i386_elf32_sol2_vec },
+#endif
+
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-kaos*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-nto*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-aros*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-chorus*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-dicos*",
+&i386_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_coff_go32_vec)
+
+{ "*-*-msdosdjgpp*", NULL },{ "*-*-go32*",
+&i386_coff_go32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_coff_vec)
+
+{ "i[3-7]86-*-sysv*", NULL },{ "i[3-7]86-*-isc*", NULL },{ "i[3-7]86-*-sco*", NULL },{ "i[3-7]86-*-coff", NULL },
+{ "i[3-7]86-*-aix*",
+&i386_coff_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-rtems*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_mach_o_vec)
+
+{ "i[3-7]86-*-darwin*", NULL },{ "i[3-7]86-*-macos10*", NULL },{ "i[3-7]86-*-rhapsody*",
+&i386_mach_o_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_dynix_vec)
+
+{ "i[3-7]86-sequent-bsd*",
+&i386_aout_dynix_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_bsd_vec)
+
+{ "i[3-7]86-*-bsd*",
+&i386_aout_bsd_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-dragonfly*",
+&i386_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_fbsd_vec)
+
+{ "i[3-7]86-*-freebsdaout*", NULL },{ "i[3-7]86-*-freebsd[12].*", NULL },
+{ "i[3-7]86-*-freebsd[12]",
+&i386_aout_fbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_fbsd_vec)
+
+{ "i[3-7]86-*-freebsd*", NULL },{ "i[3-7]86-*-kfreebsd*-gnu",
+&i386_elf32_fbsd_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-netbsdelf*", NULL },{ "i[3-7]86-*-netbsd*-gnu*", NULL },{ "i[3-7]86-*-knetbsd*-gnu",
+&i386_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_pe_vec)
+
+{ "i[3-7]86-*-netbsdpe*",
+&i386_pe_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_nbsd_vec)
+
+{ "i[3-7]86-*-netbsdaout*", NULL },{ "i[3-7]86-*-netbsd*", NULL },
+{ "i[3-7]86-*-openbsd[0-2].*", NULL },{ "i[3-7]86-*-openbsd3.[0-3]",
+&i386_aout_nbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-openbsd*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-netware*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_linux_vec)
+
+{ "i[3-7]86-*-linux*aout*",
+&i386_aout_linux_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-linux-*",
+&i386_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_nacl_vec)
+
+{ "i[3-7]86-*-nacl*",
+&i386_elf32_nacl_vec },
+#endif
+
+
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_cloudabi_vec)
+
+{ "x86_64-*-cloudabi*",
+&x86_64_elf64_cloudabi_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_mach_o_vec)
+
+{ "x86_64-*-darwin*",
+&x86_64_mach_o_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_vec)
+
+{ "x86_64-*-dicos*",
+&x86_64_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_vec)
+
+{ "x86_64-*-elf*", NULL },{ "x86_64-*-rtems*", NULL },{ "x86_64-*-fuchsia",
+&x86_64_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_vec)
+
+{ "x86_64-*-dragonfly*",
+&x86_64_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_fbsd_vec)
+
+{ "x86_64-*-freebsd*", NULL },{ "x86_64-*-kfreebsd*-gnu",
+&x86_64_elf64_fbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_vec)
+
+{ "x86_64-*-netbsd*", NULL },{ "x86_64-*-openbsd*",
+&x86_64_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_vec)
+
+{ "x86_64-*-linux-*",
+&x86_64_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf32_nacl_vec)
+
+{ "x86_64-*-nacl*",
+&x86_64_elf32_nacl_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_pe_vec)
+
+{ "x86_64-*-mingw*", NULL },{ "x86_64-*-pe", NULL },{ "x86_64-*-pep", NULL },{ "x86_64-*-cygwin",
+&x86_64_pe_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_x86_64_elf64_vec)
+
+{ "x86_64-*-rdos*",
+&x86_64_elf64_vec },
+#endif
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-lynxos*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-gnu*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_mach3_vec)
+
+{ "i[3-7]86-*-mach*", NULL },{ "i[3-7]86-*-osf1mk*",
+&i386_aout_mach3_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_os9k_vec)
+
+{ "i[3-7]86-*-os9k",
+&i386_aout_os9k_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_vec)
+
+{ "i[3-7]86-*-msdos*",
+&i386_aout_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-moss*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_pe_vec)
+
+{ "i[3-7]86-*-beospe*",
+&i386_pe_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-beoself*", NULL },{ "i[3-7]86-*-beos*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_pei_vec)
+
+{ "i[3-7]86-*-interix*",
+&i386_pei_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-rdos*",
+&i386_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_pe_vec)
+
+{ "i[3-7]86-*-mingw32*", NULL },{ "i[3-7]86-*-cygwin*", NULL },{ "i[3-7]86-*-winnt", NULL },{ "i[3-7]86-*-pe",
+&i386_pe_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_coff_vec)
+
+{ "i[3-7]86-none-*",
+&i386_coff_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_aout_vec)
+
+{ "i[3-7]86-*-aout*", NULL },{ "i[3-7]86*-*-vsta*",
+&i386_aout_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vxworks_vec)
+
+{ "i[3-7]86-*-vxworks*",
+&i386_elf32_vxworks_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i386_elf32_vec)
+
+{ "i[3-7]86-*-chaos",
+&i386_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_i860_coff_vec)
+
+{ "i860-*-mach3*", NULL },{ "i860-*-osf1*", NULL },{ "i860-*-coff*",
+&i860_coff_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i860_elf32_le_vec)
+
+{ "i860-stardent-sysv4*", NULL },{ "i860-stardent-elf*",
+&i860_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i860_elf32_vec)
+
+{ "i860-*-sysv4*", NULL },{ "i860-*-elf*",
+&i860_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_bout_le_vec)
+
+{ "i960-*-vxworks4*", NULL },{ "i960-*-vxworks5.0",
+&bout_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_icoff_le_vec)
+
+{ "i960-*-vxworks5.*", NULL },{ "i960-*-coff*", NULL },{ "i960-*-sysv*",
+&icoff_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_bout_le_vec)
+
+{ "i960-*-vxworks*", NULL },{ "i960-*-aout*", NULL },{ "i960-*-bout*", NULL },{ "i960-*-nindy*",
+&bout_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_i960_elf32_vec)
+
+{ "i960-*-elf*",
+&i960_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_ip2k_elf32_vec)
+
+{ "ip2k-*-elf",
+&ip2k_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_iq2000_elf32_vec)
+
+{ "iq2000-*-elf",
+&iq2000_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_lm32_elf32_vec)
+
+{ "lm32-*-elf", NULL },{ "lm32-*-rtems*",
+&lm32_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_lm32_elf32_fdpic_vec)
+
+{ "lm32-*-*linux*",
+&lm32_elf32_fdpic_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m32c_elf32_vec)
+
+{ "m32c-*-elf", NULL },{ "m32c-*-rtems*",
+&m32c_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m32r_elf32_linux_le_vec)
+
+{ "m32r*le-*-linux*",
+&m32r_elf32_linux_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m32r_elf32_linux_vec)
+
+{ "m32r*-*-linux*",
+&m32r_elf32_linux_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m32r_elf32_le_vec)
+
+{ "m32r*le-*-*",
+&m32r_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m32r_elf32_vec)
+
+{ "m32r-*-*",
+&m32r_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m68hc11_elf32_vec)
+
+{ "m68hc11-*-*", NULL },{ "m6811-*-*",
+&m68hc11_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68hc12_elf32_vec)
+
+{ "m68hc12-*-*", NULL },{ "m6812-*-*",
+&m68hc12_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_coff_sysv_vec)
+
+{ "m68*-motorola-sysv*",
+&m68k_coff_sysv_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_hp300bsd_vec)
+
+{ "m68*-hp-bsd*",
+&m68k_aout_hp300bsd_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_aout0_be_vec)
+
+{ "m68*-*-aout*",
+&aout0_be_vec },
+#endif
+
+
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-*-elf*", NULL },{ "m68*-*-sysv4*", NULL },{ "m68*-*-uclinux*",
+&m68k_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-*-rtems*",
+&m68k_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_coff_vec)
+
+{ "m68*-*-coff*", NULL },{ "m68*-*-sysv*",
+&m68k_coff_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_hp300hpux_vec)
+
+{ "m68*-*-hpux*",
+&m68k_aout_hp300hpux_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_linux_vec)
+
+{ "m68*-*-linux*aout*",
+&m68k_aout_linux_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-*-linux-*",
+&m68k_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-*-gnu*",
+&m68k_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_4knbsd_vec)
+
+{ "m68*-hp*-netbsd*",
+&m68k_aout_4knbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-*-netbsdelf*",
+&m68k_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_nbsd_vec)
+
+{ "m68*-*-netbsdaout*", NULL },{ "m68*-*-netbsd*",
+&m68k_aout_nbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_nbsd_vec)
+
+{ "m68*-*-openbsd*",
+&m68k_aout_nbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_sunos_be_vec)
+
+{ "m68*-*-sunos*", NULL },{ "m68*-*-os68k*", NULL },{ "m68*-*-vxworks*", NULL },{ "m68*-netx-*", NULL },
+{ "m68*-*-bsd*", NULL },{ "m68*-*-vsta*",
+&sparc_aout_sunos_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_sunos_be_vec)
+
+{ "m68*-ericsson-*",
+&sparc_aout_sunos_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-cbm-*",
+&m68k_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_elf32_vec)
+
+{ "m68*-*-psos*",
+&m68k_elf32_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m88k_elf32_vec)
+
+{ "m88*-harris-cxux*", NULL },{ "m88*-*-dgux*", NULL },{ "m88*-*-sysv4*",
+&m88k_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m88k_aout_mach3_vec)
+
+{ "m88*-*-mach3*",
+&m88k_aout_mach3_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_m88k_aout_obsd_vec)
+
+{ "m88*-*-openbsd*",
+&m88k_aout_obsd_vec },
+#endif
+
+
+   
+#if !defined (SELECT_VECS) || defined (HAVE_m88k_coff_bcs_vec)
+
+{ "m88*-*-*",
+&m88k_coff_bcs_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mcore_elf32_be_vec)
+
+{ "mcore-*-elf",
+&mcore_elf32_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mcore_pe_be_vec)
+
+{ "mcore-*-pe",
+&mcore_pe_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mep_elf32_vec)
+
+{ "mep-*-elf",
+&mep_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_metag_elf32_vec)
+
+{ "metag-*-*",
+&metag_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_microblaze_elf32_le_vec)
+
+{ "microblazeel*-*",
+&microblaze_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_microblaze_elf32_vec)
+
+{ "microblaze*-*",
+&microblaze_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mips_ecoff_be_vec)
+
+{ "mips*-big-*",
+&mips_ecoff_be_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_le_vec)
+
+{ "mips*el-*-netbsd*",
+&mips_elf32_trad_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_be_vec)
+
+{ "mips*-*-netbsd*",
+&mips_elf32_trad_be_vec },
+#endif
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_mips_ecoff_le_vec)
+
+{ "mips*-dec-*", NULL },{ "mips*el-*-ecoff*",
+&mips_ecoff_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_ecoff_be_vec)
+
+{ "mips*-*-ecoff*",
+&mips_ecoff_be_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_n_be_vec)
+
+{ "mips*-*-irix6*",
+&mips_elf32_n_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_n_le_vec)
+
+{ "mips64*-ps2-elf*",
+&mips_elf32_n_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_le_vec)
+
+{ "mips*-ps2-elf*",
+&mips_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_be_vec)
+
+{ "mips*-*-irix5*",
+&mips_elf32_be_vec },
+#endif
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_mips_ecoff_be_vec)
+
+{ "mips*-sgi-*", NULL },{ "mips*-*-bsd*",
+&mips_ecoff_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_ecoff_bele_vec)
+
+{ "mips*-*-lnews*",
+&mips_ecoff_bele_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_be_vec)
+
+{ "mips*-*-sysv4*",
+&mips_elf32_trad_be_vec },
+#endif
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_mips_ecoff_be_vec)
+
+{ "mips*-*-sysv*", NULL },{ "mips*-*-riscos*",
+&mips_ecoff_be_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_vxworks_le_vec)
+
+{ "mips*el-*-vxworks*",
+&mips_elf32_vxworks_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_vxworks_be_vec)
+
+{ "mips*-*-vxworks*",
+&mips_elf32_vxworks_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_le_vec)
+
+{ "mips*el-sde-elf*",
+&mips_elf32_trad_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_be_vec)
+
+{ "mips*-sde-elf*", NULL },{ "mips*-mti-elf*", NULL },{ "mips*-img-elf*",
+&mips_elf32_trad_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_le_vec)
+
+{ "mips*el-*-elf*", NULL },{ "mips*-*-chorus*",
+&mips_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_be_vec)
+
+{ "mips*-*-elf*", NULL },{ "mips*-*-rtems*", NULL },{ "mips*-*-windiss", NULL },{ "mips*-*-none",
+&mips_elf32_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf64_trad_be_vec)
+
+{ "mips64*-*-openbsd*",
+&mips_elf64_trad_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_le_vec)
+
+{ "mips*el-*-openbsd*",
+&mips_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_be_vec)
+
+{ "mips*-*-openbsd*",
+&mips_elf32_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_ntrad_le_vec)
+
+{ "mips64*el-*-linux*",
+&mips_elf32_ntrad_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_ntrad_be_vec)
+
+{ "mips64*-*-linux*",
+&mips_elf32_ntrad_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_le_vec)
+
+{ "mips*el-*-linux*",
+&mips_elf32_trad_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_trad_be_vec)
+
+{ "mips*-*-linux*",
+&mips_elf32_trad_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_ntradfbsd_le_vec)
+
+{ "mips64*el-*-freebsd*", NULL },{ "mips64*el-*-kfreebsd*-gnu",
+&mips_elf32_ntradfbsd_le_vec },
+#endif
+
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_ntradfbsd_be_vec)
+
+{ "mips64*-*-freebsd*", NULL },{ "mips64*-*-kfreebsd*-gnu",
+&mips_elf32_ntradfbsd_be_vec },
+#endif
+
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_tradfbsd_le_vec)
+
+{ "mips*el-*-freebsd*", NULL },{ "mips*el-*-kfreebsd*-gnu",
+&mips_elf32_tradfbsd_le_vec },
+#endif
+
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mips_elf32_tradfbsd_be_vec)
+
+{ "mips*-*-freebsd*", NULL },{ "mips*-*-kfreebsd*-gnu",
+&mips_elf32_tradfbsd_be_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mmix_elf64_vec)
+
+{ "mmix-*-*",
+&mmix_elf64_vec },
+#endif
+
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_mn10200_elf32_vec)
+
+{ "mn10200-*-*",
+&mn10200_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mn10300_elf32_vec)
+
+{ "mn10300-*-*",
+&mn10300_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_mt_elf32_vec)
+
+{ "mt-*-elf",
+&mt_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_msp430_elf32_vec)
+
+{ "msp430-*-*",
+&msp430_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nds32_elf32_linux_le_vec)
+
+{ "nds32*le-*-linux*",
+&nds32_elf32_linux_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nds32_elf32_linux_be_vec)
+
+{ "nds32*be-*-linux*",
+&nds32_elf32_linux_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nds32_elf32_le_vec)
+
+{ "nds32*le-*-*",
+&nds32_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nds32_elf32_be_vec)
+
+{ "nds32*be-*-*",
+&nds32_elf32_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_ns32k_aout_pc532mach_vec)
+
+{ "ns32k-pc532-mach*", NULL },{ "ns32k-pc532-ux*",
+&ns32k_aout_pc532mach_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_ns32k_aout_pc532nbsd_vec)
+
+{ "ns32k-*-netbsd*", NULL },{ "ns32k-*-lites*", NULL },{ "ns32k-*-openbsd*",
+&ns32k_aout_pc532nbsd_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nios2_elf32_be_vec)
+
+{ "nios2eb-*-*",
+&nios2_elf32_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nios2_elf32_le_vec)
+
+{ "nios2el-*-*",
+&nios2_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_nios2_elf32_le_vec)
+
+{ "nios2-*-*",
+&nios2_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_or1k_elf32_vec)
+
+{ "or1k-*-elf", NULL },{ "or1k-*-linux*", NULL },{ "or1k-*-rtems*",
+&or1k_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_or1k_elf32_vec)
+
+{ "or1knd-*-elf", NULL },{ "or1knd-*-linux*", NULL },{ "or1knd-*-rtems*",
+&or1k_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_pdp11_aout_vec)
+
+{ "pdp11-*-*",
+&pdp11_aout_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_pj_elf32_vec)
+
+{ "pj-*-*",
+&pj_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_pj_elf32_le_vec)
+
+{ "pjl-*-*",
+&pj_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_rs6000_xcoff_vec)
+
+{ "powerpc-*-aix5.[01]", NULL },{ "rs6000-*-aix5.[01]",
+&rs6000_xcoff_vec },
+#endif
+
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_rs6000_xcoff64_aix_vec)
+
+{ "powerpc64-*-aix5.[01]", NULL },{ "rs6000-*-aix5.[01]",
+&rs6000_xcoff64_aix_vec },
+#endif
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_rs6000_xcoff_vec)
+
+{ "powerpc-*-aix[5-9]*", NULL },{ "rs6000-*-aix[5-9]*",
+&rs6000_xcoff_vec },
+#endif
+
+
+
+    
+#ifdef BFD64
+
+#if !defined (SELECT_VECS) || defined (HAVE_rs6000_xcoff64_aix_vec)
+
+{ "powerpc64-*-aix[5-9]*", NULL },{ "rs6000-*-aix[5-9]*",
+&rs6000_xcoff64_aix_vec },
+#endif
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_rs6000_xcoff_vec)
+
+{ "powerpc-*-aix*", NULL },{ "powerpc-*-beos*", NULL },{ "rs6000-*-*",
+&rs6000_xcoff_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_rs6000_xcoff64_vec)
+
+{ "powerpc64-*-aix*",
+&rs6000_xcoff64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf64_fbsd_vec)
+
+{ "powerpc64-*-freebsd*",
+&powerpc_elf64_fbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf64_vec)
+
+{ "powerpc64-*-elf*", NULL },{ "powerpc-*-elf64*", NULL },{ "powerpc64-*-linux*", NULL },
+{ "powerpc64-*-*bsd*",
+&powerpc_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf64_le_vec)
+
+{ "powerpc64le-*-elf*", NULL },{ "powerpcle-*-elf64*", NULL },{ "powerpc64le-*-linux*", NULL },
+{ "powerpc64le-*-*bsd*",
+&powerpc_elf64_le_vec },
+#endif
+
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_fbsd_vec)
+
+{ "powerpc-*-*freebsd*",
+&powerpc_elf32_fbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_vec)
+
+{ "powerpc-*-*bsd*", NULL },{ "powerpc-*-elf*", NULL },{ "powerpc-*-sysv4*", NULL },{ "powerpc-*-eabi*", NULL },
+{ "powerpc-*-solaris2*", NULL },{ "powerpc-*-linux-*", NULL },{ "powerpc-*-rtems*", NULL },
+{ "powerpc-*-chorus*",
+&powerpc_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_vec)
+
+{ "powerpc-*-kaos*",
+&powerpc_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_mach_o_be_vec)
+
+{ "powerpc-*-darwin*", NULL },{ "powerpc-*-macos10*", NULL },{ "powerpc-*-rhapsody*",
+&mach_o_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_xcoff_vec)
+
+{ "powerpc-*-macos*",
+&powerpc_xcoff_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_vec)
+
+{ "powerpc-*-lynxos*",
+&powerpc_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_vec)
+
+{ "powerpc-*-netware*",
+&powerpc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_vec)
+
+{ "powerpc-*-nto*",
+&powerpc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_vxworks_vec)
+
+{ "powerpc-*-vxworks*", NULL },{ "powerpc-*-windiss*",
+&powerpc_elf32_vxworks_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_le_vec)
+
+{ "powerpcle-*-nto*",
+&powerpc_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_elf32_le_vec)
+
+{ "powerpcle-*-elf*", NULL },{ "powerpcle-*-sysv4*", NULL },{ "powerpcle-*-eabi*", NULL },
+{ "powerpcle-*-solaris2*", NULL },{ "powerpcle-*-linux-*", NULL },{ "powerpcle-*-vxworks*",
+&powerpc_elf32_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_powerpc_pe_le_vec)
+
+{ "powerpcle-*-pe", NULL },{ "powerpcle-*-winnt*", NULL },{ "powerpcle-*-cygwin*",
+&powerpc_pe_le_vec },
+#endif
+
+
+    
+
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_riscv_elf32_vec)
+
+{ "riscv32-*-*",
+&riscv_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_riscv_elf64_vec)
+
+{ "riscv64-*-*",
+&riscv_elf64_vec },
+#endif
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_rl78_elf32_vec)
+
+{ "rl78-*-elf",
+&rl78_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_rx_elf32_le_vec)
+
+{ "rx-*-elf",
+&rx_elf32_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_s390_elf32_vec)
+
+{ "s390-*-linux*",
+&s390_elf32_vec },
+#endif
+
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_s390_elf64_vec)
+
+{ "s390x-*-linux*",
+&s390_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_s390_elf64_vec)
+
+{ "s390x-*-tpf*",
+&s390_elf64_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_score_elf32_be_vec)
+
+{ "score*-*-elf*",
+&score_elf32_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf32_le_vec)
+
+{ "sh64l*-*-elf*",
+&sh64_elf32_le_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf32_vec)
+
+{ "sh64-*-elf*",
+&sh64_elf32_vec },
+#endif
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf32_linux_be_vec)
+
+{ "sh64eb-*-linux*",
+&sh64_elf32_linux_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf32_linux_vec)
+
+{ "sh64-*-linux*",
+&sh64_elf32_linux_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_linux_be_vec)
+
+{ "sh-*-linux*",
+&sh_elf32_linux_be_vec },
+#endif
+
+
+
+
+    
+#endif /* BFD64 */
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_linux_be_vec)
+
+{ "sh*eb-*-linux*",
+&sh_elf32_linux_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_linux_vec)
+
+{ "sh*-*-linux*",
+&sh_elf32_linux_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_vec)
+
+{ "sh-*-uclinux*", NULL },{ "sh[12]-*-uclinux*",
+&sh_elf32_vec },
+#endif
+
+
+#ifdef BFD64
+
+#endif
+    
+
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf32_nbsd_le_vec)
+
+{ "sh5le-*-netbsd*",
+&sh64_elf32_nbsd_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf32_nbsd_vec)
+
+{ "sh5-*-netbsd*",
+&sh64_elf32_nbsd_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf64_nbsd_le_vec)
+
+{ "sh64le-*-netbsd*",
+&sh64_elf64_nbsd_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh64_elf64_nbsd_vec)
+
+{ "sh64-*-netbsd*",
+&sh64_elf64_nbsd_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_nbsd_le_vec)
+
+{ "sh*l*-*-netbsdelf*",
+&sh_elf32_nbsd_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_nbsd_vec)
+
+{ "sh-*-netbsdelf*",
+&sh_elf32_nbsd_vec },
+#endif
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_nbsd_vec)
+
+{ "sh*-*-netbsdelf*",
+&sh_elf32_nbsd_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_symbian_le_vec)
+
+{ "sh*-*-symbianelf*",
+&sh_elf32_symbian_le_vec },
+#endif
+
+
+
+    
+
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_le_vec)
+
+{ "shl*-*-elf*", NULL },{ "sh[1234]l*-*-elf*", NULL },{ "sh3el*-*-elf*", NULL },{ "shl*-*-kaos*",
+&sh_elf32_le_vec },
+#endif
+
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh_coff_vec)
+
+{ "sh-*-rtemscoff*",
+&sh_coff_vec },
+#endif
+
+
+
+    
+
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_vec)
+
+{ "sh-*-elf*", NULL },{ "sh[1234]*-elf*", NULL },{ "sh-*-rtems*", NULL },{ "sh-*-kaos*",
+&sh_elf32_vec },
+#endif
+
+
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_vec)
+
+{ "sh-*-nto*",
+&sh_elf32_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_nbsd_le_vec)
+
+{ "sh*-*-openbsd*",
+&sh_elf32_nbsd_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_pe_le_vec)
+
+{ "sh-*-pe",
+&sh_pe_le_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_elf32_vxworks_vec)
+
+{ "sh-*-vxworks",
+&sh_elf32_vxworks_vec },
+#endif
+
+
+
+
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sh_coff_vec)
+
+{ "sh-*-*",
+&sh_coff_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_sunos_be_vec)
+
+{ "sparclet-*-aout*",
+&sparc_aout_sunos_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_sunos_be_vec)
+
+{ "sparc86x-*-aout*",
+&sparc_aout_sunos_be_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparclite-*-elf*", NULL },{ "sparc86x-*-elf*",
+&sparc_elf32_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc*-*-chorus*",
+&sparc_elf32_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_linux_vec)
+
+{ "sparc-*-linux*aout*",
+&sparc_aout_linux_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-linux-*", NULL },{ "sparcv*-*-linux-*",
+&sparc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-netbsdelf*",
+&sparc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_nbsd_vec)
+
+{ "sparc-*-netbsdaout*", NULL },{ "sparc-*-netbsd*",
+&sparc_aout_nbsd_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_nbsd_vec)
+
+{ "sparc-*-openbsd[0-2].*", NULL },{ "sparc-*-openbsd3.[0-1]",
+&sparc_aout_nbsd_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-openbsd*",
+&sparc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-elf*",
+&sparc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_sol2_vec)
+
+{ "sparc-*-solaris2.[0-6]", NULL },{ "sparc-*-solaris2.[0-6].*",
+&sparc_elf32_sol2_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_sol2_vec)
+
+{ "sparc-*-solaris2*", NULL },{ "sparcv9-*-solaris2*", NULL },{ "sparc64-*-solaris2*",
+&sparc_elf32_sol2_vec },
+#endif
+
+
+
+    
+#endif
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-sysv4*",
+&sparc_elf32_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vxworks_vec)
+
+{ "sparc-*-vxworks*",
+&sparc_elf32_vxworks_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-netware*",
+&sparc_elf32_vec },
+#endif
+
+
+    
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_sunos_be_vec)
+
+{ "sparc64-*-aout*",
+&sparc_aout_sunos_be_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf64_vec)
+
+{ "sparc64*-*-linux-*",
+&sparc_elf64_vec },
+#endif
+
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf64_vec)
+
+{ "sparc64-*-elf*", NULL },{ "sparc64-*-rtems*",
+&sparc_elf64_vec },
+#endif
+
+
+
+    
+#endif /* BFD64 */
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_coff_vec)
+
+{ "sparc*-*-coff*",
+&sparc_coff_vec },
+#endif
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_elf32_vec)
+
+{ "sparc-*-rtems*",
+&sparc_elf32_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_sparc_aout_sunos_be_vec)
+
+{ "sparc*-*-*",
+&sparc_aout_sunos_be_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_spu_elf32_vec)
+
+{ "spu-*-elf",
+&spu_elf32_vec },
+#endif
+
+
+    
+
+#if HAVE_aout_vec
+#if !defined (SELECT_VECS) || defined (HAVE_aout_vec)
+
+{ "tahoe-*-*",
+&aout_vec },
+#endif
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_tic6x_elf32_c6000_le_vec)
+
+{ "tic6x-*-elf",
+&tic6x_elf32_c6000_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_tic6x_elf32_linux_le_vec)
+
+{ "tic6x-*-uclinux",
+&tic6x_elf32_linux_le_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_tic80_coff_vec)
+
+{ "tic80*-*-*",
+&tic80_coff_vec },
+#endif
+
+
+    
+
+#ifdef BFD64
+#if !defined (SELECT_VECS) || defined (HAVE_tilegx_elf64_le_vec)
+
+{ "tilegx-*-*",
+&tilegx_elf64_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_tilegx_elf64_be_vec)
+
+{ "tilegxbe-*-*",
+&tilegx_elf64_be_vec },
+#endif
+
+
+    
+#endif
+
+#if !defined (SELECT_VECS) || defined (HAVE_tilepro_elf32_vec)
+
+{ "tilepro-*-*",
+&tilepro_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_ft32_elf32_vec)
+
+{ "ft32*-*-*",
+&ft32_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_v850_elf32_vec)
+
+{ "v850*-*-*",
+&v850_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_vax_elf32_vec)
+
+{ "vax-*-netbsdelf*",
+&vax_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_vax_aout_nbsd_vec)
+
+{ "vax-*-netbsdaout*", NULL },{ "vax-*-netbsd*",
+&vax_aout_nbsd_vec },
+#endif
+
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_vax_aout_bsd_vec)
+
+{ "vax-*-bsd*", NULL },{ "vax-*-ultrix*",
+&vax_aout_bsd_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_vax_aout_nbsd_vec)
+
+{ "vax-*-openbsd*",
+&vax_aout_nbsd_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_vax_elf32_vec)
+
+{ "vax-*-linux-*",
+&vax_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_visium_elf32_vec)
+
+{ "visium-*-elf",
+&visium_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_we32k_coff_vec)
+
+{ "we32k-*-*",
+&we32k_coff_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_w65_coff_vec)
+
+{ "w65-*-*",
+&w65_coff_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_xgate_elf32_vec)
+
+{ "xgate-*-*",
+&xgate_elf32_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_xstormy16_elf32_vec)
+
+{ "xstormy16-*-elf",
+&xstormy16_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_xtensa_elf32_le_vec)
+
+{ "xtensa*-*-*",
+&xtensa_elf32_le_vec },
+#endif
+
+
+    
+#if !defined (SELECT_VECS) || defined (HAVE_xc16x_elf32_vec)
+
+{ "xc16x-*-elf",
+&xc16x_elf32_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_z80_coff_vec)
+
+{ "z80-*-*",
+&z80_coff_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_z8k_coff_vec)
+
+{ "z8k*-*-*",
+&z8k_coff_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_ieee_vec)
+
+{ "*-*-ieee*",
+&ieee_vec },
+#endif
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_aout_adobe_vec)
+
+{ "*-adobe-*",
+&aout_adobe_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_aout_newsos3_vec)
+
+{ "*-sony-*",
+&m68k_aout_newsos3_vec },
+#endif
+
+
+    
+
+#if !defined (SELECT_VECS) || defined (HAVE_m68k_coff_vec)
+
+{ "*-tandem-*",
+&m68k_coff_vec },
+#endif
+
+
+    
diff --git a/utils/gapy/gen-debug-info-src/ext/bfdlink.h b/utils/gapy/gen-debug-info-src/ext/bfdlink.h
new file mode 100644
index 000000000..1ac073823
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bfdlink.h
@@ -0,0 +1,842 @@
+/* bfdlink.h -- header file for BFD link routines
+   Copyright 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+   2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+   Free Software Foundation, Inc.
+   Written by Steve Chamberlain and Ian Lance Taylor, Cygnus Support.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+#ifndef BFDLINK_H
+#define BFDLINK_H
+
+/* Which symbols to strip during a link.  */
+enum bfd_link_strip
+{
+  strip_none,		/* Don't strip any symbols.  */
+  strip_debugger,	/* Strip debugging symbols.  */
+  strip_some,		/* keep_hash is the list of symbols to keep.  */
+  strip_all		/* Strip all symbols.  */
+};
+
+/* Which local symbols to discard during a link.  This is irrelevant
+   if strip_all is used.  */
+enum bfd_link_discard
+{
+  discard_sec_merge,	/* Discard local temporary symbols in SEC_MERGE
+			   sections.  */
+  discard_none,		/* Don't discard any locals.  */
+  discard_l,		/* Discard local temporary symbols.  */
+  discard_all		/* Discard all locals.  */
+};
+
+/* Describes the type of hash table entry structure being used.
+   Different hash table structure have different fields and so
+   support different linking features.  */
+enum bfd_link_hash_table_type
+  {
+    bfd_link_generic_hash_table,
+    bfd_link_elf_hash_table
+  };
+
+/* These are the possible types of an entry in the BFD link hash
+   table.  */
+
+enum bfd_link_hash_type
+{
+  bfd_link_hash_new,		/* Symbol is new.  */
+  bfd_link_hash_undefined,	/* Symbol seen before, but undefined.  */
+  bfd_link_hash_undefweak,	/* Symbol is weak and undefined.  */
+  bfd_link_hash_defined,	/* Symbol is defined.  */
+  bfd_link_hash_defweak,	/* Symbol is weak and defined.  */
+  bfd_link_hash_common,		/* Symbol is common.  */
+  bfd_link_hash_indirect,	/* Symbol is an indirect link.  */
+  bfd_link_hash_warning		/* Like indirect, but warn if referenced.  */
+};
+
+enum bfd_link_common_skip_ar_symbols
+{
+  bfd_link_common_skip_none,
+  bfd_link_common_skip_text,
+  bfd_link_common_skip_data,
+  bfd_link_common_skip_all
+};
+
+struct bfd_link_hash_common_entry
+  {
+    unsigned int alignment_power;	/* Alignment.  */
+    asection *section;		/* Symbol section.  */
+  };
+
+/* The linking routines use a hash table which uses this structure for
+   its elements.  */
+
+struct bfd_link_hash_entry
+{
+  /* Base hash table entry structure.  */
+  struct bfd_hash_entry root;
+
+  /* Type of this entry.  */
+  ENUM_BITFIELD (bfd_link_hash_type) type : 8;
+
+  unsigned int non_ir_ref : 1;
+
+  /* A union of information depending upon the type.  */
+  union
+    {
+      /* Nothing is kept for bfd_hash_new.  */
+      /* bfd_link_hash_undefined, bfd_link_hash_undefweak.  */
+      struct
+	{
+	  /* Undefined and common symbols are kept in a linked list through
+	     this field.  This field is present in all of the union element
+	     so that we don't need to remove entries from the list when we
+	     change their type.  Removing entries would either require the
+	     list to be doubly linked, which would waste more memory, or
+	     require a traversal.  When an undefined or common symbol is
+	     created, it should be added to this list, the head of which is in
+	     the link hash table itself.  As symbols are defined, they need
+	     not be removed from the list; anything which reads the list must
+	     doublecheck the symbol type.
+
+	     Weak symbols are not kept on this list.
+
+	     Defined and defweak symbols use this field as a reference marker.
+	     If the field is not NULL, or this structure is the tail of the
+	     undefined symbol list, the symbol has been referenced.  If the
+	     symbol is undefined and becomes defined, this field will
+	     automatically be non-NULL since the symbol will have been on the
+	     undefined symbol list.  */
+	  struct bfd_link_hash_entry *next;
+	  bfd *abfd;		/* BFD symbol was found in.  */
+	} undef;
+      /* bfd_link_hash_defined, bfd_link_hash_defweak.  */
+      struct
+	{
+	  struct bfd_link_hash_entry *next;
+	  asection *section;	/* Symbol section.  */
+	  bfd_vma value;	/* Symbol value.  */
+	} def;
+      /* bfd_link_hash_indirect, bfd_link_hash_warning.  */
+      struct
+	{
+	  struct bfd_link_hash_entry *next;
+	  struct bfd_link_hash_entry *link;	/* Real symbol.  */
+	  const char *warning;	/* Warning (bfd_link_hash_warning only).  */
+	} i;
+      /* bfd_link_hash_common.  */
+      struct
+	{
+	  struct bfd_link_hash_entry *next;
+	  /* The linker needs to know three things about common
+	     symbols: the size, the alignment, and the section in
+	     which the symbol should be placed.  We store the size
+	     here, and we allocate a small structure to hold the
+	     section and the alignment.  The alignment is stored as a
+	     power of two.  We don't store all the information
+	     directly because we don't want to increase the size of
+	     the union; this structure is a major space user in the
+	     linker.  */
+	  struct bfd_link_hash_common_entry *p;
+	  bfd_size_type size;	/* Common symbol size.  */
+	} c;
+    } u;
+};
+
+/* This is the link hash table.  It is a derived class of
+   bfd_hash_table.  */
+
+struct bfd_link_hash_table
+{
+  /* The hash table itself.  */
+  struct bfd_hash_table table;
+  /* A linked list of undefined and common symbols, linked through the
+     next field in the bfd_link_hash_entry structure.  */
+  struct bfd_link_hash_entry *undefs;
+  /* Entries are added to the tail of the undefs list.  */
+  struct bfd_link_hash_entry *undefs_tail;
+  /* The type of the link hash table.  */
+  enum bfd_link_hash_table_type type;
+};
+
+/* Look up an entry in a link hash table.  If FOLLOW is TRUE, this
+   follows bfd_link_hash_indirect and bfd_link_hash_warning links to
+   the real symbol.  */
+extern struct bfd_link_hash_entry *bfd_link_hash_lookup
+  (struct bfd_link_hash_table *, const char *, bfd_boolean create,
+   bfd_boolean copy, bfd_boolean follow);
+
+/* Look up an entry in the main linker hash table if the symbol might
+   be wrapped.  This should only be used for references to an
+   undefined symbol, not for definitions of a symbol.  */
+
+extern struct bfd_link_hash_entry *bfd_wrapped_link_hash_lookup
+  (bfd *, struct bfd_link_info *, const char *, bfd_boolean,
+   bfd_boolean, bfd_boolean);
+
+/* Traverse a link hash table.  */
+extern void bfd_link_hash_traverse
+  (struct bfd_link_hash_table *,
+    bfd_boolean (*) (struct bfd_link_hash_entry *, void *),
+    void *);
+
+/* Add an entry to the undefs list.  */
+extern void bfd_link_add_undef
+  (struct bfd_link_hash_table *, struct bfd_link_hash_entry *);
+
+/* Remove symbols from the undefs list that don't belong there.  */
+extern void bfd_link_repair_undef_list
+  (struct bfd_link_hash_table *table);
+
+/* Read symbols and cache symbol pointer array in outsymbols.  */
+extern bfd_boolean bfd_generic_link_read_symbols (bfd *);
+
+struct bfd_sym_chain
+{
+  struct bfd_sym_chain *next;
+  const char *name;
+};
+
+/* How to handle unresolved symbols.
+   There are four possibilities which are enumerated below:  */
+enum report_method
+{
+  /* This is the initial value when then link_info structure is created.
+     It allows the various stages of the linker to determine whether they
+     allowed to set the value.  */
+  RM_NOT_YET_SET = 0,
+  RM_IGNORE,
+  RM_GENERATE_WARNING,
+  RM_GENERATE_ERROR
+};
+
+typedef enum {with_flags, without_flags} flag_type;
+
+/* A section flag list.  */
+struct flag_info_list
+{
+  flag_type with;
+  const char *name;
+  bfd_boolean valid;
+  struct flag_info_list *next;
+};
+
+/* Section flag info.  */
+struct flag_info
+{
+  flagword only_with_flags;
+  flagword not_with_flags;
+  struct flag_info_list *flag_list;
+  bfd_boolean flags_initialized;
+};
+
+struct bfd_elf_dynamic_list;
+struct bfd_elf_version_tree;
+
+/* This structure holds all the information needed to communicate
+   between BFD and the linker when doing a link.  */
+
+struct bfd_link_info
+{
+  /* TRUE if BFD should generate a shared object (or a pie).  */
+  unsigned int shared: 1;
+
+  /* TRUE if generating an executable, position independent or not.  */
+  unsigned int executable : 1;
+
+  /* TRUE if generating a position independent executable.  */
+  unsigned int pie: 1;
+
+  /* TRUE if BFD should generate a relocatable object file.  */
+  unsigned int relocatable: 1;
+
+  /* TRUE if BFD should pre-bind symbols in a shared object.  */
+  unsigned int symbolic: 1;
+
+  /* TRUE if executable should not contain copy relocs.
+     Setting this true may result in a non-sharable text segment.  */
+  unsigned int nocopyreloc: 1;
+
+  /* TRUE if BFD should export all symbols in the dynamic symbol table
+     of an executable, rather than only those used.  */
+  unsigned int export_dynamic: 1;
+
+  /* TRUE if a default symbol version should be created and used for
+     exported symbols.  */
+  unsigned int create_default_symver: 1;
+
+  /* TRUE if unreferenced sections should be removed.  */
+  unsigned int gc_sections: 1;
+
+  /* TRUE if every symbol should be reported back via the notice
+     callback.  */
+  unsigned int notice_all: 1;
+
+  /* TRUE if we are loading LTO outputs.  */
+  unsigned int loading_lto_outputs: 1;
+
+  /* TRUE if global symbols in discarded sections should be stripped.  */
+  unsigned int strip_discarded: 1;
+
+  /* TRUE if all data symbols should be dynamic.  */
+  unsigned int dynamic_data: 1;
+
+  /* Which symbols to strip.  */
+  ENUM_BITFIELD (bfd_link_strip) strip : 2;
+
+  /* Which local symbols to discard.  */
+  ENUM_BITFIELD (bfd_link_discard) discard : 2;
+
+  /* Criteria for skipping symbols when determining
+     whether to include an object from an archive. */
+  ENUM_BITFIELD (bfd_link_common_skip_ar_symbols) common_skip_ar_symbols : 2;
+
+  /* What to do with unresolved symbols in an object file.
+     When producing executables the default is GENERATE_ERROR.
+     When producing shared libraries the default is IGNORE.  The
+     assumption with shared libraries is that the reference will be
+     resolved at load/execution time.  */
+  ENUM_BITFIELD (report_method) unresolved_syms_in_objects : 2;
+
+  /* What to do with unresolved symbols in a shared library.
+     The same defaults apply.  */
+  ENUM_BITFIELD (report_method) unresolved_syms_in_shared_libs : 2;
+
+  /* TRUE if shared objects should be linked directly, not shared.  */
+  unsigned int static_link: 1;
+
+  /* TRUE if symbols should be retained in memory, FALSE if they
+     should be freed and reread.  */
+  unsigned int keep_memory: 1;
+
+  /* TRUE if BFD should generate relocation information in the final
+     executable.  */
+  unsigned int emitrelocations: 1;
+
+  /* TRUE if PT_GNU_RELRO segment should be created.  */
+  unsigned int relro: 1;
+
+  /* TRUE if .eh_frame_hdr section and PT_GNU_EH_FRAME ELF segment
+     should be created.  */
+  unsigned int eh_frame_hdr: 1;
+
+  /* TRUE if we should warn when adding a DT_TEXTREL to a shared object.  */
+  unsigned int warn_shared_textrel: 1;
+
+  /* TRUE if we should error when adding a DT_TEXTREL.  */
+  unsigned int error_textrel: 1;
+
+  /* TRUE if .hash section should be created.  */
+  unsigned int emit_hash: 1;
+
+  /* TRUE if .gnu.hash section should be created.  */
+  unsigned int emit_gnu_hash: 1;
+
+  /* If TRUE reduce memory overheads, at the expense of speed. This will
+     cause map file generation to use an O(N^2) algorithm and disable
+     caching ELF symbol buffer.  */
+  unsigned int reduce_memory_overheads: 1;
+
+  /* TRUE if the output file should be in a traditional format.  This
+     is equivalent to the setting of the BFD_TRADITIONAL_FORMAT flag
+     on the output file, but may be checked when reading the input
+     files.  */
+  unsigned int traditional_format: 1;
+
+  /* TRUE if non-PLT relocs should be merged into one reloc section
+     and sorted so that relocs against the same symbol come together.  */
+  unsigned int combreloc: 1;
+
+  /* TRUE if a default symbol version should be created and used for
+     imported symbols.  */
+  unsigned int default_imported_symver: 1;
+
+  /* TRUE if the new ELF dynamic tags are enabled. */
+  unsigned int new_dtags: 1;
+
+  /* FALSE if .eh_frame unwind info should be generated for PLT and other
+     linker created sections, TRUE if it should be omitted.  */
+  unsigned int no_ld_generated_unwind_info: 1;
+
+  /* TRUE if BFD should generate a "task linked" object file,
+     similar to relocatable but also with globals converted to
+     statics.  */
+  unsigned int task_link: 1;
+
+  /* TRUE if ok to have multiple definition.  */
+  unsigned int allow_multiple_definition: 1;
+
+  /* TRUE if ok to have version with no definition.  */
+  unsigned int allow_undefined_version: 1;
+
+  /* TRUE if some symbols have to be dynamic, controlled by
+     --dynamic-list command line options.  */
+  unsigned int dynamic: 1;
+
+  /* TRUE if PT_GNU_STACK segment should be created with PF_R|PF_W|PF_X
+     flags.  */
+  unsigned int execstack: 1;
+
+  /* TRUE if PT_GNU_STACK segment should be created with PF_R|PF_W
+     flags.  */
+  unsigned int noexecstack: 1;
+
+  /* TRUE if we want to produced optimized output files.  This might
+     need much more time and therefore must be explicitly selected.  */
+  unsigned int optimize: 1;
+
+  /* TRUE if user should be informed of removed unreferenced sections.  */
+  unsigned int print_gc_sections: 1;
+
+  /* TRUE if we should warn alternate ELF machine code.  */
+  unsigned int warn_alternate_em: 1;
+
+  /* TRUE if the linker script contained an explicit PHDRS command.  */
+  unsigned int user_phdrs: 1;
+
+  /* Char that may appear as the first char of a symbol, but should be
+     skipped (like symbol_leading_char) when looking up symbols in
+     wrap_hash.  Used by PowerPC Linux for 'dot' symbols.  */
+  char wrap_char;
+
+  /* Separator between archive and filename in linker script filespecs.  */
+  char path_separator;
+
+  /* Default stack size.  Zero means default (often zero itself), -1
+     means explicitly zero-sized.  */
+  bfd_signed_vma stacksize;
+
+  /* Enable or disable target specific optimizations.
+
+     Not all targets have optimizations to enable.
+
+     Normally these optimizations are disabled by default but some targets
+     prefer to enable them by default.  So this field is a tri-state variable.
+     The values are:
+     
+     zero: Enable the optimizations (either from --relax being specified on
+       the command line or the backend's before_allocation emulation function.
+       
+     positive: The user has requested that these optimizations be disabled.
+       (Via the --no-relax command line option).
+
+     negative: The optimizations are disabled.  (Set when initializing the
+       args_type structure in ldmain.c:main.  */
+  signed int disable_target_specific_optimizations;
+
+  /* Function callbacks.  */
+  const struct bfd_link_callbacks *callbacks;
+
+  /* Hash table handled by BFD.  */
+  struct bfd_link_hash_table *hash;
+
+  /* Hash table of symbols to keep.  This is NULL unless strip is
+     strip_some.  */
+  struct bfd_hash_table *keep_hash;
+
+  /* Hash table of symbols to report back via the notice callback.  If
+     this is NULL, and notice_all is FALSE, then no symbols are
+     reported back.  */
+  struct bfd_hash_table *notice_hash;
+
+  /* Hash table of symbols which are being wrapped (the --wrap linker
+     option).  If this is NULL, no symbols are being wrapped.  */
+  struct bfd_hash_table *wrap_hash;
+
+  /* Hash table of symbols which may be left unresolved during
+     a link.  If this is NULL, no symbols can be left unresolved.  */
+  struct bfd_hash_table *ignore_hash;
+
+  /* The output BFD.  */
+  bfd *output_bfd;
+
+  /* The list of input BFD's involved in the link.  These are chained
+     together via the link_next field.  */
+  bfd *input_bfds;
+  bfd **input_bfds_tail;
+
+  /* If a symbol should be created for each input BFD, this is section
+     where those symbols should be placed.  It must be a section in
+     the output BFD.  It may be NULL, in which case no such symbols
+     will be created.  This is to support CREATE_OBJECT_SYMBOLS in the
+     linker command language.  */
+  asection *create_object_symbols_section;
+
+  /* List of global symbol names that are starting points for marking
+     sections against garbage collection.  */
+  struct bfd_sym_chain *gc_sym_list;
+
+  /* If a base output file is wanted, then this points to it */
+  void *base_file;
+
+  /* The function to call when the executable or shared object is
+     loaded.  */
+  const char *init_function;
+
+  /* The function to call when the executable or shared object is
+     unloaded.  */
+  const char *fini_function;
+
+  /* Number of relaxation passes.  Usually only one relaxation pass
+     is needed.  But a backend can have as many relaxation passes as
+     necessary.  During bfd_relax_section call, it is set to the
+     current pass, starting from 0.  */
+  int relax_pass;
+
+  /* Number of relaxation trips.  This number is incremented every
+     time the relaxation pass is restarted due to a previous
+     relaxation returning true in *AGAIN.  */
+  int relax_trip;
+
+  /* Non-zero if auto-import thunks for DATA items in pei386 DLLs
+     should be generated/linked against.  Set to 1 if this feature
+     is explicitly requested by the user, -1 if enabled by default.  */
+  int pei386_auto_import;
+
+  /* Non-zero if runtime relocs for DATA items with non-zero addends
+     in pei386 DLLs should be generated.  Set to 1 if this feature
+     is explicitly requested by the user, -1 if enabled by default.  */
+  int pei386_runtime_pseudo_reloc;
+
+  /* How many spare .dynamic DT_NULL entries should be added?  */
+  unsigned int spare_dynamic_tags;
+
+  /* May be used to set DT_FLAGS for ELF. */
+  bfd_vma flags;
+
+  /* May be used to set DT_FLAGS_1 for ELF. */
+  bfd_vma flags_1;
+
+  /* Start and end of RELRO region.  */
+  bfd_vma relro_start, relro_end;
+
+  /* List of symbols should be dynamic.  */
+  struct bfd_elf_dynamic_list *dynamic_list;
+
+  /* The version information.  */
+  struct bfd_elf_version_tree *version_info;
+};
+
+/* This structures holds a set of callback functions.  These are called
+   by the BFD linker routines.  Except for the info functions, the first
+   argument to each callback function is the bfd_link_info structure
+   being used and each function returns a boolean value.  If the
+   function returns FALSE, then the BFD function which called it should
+   return with a failure indication.  */
+
+struct bfd_link_callbacks
+{
+  /* A function which is called when an object is added from an
+     archive.  ABFD is the archive element being added.  NAME is the
+     name of the symbol which caused the archive element to be pulled
+     in.  This function may set *SUBSBFD to point to an alternative
+     BFD from which symbols should in fact be added in place of the
+     original BFD's symbols.  */
+  bfd_boolean (*add_archive_element)
+    (struct bfd_link_info *, bfd *abfd, const char *name, bfd **subsbfd);
+  /* A function which is called when a symbol is found with multiple
+     definitions.  H is the symbol which is defined multiple times.
+     NBFD is the new BFD, NSEC is the new section, and NVAL is the new
+     value.  NSEC may be bfd_com_section or bfd_ind_section.  */
+  bfd_boolean (*multiple_definition)
+    (struct bfd_link_info *, struct bfd_link_hash_entry *h,
+     bfd *nbfd, asection *nsec, bfd_vma nval);
+  /* A function which is called when a common symbol is defined
+     multiple times.  H is the symbol appearing multiple times.
+     NBFD is the BFD of the new symbol.  NTYPE is the type of the new
+     symbol, one of bfd_link_hash_defined, bfd_link_hash_common, or
+     bfd_link_hash_indirect.  If NTYPE is bfd_link_hash_common, NSIZE
+     is the size of the new symbol.  */
+  bfd_boolean (*multiple_common)
+    (struct bfd_link_info *, struct bfd_link_hash_entry *h,
+     bfd *nbfd, enum bfd_link_hash_type ntype, bfd_vma nsize);
+  /* A function which is called to add a symbol to a set.  ENTRY is
+     the link hash table entry for the set itself (e.g.,
+     __CTOR_LIST__).  RELOC is the relocation to use for an entry in
+     the set when generating a relocatable file, and is also used to
+     get the size of the entry when generating an executable file.
+     ABFD, SEC and VALUE identify the value to add to the set.  */
+  bfd_boolean (*add_to_set)
+    (struct bfd_link_info *, struct bfd_link_hash_entry *entry,
+     bfd_reloc_code_real_type reloc, bfd *abfd, asection *sec, bfd_vma value);
+  /* A function which is called when the name of a g++ constructor or
+     destructor is found.  This is only called by some object file
+     formats.  CONSTRUCTOR is TRUE for a constructor, FALSE for a
+     destructor.  This will use BFD_RELOC_CTOR when generating a
+     relocatable file.  NAME is the name of the symbol found.  ABFD,
+     SECTION and VALUE are the value of the symbol.  */
+  bfd_boolean (*constructor)
+    (struct bfd_link_info *, bfd_boolean constructor, const char *name,
+     bfd *abfd, asection *sec, bfd_vma value);
+  /* A function which is called to issue a linker warning.  For
+     example, this is called when there is a reference to a warning
+     symbol.  WARNING is the warning to be issued.  SYMBOL is the name
+     of the symbol which triggered the warning; it may be NULL if
+     there is none.  ABFD, SECTION and ADDRESS identify the location
+     which trigerred the warning; either ABFD or SECTION or both may
+     be NULL if the location is not known.  */
+  bfd_boolean (*warning)
+    (struct bfd_link_info *, const char *warning, const char *symbol,
+     bfd *abfd, asection *section, bfd_vma address);
+  /* A function which is called when a relocation is attempted against
+     an undefined symbol.  NAME is the symbol which is undefined.
+     ABFD, SECTION and ADDRESS identify the location from which the
+     reference is made. IS_FATAL indicates whether an undefined symbol is
+     a fatal error or not. In some cases SECTION may be NULL.  */
+  bfd_boolean (*undefined_symbol)
+    (struct bfd_link_info *, const char *name, bfd *abfd,
+     asection *section, bfd_vma address, bfd_boolean is_fatal);
+  /* A function which is called when a reloc overflow occurs. ENTRY is
+     the link hash table entry for the symbol the reloc is against.
+     NAME is the name of the local symbol or section the reloc is
+     against, RELOC_NAME is the name of the relocation, and ADDEND is
+     any addend that is used.  ABFD, SECTION and ADDRESS identify the
+     location at which the overflow occurs; if this is the result of a
+     bfd_section_reloc_link_order or bfd_symbol_reloc_link_order, then
+     ABFD will be NULL.  */
+  bfd_boolean (*reloc_overflow)
+    (struct bfd_link_info *, struct bfd_link_hash_entry *entry,
+     const char *name, const char *reloc_name, bfd_vma addend,
+     bfd *abfd, asection *section, bfd_vma address);
+  /* A function which is called when a dangerous reloc is performed.
+     MESSAGE is an appropriate message.
+     ABFD, SECTION and ADDRESS identify the location at which the
+     problem occurred; if this is the result of a
+     bfd_section_reloc_link_order or bfd_symbol_reloc_link_order, then
+     ABFD will be NULL.  */
+  bfd_boolean (*reloc_dangerous)
+    (struct bfd_link_info *, const char *message,
+     bfd *abfd, asection *section, bfd_vma address);
+  /* A function which is called when a reloc is found to be attached
+     to a symbol which is not being written out.  NAME is the name of
+     the symbol.  ABFD, SECTION and ADDRESS identify the location of
+     the reloc; if this is the result of a
+     bfd_section_reloc_link_order or bfd_symbol_reloc_link_order, then
+     ABFD will be NULL.  */
+  bfd_boolean (*unattached_reloc)
+    (struct bfd_link_info *, const char *name,
+     bfd *abfd, asection *section, bfd_vma address);
+  /* A function which is called when a symbol in notice_hash is
+     defined or referenced.  H is the symbol.  ABFD, SECTION and
+     ADDRESS are the (new) value of the symbol.  If SECTION is
+     bfd_und_section, this is a reference.  FLAGS are the symbol
+     BSF_* flags.  STRING is the name of the symbol to indirect to if
+     the sym is indirect, or the warning string if a warning sym.  */
+  bfd_boolean (*notice)
+    (struct bfd_link_info *, struct bfd_link_hash_entry *h,
+     bfd *abfd, asection *section, bfd_vma address, flagword flags,
+     const char *string);
+  /* Error or warning link info message.  */
+  void (*einfo)
+    (const char *fmt, ...);
+  /* General link info message.  */
+  void (*info)
+    (const char *fmt, ...);
+  /* Message to be printed in linker map file.  */
+  void (*minfo)
+    (const char *fmt, ...);
+  /* This callback provides a chance for users of the BFD library to
+     override its decision about whether to place two adjacent sections
+     into the same segment.  */
+  bfd_boolean (*override_segment_assignment)
+    (struct bfd_link_info *, bfd * abfd,
+     asection * current_section, asection * previous_section,
+     bfd_boolean new_segment);
+};
+
+/* The linker builds link_order structures which tell the code how to
+   include input data in the output file.  */
+
+/* These are the types of link_order structures.  */
+
+enum bfd_link_order_type
+{
+  bfd_undefined_link_order,	/* Undefined.  */
+  bfd_indirect_link_order,	/* Built from a section.  */
+  bfd_data_link_order,		/* Set to explicit data.  */
+  bfd_section_reloc_link_order,	/* Relocate against a section.  */
+  bfd_symbol_reloc_link_order	/* Relocate against a symbol.  */
+};
+
+/* This is the link_order structure itself.  These form a chain
+   attached to the output section whose contents they are describing.  */
+
+struct bfd_link_order
+{
+  /* Next link_order in chain.  */
+  struct bfd_link_order *next;
+  /* Type of link_order.  */
+  enum bfd_link_order_type type;
+  /* Offset within output section.  */
+  bfd_vma offset;
+  /* Size within output section.  */
+  bfd_size_type size;
+  /* Type specific information.  */
+  union
+    {
+      struct
+	{
+	  /* Section to include.  If this is used, then
+	     section->output_section must be the section the
+	     link_order is attached to, section->output_offset must
+	     equal the link_order offset field, and section->size
+	     must equal the link_order size field.  Maybe these
+	     restrictions should be relaxed someday.  */
+	  asection *section;
+	} indirect;
+      struct
+	{
+	  /* Size of contents, or zero when contents should be filled by
+	     the architecture-dependent fill function.
+	     A non-zero value allows filling of the output section
+	     with an arbitrary repeated pattern.  */
+	  unsigned int size;
+	  /* Data to put into file.  */
+	  bfd_byte *contents;
+	} data;
+      struct
+	{
+	  /* Description of reloc to generate.  Used for
+	     bfd_section_reloc_link_order and
+	     bfd_symbol_reloc_link_order.  */
+	  struct bfd_link_order_reloc *p;
+	} reloc;
+    } u;
+};
+
+/* A linker order of type bfd_section_reloc_link_order or
+   bfd_symbol_reloc_link_order means to create a reloc against a
+   section or symbol, respectively.  This is used to implement -Ur to
+   generate relocs for the constructor tables.  The
+   bfd_link_order_reloc structure describes the reloc that BFD should
+   create.  It is similar to a arelent, but I didn't use arelent
+   because the linker does not know anything about most symbols, and
+   any asymbol structure it creates will be partially meaningless.
+   This information could logically be in the bfd_link_order struct,
+   but I didn't want to waste the space since these types of relocs
+   are relatively rare.  */
+
+struct bfd_link_order_reloc
+{
+  /* Reloc type.  */
+  bfd_reloc_code_real_type reloc;
+
+  union
+    {
+      /* For type bfd_section_reloc_link_order, this is the section
+	 the reloc should be against.  This must be a section in the
+	 output BFD, not any of the input BFDs.  */
+      asection *section;
+      /* For type bfd_symbol_reloc_link_order, this is the name of the
+	 symbol the reloc should be against.  */
+      const char *name;
+    } u;
+
+  /* Addend to use.  The object file should contain zero.  The BFD
+     backend is responsible for filling in the contents of the object
+     file correctly.  For some object file formats (e.g., COFF) the
+     addend must be stored into in the object file, and for some
+     (e.g., SPARC a.out) it is kept in the reloc.  */
+  bfd_vma addend;
+};
+
+/* Allocate a new link_order for a section.  */
+extern struct bfd_link_order *bfd_new_link_order (bfd *, asection *);
+
+/* These structures are used to describe version information for the
+   ELF linker.  These structures could be manipulated entirely inside
+   BFD, but it would be a pain.  Instead, the regular linker sets up
+   these structures, and then passes them into BFD.  */
+
+/* Glob pattern for a version.  */
+
+struct bfd_elf_version_expr
+{
+  /* Next glob pattern for this version.  */
+  struct bfd_elf_version_expr *next;
+  /* Glob pattern.  */
+  const char *pattern;
+  /* Set if pattern is not a glob.  */
+  unsigned int literal : 1;
+  /* Defined by ".symver".  */
+  unsigned int symver : 1;
+  /* Defined by version script.  */
+  unsigned int script : 1;
+  /* Pattern type.  */
+#define BFD_ELF_VERSION_C_TYPE		1
+#define BFD_ELF_VERSION_CXX_TYPE	2
+#define BFD_ELF_VERSION_JAVA_TYPE	4
+  unsigned int mask : 3;
+};
+
+struct bfd_elf_version_expr_head
+{
+  /* List of all patterns, both wildcards and non-wildcards.  */
+  struct bfd_elf_version_expr *list;
+  /* Hash table for non-wildcards.  */
+  void *htab;
+  /* Remaining patterns.  */
+  struct bfd_elf_version_expr *remaining;
+  /* What kind of pattern types are present in list (bitmask).  */
+  unsigned int mask;
+};
+
+/* Version dependencies.  */
+
+struct bfd_elf_version_deps
+{
+  /* Next dependency for this version.  */
+  struct bfd_elf_version_deps *next;
+  /* The version which this version depends upon.  */
+  struct bfd_elf_version_tree *version_needed;
+};
+
+/* A node in the version tree.  */
+
+struct bfd_elf_version_tree
+{
+  /* Next version.  */
+  struct bfd_elf_version_tree *next;
+  /* Name of this version.  */
+  const char *name;
+  /* Version number.  */
+  unsigned int vernum;
+  /* Regular expressions for global symbols in this version.  */
+  struct bfd_elf_version_expr_head globals;
+  /* Regular expressions for local symbols in this version.  */
+  struct bfd_elf_version_expr_head locals;
+  /* List of versions which this version depends upon.  */
+  struct bfd_elf_version_deps *deps;
+  /* Index of the version name.  This is used within BFD.  */
+  unsigned int name_indx;
+  /* Whether this version tree was used.  This is used within BFD.  */
+  int used;
+  /* Matching hook.  */
+  struct bfd_elf_version_expr *(*match)
+    (struct bfd_elf_version_expr_head *head,
+     struct bfd_elf_version_expr *prev, const char *sym);
+};
+
+struct bfd_elf_dynamic_list
+{
+  struct bfd_elf_version_expr_head head;
+  struct bfd_elf_version_expr *(*match)
+    (struct bfd_elf_version_expr_head *head,
+     struct bfd_elf_version_expr *prev, const char *sym);
+};
+
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/binary-io.h b/utils/gapy/gen-debug-info-src/ext/binary-io.h
new file mode 100644
index 000000000..2984271f7
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/binary-io.h
@@ -0,0 +1,62 @@
+/* Binary mode I/O.
+   Copyright (C) 2001, 2003, 2005, 2008 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef _BINARY_H
+#define _BINARY_H
+
+/* Include this header after <fcntl.h> and <stdio.h>, because
+   systems that distinguish between text and binary I/O usually
+   define O_BINARY in <fcntl.h>, and the MSVC7 <stdio.h> doesn't
+   like to be included after '#define fileno ...'
+
+   We don't include <fcntl.h> here because not all systems have
+   that header.  */
+
+#if !defined O_BINARY && defined _O_BINARY
+  /* For MSC-compatible compilers.  */
+# define O_BINARY _O_BINARY
+# define O_TEXT _O_TEXT
+#endif
+#ifdef __BEOS__
+  /* BeOS 5 has O_BINARY and O_TEXT, but they have no effect.  */
+# undef O_BINARY
+# undef O_TEXT
+#endif
+#if O_BINARY
+# if defined __EMX__ || defined __DJGPP__ || defined __CYGWIN__
+#  include <io.h> /* declares setmode() */
+# else
+#  define setmode _setmode
+#  undef fileno
+#  define fileno _fileno
+# endif
+# ifdef __DJGPP__
+#  include <unistd.h> /* declares isatty() */
+#  /* Avoid putting stdin/stdout in binary mode if it is connected to the
+#     console, because that would make it impossible for the user to
+#     interrupt the program through Ctrl-C or Ctrl-Break.  */
+#  define SET_BINARY(fd) (!isatty (fd) ? (setmode (fd, O_BINARY), 0) : 0)
+# else
+#  define SET_BINARY(fd) setmode (fd, O_BINARY)
+# endif
+#else
+  /* On reasonable systems, binary I/O is the default.  */
+# undef O_BINARY
+# define O_BINARY 0
+# define SET_BINARY(fd) /* nothing */
+#endif
+
+#endif /* _BINARY_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/bout.h b/utils/gapy/gen-debug-info-src/ext/bout.h
new file mode 100644
index 000000000..4a302283c
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/bout.h
@@ -0,0 +1,192 @@
+/* This file is a modified version of 'a.out.h'.  It is to be used in all
+   GNU tools modified to support the i80960 (or tools that operate on
+   object files created by such tools).
+
+   Copyright 2001, 2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+ 
+/* All i80960 development is done in a CROSS-DEVELOPMENT environment.  I.e.,
+   object code is generated on, and executed under the direction of a symbolic
+   debugger running on, a host system.  We do not want to be subject to the
+   vagaries of which host it is or whether it supports COFF or a.out format,
+   or anything else.  We DO want to:
+  
+  	o always generate the same format object files, regardless of host.
+ 
+ 	o have an 'a.out' header that we can modify for our own purposes
+ 	  (the 80960 is typically an embedded processor and may require
+ 	  enhanced linker support that the normal a.out.h header can't
+ 	  accommodate).
+ 
+  As for byte-ordering, the following rules apply:
+ 
+ 	o Text and data that is actually downloaded to the target is always
+ 	  in i80960 (little-endian) order.
+ 
+ 	o All other numbers (in the header, symbols, relocation directives)
+ 	  are in host byte-order:  object files CANNOT be lifted from a
+ 	  little-end host and used on a big-endian (or vice versa) without
+ 	  modification.
+  ==> THIS IS NO LONGER TRUE USING BFD.  WE CAN GENERATE ANY BYTE ORDER
+      FOR THE HEADER, AND READ ANY BYTE ORDER.  PREFERENCE WOULD BE TO
+      USE LITTLE-ENDIAN BYTE ORDER THROUGHOUT, REGARDLESS OF HOST.  <==
+ 
+ 	o The downloader ('comm960') takes care to generate a pseudo-header
+ 	  with correct (i80960) byte-ordering before shipping text and data
+ 	  off to the NINDY monitor in the target systems.  Symbols and
+ 	  relocation info are never sent to the target.  */
+
+#define BMAGIC	0415
+/* We don't accept the following (see N_BADMAG macro).
+   They're just here so GNU code will compile.  */
+#define	OMAGIC	0407		/* old impure format */
+#define	NMAGIC	0410		/* read-only text */
+#define	ZMAGIC	0413		/* demand load format */
+
+/* FILE HEADER
+  	All 'lengths' are given as a number of bytes.
+  	All 'alignments' are for relinkable files only;  an alignment of
+  		'n' indicates the corresponding segment must begin at an
+  		address that is a multiple of (2**n).  */
+struct external_exec
+  {
+    /* Standard stuff */
+    unsigned char e_info[4];	/* Identifies this as a b.out file */
+    unsigned char e_text[4];	/* Length of text */
+    unsigned char e_data[4];	/* Length of data */
+    unsigned char e_bss[4];	/* Length of uninitialized data area */
+    unsigned char e_syms[4];	/* Length of symbol table */
+    unsigned char e_entry[4];	/* Runtime start address */
+    unsigned char e_trsize[4];	/* Length of text relocation info */
+    unsigned char e_drsize[4];	/* Length of data relocation info */
+
+    /* Added for i960 */
+    unsigned char e_tload[4];	/* Text runtime load address */
+    unsigned char e_dload[4];	/* Data runtime load address */
+    unsigned char e_talign[1];	/* Alignment of text segment */
+    unsigned char e_dalign[1];	/* Alignment of data segment */
+    unsigned char e_balign[1];	/* Alignment of bss segment */
+    unsigned char e_relaxable[1];/* Assembled with enough info to allow linker to relax */
+  };
+
+#define	EXEC_BYTES_SIZE	(sizeof (struct external_exec))
+
+/* These macros use the a_xxx field names, since they operate on the exec
+   structure after it's been byte-swapped and realigned on the host machine.  */
+#define N_BADMAG(x)	(((x).a_info)!=BMAGIC)
+#define N_TXTOFF(x)	EXEC_BYTES_SIZE
+#define N_DATOFF(x)	( N_TXTOFF(x) + (x).a_text )
+#define N_TROFF(x)	( N_DATOFF(x) + (x).a_data )
+#define N_TRELOFF	N_TROFF
+#define N_DROFF(x)	( N_TROFF(x) + (x).a_trsize )
+#define N_DRELOFF	N_DROFF
+#define N_SYMOFF(x)	( N_DROFF(x) + (x).a_drsize )
+#define N_STROFF(x)	( N_SYMOFF(x) + (x).a_syms )
+#define N_DATADDR(x)	( (x).a_dload )    
+
+/* Address of text segment in memory after it is loaded.  */
+#if !defined (N_TXTADDR)
+#define N_TXTADDR(x) 0
+#endif
+
+/* A single entry in the symbol table.  */
+struct nlist
+  {
+    union
+      {
+	char*          n_name;
+	struct nlist * n_next;
+	long	       n_strx;	/* Index into string table	*/
+      } n_un;
+
+    unsigned char n_type;	/* See below				*/
+    char	  n_other;	/* Used in i80960 support -- see below	*/
+    short	  n_desc;
+    unsigned long n_value;
+  };
+
+
+/* Legal values of n_type.  */
+#define N_UNDF	0	/* Undefined symbol	*/
+#define N_ABS	2	/* Absolute symbol	*/
+#define N_TEXT	4	/* Text symbol		*/
+#define N_DATA	6	/* Data symbol		*/
+#define N_BSS	8	/* BSS symbol		*/
+#define N_FN	31	/* Filename symbol	*/
+
+#define N_EXT	1	/* External symbol (OR'd in with one of above)	*/
+#define N_TYPE	036	/* Mask for all the type bits			*/
+#define N_STAB	0340	/* Mask for all bits used for SDB entries 	*/
+
+/* MEANING OF 'n_other'
+ 
+  If non-zero, the 'n_other' fields indicates either a leaf procedure or
+  a system procedure, as follows:
+ 
+ 	1 <= n_other <= 32 :
+ 		The symbol is the entry point to a system procedure.
+ 		'n_value' is the address of the entry, as for any other
+ 		procedure.  The system procedure number (which can be used in
+ 		a 'calls' instruction) is (n_other-1).  These entries come from
+ 		'.sysproc' directives.
+ 
+ 	n_other == N_CALLNAME
+ 		the symbol is the 'call' entry point to a leaf procedure.
+ 		The *next* symbol in the symbol table must be the corresponding
+ 		'bal' entry point to the procedure (see following).  These
+ 		entries come from '.leafproc' directives in which two different
+ 		symbols are specified (the first one is represented here).
+ 	
+ 
+ 	n_other == N_BALNAME
+ 		the symbol is the 'bal' entry point to a leaf procedure.
+ 		These entries result from '.leafproc' directives in which only
+ 		one symbol is specified, or in which the same symbol is
+ 		specified twice.
+ 
+  Note that an N_CALLNAME entry *must* have a corresponding N_BALNAME entry,
+  but not every N_BALNAME entry must have an N_CALLNAME entry.  */
+#define N_CALLNAME	((char)-1)
+#define N_BALNAME	((char)-2)
+#define IS_CALLNAME(x)	(N_CALLNAME == (x))
+#define IS_BALNAME(x)	(N_BALNAME == (x))
+#define IS_OTHER(x)	((x)>0 && (x) <=32)
+
+#define b_out_relocation_info relocation_info
+struct relocation_info
+  {
+    int	 r_address;	/* File address of item to be relocated.  */
+    unsigned
+#define r_index r_symbolnum
+    r_symbolnum:24,	/* Index of symbol on which relocation is based,
+			   if r_extern is set.  Otherwise set to
+			   either N_TEXT, N_DATA, or N_BSS to
+			   indicate section on which relocation is
+			   based.  */
+      r_pcrel:1,	/* 1 => relocate PC-relative; else absolute
+			   On i960, pc-relative implies 24-bit
+			   address, absolute implies 32-bit.  */
+      r_length:2,	/* Number of bytes to relocate:
+			   0 => 1 byte
+			   1 => 2 bytes -- used for 13 bit pcrel
+			   2 => 4 bytes.  */
+      r_extern:1,
+      r_bsr:1,		/* Something for the GNU NS32K assembler.  */
+      r_disp:1,		/* Something for the GNU NS32K assembler.  */
+      r_callj:1,	/* 1 if relocation target is an i960 'callj'.  */
+      r_relaxable:1;	/* 1 if enough info is left to relax the data.  */
+};
diff --git a/utils/gapy/gen-debug-info-src/ext/demangle.h b/utils/gapy/gen-debug-info-src/ext/demangle.h
new file mode 100644
index 000000000..1d7cadf4b
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/demangle.h
@@ -0,0 +1,691 @@
+/* Defs for interface to demanglers.
+   Copyright (C) 1992-2015 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License
+   as published by the Free Software Foundation; either version 2, or
+   (at your option) any later version.
+
+   In addition to the permissions in the GNU Library General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Library Public License restrictions do apply in other
+   respects; for example, they cover modification of the file, and
+   distribution when not linked into a combined executable.)
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+
+#if !defined (DEMANGLE_H)
+#define DEMANGLE_H
+
+#include "libiberty.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Options passed to cplus_demangle (in 2nd parameter). */
+
+#define DMGL_NO_OPTS	 0		/* For readability... */
+#define DMGL_PARAMS	 (1 << 0)	/* Include function args */
+#define DMGL_ANSI	 (1 << 1)	/* Include const, volatile, etc */
+#define DMGL_JAVA	 (1 << 2)	/* Demangle as Java rather than C++. */
+#define DMGL_VERBOSE	 (1 << 3)	/* Include implementation details.  */
+#define DMGL_TYPES	 (1 << 4)	/* Also try to demangle type encodings.  */
+#define DMGL_RET_POSTFIX (1 << 5)       /* Print function return types (when
+					   present) after function signature.
+					   It applies only to the toplevel
+					   function type.  */
+#define DMGL_RET_DROP	 (1 << 6)       /* Suppress printing function return
+					   types, even if present.  It applies
+					   only to the toplevel function type.
+					   */
+
+#define DMGL_AUTO	 (1 << 8)
+#define DMGL_GNU	 (1 << 9)
+#define DMGL_LUCID	 (1 << 10)
+#define DMGL_ARM	 (1 << 11)
+#define DMGL_HP 	 (1 << 12)       /* For the HP aCC compiler;
+                                            same as ARM except for
+                                            template arguments, etc. */
+#define DMGL_EDG	 (1 << 13)
+#define DMGL_GNU_V3	 (1 << 14)
+#define DMGL_GNAT	 (1 << 15)
+#define DMGL_DLANG	 (1 << 16)
+
+/* If none of these are set, use 'current_demangling_style' as the default. */
+#define DMGL_STYLE_MASK (DMGL_AUTO|DMGL_GNU|DMGL_LUCID|DMGL_ARM|DMGL_HP|DMGL_EDG|DMGL_GNU_V3|DMGL_JAVA|DMGL_GNAT|DMGL_DLANG)
+
+/* Enumeration of possible demangling styles.
+
+   Lucid and ARM styles are still kept logically distinct, even though
+   they now both behave identically.  The resulting style is actual the
+   union of both.  I.E. either style recognizes both "__pt__" and "__rf__"
+   for operator "->", even though the first is lucid style and the second
+   is ARM style. (FIXME?) */
+
+extern enum demangling_styles
+{
+  no_demangling = -1,
+  unknown_demangling = 0,
+  auto_demangling = DMGL_AUTO,
+  gnu_demangling = DMGL_GNU,
+  lucid_demangling = DMGL_LUCID,
+  arm_demangling = DMGL_ARM,
+  hp_demangling = DMGL_HP,
+  edg_demangling = DMGL_EDG,
+  gnu_v3_demangling = DMGL_GNU_V3,
+  java_demangling = DMGL_JAVA,
+  gnat_demangling = DMGL_GNAT,
+  dlang_demangling = DMGL_DLANG
+} current_demangling_style;
+
+/* Define string names for the various demangling styles. */
+
+#define NO_DEMANGLING_STYLE_STRING            "none"
+#define AUTO_DEMANGLING_STYLE_STRING	      "auto"
+#define GNU_DEMANGLING_STYLE_STRING    	      "gnu"
+#define LUCID_DEMANGLING_STYLE_STRING	      "lucid"
+#define ARM_DEMANGLING_STYLE_STRING	      "arm"
+#define HP_DEMANGLING_STYLE_STRING	      "hp"
+#define EDG_DEMANGLING_STYLE_STRING	      "edg"
+#define GNU_V3_DEMANGLING_STYLE_STRING        "gnu-v3"
+#define JAVA_DEMANGLING_STYLE_STRING          "java"
+#define GNAT_DEMANGLING_STYLE_STRING          "gnat"
+#define DLANG_DEMANGLING_STYLE_STRING         "dlang"
+
+/* Some macros to test what demangling style is active. */
+
+#define CURRENT_DEMANGLING_STYLE current_demangling_style
+#define AUTO_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_AUTO)
+#define GNU_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_GNU)
+#define LUCID_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_LUCID)
+#define ARM_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_ARM)
+#define HP_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_HP)
+#define EDG_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_EDG)
+#define GNU_V3_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_GNU_V3)
+#define JAVA_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_JAVA)
+#define GNAT_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_GNAT)
+#define DLANG_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_DLANG)
+
+/* Provide information about the available demangle styles. This code is
+   pulled from gdb into libiberty because it is useful to binutils also.  */
+
+extern const struct demangler_engine
+{
+  const char *const demangling_style_name;
+  const enum demangling_styles demangling_style;
+  const char *const demangling_style_doc;
+} libiberty_demanglers[];
+
+extern char *
+cplus_demangle (const char *mangled, int options);
+
+extern int
+cplus_demangle_opname (const char *opname, char *result, int options);
+
+extern const char *
+cplus_mangle_opname (const char *opname, int options);
+
+/* Note: This sets global state.  FIXME if you care about multi-threading. */
+
+extern void
+set_cplus_marker_for_demangling (int ch);
+
+extern enum demangling_styles
+cplus_demangle_set_style (enum demangling_styles style);
+
+extern enum demangling_styles
+cplus_demangle_name_to_style (const char *name);
+
+/* Callback typedef for allocation-less demangler interfaces. */
+typedef void (*demangle_callbackref) (const char *, size_t, void *);
+
+/* V3 ABI demangling entry points, defined in cp-demangle.c.  Callback
+   variants return non-zero on success, zero on error.  char* variants
+   return a string allocated by malloc on success, NULL on error.  */
+extern int
+cplus_demangle_v3_callback (const char *mangled, int options,
+                            demangle_callbackref callback, void *opaque);
+
+extern char*
+cplus_demangle_v3 (const char *mangled, int options);
+
+extern int
+java_demangle_v3_callback (const char *mangled,
+                           demangle_callbackref callback, void *opaque);
+
+extern char*
+java_demangle_v3 (const char *mangled);
+
+char *
+ada_demangle (const char *mangled, int options);
+
+extern char *
+dlang_demangle (const char *mangled, int options);
+
+enum gnu_v3_ctor_kinds {
+  gnu_v3_complete_object_ctor = 1,
+  gnu_v3_base_object_ctor,
+  gnu_v3_complete_object_allocating_ctor,
+  /* These are not part of the V3 ABI.  Unified constructors are generated
+     as a speed-for-space optimization when the -fdeclone-ctor-dtor option
+     is used, and are always internal symbols.  */
+  gnu_v3_unified_ctor,
+  gnu_v3_object_ctor_group
+};
+
+/* Return non-zero iff NAME is the mangled form of a constructor name
+   in the G++ V3 ABI demangling style.  Specifically, return an `enum
+   gnu_v3_ctor_kinds' value indicating what kind of constructor
+   it is.  */
+extern enum gnu_v3_ctor_kinds
+	is_gnu_v3_mangled_ctor (const char *name);
+
+
+enum gnu_v3_dtor_kinds {
+  gnu_v3_deleting_dtor = 1,
+  gnu_v3_complete_object_dtor,
+  gnu_v3_base_object_dtor,
+  /* These are not part of the V3 ABI.  Unified destructors are generated
+     as a speed-for-space optimization when the -fdeclone-ctor-dtor option
+     is used, and are always internal symbols.  */
+  gnu_v3_unified_dtor,
+  gnu_v3_object_dtor_group
+};
+
+/* Return non-zero iff NAME is the mangled form of a destructor name
+   in the G++ V3 ABI demangling style.  Specifically, return an `enum
+   gnu_v3_dtor_kinds' value, indicating what kind of destructor
+   it is.  */
+extern enum gnu_v3_dtor_kinds
+	is_gnu_v3_mangled_dtor (const char *name);
+
+/* The V3 demangler works in two passes.  The first pass builds a tree
+   representation of the mangled name, and the second pass turns the
+   tree representation into a demangled string.  Here we define an
+   interface to permit a caller to build their own tree
+   representation, which they can pass to the demangler to get a
+   demangled string.  This can be used to canonicalize user input into
+   something which the demangler might output.  It could also be used
+   by other demanglers in the future.  */
+
+/* These are the component types which may be found in the tree.  Many
+   component types have one or two subtrees, referred to as left and
+   right (a component type with only one subtree puts it in the left
+   subtree).  */
+
+enum demangle_component_type
+{
+  /* A name, with a length and a pointer to a string.  */
+  DEMANGLE_COMPONENT_NAME,
+  /* A qualified name.  The left subtree is a class or namespace or
+     some such thing, and the right subtree is a name qualified by
+     that class.  */
+  DEMANGLE_COMPONENT_QUAL_NAME,
+  /* A local name.  The left subtree describes a function, and the
+     right subtree is a name which is local to that function.  */
+  DEMANGLE_COMPONENT_LOCAL_NAME,
+  /* A typed name.  The left subtree is a name, and the right subtree
+     describes that name as a function.  */
+  DEMANGLE_COMPONENT_TYPED_NAME,
+  /* A template.  The left subtree is a template name, and the right
+     subtree is a template argument list.  */
+  DEMANGLE_COMPONENT_TEMPLATE,
+  /* A template parameter.  This holds a number, which is the template
+     parameter index.  */
+  DEMANGLE_COMPONENT_TEMPLATE_PARAM,
+  /* A function parameter.  This holds a number, which is the index.  */
+  DEMANGLE_COMPONENT_FUNCTION_PARAM,
+  /* A constructor.  This holds a name and the kind of
+     constructor.  */
+  DEMANGLE_COMPONENT_CTOR,
+  /* A destructor.  This holds a name and the kind of destructor.  */
+  DEMANGLE_COMPONENT_DTOR,
+  /* A vtable.  This has one subtree, the type for which this is a
+     vtable.  */
+  DEMANGLE_COMPONENT_VTABLE,
+  /* A VTT structure.  This has one subtree, the type for which this
+     is a VTT.  */
+  DEMANGLE_COMPONENT_VTT,
+  /* A construction vtable.  The left subtree is the type for which
+     this is a vtable, and the right subtree is the derived type for
+     which this vtable is built.  */
+  DEMANGLE_COMPONENT_CONSTRUCTION_VTABLE,
+  /* A typeinfo structure.  This has one subtree, the type for which
+     this is the tpeinfo structure.  */
+  DEMANGLE_COMPONENT_TYPEINFO,
+  /* A typeinfo name.  This has one subtree, the type for which this
+     is the typeinfo name.  */
+  DEMANGLE_COMPONENT_TYPEINFO_NAME,
+  /* A typeinfo function.  This has one subtree, the type for which
+     this is the tpyeinfo function.  */
+  DEMANGLE_COMPONENT_TYPEINFO_FN,
+  /* A thunk.  This has one subtree, the name for which this is a
+     thunk.  */
+  DEMANGLE_COMPONENT_THUNK,
+  /* A virtual thunk.  This has one subtree, the name for which this
+     is a virtual thunk.  */
+  DEMANGLE_COMPONENT_VIRTUAL_THUNK,
+  /* A covariant thunk.  This has one subtree, the name for which this
+     is a covariant thunk.  */
+  DEMANGLE_COMPONENT_COVARIANT_THUNK,
+  /* A Java class.  This has one subtree, the type.  */
+  DEMANGLE_COMPONENT_JAVA_CLASS,
+  /* A guard variable.  This has one subtree, the name for which this
+     is a guard variable.  */
+  DEMANGLE_COMPONENT_GUARD,
+  /* The init and wrapper functions for C++11 thread_local variables.  */
+  DEMANGLE_COMPONENT_TLS_INIT,
+  DEMANGLE_COMPONENT_TLS_WRAPPER,
+  /* A reference temporary.  This has one subtree, the name for which
+     this is a temporary.  */
+  DEMANGLE_COMPONENT_REFTEMP,
+  /* A hidden alias.  This has one subtree, the encoding for which it
+     is providing alternative linkage.  */
+  DEMANGLE_COMPONENT_HIDDEN_ALIAS,
+  /* A standard substitution.  This holds the name of the
+     substitution.  */
+  DEMANGLE_COMPONENT_SUB_STD,
+  /* The restrict qualifier.  The one subtree is the type which is
+     being qualified.  */
+  DEMANGLE_COMPONENT_RESTRICT,
+  /* The volatile qualifier.  The one subtree is the type which is
+     being qualified.  */
+  DEMANGLE_COMPONENT_VOLATILE,
+  /* The const qualifier.  The one subtree is the type which is being
+     qualified.  */
+  DEMANGLE_COMPONENT_CONST,
+  /* The restrict qualifier modifying a member function.  The one
+     subtree is the type which is being qualified.  */
+  DEMANGLE_COMPONENT_RESTRICT_THIS,
+  /* The volatile qualifier modifying a member function.  The one
+     subtree is the type which is being qualified.  */
+  DEMANGLE_COMPONENT_VOLATILE_THIS,
+  /* The const qualifier modifying a member function.  The one subtree
+     is the type which is being qualified.  */
+  DEMANGLE_COMPONENT_CONST_THIS,
+  /* C++11 A reference modifying a member function.  The one subtree is the
+     type which is being referenced.  */
+  DEMANGLE_COMPONENT_REFERENCE_THIS,
+  /* C++11: An rvalue reference modifying a member function.  The one
+     subtree is the type which is being referenced.  */
+  DEMANGLE_COMPONENT_RVALUE_REFERENCE_THIS,
+  /* A vendor qualifier.  The left subtree is the type which is being
+     qualified, and the right subtree is the name of the
+     qualifier.  */
+  DEMANGLE_COMPONENT_VENDOR_TYPE_QUAL,
+  /* A pointer.  The one subtree is the type which is being pointed
+     to.  */
+  DEMANGLE_COMPONENT_POINTER,
+  /* A reference.  The one subtree is the type which is being
+     referenced.  */
+  DEMANGLE_COMPONENT_REFERENCE,
+  /* C++0x: An rvalue reference.  The one subtree is the type which is
+     being referenced.  */
+  DEMANGLE_COMPONENT_RVALUE_REFERENCE,
+  /* A complex type.  The one subtree is the base type.  */
+  DEMANGLE_COMPONENT_COMPLEX,
+  /* An imaginary type.  The one subtree is the base type.  */
+  DEMANGLE_COMPONENT_IMAGINARY,
+  /* A builtin type.  This holds the builtin type information.  */
+  DEMANGLE_COMPONENT_BUILTIN_TYPE,
+  /* A vendor's builtin type.  This holds the name of the type.  */
+  DEMANGLE_COMPONENT_VENDOR_TYPE,
+  /* A function type.  The left subtree is the return type.  The right
+     subtree is a list of ARGLIST nodes.  Either or both may be
+     NULL.  */
+  DEMANGLE_COMPONENT_FUNCTION_TYPE,
+  /* An array type.  The left subtree is the dimension, which may be
+     NULL, or a string (represented as DEMANGLE_COMPONENT_NAME), or an
+     expression.  The right subtree is the element type.  */
+  DEMANGLE_COMPONENT_ARRAY_TYPE,
+  /* A pointer to member type.  The left subtree is the class type,
+     and the right subtree is the member type.  CV-qualifiers appear
+     on the latter.  */
+  DEMANGLE_COMPONENT_PTRMEM_TYPE,
+  /* A fixed-point type.  */
+  DEMANGLE_COMPONENT_FIXED_TYPE,
+  /* A vector type.  The left subtree is the number of elements,
+     the right subtree is the element type.  */
+  DEMANGLE_COMPONENT_VECTOR_TYPE,
+  /* An argument list.  The left subtree is the current argument, and
+     the right subtree is either NULL or another ARGLIST node.  */
+  DEMANGLE_COMPONENT_ARGLIST,
+  /* A template argument list.  The left subtree is the current
+     template argument, and the right subtree is either NULL or
+     another TEMPLATE_ARGLIST node.  */
+  DEMANGLE_COMPONENT_TEMPLATE_ARGLIST,
+  /* An initializer list.  The left subtree is either an explicit type or
+     NULL, and the right subtree is a DEMANGLE_COMPONENT_ARGLIST.  */
+  DEMANGLE_COMPONENT_INITIALIZER_LIST,
+  /* An operator.  This holds information about a standard
+     operator.  */
+  DEMANGLE_COMPONENT_OPERATOR,
+  /* An extended operator.  This holds the number of arguments, and
+     the name of the extended operator.  */
+  DEMANGLE_COMPONENT_EXTENDED_OPERATOR,
+  /* A typecast, represented as a unary operator.  The one subtree is
+     the type to which the argument should be cast.  */
+  DEMANGLE_COMPONENT_CAST,
+  /* A conversion operator, represented as a unary operator.  The one
+     subtree is the type to which the argument should be converted
+     to.  */
+  DEMANGLE_COMPONENT_CONVERSION,
+  /* A nullary expression.  The left subtree is the operator.  */
+  DEMANGLE_COMPONENT_NULLARY,
+  /* A unary expression.  The left subtree is the operator, and the
+     right subtree is the single argument.  */
+  DEMANGLE_COMPONENT_UNARY,
+  /* A binary expression.  The left subtree is the operator, and the
+     right subtree is a BINARY_ARGS.  */
+  DEMANGLE_COMPONENT_BINARY,
+  /* Arguments to a binary expression.  The left subtree is the first
+     argument, and the right subtree is the second argument.  */
+  DEMANGLE_COMPONENT_BINARY_ARGS,
+  /* A trinary expression.  The left subtree is the operator, and the
+     right subtree is a TRINARY_ARG1.  */
+  DEMANGLE_COMPONENT_TRINARY,
+  /* Arguments to a trinary expression.  The left subtree is the first
+     argument, and the right subtree is a TRINARY_ARG2.  */
+  DEMANGLE_COMPONENT_TRINARY_ARG1,
+  /* More arguments to a trinary expression.  The left subtree is the
+     second argument, and the right subtree is the third argument.  */
+  DEMANGLE_COMPONENT_TRINARY_ARG2,
+  /* A literal.  The left subtree is the type, and the right subtree
+     is the value, represented as a DEMANGLE_COMPONENT_NAME.  */
+  DEMANGLE_COMPONENT_LITERAL,
+  /* A negative literal.  Like LITERAL, but the value is negated.
+     This is a minor hack: the NAME used for LITERAL points directly
+     to the mangled string, but since negative numbers are mangled
+     using 'n' instead of '-', we want a way to indicate a negative
+     number which involves neither modifying the mangled string nor
+     allocating a new copy of the literal in memory.  */
+  DEMANGLE_COMPONENT_LITERAL_NEG,
+  /* A libgcj compiled resource.  The left subtree is the name of the
+     resource.  */
+  DEMANGLE_COMPONENT_JAVA_RESOURCE,
+  /* A name formed by the concatenation of two parts.  The left
+     subtree is the first part and the right subtree the second.  */
+  DEMANGLE_COMPONENT_COMPOUND_NAME,
+  /* A name formed by a single character.  */
+  DEMANGLE_COMPONENT_CHARACTER,
+  /* A number.  */
+  DEMANGLE_COMPONENT_NUMBER,
+  /* A decltype type.  */
+  DEMANGLE_COMPONENT_DECLTYPE,
+  /* Global constructors keyed to name.  */
+  DEMANGLE_COMPONENT_GLOBAL_CONSTRUCTORS,
+  /* Global destructors keyed to name.  */
+  DEMANGLE_COMPONENT_GLOBAL_DESTRUCTORS,
+  /* A lambda closure type.  */
+  DEMANGLE_COMPONENT_LAMBDA,
+  /* A default argument scope.  */
+  DEMANGLE_COMPONENT_DEFAULT_ARG,
+  /* An unnamed type.  */
+  DEMANGLE_COMPONENT_UNNAMED_TYPE,
+  /* A transactional clone.  This has one subtree, the encoding for
+     which it is providing alternative linkage.  */
+  DEMANGLE_COMPONENT_TRANSACTION_CLONE,
+  /* A non-transactional clone entry point.  In the i386/x86_64 abi,
+     the unmangled symbol of a tm_callable becomes a thunk and the
+     non-transactional function version is mangled thus.  */
+  DEMANGLE_COMPONENT_NONTRANSACTION_CLONE,
+  /* A pack expansion.  */
+  DEMANGLE_COMPONENT_PACK_EXPANSION,
+  /* A name with an ABI tag.  */
+  DEMANGLE_COMPONENT_TAGGED_NAME,
+  /* A transaction-safe function type.  */
+  DEMANGLE_COMPONENT_TRANSACTION_SAFE,
+  /* A cloned function.  */
+  DEMANGLE_COMPONENT_CLONE
+};
+
+/* Types which are only used internally.  */
+
+struct demangle_operator_info;
+struct demangle_builtin_type_info;
+
+/* A node in the tree representation is an instance of a struct
+   demangle_component.  Note that the field names of the struct are
+   not well protected against macros defined by the file including
+   this one.  We can fix this if it ever becomes a problem.  */
+
+struct demangle_component
+{
+  /* The type of this component.  */
+  enum demangle_component_type type;
+
+  union
+  {
+    /* For DEMANGLE_COMPONENT_NAME.  */
+    struct
+    {
+      /* A pointer to the name (which need not NULL terminated) and
+	 its length.  */
+      const char *s;
+      int len;
+    } s_name;
+
+    /* For DEMANGLE_COMPONENT_OPERATOR.  */
+    struct
+    {
+      /* Operator.  */
+      const struct demangle_operator_info *op;
+    } s_operator;
+
+    /* For DEMANGLE_COMPONENT_EXTENDED_OPERATOR.  */
+    struct
+    {
+      /* Number of arguments.  */
+      int args;
+      /* Name.  */
+      struct demangle_component *name;
+    } s_extended_operator;
+
+    /* For DEMANGLE_COMPONENT_FIXED_TYPE.  */
+    struct
+    {
+      /* The length, indicated by a C integer type name.  */
+      struct demangle_component *length;
+      /* _Accum or _Fract?  */
+      short accum;
+      /* Saturating or not?  */
+      short sat;
+    } s_fixed;
+
+    /* For DEMANGLE_COMPONENT_CTOR.  */
+    struct
+    {
+      /* Kind of constructor.  */
+      enum gnu_v3_ctor_kinds kind;
+      /* Name.  */
+      struct demangle_component *name;
+    } s_ctor;
+
+    /* For DEMANGLE_COMPONENT_DTOR.  */
+    struct
+    {
+      /* Kind of destructor.  */
+      enum gnu_v3_dtor_kinds kind;
+      /* Name.  */
+      struct demangle_component *name;
+    } s_dtor;
+
+    /* For DEMANGLE_COMPONENT_BUILTIN_TYPE.  */
+    struct
+    {
+      /* Builtin type.  */
+      const struct demangle_builtin_type_info *type;
+    } s_builtin;
+
+    /* For DEMANGLE_COMPONENT_SUB_STD.  */
+    struct
+    {
+      /* Standard substitution string.  */
+      const char* string;
+      /* Length of string.  */
+      int len;
+    } s_string;
+
+    /* For DEMANGLE_COMPONENT_*_PARAM.  */
+    struct
+    {
+      /* Parameter index.  */
+      long number;
+    } s_number;
+
+    /* For DEMANGLE_COMPONENT_CHARACTER.  */
+    struct
+    {
+      int character;
+    } s_character;
+
+    /* For other types.  */
+    struct
+    {
+      /* Left (or only) subtree.  */
+      struct demangle_component *left;
+      /* Right subtree.  */
+      struct demangle_component *right;
+    } s_binary;
+
+    struct
+    {
+      /* subtree, same place as d_left.  */
+      struct demangle_component *sub;
+      /* integer.  */
+      int num;
+    } s_unary_num;
+
+  } u;
+};
+
+/* People building mangled trees are expected to allocate instances of
+   struct demangle_component themselves.  They can then call one of
+   the following functions to fill them in.  */
+
+/* Fill in most component types with a left subtree and a right
+   subtree.  Returns non-zero on success, zero on failure, such as an
+   unrecognized or inappropriate component type.  */
+
+extern int
+cplus_demangle_fill_component (struct demangle_component *fill,
+                               enum demangle_component_type,
+                               struct demangle_component *left,
+                               struct demangle_component *right);
+
+/* Fill in a DEMANGLE_COMPONENT_NAME.  Returns non-zero on success,
+   zero for bad arguments.  */
+
+extern int
+cplus_demangle_fill_name (struct demangle_component *fill,
+                          const char *, int);
+
+/* Fill in a DEMANGLE_COMPONENT_BUILTIN_TYPE, using the name of the
+   builtin type (e.g., "int", etc.).  Returns non-zero on success,
+   zero if the type is not recognized.  */
+
+extern int
+cplus_demangle_fill_builtin_type (struct demangle_component *fill,
+                                  const char *type_name);
+
+/* Fill in a DEMANGLE_COMPONENT_OPERATOR, using the name of the
+   operator and the number of arguments which it takes (the latter is
+   used to disambiguate operators which can be both binary and unary,
+   such as '-').  Returns non-zero on success, zero if the operator is
+   not recognized.  */
+
+extern int
+cplus_demangle_fill_operator (struct demangle_component *fill,
+                              const char *opname, int args);
+
+/* Fill in a DEMANGLE_COMPONENT_EXTENDED_OPERATOR, providing the
+   number of arguments and the name.  Returns non-zero on success,
+   zero for bad arguments.  */
+
+extern int
+cplus_demangle_fill_extended_operator (struct demangle_component *fill,
+                                       int numargs,
+                                       struct demangle_component *nm);
+
+/* Fill in a DEMANGLE_COMPONENT_CTOR.  Returns non-zero on success,
+   zero for bad arguments.  */
+
+extern int
+cplus_demangle_fill_ctor (struct demangle_component *fill,
+                          enum gnu_v3_ctor_kinds kind,
+                          struct demangle_component *name);
+
+/* Fill in a DEMANGLE_COMPONENT_DTOR.  Returns non-zero on success,
+   zero for bad arguments.  */
+
+extern int
+cplus_demangle_fill_dtor (struct demangle_component *fill,
+                          enum gnu_v3_dtor_kinds kind,
+                          struct demangle_component *name);
+
+/* This function translates a mangled name into a struct
+   demangle_component tree.  The first argument is the mangled name.
+   The second argument is DMGL_* options.  This returns a pointer to a
+   tree on success, or NULL on failure.  On success, the third
+   argument is set to a block of memory allocated by malloc.  This
+   block should be passed to free when the tree is no longer
+   needed.  */
+
+extern struct demangle_component *
+cplus_demangle_v3_components (const char *mangled, int options, void **mem);
+
+/* This function takes a struct demangle_component tree and returns
+   the corresponding demangled string.  The first argument is DMGL_*
+   options.  The second is the tree to demangle.  The third is a guess
+   at the length of the demangled string, used to initially allocate
+   the return buffer.  The fourth is a pointer to a size_t.  On
+   success, this function returns a buffer allocated by malloc(), and
+   sets the size_t pointed to by the fourth argument to the size of
+   the allocated buffer (not the length of the returned string).  On
+   failure, this function returns NULL, and sets the size_t pointed to
+   by the fourth argument to 0 for an invalid tree, or to 1 for a
+   memory allocation error.  */
+
+extern char *
+cplus_demangle_print (int options,
+                      const struct demangle_component *tree,
+                      int estimated_length,
+                      size_t *p_allocated_size);
+
+/* This function takes a struct demangle_component tree and passes back
+   a demangled string in one or more calls to a callback function.
+   The first argument is DMGL_* options.  The second is the tree to
+   demangle.  The third is a pointer to a callback function; on each call
+   this receives an element of the demangled string, its length, and an
+   opaque value.  The fourth is the opaque value passed to the callback.
+   The callback is called once or more to return the full demangled
+   string.  The demangled element string is always nul-terminated, though
+   its length is also provided for convenience.  In contrast to
+   cplus_demangle_print(), this function does not allocate heap memory
+   to grow output strings (except perhaps where alloca() is implemented
+   by malloc()), and so is normally safe for use where the heap has been
+   corrupted.  On success, this function returns 1; on failure, 0.  */
+
+extern int
+cplus_demangle_print_callback (int options,
+                               const struct demangle_component *tree,
+                               demangle_callbackref callback, void *opaque);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif	/* DEMANGLE_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/dis-asm.h b/utils/gapy/gen-debug-info-src/ext/dis-asm.h
new file mode 100644
index 000000000..78e9fc01a
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/dis-asm.h
@@ -0,0 +1,382 @@
+/* Interface between the opcode library and its callers.
+
+   Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009, 2010,
+   2011, 2012 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.
+
+   Written by Cygnus Support, 1993.
+
+   The opcode library (libopcodes.a) provides instruction decoders for
+   a large variety of instruction sets, callable with an identical
+   interface, for making instruction-processing programs more independent
+   of the instruction set being processed.  */
+
+#ifndef DIS_ASM_H
+#define DIS_ASM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include "bfd.h"
+
+  typedef int (*fprintf_ftype) (void *, const char*, ...) ATTRIBUTE_FPTR_PRINTF_2;
+
+enum dis_insn_type
+{
+  dis_noninsn,			/* Not a valid instruction.  */
+  dis_nonbranch,		/* Not a branch instruction.  */
+  dis_branch,			/* Unconditional branch.  */
+  dis_condbranch,		/* Conditional branch.  */
+  dis_jsr,			/* Jump to subroutine.  */
+  dis_condjsr,			/* Conditional jump to subroutine.  */
+  dis_dref,			/* Data reference instruction.  */
+  dis_dref2			/* Two data references in instruction.  */
+};
+
+/* This struct is passed into the instruction decoding routine,
+   and is passed back out into each callback.  The various fields are used
+   for conveying information from your main routine into your callbacks,
+   for passing information into the instruction decoders (such as the
+   addresses of the callback functions), or for passing information
+   back from the instruction decoders to their callers.
+
+   It must be initialized before it is first passed; this can be done
+   by hand, or using one of the initialization macros below.  */
+
+typedef struct disassemble_info
+{
+  fprintf_ftype fprintf_func;
+  void *stream;
+  void *application_data;
+
+  /* Target description.  We could replace this with a pointer to the bfd,
+     but that would require one.  There currently isn't any such requirement
+     so to avoid introducing one we record these explicitly.  */
+  /* The bfd_flavour.  This can be bfd_target_unknown_flavour.  */
+  enum bfd_flavour flavour;
+  /* The bfd_arch value.  */
+  enum bfd_architecture arch;
+  /* The bfd_mach value.  */
+  unsigned long mach;
+  /* Endianness (for bi-endian cpus).  Mono-endian cpus can ignore this.  */
+  enum bfd_endian endian;
+  /* Endianness of code, for mixed-endian situations such as ARM BE8.  */
+  enum bfd_endian endian_code;
+  /* An arch/mach-specific bitmask of selected instruction subsets, mainly
+     for processors with run-time-switchable instruction sets.  The default,
+     zero, means that there is no constraint.  CGEN-based opcodes ports
+     may use ISA_foo masks.  */
+  void *insn_sets;
+
+  /* Some targets need information about the current section to accurately
+     display insns.  If this is NULL, the target disassembler function
+     will have to make its best guess.  */
+  asection *section;
+
+  /* An array of pointers to symbols either at the location being disassembled
+     or at the start of the function being disassembled.  The array is sorted
+     so that the first symbol is intended to be the one used.  The others are
+     present for any misc. purposes.  This is not set reliably, but if it is
+     not NULL, it is correct.  */
+  asymbol **symbols;
+  /* Number of symbols in array.  */
+  int num_symbols;
+
+  /* Symbol table provided for targets that want to look at it.  This is
+     used on Arm to find mapping symbols and determine Arm/Thumb code.  */
+  asymbol **symtab;
+  int symtab_pos;
+  int symtab_size;
+
+  /* For use by the disassembler.
+     The top 16 bits are reserved for public use (and are documented here).
+     The bottom 16 bits are for the internal use of the disassembler.  */
+  unsigned long flags;
+  /* Set if the disassembler has determined that there are one or more
+     relocations associated with the instruction being disassembled.  */
+#define INSN_HAS_RELOC	 (1 << 31)
+  /* Set if the user has requested the disassembly of data as well as code.  */
+#define DISASSEMBLE_DATA (1 << 30)
+  /* Set if the user has specifically set the machine type encoded in the
+     mach field of this structure.  */
+#define USER_SPECIFIED_MACHINE_TYPE (1 << 29)
+
+  /* Use internally by the target specific disassembly code.  */
+  void *private_data;
+
+  /* Function used to get bytes to disassemble.  MEMADDR is the
+     address of the stuff to be disassembled, MYADDR is the address to
+     put the bytes in, and LENGTH is the number of bytes to read.
+     INFO is a pointer to this struct.
+     Returns an errno value or 0 for success.  */
+  int (*read_memory_func)
+    (bfd_vma memaddr, bfd_byte *myaddr, unsigned int length,
+     struct disassemble_info *dinfo);
+
+  /* Function which should be called if we get an error that we can't
+     recover from.  STATUS is the errno value from read_memory_func and
+     MEMADDR is the address that we were trying to read.  INFO is a
+     pointer to this struct.  */
+  void (*memory_error_func)
+    (int status, bfd_vma memaddr, struct disassemble_info *dinfo);
+
+  /* Function called to print ADDR.  */
+  void (*print_address_func)
+    (bfd_vma addr, struct disassemble_info *dinfo);
+
+  /* Function called to determine if there is a symbol at the given ADDR.
+     If there is, the function returns 1, otherwise it returns 0.
+     This is used by ports which support an overlay manager where
+     the overlay number is held in the top part of an address.  In
+     some circumstances we want to include the overlay number in the
+     address, (normally because there is a symbol associated with
+     that address), but sometimes we want to mask out the overlay bits.  */
+  int (* symbol_at_address_func)
+    (bfd_vma addr, struct disassemble_info *dinfo);
+
+  /* Function called to check if a SYMBOL is can be displayed to the user.
+     This is used by some ports that want to hide special symbols when
+     displaying debugging outout.  */
+  bfd_boolean (* symbol_is_valid)
+    (asymbol *, struct disassemble_info *dinfo);
+
+  /* These are for buffer_read_memory.  */
+  bfd_byte *buffer;
+  bfd_vma buffer_vma;
+  unsigned int buffer_length;
+
+  /* This variable may be set by the instruction decoder.  It suggests
+      the number of bytes objdump should display on a single line.  If
+      the instruction decoder sets this, it should always set it to
+      the same value in order to get reasonable looking output.  */
+  int bytes_per_line;
+
+  /* The next two variables control the way objdump displays the raw data.  */
+  /* For example, if bytes_per_line is 8 and bytes_per_chunk is 4, the */
+  /* output will look like this:
+     00:   00000000 00000000
+     with the chunks displayed according to "display_endian". */
+  int bytes_per_chunk;
+  enum bfd_endian display_endian;
+
+  /* Number of octets per incremented target address
+     Normally one, but some DSPs have byte sizes of 16 or 32 bits.  */
+  unsigned int octets_per_byte;
+
+  /* The number of zeroes we want to see at the end of a section before we
+     start skipping them.  */
+  unsigned int skip_zeroes;
+
+  /* The number of zeroes to skip at the end of a section.  If the number
+     of zeroes at the end is between SKIP_ZEROES_AT_END and SKIP_ZEROES,
+     they will be disassembled.  If there are fewer than
+     SKIP_ZEROES_AT_END, they will be skipped.  This is a heuristic
+     attempt to avoid disassembling zeroes inserted by section
+     alignment.  */
+  unsigned int skip_zeroes_at_end;
+
+  /* Whether the disassembler always needs the relocations.  */
+  bfd_boolean disassembler_needs_relocs;
+
+  /* Results from instruction decoders.  Not all decoders yet support
+     this information.  This info is set each time an instruction is
+     decoded, and is only valid for the last such instruction.
+
+     To determine whether this decoder supports this information, set
+     insn_info_valid to 0, decode an instruction, then check it.  */
+
+  char insn_info_valid;		/* Branch info has been set. */
+  char branch_delay_insns;	/* How many sequential insn's will run before
+				   a branch takes effect.  (0 = normal) */
+  char data_size;		/* Size of data reference in insn, in bytes */
+  enum dis_insn_type insn_type;	/* Type of instruction */
+  bfd_vma target;		/* Target address of branch or dref, if known;
+				   zero if unknown.  */
+  bfd_vma target2;		/* Second target address for dref2 */
+
+  /* Command line options specific to the target disassembler.  */
+  char * disassembler_options;
+
+} disassemble_info;
+
+
+/* Standard disassemblers.  Disassemble one instruction at the given
+   target address.  Return number of octets processed.  */
+typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *);
+
+extern int print_insn_aarch64		(bfd_vma, disassemble_info *);
+extern int print_insn_alpha		(bfd_vma, disassemble_info *);
+extern int print_insn_avr		(bfd_vma, disassemble_info *);
+extern int print_insn_bfin		(bfd_vma, disassemble_info *);
+extern int print_insn_big_arm		(bfd_vma, disassemble_info *);
+extern int print_insn_big_mips		(bfd_vma, disassemble_info *);
+extern int print_insn_big_nios2		(bfd_vma, disassemble_info *);
+extern int print_insn_big_or32		(bfd_vma, disassemble_info *);
+extern int print_insn_big_powerpc	(bfd_vma, disassemble_info *);
+extern int print_insn_big_score         (bfd_vma, disassemble_info *);
+extern int print_insn_cr16              (bfd_vma, disassemble_info *);
+extern int print_insn_crx               (bfd_vma, disassemble_info *);
+extern int print_insn_d10v		(bfd_vma, disassemble_info *);
+extern int print_insn_d30v		(bfd_vma, disassemble_info *);
+extern int print_insn_dlx 		(bfd_vma, disassemble_info *);
+extern int print_insn_epiphany		(bfd_vma, disassemble_info *);
+extern int print_insn_fr30		(bfd_vma, disassemble_info *);
+extern int print_insn_frv		(bfd_vma, disassemble_info *);
+extern int print_insn_h8300		(bfd_vma, disassemble_info *);
+extern int print_insn_h8300h		(bfd_vma, disassemble_info *);
+extern int print_insn_h8300s		(bfd_vma, disassemble_info *);
+extern int print_insn_h8500		(bfd_vma, disassemble_info *);
+extern int print_insn_hppa		(bfd_vma, disassemble_info *);
+extern int print_insn_i370		(bfd_vma, disassemble_info *);
+extern int print_insn_i386		(bfd_vma, disassemble_info *);
+extern int print_insn_i386_att		(bfd_vma, disassemble_info *);
+extern int print_insn_i386_intel	(bfd_vma, disassemble_info *);
+extern int print_insn_i860		(bfd_vma, disassemble_info *);
+extern int print_insn_i960		(bfd_vma, disassemble_info *);
+extern int print_insn_ia64		(bfd_vma, disassemble_info *);
+extern int print_insn_ip2k		(bfd_vma, disassemble_info *);
+extern int print_insn_iq2000		(bfd_vma, disassemble_info *);
+extern int print_insn_little_arm	(bfd_vma, disassemble_info *);
+extern int print_insn_little_mips	(bfd_vma, disassemble_info *);
+extern int print_insn_little_nios2	(bfd_vma, disassemble_info *);
+extern int print_insn_little_or32	(bfd_vma, disassemble_info *);
+extern int print_insn_little_powerpc	(bfd_vma, disassemble_info *);
+extern int print_insn_little_score      (bfd_vma, disassemble_info *); 
+extern int print_insn_lm32		(bfd_vma, disassemble_info *);
+extern int print_insn_m32c	        (bfd_vma, disassemble_info *);
+extern int print_insn_m32r		(bfd_vma, disassemble_info *);
+extern int print_insn_m68hc11		(bfd_vma, disassemble_info *);
+extern int print_insn_m68hc12		(bfd_vma, disassemble_info *);
+extern int print_insn_m9s12x		(bfd_vma, disassemble_info *);
+extern int print_insn_m9s12xg		(bfd_vma, disassemble_info *);
+extern int print_insn_m68k		(bfd_vma, disassemble_info *);
+extern int print_insn_m88k		(bfd_vma, disassemble_info *);
+extern int print_insn_mcore		(bfd_vma, disassemble_info *);
+extern int print_insn_mep		(bfd_vma, disassemble_info *);
+extern int print_insn_metag		(bfd_vma, disassemble_info *);
+extern int print_insn_microblaze	(bfd_vma, disassemble_info *);
+extern int print_insn_mmix		(bfd_vma, disassemble_info *);
+extern int print_insn_mn10200		(bfd_vma, disassemble_info *);
+extern int print_insn_mn10300		(bfd_vma, disassemble_info *);
+extern int print_insn_moxie		(bfd_vma, disassemble_info *);
+extern int print_insn_msp430		(bfd_vma, disassemble_info *);
+extern int print_insn_mt                (bfd_vma, disassemble_info *);
+extern int print_insn_ns32k		(bfd_vma, disassemble_info *);
+extern int print_insn_openrisc		(bfd_vma, disassemble_info *);
+extern int print_insn_pdp11		(bfd_vma, disassemble_info *);
+extern int print_insn_pj		(bfd_vma, disassemble_info *);
+extern int print_insn_rs6000		(bfd_vma, disassemble_info *);
+extern int print_insn_s390		(bfd_vma, disassemble_info *);
+extern int print_insn_sh		(bfd_vma, disassemble_info *);
+extern int print_insn_sh64		(bfd_vma, disassemble_info *);
+extern int print_insn_sh64x_media	(bfd_vma, disassemble_info *);
+extern int print_insn_sparc		(bfd_vma, disassemble_info *);
+extern int print_insn_spu		(bfd_vma, disassemble_info *);
+extern int print_insn_tic30		(bfd_vma, disassemble_info *);
+extern int print_insn_tic4x		(bfd_vma, disassemble_info *);
+extern int print_insn_tic54x		(bfd_vma, disassemble_info *);
+extern int print_insn_tic6x		(bfd_vma, disassemble_info *);
+extern int print_insn_tic80		(bfd_vma, disassemble_info *);
+extern int print_insn_tilegx		(bfd_vma, disassemble_info *);
+extern int print_insn_tilepro		(bfd_vma, disassemble_info *);
+extern int print_insn_v850		(bfd_vma, disassemble_info *);
+extern int print_insn_vax		(bfd_vma, disassemble_info *);
+extern int print_insn_w65		(bfd_vma, disassemble_info *);
+extern int print_insn_xc16x		(bfd_vma, disassemble_info *);
+extern int print_insn_xgate             (bfd_vma, disassemble_info *);
+extern int print_insn_xstormy16		(bfd_vma, disassemble_info *);
+extern int print_insn_xtensa		(bfd_vma, disassemble_info *);
+extern int print_insn_z80		(bfd_vma, disassemble_info *);
+extern int print_insn_z8001		(bfd_vma, disassemble_info *);
+extern int print_insn_z8002		(bfd_vma, disassemble_info *);
+extern int print_insn_rx		(bfd_vma, disassemble_info *);
+extern int print_insn_rl78		(bfd_vma, disassemble_info *);
+
+extern disassembler_ftype arc_get_disassembler (void *);
+extern disassembler_ftype cris_get_disassembler (bfd *);
+
+extern void print_aarch64_disassembler_options (FILE *);
+extern void print_i386_disassembler_options (FILE *);
+extern void print_mips_disassembler_options (FILE *);
+extern void print_ppc_disassembler_options (FILE *);
+extern void print_arm_disassembler_options (FILE *);
+extern void parse_arm_disassembler_option (char *);
+extern void print_s390_disassembler_options (FILE *);
+extern int  get_arm_regname_num_options (void);
+extern int  set_arm_regname_option (int);
+extern int  get_arm_regnames (int, const char **, const char **, const char *const **);
+extern bfd_boolean aarch64_symbol_is_valid (asymbol *, struct disassemble_info *);
+extern bfd_boolean arm_symbol_is_valid (asymbol *, struct disassemble_info *);
+extern void disassemble_init_powerpc (struct disassemble_info *);
+
+/* Fetch the disassembler for a given BFD, if that support is available.  */
+extern disassembler_ftype disassembler (bfd *);
+
+/* Amend the disassemble_info structure as necessary for the target architecture.
+   Should only be called after initialising the info->arch field.  */
+extern void disassemble_init_for_target (struct disassemble_info * dinfo);
+
+/* Document any target specific options available from the disassembler.  */
+extern void disassembler_usage (FILE *);
+
+
+/* This block of definitions is for particular callers who read instructions
+   into a buffer before calling the instruction decoder.  */
+
+/* Here is a function which callers may wish to use for read_memory_func.
+   It gets bytes from a buffer.  */
+extern int buffer_read_memory
+  (bfd_vma, bfd_byte *, unsigned int, struct disassemble_info *);
+
+/* This function goes with buffer_read_memory.
+   It prints a message using info->fprintf_func and info->stream.  */
+extern void perror_memory (int, bfd_vma, struct disassemble_info *);
+
+
+/* Just print the address in hex.  This is included for completeness even
+   though both GDB and objdump provide their own (to print symbolic
+   addresses).  */
+extern void generic_print_address
+  (bfd_vma, struct disassemble_info *);
+
+/* Always true.  */
+extern int generic_symbol_at_address
+  (bfd_vma, struct disassemble_info *);
+
+/* Also always true.  */
+extern bfd_boolean generic_symbol_is_valid
+  (asymbol *, struct disassemble_info *);
+
+/* Method to initialize a disassemble_info struct.  This should be
+   called by all applications creating such a struct.  */
+extern void init_disassemble_info (struct disassemble_info *dinfo, void *stream,
+				   fprintf_ftype fprintf_func);
+
+/* For compatibility with existing code.  */
+#define INIT_DISASSEMBLE_INFO(INFO, STREAM, FPRINTF_FUNC) \
+  init_disassemble_info (&(INFO), (STREAM), (fprintf_ftype) (FPRINTF_FUNC))
+#define INIT_DISASSEMBLE_INFO_NO_ARCH(INFO, STREAM, FPRINTF_FUNC) \
+  init_disassemble_info (&(INFO), (STREAM), (fprintf_ftype) (FPRINTF_FUNC))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ! defined (DIS_ASM_H) */
diff --git a/utils/gapy/gen-debug-info-src/ext/dwarf2.h b/utils/gapy/gen-debug-info-src/ext/dwarf2.h
new file mode 100644
index 000000000..1a145aa48
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/dwarf2.h
@@ -0,0 +1,434 @@
+/* Declarations and definitions of codes relating to the DWARF2 and
+   DWARF3 symbolic debugging information formats.
+   Copyright (C) 1992-2016 Free Software Foundation, Inc.
+
+   Written by Gary Funck (gary@intrepid.com) The Ada Joint Program
+   Office (AJPO), Florida State University and Silicon Graphics Inc.
+   provided support for this effort -- June 21, 1995.
+
+   Derived from the DWARF 1 implementation written by Ron Guilmette
+   (rfg@netcom.com), November 1990.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 3, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file is derived from the DWARF specification (a public document)
+   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
+   Programming Languages Special Interest Group (UI/PLSIG) and distributed
+   by UNIX International.  Copies of this specification are available from
+   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.
+
+   This file also now contains definitions from the DWARF 3 specification
+   published Dec 20, 2005, available from: http://dwarf.freestandards.org.  */
+
+#ifndef _DWARF2_H
+#define _DWARF2_H
+
+#define DW_TAG(name, value) , name = value
+#define DW_TAG_DUP(name, value) , name = value
+#define DW_FORM(name, value) , name = value
+#define DW_AT(name, value) , name = value
+#define DW_AT_DUP(name, value) , name = value
+#define DW_OP(name, value) , name = value
+#define DW_OP_DUP(name, value) , name = value
+#define DW_ATE(name, value) , name = value
+#define DW_ATE_DUP(name, value) , name = value
+#define DW_CFA(name, value) , name = value
+
+#define DW_FIRST_TAG(name, value) enum dwarf_tag { \
+  name = value
+#define DW_END_TAG };
+#define DW_FIRST_FORM(name, value) enum dwarf_form { \
+  name = value
+#define DW_END_FORM };
+#define DW_FIRST_AT(name, value) enum dwarf_attribute { \
+  name = value
+#define DW_END_AT };
+#define DW_FIRST_OP(name, value) enum dwarf_location_atom { \
+  name = value
+#define DW_END_OP };
+#define DW_FIRST_ATE(name, value) enum dwarf_type { \
+  name = value
+#define DW_END_ATE };
+#define DW_FIRST_CFA(name, value) enum dwarf_call_frame_info { \
+  name = value
+#define DW_END_CFA };
+
+#include "dwarf2.def"
+
+#undef DW_FIRST_TAG
+#undef DW_END_TAG
+#undef DW_FIRST_FORM
+#undef DW_END_FORM
+#undef DW_FIRST_AT
+#undef DW_END_AT
+#undef DW_FIRST_OP
+#undef DW_END_OP
+#undef DW_FIRST_ATE
+#undef DW_END_ATE
+#undef DW_FIRST_CFA
+#undef DW_END_CFA
+
+#undef DW_TAG
+#undef DW_TAG_DUP
+#undef DW_FORM
+#undef DW_AT
+#undef DW_AT_DUP
+#undef DW_OP
+#undef DW_OP_DUP
+#undef DW_ATE
+#undef DW_ATE_DUP
+#undef DW_CFA
+
+/* Flag that tells whether entry has a child or not.  */
+#define DW_children_no   0
+#define	DW_children_yes  1
+
+#define DW_AT_stride_size   DW_AT_bit_stride  /* Note: The use of DW_AT_stride_size is deprecated.  */
+#define DW_AT_stride   DW_AT_byte_stride  /* Note: The use of DW_AT_stride is deprecated.  */
+
+/* Decimal sign encodings.  */
+enum dwarf_decimal_sign_encoding
+  {
+    /* DWARF 3.  */
+    DW_DS_unsigned = 0x01,
+    DW_DS_leading_overpunch = 0x02,
+    DW_DS_trailing_overpunch = 0x03,
+    DW_DS_leading_separate = 0x04,
+    DW_DS_trailing_separate = 0x05
+  };
+
+/* Endianity encodings.  */
+enum dwarf_endianity_encoding
+  {
+    /* DWARF 3.  */
+    DW_END_default = 0x00,
+    DW_END_big = 0x01,
+    DW_END_little = 0x02,
+
+    DW_END_lo_user = 0x40,
+    DW_END_hi_user = 0xff
+  };
+
+/* Array ordering names and codes.  */
+enum dwarf_array_dim_ordering
+  {
+    DW_ORD_row_major = 0,
+    DW_ORD_col_major = 1
+  };
+
+/* Access attribute.  */
+enum dwarf_access_attribute
+  {
+    DW_ACCESS_public = 1,
+    DW_ACCESS_protected = 2,
+    DW_ACCESS_private = 3
+  };
+
+/* Visibility.  */
+enum dwarf_visibility_attribute
+  {
+    DW_VIS_local = 1,
+    DW_VIS_exported = 2,
+    DW_VIS_qualified = 3
+  };
+
+/* Virtuality.  */
+enum dwarf_virtuality_attribute
+  {
+    DW_VIRTUALITY_none = 0,
+    DW_VIRTUALITY_virtual = 1,
+    DW_VIRTUALITY_pure_virtual = 2
+  };
+
+/* Case sensitivity.  */
+enum dwarf_id_case
+  {
+    DW_ID_case_sensitive = 0,
+    DW_ID_up_case = 1,
+    DW_ID_down_case = 2,
+    DW_ID_case_insensitive = 3
+  };
+
+/* Calling convention.  */
+enum dwarf_calling_convention
+  {
+    DW_CC_normal = 0x1,
+    DW_CC_program = 0x2,
+    DW_CC_nocall = 0x3,
+
+    DW_CC_lo_user = 0x40,
+    DW_CC_hi_user = 0xff,
+
+    DW_CC_GNU_renesas_sh = 0x40,
+    DW_CC_GNU_borland_fastcall_i386 = 0x41,
+
+    /* This DW_CC_ value is not currently generated by any toolchain.  It is
+       used internally to GDB to indicate OpenCL C functions that have been
+       compiled with the IBM XL C for OpenCL compiler and use a non-platform
+       calling convention for passing OpenCL C vector types.  This value may
+       be changed freely as long as it does not conflict with any other DW_CC_
+       value defined here.  */
+    DW_CC_GDB_IBM_OpenCL = 0xff
+  };
+
+/* Inline attribute.  */
+enum dwarf_inline_attribute
+  {
+    DW_INL_not_inlined = 0,
+    DW_INL_inlined = 1,
+    DW_INL_declared_not_inlined = 2,
+    DW_INL_declared_inlined = 3
+  };
+
+/* Discriminant lists.  */
+enum dwarf_discrim_list
+  {
+    DW_DSC_label = 0,
+    DW_DSC_range = 1
+  };
+
+/* Line number opcodes.  */
+enum dwarf_line_number_ops
+  {
+    DW_LNS_extended_op = 0,
+    DW_LNS_copy = 1,
+    DW_LNS_advance_pc = 2,
+    DW_LNS_advance_line = 3,
+    DW_LNS_set_file = 4,
+    DW_LNS_set_column = 5,
+    DW_LNS_negate_stmt = 6,
+    DW_LNS_set_basic_block = 7,
+    DW_LNS_const_add_pc = 8,
+    DW_LNS_fixed_advance_pc = 9,
+    /* DWARF 3.  */
+    DW_LNS_set_prologue_end = 10,
+    DW_LNS_set_epilogue_begin = 11,
+    DW_LNS_set_isa = 12
+  };
+
+/* Line number extended opcodes.  */
+enum dwarf_line_number_x_ops
+  {
+    DW_LNE_end_sequence = 1,
+    DW_LNE_set_address = 2,
+    DW_LNE_define_file = 3,
+    DW_LNE_set_discriminator = 4,
+    /* HP extensions.  */
+    DW_LNE_HP_negate_is_UV_update      = 0x11,
+    DW_LNE_HP_push_context             = 0x12,
+    DW_LNE_HP_pop_context              = 0x13,
+    DW_LNE_HP_set_file_line_column     = 0x14,
+    DW_LNE_HP_set_routine_name         = 0x15,
+    DW_LNE_HP_set_sequence             = 0x16,
+    DW_LNE_HP_negate_post_semantics    = 0x17,
+    DW_LNE_HP_negate_function_exit     = 0x18,
+    DW_LNE_HP_negate_front_end_logical = 0x19,
+    DW_LNE_HP_define_proc              = 0x20,
+    DW_LNE_HP_source_file_correlation  = 0x80,
+
+    DW_LNE_lo_user = 0x80,
+    DW_LNE_hi_user = 0xff
+  };
+
+/* Sub-opcodes for DW_LNE_HP_source_file_correlation.  */
+enum dwarf_line_number_hp_sfc_ops
+  {
+    DW_LNE_HP_SFC_formfeed = 1,
+    DW_LNE_HP_SFC_set_listing_line = 2,
+    DW_LNE_HP_SFC_associate = 3
+  };
+
+/* Type codes for location list entries.
+   Extension for Fission.  See http://gcc.gnu.org/wiki/DebugFission.  */
+
+enum dwarf_location_list_entry_type
+  {
+    DW_LLE_GNU_end_of_list_entry = 0,
+    DW_LLE_GNU_base_address_selection_entry = 1,
+    DW_LLE_GNU_start_end_entry = 2,
+    DW_LLE_GNU_start_length_entry = 3
+  };
+
+#define DW_CIE_ID	  0xffffffff
+#define DW64_CIE_ID	  0xffffffffffffffffULL
+#define DW_CIE_VERSION	  1
+
+#define DW_CFA_extended   0
+
+#define DW_CHILDREN_no		     0x00
+#define DW_CHILDREN_yes		     0x01
+
+#define DW_ADDR_none		0
+
+/* Source language names and codes.  */
+enum dwarf_source_language
+  {
+    DW_LANG_C89 = 0x0001,
+    DW_LANG_C = 0x0002,
+    DW_LANG_Ada83 = 0x0003,
+    DW_LANG_C_plus_plus = 0x0004,
+    DW_LANG_Cobol74 = 0x0005,
+    DW_LANG_Cobol85 = 0x0006,
+    DW_LANG_Fortran77 = 0x0007,
+    DW_LANG_Fortran90 = 0x0008,
+    DW_LANG_Pascal83 = 0x0009,
+    DW_LANG_Modula2 = 0x000a,
+    /* DWARF 3.  */
+    DW_LANG_Java = 0x000b,
+    DW_LANG_C99 = 0x000c,
+    DW_LANG_Ada95 = 0x000d,
+    DW_LANG_Fortran95 = 0x000e,
+    DW_LANG_PLI = 0x000f,
+    DW_LANG_ObjC = 0x0010,
+    DW_LANG_ObjC_plus_plus = 0x0011,
+    DW_LANG_UPC = 0x0012,
+    DW_LANG_D = 0x0013,
+    /* DWARF 4.  */
+    DW_LANG_Python = 0x0014,
+    /* DWARF 5.  */
+    DW_LANG_Go = 0x0016,
+
+    DW_LANG_C_plus_plus_11 = 0x001a, /* dwarf5.20141029.pdf DRAFT */
+    DW_LANG_Rust = 0x001c,
+    DW_LANG_C11 = 0x001d,
+    DW_LANG_C_plus_plus_14 = 0x0021,
+    DW_LANG_Fortran03 = 0x0022,
+    DW_LANG_Fortran08 = 0x0023,
+
+    DW_LANG_lo_user = 0x8000,	/* Implementation-defined range start.  */
+    DW_LANG_hi_user = 0xffff,	/* Implementation-defined range start.  */
+
+    /* MIPS.  */
+    DW_LANG_Mips_Assembler = 0x8001,
+    /* UPC.  */
+    DW_LANG_Upc = 0x8765,
+    /* HP extensions.  */
+    DW_LANG_HP_Bliss     = 0x8003,
+    DW_LANG_HP_Basic91   = 0x8004,
+    DW_LANG_HP_Pascal91  = 0x8005,
+    DW_LANG_HP_IMacro    = 0x8006,
+    DW_LANG_HP_Assembler = 0x8007,
+
+    /* Rust extension, but replaced in DWARF 5.  */
+    DW_LANG_Rust_old = 0x9000
+  };
+
+/* Names and codes for macro information.  */
+enum dwarf_macinfo_record_type
+  {
+    DW_MACINFO_define = 1,
+    DW_MACINFO_undef = 2,
+    DW_MACINFO_start_file = 3,
+    DW_MACINFO_end_file = 4,
+    DW_MACINFO_vendor_ext = 255
+  };
+
+/* Names and codes for new style macro information.  */
+enum dwarf_macro_record_type
+  {
+    DW_MACRO_GNU_define = 1,
+    DW_MACRO_GNU_undef = 2,
+    DW_MACRO_GNU_start_file = 3,
+    DW_MACRO_GNU_end_file = 4,
+    DW_MACRO_GNU_define_indirect = 5,
+    DW_MACRO_GNU_undef_indirect = 6,
+    DW_MACRO_GNU_transparent_include = 7,
+    /* Extensions for DWZ multifile.
+       See http://www.dwarfstd.org/ShowIssue.php?issue=120604.1&type=open .  */
+    DW_MACRO_GNU_define_indirect_alt = 8,
+    DW_MACRO_GNU_undef_indirect_alt = 9,
+    DW_MACRO_GNU_transparent_include_alt = 10,
+    DW_MACRO_GNU_lo_user = 0xe0,
+    DW_MACRO_GNU_hi_user = 0xff
+  };
+
+/* @@@ For use with GNU frame unwind information.  */
+
+#define DW_EH_PE_absptr		0x00
+#define DW_EH_PE_omit		0xff
+
+#define DW_EH_PE_uleb128	0x01
+#define DW_EH_PE_udata2		0x02
+#define DW_EH_PE_udata4		0x03
+#define DW_EH_PE_udata8		0x04
+#define DW_EH_PE_sleb128	0x09
+#define DW_EH_PE_sdata2		0x0A
+#define DW_EH_PE_sdata4		0x0B
+#define DW_EH_PE_sdata8		0x0C
+#define DW_EH_PE_signed		0x08
+
+#define DW_EH_PE_pcrel		0x10
+#define DW_EH_PE_textrel	0x20
+#define DW_EH_PE_datarel	0x30
+#define DW_EH_PE_funcrel	0x40
+#define DW_EH_PE_aligned	0x50
+
+#define DW_EH_PE_indirect	0x80
+
+/* Codes for the debug sections in a dwarf package (.dwp) file.
+   Extensions for Fission.  See http://gcc.gnu.org/wiki/DebugFissionDWP.  */
+enum dwarf_sect
+  {
+    DW_SECT_INFO = 1,
+    DW_SECT_TYPES = 2,
+    DW_SECT_ABBREV = 3,
+    DW_SECT_LINE = 4,
+    DW_SECT_LOC = 5,
+    DW_SECT_STR_OFFSETS = 6,
+    DW_SECT_MACINFO = 7,
+    DW_SECT_MACRO = 8,
+    DW_SECT_MAX = 8
+  };
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Return the name of a DW_TAG_ constant, or NULL if the value is not
+   recognized.  */
+extern const char *get_DW_TAG_name (unsigned int tag);
+
+/* Return the name of a DW_AT_ constant, or NULL if the value is not
+   recognized.  */
+extern const char *get_DW_AT_name (unsigned int attr);
+
+/* Return the name of a DW_FORM_ constant, or NULL if the value is not
+   recognized.  */
+extern const char *get_DW_FORM_name (unsigned int form);
+
+/* Return the name of a DW_OP_ constant, or NULL if the value is not
+   recognized.  */
+extern const char *get_DW_OP_name (unsigned int op);
+
+/* Return the name of a DW_ATE_ constant, or NULL if the value is not
+   recognized.  */
+extern const char *get_DW_ATE_name (unsigned int enc);
+
+/* Return the name of a DW_CFA_ constant, or NULL if the value is not
+   recognized.  */
+extern const char *get_DW_CFA_name (unsigned int opc);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _DWARF2_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/dyn-string.h b/utils/gapy/gen-debug-info-src/ext/dyn-string.h
new file mode 100644
index 000000000..7c3684b7c
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/dyn-string.h
@@ -0,0 +1,72 @@
+/* An abstract string datatype.
+   Copyright (C) 1998-2015 Free Software Foundation, Inc.
+   Contributed by Mark Mitchell (mark@markmitchell.com).
+
+This file is part of GCC.
+   
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef DYN_STRING_H
+#define DYN_STRING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct dyn_string
+{
+  int allocated;	/* The amount of space allocated for the string.  */
+  int length;		/* The actual length of the string.  */
+  char *s;		/* The string itself, NUL-terminated.  */
+}* dyn_string_t;
+
+/* The length STR, in bytes, not including the terminating NUL.  */
+#define dyn_string_length(STR)                                          \
+  ((STR)->length)
+
+/* The NTBS in which the contents of STR are stored.  */
+#define dyn_string_buf(STR)                                             \
+  ((STR)->s)
+
+/* Compare DS1 to DS2 with strcmp.  */
+#define dyn_string_compare(DS1, DS2)                                    \
+  (strcmp ((DS1)->s, (DS2)->s))
+
+
+extern int dyn_string_init (struct dyn_string *, int);
+extern dyn_string_t dyn_string_new (int);
+extern void dyn_string_delete (dyn_string_t);
+extern char *dyn_string_release (dyn_string_t);
+extern dyn_string_t dyn_string_resize (dyn_string_t, int);
+extern void dyn_string_clear (dyn_string_t);
+extern int dyn_string_copy (dyn_string_t, dyn_string_t);
+extern int dyn_string_copy_cstr (dyn_string_t, const char *);
+extern int dyn_string_prepend (dyn_string_t, dyn_string_t);
+extern int dyn_string_prepend_cstr (dyn_string_t, const char *);
+extern int dyn_string_insert (dyn_string_t, int, dyn_string_t);
+extern int dyn_string_insert_cstr (dyn_string_t, int, const char *);
+extern int dyn_string_insert_char (dyn_string_t, int, int);
+extern int dyn_string_append (dyn_string_t, dyn_string_t);
+extern int dyn_string_append_cstr (dyn_string_t, const char *);
+extern int dyn_string_append_char (dyn_string_t, int);
+extern int dyn_string_substring (dyn_string_t,  dyn_string_t, int, int);
+extern int dyn_string_eq (dyn_string_t, dyn_string_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined (DYN_STRING_H) */
diff --git a/utils/gapy/gen-debug-info-src/ext/environ.h b/utils/gapy/gen-debug-info-src/ext/environ.h
new file mode 100644
index 000000000..c18902ba5
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/environ.h
@@ -0,0 +1,33 @@
+/* Declare the environ system variable.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of the libiberty library.
+Libiberty is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+Libiberty is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with libiberty; see the file COPYING.LIB.  If not,
+write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* On OSX, the environ variable can be used directly in the code of an
+   executable, but cannot be used in the code of a shared library (such as
+   GCC's liblto_plugin, which links in libiberty code).  Instead, the
+   function _NSGetEnviron can be called to get the address of environ.  */
+
+#ifndef HAVE_ENVIRON_DECL
+#  ifdef __APPLE__
+#     include <crt_externs.h>
+#     define environ (*_NSGetEnviron ())
+#  else
+extern char **environ;
+#  endif
+#  define HAVE_ENVIRON_DECL
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/fibheap.h b/utils/gapy/gen-debug-info-src/ext/fibheap.h
new file mode 100644
index 000000000..85b10c58b
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/fibheap.h
@@ -0,0 +1,94 @@
+/* A Fibonacci heap datatype.
+   Copyright (C) 1998-2015 Free Software Foundation, Inc.
+   Contributed by Daniel Berlin (dan@cgsoftware.com).
+
+This file is part of GCC.
+   
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* Fibonacci heaps are somewhat complex, but, there's an article in
+   DDJ that explains them pretty well:
+
+   http://www.ddj.com/articles/1997/9701/9701o/9701o.htm?topic=algoritms
+
+   Introduction to algorithms by Corman and Rivest also goes over them.
+
+   The original paper that introduced them is "Fibonacci heaps and their
+   uses in improved network optimization algorithms" by Tarjan and
+   Fredman (JACM 34(3), July 1987).
+
+   Amortized and real worst case time for operations:
+
+   ExtractMin: O(lg n) amortized. O(n) worst case.
+   DecreaseKey: O(1) amortized.  O(lg n) worst case. 
+   Insert: O(2) amortized. O(1) actual.  
+   Union: O(1) amortized. O(1) actual.  */
+
+#ifndef _FIBHEAP_H_
+#define _FIBHEAP_H_
+
+#include "ansidecl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef long fibheapkey_t;
+
+typedef struct fibheap
+{
+  size_t nodes;
+  struct fibnode *min;
+  struct fibnode *root;
+} *fibheap_t;
+
+typedef struct fibnode
+{
+  struct fibnode *parent;
+  struct fibnode *child;
+  struct fibnode *left;
+  struct fibnode *right;
+  fibheapkey_t key;
+  void *data;
+#if defined (__GNUC__) && (!defined (SIZEOF_INT) || SIZEOF_INT < 4)
+  __extension__ unsigned long int degree : 31;
+  __extension__ unsigned long int mark : 1;
+#else
+  unsigned int degree : 31;
+  unsigned int mark : 1;
+#endif
+} *fibnode_t;
+
+extern fibheap_t fibheap_new (void);
+extern fibnode_t fibheap_insert (fibheap_t, fibheapkey_t, void *);
+extern int fibheap_empty (fibheap_t);
+extern fibheapkey_t fibheap_min_key (fibheap_t);
+extern fibheapkey_t fibheap_replace_key (fibheap_t, fibnode_t,
+                                         fibheapkey_t);
+extern void *fibheap_replace_key_data (fibheap_t, fibnode_t,
+                                       fibheapkey_t, void *);
+extern void *fibheap_extract_min (fibheap_t);
+extern void *fibheap_min (fibheap_t);
+extern void *fibheap_replace_data (fibheap_t, fibnode_t, void *);
+extern void *fibheap_delete_node (fibheap_t, fibnode_t);
+extern void fibheap_delete (fibheap_t);
+extern fibheap_t fibheap_union (fibheap_t, fibheap_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FIBHEAP_H_ */
diff --git a/utils/gapy/gen-debug-info-src/ext/filenames.h b/utils/gapy/gen-debug-info-src/ext/filenames.h
new file mode 100644
index 000000000..1161daaa4
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/filenames.h
@@ -0,0 +1,99 @@
+/* Macros for taking apart, interpreting and processing file names.
+
+   These are here because some non-Posix (a.k.a. DOSish) systems have
+   drive letter brain-damage at the beginning of an absolute file name,
+   use forward- and back-slash in path names interchangeably, and
+   some of them have case-insensitive file names.
+
+   Copyright (C) 2000-2015 Free Software Foundation, Inc.
+
+This file is part of BFD, the Binary File Descriptor library.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef FILENAMES_H
+#define FILENAMES_H
+
+#include "hashtab.h" /* for hashval_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MSDOS__) || defined(_WIN32) || defined(__OS2__) || defined (__CYGWIN__)
+#  ifndef HAVE_DOS_BASED_FILE_SYSTEM
+#    define HAVE_DOS_BASED_FILE_SYSTEM 1
+#  endif
+#  ifndef HAVE_CASE_INSENSITIVE_FILE_SYSTEM
+#    define HAVE_CASE_INSENSITIVE_FILE_SYSTEM 1
+#  endif
+#  define HAS_DRIVE_SPEC(f) HAS_DOS_DRIVE_SPEC (f)
+#  define IS_DIR_SEPARATOR(c) IS_DOS_DIR_SEPARATOR (c)
+#  define IS_ABSOLUTE_PATH(f) IS_DOS_ABSOLUTE_PATH (f)
+#else /* not DOSish */
+#  if defined(__APPLE__)
+#    ifndef HAVE_CASE_INSENSITIVE_FILE_SYSTEM
+#      define HAVE_CASE_INSENSITIVE_FILE_SYSTEM 1
+#    endif
+#  endif /* __APPLE__ */
+#  define HAS_DRIVE_SPEC(f) (0)
+#  define IS_DIR_SEPARATOR(c) IS_UNIX_DIR_SEPARATOR (c)
+#  define IS_ABSOLUTE_PATH(f) IS_UNIX_ABSOLUTE_PATH (f)
+#endif
+
+#define IS_DIR_SEPARATOR_1(dos_based, c)				\
+  (((c) == '/')								\
+   || (((c) == '\\') && (dos_based)))
+
+#define HAS_DRIVE_SPEC_1(dos_based, f)			\
+  ((f)[0] && ((f)[1] == ':') && (dos_based))
+
+/* Remove the drive spec from F, assuming HAS_DRIVE_SPEC (f).
+   The result is a pointer to the remainder of F.  */
+#define STRIP_DRIVE_SPEC(f)	((f) + 2)
+
+#define IS_DOS_DIR_SEPARATOR(c) IS_DIR_SEPARATOR_1 (1, c)
+#define IS_DOS_ABSOLUTE_PATH(f) IS_ABSOLUTE_PATH_1 (1, f)
+#define HAS_DOS_DRIVE_SPEC(f) HAS_DRIVE_SPEC_1 (1, f)
+
+#define IS_UNIX_DIR_SEPARATOR(c) IS_DIR_SEPARATOR_1 (0, c)
+#define IS_UNIX_ABSOLUTE_PATH(f) IS_ABSOLUTE_PATH_1 (0, f)
+
+/* Note that when DOS_BASED is true, IS_ABSOLUTE_PATH accepts d:foo as
+   well, although it is only semi-absolute.  This is because the users
+   of IS_ABSOLUTE_PATH want to know whether to prepend the current
+   working directory to a file name, which should not be done with a
+   name like d:foo.  */
+#define IS_ABSOLUTE_PATH_1(dos_based, f)		 \
+  (IS_DIR_SEPARATOR_1 (dos_based, (f)[0])		 \
+   || HAS_DRIVE_SPEC_1 (dos_based, f))
+
+extern int filename_cmp (const char *s1, const char *s2);
+#define FILENAME_CMP(s1, s2)	filename_cmp(s1, s2)
+
+extern int filename_ncmp (const char *s1, const char *s2,
+			  size_t n);
+
+extern hashval_t filename_hash (const void *s);
+
+extern int filename_eq (const void *s1, const void *s2);
+
+extern int canonical_filename_eq (const char *a, const char *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FILENAMES_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/floatformat.h b/utils/gapy/gen-debug-info-src/ext/floatformat.h
new file mode 100644
index 000000000..af4d09cb8
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/floatformat.h
@@ -0,0 +1,159 @@
+/* IEEE floating point support declarations, for GDB, the GNU Debugger.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+
+This file is part of GDB.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#if !defined (FLOATFORMAT_H)
+#define FLOATFORMAT_H 1
+
+#include "ansidecl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A floatformat consists of a sign bit, an exponent and a mantissa.  Once the
+   bytes are concatenated according to the byteorder flag, then each of those
+   fields is contiguous.  We number the bits with 0 being the most significant
+   (i.e. BITS_BIG_ENDIAN type numbering), and specify which bits each field
+   contains with the *_start and *_len fields.  */
+
+/* What is the order of the bytes?  */
+
+enum floatformat_byteorders {
+  /* Standard little endian byte order.
+     EX: 1.2345678e10 => 00 00 80 c5 e0 fe 06 42 */
+  floatformat_little,
+
+  /* Standard big endian byte order.
+     EX: 1.2345678e10 => 42 06 fe e0 c5 80 00 00 */
+  floatformat_big,
+
+  /* Little endian byte order but big endian word order.
+     EX: 1.2345678e10 => e0 fe 06 42 00 00 80 c5 */
+  floatformat_littlebyte_bigword,
+
+  /* VAX byte order.  Little endian byte order with 16-bit words.  The
+     following example is an illustration of the byte order only; VAX
+     doesn't have a fully IEEE compliant floating-point format.
+     EX: 1.2345678e10 => 80 c5 00 00 06 42 e0 fe */
+  floatformat_vax
+};
+
+enum floatformat_intbit { floatformat_intbit_yes, floatformat_intbit_no };
+
+struct floatformat
+{
+  enum floatformat_byteorders byteorder;
+  unsigned int totalsize;	/* Total size of number in bits */
+
+  /* Sign bit is always one bit long.  1 means negative, 0 means positive.  */
+  unsigned int sign_start;
+
+  unsigned int exp_start;
+  unsigned int exp_len;
+  /* Bias added to a "true" exponent to form the biased exponent.  It
+     is intentionally signed as, otherwize, -exp_bias can turn into a
+     very large number (e.g., given the exp_bias of 0x3fff and a 64
+     bit long, the equation (long)(1 - exp_bias) evaluates to
+     4294950914) instead of -16382).  */
+  int exp_bias;
+  /* Exponent value which indicates NaN.  This is the actual value stored in
+     the float, not adjusted by the exp_bias.  This usually consists of all
+     one bits.  */
+  unsigned int exp_nan;
+
+  unsigned int man_start;
+  unsigned int man_len;
+
+  /* Is the integer bit explicit or implicit?  */
+  enum floatformat_intbit intbit;
+
+  /* Internal name for debugging. */
+  const char *name;
+
+  /* Validator method.  */
+  int (*is_valid) (const struct floatformat *fmt, const void *from);
+
+  /* Is the format actually the sum of two smaller floating point
+     formats (IBM long double, as described in
+     gcc/config/rs6000/darwin-ldouble-format)?  If so, this is the
+     smaller format in question, and the fields sign_start through
+     intbit describe the first half.  If not, this is NULL.  */
+  const struct floatformat *split_half;
+};
+
+/* floatformats for IEEE single and double, big and little endian.  */
+
+extern const struct floatformat floatformat_ieee_half_big;
+extern const struct floatformat floatformat_ieee_half_little;
+extern const struct floatformat floatformat_ieee_single_big;
+extern const struct floatformat floatformat_ieee_single_little;
+extern const struct floatformat floatformat_ieee_double_big;
+extern const struct floatformat floatformat_ieee_double_little;
+
+/* floatformat for ARM IEEE double, little endian bytes and big endian words */
+
+extern const struct floatformat floatformat_ieee_double_littlebyte_bigword;
+
+/* floatformats for VAX.  */
+
+extern const struct floatformat floatformat_vax_f;
+extern const struct floatformat floatformat_vax_d;
+extern const struct floatformat floatformat_vax_g;
+
+/* floatformats for various extendeds.  */
+
+extern const struct floatformat floatformat_i387_ext;
+extern const struct floatformat floatformat_m68881_ext;
+extern const struct floatformat floatformat_i960_ext;
+extern const struct floatformat floatformat_m88110_ext;
+extern const struct floatformat floatformat_m88110_harris_ext;
+extern const struct floatformat floatformat_arm_ext_big;
+extern const struct floatformat floatformat_arm_ext_littlebyte_bigword;
+/* IA-64 Floating Point register spilt into memory.  */
+extern const struct floatformat floatformat_ia64_spill_big;
+extern const struct floatformat floatformat_ia64_spill_little;
+extern const struct floatformat floatformat_ia64_quad_big;
+extern const struct floatformat floatformat_ia64_quad_little;
+/* IBM long double (double+double).  */
+extern const struct floatformat floatformat_ibm_long_double_big;
+extern const struct floatformat floatformat_ibm_long_double_little;
+
+/* Convert from FMT to a double.
+   FROM is the address of the extended float.
+   Store the double in *TO.  */
+
+extern void
+floatformat_to_double (const struct floatformat *, const void *, double *);
+
+/* The converse: convert the double *FROM to FMT
+   and store where TO points.  */
+
+extern void
+floatformat_from_double (const struct floatformat *, const double *, void *);
+
+/* Return non-zero iff the data at FROM is a valid number in format FMT.  */
+
+extern int
+floatformat_is_valid (const struct floatformat *fmt, const void *from);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* defined (FLOATFORMAT_H) */
diff --git a/utils/gapy/gen-debug-info-src/ext/fnmatch.h b/utils/gapy/gen-debug-info-src/ext/fnmatch.h
new file mode 100644
index 000000000..0789fc146
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/fnmatch.h
@@ -0,0 +1,70 @@
+/* Copyright (C) 1991-2015 Free Software Foundation, Inc.
+
+NOTE: The canonical source of this file is maintained with the GNU C Library.
+Bugs can be reported to bug-glibc@prep.ai.mit.edu.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef	_FNMATCH_H
+
+#define	_FNMATCH_H	1
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#if defined (__cplusplus) || (defined (__STDC__) && __STDC__)
+#undef	__P
+#define	__P(args)	args
+#else /* Not C++ or ANSI C.  */
+#undef	__P
+#define	__P(args)	()
+/* We can get away without defining `const' here only because in this file
+   it is used only inside the prototype for `fnmatch', which is elided in
+   non-ANSI C where `const' is problematical.  */
+#endif /* C++ or ANSI C.  */
+
+
+/* We #undef these before defining them because some losing systems
+   (HP-UX A.08.07 for example) define these in <unistd.h>.  */
+#undef	FNM_PATHNAME
+#undef	FNM_NOESCAPE
+#undef	FNM_PERIOD
+
+/* Bits set in the FLAGS argument to `fnmatch'.  */
+#define	FNM_PATHNAME	(1 << 0) /* No wildcard can ever match `/'.  */
+#define	FNM_NOESCAPE	(1 << 1) /* Backslashes don't quote special chars.  */
+#define	FNM_PERIOD	(1 << 2) /* Leading `.' is matched only explicitly.  */
+
+#if !defined (_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 2 || defined (_GNU_SOURCE)
+#define	FNM_FILE_NAME	FNM_PATHNAME /* Preferred GNU name.  */
+#define	FNM_LEADING_DIR	(1 << 3) /* Ignore `/...' after a match.  */
+#define	FNM_CASEFOLD	(1 << 4) /* Compare without regard to case.  */
+#endif
+
+/* Value returned by `fnmatch' if STRING does not match PATTERN.  */
+#define	FNM_NOMATCH	1
+
+/* Match STRING against the filename pattern PATTERN,
+   returning zero if it matches, FNM_NOMATCH if not.  */
+extern int fnmatch __P ((const char *__pattern, const char *__string,
+			 int __flags));
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* fnmatch.h */
diff --git a/utils/gapy/gen-debug-info-src/ext/fopen-bin.h b/utils/gapy/gen-debug-info-src/ext/fopen-bin.h
new file mode 100644
index 000000000..8f5f77100
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/fopen-bin.h
@@ -0,0 +1,44 @@
+/* Macros for the 'type' part of an fopen, freopen or fdopen. 
+
+	<Read|Write>[Update]<Binary file|text file>
+
+   This version is for "binary" systems, where text and binary files are
+   different.  An example is Mess-Dose.  Many Unix systems could also
+   cope with a "b" in the string, indicating binary files, but some reject this
+   (and thereby don't conform to ANSI C, but what else is new?).
+
+   Copyright 1996-2012 Free Software Foundation, Inc.
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* This file is designed for inclusion by host-dependent .h files.  No
+   user application should include it directly, since that would make
+   the application unable to be configured for both "same" and "binary"
+   variant systems.  */
+
+#define FOPEN_RB	"rb"
+#define FOPEN_WB 	"wb"
+#define FOPEN_AB 	"ab"
+#define FOPEN_RUB 	"r+b"
+#define FOPEN_WUB 	"w+b"
+#define FOPEN_AUB 	"a+b"
+
+#define FOPEN_RT	"r"
+#define FOPEN_WT 	"w"
+#define FOPEN_AT 	"a"
+#define FOPEN_RUT 	"r+"
+#define FOPEN_WUT 	"w+"
+#define FOPEN_AUT 	"a+"
diff --git a/utils/gapy/gen-debug-info-src/ext/fopen-same.h b/utils/gapy/gen-debug-info-src/ext/fopen-same.h
new file mode 100644
index 000000000..d139e02ec
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/fopen-same.h
@@ -0,0 +1,44 @@
+/* Macros for the 'type' part of an fopen, freopen or fdopen. 
+
+	<Read|Write>[Update]<Binary file|text file>
+
+   This version is for "same" systems, where text and binary files are
+   the same.  An example is Unix.  Many Unix systems could also add a
+   "b" to the string, indicating binary files, but some reject this
+   (and thereby don't conform to ANSI C, but what else is new?).
+
+   Copyright 1996-2012 Free Software Foundation, Inc.
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* This file is designed for inclusion by host-dependent .h files.  No
+   user application should include it directly, since that would make
+   the application unable to be configured for both "same" and "binary"
+   variant systems.  */
+
+#define FOPEN_RB	"r"
+#define FOPEN_WB 	"w"
+#define FOPEN_AB 	"a"
+#define FOPEN_RUB 	"r+"
+#define FOPEN_WUB 	"w+"
+#define FOPEN_AUB 	"a+"
+
+#define FOPEN_RT	"r"
+#define FOPEN_WT 	"w"
+#define FOPEN_AT 	"a"
+#define FOPEN_RUT 	"r+"
+#define FOPEN_WUT 	"w+"
+#define FOPEN_AUT 	"a+"
diff --git a/utils/gapy/gen-debug-info-src/ext/fopen-vms.h b/utils/gapy/gen-debug-info-src/ext/fopen-vms.h
new file mode 100644
index 000000000..e2b2dc7ad
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/fopen-vms.h
@@ -0,0 +1,42 @@
+/* Macros for the 'type' part of an fopen, freopen or fdopen. 
+
+	<Read|Write>[Update]<Binary file|text file>
+
+   This version is for VMS systems, where text and binary files are
+   different.
+   
+   Copyright 1996-2012 Free Software Foundation, Inc.
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* This file is designed for inclusion by host-dependent .h files.  No
+   user application should include it directly, since that would make
+   the application unable to be configured for both "same" and "binary"
+   variant systems.  */
+
+#define FOPEN_RB	"rb,rfm=udf,rat=none"
+#define FOPEN_WB 	"wb,rfm=udf,rat=none"
+#define FOPEN_AB 	"ab,rfm=udf,rat=none"
+#define FOPEN_RUB 	"r+b,rfm=udf,rat=none"
+#define FOPEN_WUB 	"w+b,rfm=udf,rat=none"
+#define FOPEN_AUB 	"a+b,rfm=udf,rat=none"
+
+#define FOPEN_RT	"r"
+#define FOPEN_WT 	"w"
+#define FOPEN_AT 	"a"
+#define FOPEN_RUT 	"r+"
+#define FOPEN_WUT 	"w+"
+#define FOPEN_AUT 	"a+"
diff --git a/utils/gapy/gen-debug-info-src/ext/gcc-c-interface.h b/utils/gapy/gen-debug-info-src/ext/gcc-c-interface.h
new file mode 100644
index 000000000..95d0fc94e
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/gcc-c-interface.h
@@ -0,0 +1,220 @@
+/* Interface between GCC C FE and GDB
+
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_C_INTERFACE_H
+#define GCC_C_INTERFACE_H
+
+#include "gcc-interface.h"
+
+/* This header defines the interface to the GCC API.  It must be both
+   valid C and valid C++, because it is included by both programs.  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declaration.  */
+
+struct gcc_c_context;
+
+/*
+ * Definitions and declarations for the C front end.
+ */
+
+/* Defined versions of the C front-end API.  */
+
+enum gcc_c_api_version
+{
+  GCC_C_FE_VERSION_0 = 0
+};
+
+/* Qualifiers.  */
+
+enum gcc_qualifiers
+{
+  GCC_QUALIFIER_CONST = 1,
+  GCC_QUALIFIER_VOLATILE = 2,
+  GCC_QUALIFIER_RESTRICT = 4
+};
+
+/* This enumerates the kinds of decls that GDB can create.  */
+
+enum gcc_c_symbol_kind
+{
+  /* A function.  */
+
+  GCC_C_SYMBOL_FUNCTION,
+
+  /* A variable.  */
+
+  GCC_C_SYMBOL_VARIABLE,
+
+  /* A typedef.  */
+
+  GCC_C_SYMBOL_TYPEDEF,
+
+  /* A label.  */
+
+  GCC_C_SYMBOL_LABEL
+};
+
+/* This enumerates the types of symbols that GCC might request from
+   GDB.  */
+
+enum gcc_c_oracle_request
+{
+  /* An ordinary symbol -- a variable, function, typedef, or enum
+     constant.  */
+
+  GCC_C_ORACLE_SYMBOL,
+
+  /* A struct, union, or enum tag.  */
+
+  GCC_C_ORACLE_TAG,
+
+  /* A label.  */
+
+  GCC_C_ORACLE_LABEL
+};
+
+/* The type of the function called by GCC to ask GDB for a symbol's
+   definition.  DATUM is an arbitrary value supplied when the oracle
+   function is registered.  CONTEXT is the GCC context in which the
+   request is being made.  REQUEST specifies what sort of symbol is
+   being requested, and IDENTIFIER is the name of the symbol.  */
+
+typedef void gcc_c_oracle_function (void *datum,
+				    struct gcc_c_context *context,
+				    enum gcc_c_oracle_request request,
+				    const char *identifier);
+
+/* The type of the function called by GCC to ask GDB for a symbol's
+   address.  This should return 0 if the address is not known.  */
+
+typedef gcc_address gcc_c_symbol_address_function (void *datum,
+						   struct gcc_c_context *ctxt,
+						   const char *identifier);
+
+/* An array of types used for creating a function type.  */
+
+struct gcc_type_array
+{
+  /* Number of elements.  */
+
+  int n_elements;
+
+  /* The elements.  */
+
+  gcc_type *elements;
+};
+
+/* The vtable used by the C front end.  */
+
+struct gcc_c_fe_vtable
+{
+  /* The version of the C interface.  The value is one of the
+     gcc_c_api_version constants.  */
+
+  unsigned int c_version;
+
+  /* Set the callbacks for this context.
+
+     The binding oracle is called whenever the C parser needs to look
+     up a symbol.  This gives the caller a chance to lazily
+     instantiate symbols using other parts of the gcc_c_fe_interface
+     API.
+
+     The address oracle is called whenever the C parser needs to look
+     up a symbol.  This is only called for symbols not provided by the
+     symbol oracle -- that is, just built-in functions where GCC
+     provides the declaration.
+
+     DATUM is an arbitrary piece of data that is passed back verbatim
+     to the callbakcs in requests.  */
+
+  void (*set_callbacks) (struct gcc_c_context *self,
+			 gcc_c_oracle_function *binding_oracle,
+			 gcc_c_symbol_address_function *address_oracle,
+			 void *datum);
+
+#define GCC_METHOD0(R, N) \
+  R (*N) (struct gcc_c_context *);
+#define GCC_METHOD1(R, N, A) \
+  R (*N) (struct gcc_c_context *, A);
+#define GCC_METHOD2(R, N, A, B) \
+  R (*N) (struct gcc_c_context *, A, B);
+#define GCC_METHOD3(R, N, A, B, C) \
+  R (*N) (struct gcc_c_context *, A, B, C);
+#define GCC_METHOD4(R, N, A, B, C, D) \
+  R (*N) (struct gcc_c_context *, A, B, C, D);
+#define GCC_METHOD5(R, N, A, B, C, D, E) \
+  R (*N) (struct gcc_c_context *, A, B, C, D, E);
+#define GCC_METHOD7(R, N, A, B, C, D, E, F, G) \
+  R (*N) (struct gcc_c_context *, A, B, C, D, E, F, G);
+
+#include "gcc-c-fe.def"
+
+#undef GCC_METHOD0
+#undef GCC_METHOD1
+#undef GCC_METHOD2
+#undef GCC_METHOD3
+#undef GCC_METHOD4
+#undef GCC_METHOD5
+#undef GCC_METHOD7
+
+};
+
+/* The C front end object.  */
+
+struct gcc_c_context
+{
+  /* Base class.  */
+
+  struct gcc_base_context base;
+
+  /* Our vtable.  This is a separate field because this is simpler
+     than implementing a vtable inheritance scheme in C.  */
+
+  const struct gcc_c_fe_vtable *c_ops;
+};
+
+/* The name of the .so that the compiler builds.  We dlopen this
+   later.  */
+
+#define GCC_C_FE_LIBCC libcc1.so
+
+/* The compiler exports a single initialization function.  This macro
+   holds its name as a symbol.  */
+
+#define GCC_C_FE_CONTEXT gcc_c_fe_context
+
+/* The type of the initialization function.  The caller passes in the
+   desired base version and desired C-specific version.  If the
+   request can be satisfied, a compatible gcc_context object will be
+   returned.  Otherwise, the function returns NULL.  */
+
+typedef struct gcc_c_context *gcc_c_fe_context_function
+    (enum gcc_base_api_version,
+     enum gcc_c_api_version);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GCC_C_INTERFACE_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/gcc-interface.h b/utils/gapy/gen-debug-info-src/ext/gcc-interface.h
new file mode 100644
index 000000000..df7db6ec1
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/gcc-interface.h
@@ -0,0 +1,127 @@
+/* Generic interface between GCC and GDB
+
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_INTERFACE_H
+#define GCC_INTERFACE_H
+
+/* This header defines the interface to the GCC API.  It must be both
+   valid C and valid C++, because it is included by both programs.  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque typedefs for objects passed through the interface.  */
+
+typedef unsigned long long gcc_type;
+typedef unsigned long long gcc_decl;
+
+/* An address in the inferior.  */
+
+typedef unsigned long long gcc_address;
+
+/* Forward declaration.  */
+
+struct gcc_base_context;
+
+/* Defined versions of the generic API.  */
+
+enum gcc_base_api_version
+{
+  GCC_FE_VERSION_0 = 0
+};
+
+/* The operations defined by the GCC base API.  This is the vtable for
+   the real context structure which is passed around.
+
+   The "base" API is concerned with basics shared by all compiler
+   front ends: setting command-line arguments, the file names, etc.
+
+   Front-end-specific interfaces inherit from this one.  */
+
+struct gcc_base_vtable
+{
+  /* The actual version implemented in this interface.  This field can
+     be relied on not to move, so users can always check it if they
+     desire.  The value is one of the gcc_base_api_version constants.
+  */
+
+  unsigned int version;
+
+  /* Set the compiler's command-line options for the next compilation.
+     TRIPLET_REGEXP is a regular expression that is used to match the
+     configury triplet prefix to the compiler.
+     The arguments are copied by GCC.  ARGV need not be
+     NULL-terminated.  The arguments must be set separately for each
+     compilation; that is, after a compile is requested, the
+     previously-set arguments cannot be reused.
+
+     This returns NULL on success.  On failure, returns a malloc()d
+     error message.  The caller is responsible for freeing it.  */
+
+  char *(*set_arguments) (struct gcc_base_context *self,
+			  const char *triplet_regexp,
+			  int argc, char **argv);
+
+  /* Set the file name of the program to compile.  The string is
+     copied by the method implementation, but the caller must
+     guarantee that the file exists through the compilation.  */
+
+  void (*set_source_file) (struct gcc_base_context *self, const char *file);
+
+  /* Set a callback to use for printing error messages.  DATUM is
+     passed through to the callback unchanged.  */
+
+  void (*set_print_callback) (struct gcc_base_context *self,
+			      void (*print_function) (void *datum,
+						      const char *message),
+			      void *datum);
+
+  /* Perform the compilation.  FILENAME is the name of the resulting
+     object file.  VERBOSE can be set to cause GCC to print some
+     information as it works.  Returns true on success, false on
+     error.  */
+
+  int /* bool */ (*compile) (struct gcc_base_context *self,
+			     const char *filename,
+			     int /* bool */ verbose);
+
+  /* Destroy this object.  */
+
+  void (*destroy) (struct gcc_base_context *self);
+};
+
+/* The GCC object.  */
+
+struct gcc_base_context
+{
+  /* The virtual table.  */
+
+  const struct gcc_base_vtable *ops;
+};
+
+/* The name of the dummy wrapper function generated by gdb.  */
+
+#define GCC_FE_WRAPPER_FUNCTION "_gdb_expr"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GCC_INTERFACE_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/gdbm.h b/utils/gapy/gen-debug-info-src/ext/gdbm.h
new file mode 100644
index 000000000..d2a600639
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/gdbm.h
@@ -0,0 +1,91 @@
+/* GNU DBM - DataBase Manager include file
+   Copyright 1989, 1991  Free Software Foundation, Inc.
+   Written by Philip A. Nelson.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* You may contact the author by:
+       e-mail:  phil@wwu.edu
+      us-mail:  Philip A. Nelson
+                Computer Science Department
+                Western Washington University
+                Bellingham, WA 98226
+        phone:  (206) 676-3035
+       
+*************************************************************************/
+
+/* Parameters to gdbm_open for READERS, WRITERS, and WRITERS who
+   can create the database. */
+#define  GDBM_READER  0
+#define  GDBM_WRITER  1
+#define  GDBM_WRCREAT 2
+#define  GDBM_NEWDB   3
+
+/* Parameters to gdbm_store for simple insertion or replacement. */
+#define  GDBM_INSERT  0
+#define  GDBM_REPLACE 1
+
+
+/* The data and key structure.  This structure is defined for compatibility. */
+typedef struct {
+	char *dptr;
+	int   dsize;
+      } datum;
+
+
+/* The file information header. This is good enough for most applications. */
+typedef struct {int dummy[10];} *GDBM_FILE;
+
+
+/* These are the routines! */
+
+extern GDBM_FILE gdbm_open ();
+
+extern void	 gdbm_close ();
+
+extern datum	 gdbm_fetch ();
+
+extern int	 gdbm_store ();
+
+extern int	 gdbm_delete ();
+
+extern datum	 gdbm_firstkey ();
+
+extern datum	 gdbm_nextkey ();
+
+extern int	 gdbm_reorganize ();
+
+
+/* gdbm sends back the following error codes in the variable gdbm_errno. */
+typedef enum {	NO_ERROR,
+		MALLOC_ERROR,
+		BLOCK_SIZE_ERROR,
+		FILE_OPEN_ERROR,
+		FILE_WRITE_ERROR,
+		FILE_SEEK_ERROR,
+		FILE_READ_ERROR,
+		BAD_MAGIC_NUMBER,
+		EMPTY_DATABASE,
+		CANT_BE_READER,
+	        CANT_BE_WRITER,
+		READER_CANT_RECOVER,
+		READER_CANT_DELETE,
+		READER_CANT_STORE,
+		READER_CANT_REORGANIZE,
+		UNKNOWN_UPDATE,
+		ITEM_NOT_FOUND,
+		REORGANIZE_FAILED,
+		CANNOT_REPLACE}
+	gdbm_error;
diff --git a/utils/gapy/gen-debug-info-src/ext/getopt.h b/utils/gapy/gen-debug-info-src/ext/getopt.h
new file mode 100644
index 000000000..6f496b113
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/getopt.h
@@ -0,0 +1,143 @@
+/* Declarations for getopt.
+   Copyright (C) 1989-2015 Free Software Foundation, Inc.
+
+   NOTE: The canonical source of this file is maintained with the GNU C Library.
+   Bugs can be reported to bug-glibc@gnu.org.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 2, or (at your option) any
+   later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301,
+   USA.  */
+
+#ifndef _GETOPT_H
+#define _GETOPT_H 1
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* For communication from `getopt' to the caller.
+   When `getopt' finds an option that takes an argument,
+   the argument value is returned here.
+   Also, when `ordering' is RETURN_IN_ORDER,
+   each non-option ARGV-element is returned here.  */
+
+extern char *optarg;
+
+/* Index in ARGV of the next element to be scanned.
+   This is used for communication to and from the caller
+   and for communication between successive calls to `getopt'.
+
+   On entry to `getopt', zero means this is the first call; initialize.
+
+   When `getopt' returns -1, this is the index of the first of the
+   non-option elements that the caller should itself scan.
+
+   Otherwise, `optind' communicates from one call to the next
+   how much of ARGV has been scanned so far.  */
+
+extern int optind;
+
+/* Callers store zero here to inhibit the error message `getopt' prints
+   for unrecognized options.  */
+
+extern int opterr;
+
+/* Set to an option character which was unrecognized.  */
+
+extern int optopt;
+
+/* Describe the long-named options requested by the application.
+   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
+   of `struct option' terminated by an element containing a name which is
+   zero.
+
+   The field `has_arg' is:
+   no_argument		(or 0) if the option does not take an argument,
+   required_argument	(or 1) if the option requires an argument,
+   optional_argument 	(or 2) if the option takes an optional argument.
+
+   If the field `flag' is not NULL, it points to a variable that is set
+   to the value given in the field `val' when the option is found, but
+   left unchanged if the option is not found.
+
+   To have a long-named option do something other than set an `int' to
+   a compiled-in constant, such as set a value from `optarg', set the
+   option's `flag' field to zero and its `val' field to a nonzero
+   value (the equivalent single-letter option character, if there is
+   one).  For long options that have a zero `flag' field, `getopt'
+   returns the contents of the `val' field.  */
+
+struct option
+{
+#if defined (__STDC__) && __STDC__
+  const char *name;
+#else
+  char *name;
+#endif
+  /* has_arg can't be an enum because some compilers complain about
+     type mismatches in all the code that assumes it is an int.  */
+  int has_arg;
+  int *flag;
+  int val;
+};
+
+/* Names for the values of the `has_arg' field of `struct option'.  */
+
+#define	no_argument		0
+#define required_argument	1
+#define optional_argument	2
+
+#if defined (__STDC__) && __STDC__
+/* HAVE_DECL_* is a three-state macro: undefined, 0 or 1.  If it is
+   undefined, we haven't run the autoconf check so provide the
+   declaration without arguments.  If it is 0, we checked and failed
+   to find the declaration so provide a fully prototyped one.  If it
+   is 1, we found it so don't provide any declaration at all.  */
+#if !HAVE_DECL_GETOPT
+#if defined (__GNU_LIBRARY__) || defined (HAVE_DECL_GETOPT)
+/* Many other libraries have conflicting prototypes for getopt, with
+   differences in the consts, in unistd.h.  To avoid compilation
+   errors, only prototype getopt for the GNU C library.  */
+extern int getopt (int argc, char *const *argv, const char *shortopts);
+#else
+#ifndef __cplusplus
+extern int getopt ();
+#endif /* __cplusplus */
+#endif
+#endif /* !HAVE_DECL_GETOPT */
+
+extern int getopt_long (int argc, char *const *argv, const char *shortopts,
+		        const struct option *longopts, int *longind);
+extern int getopt_long_only (int argc, char *const *argv,
+			     const char *shortopts,
+		             const struct option *longopts, int *longind);
+
+/* Internal only.  Users should not call this directly.  */
+extern int _getopt_internal (int argc, char *const *argv,
+			     const char *shortopts,
+		             const struct option *longopts, int *longind,
+			     int long_only);
+#else /* not __STDC__ */
+extern int getopt ();
+extern int getopt_long ();
+extern int getopt_long_only ();
+
+extern int _getopt_internal ();
+#endif /* __STDC__ */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* getopt.h */
diff --git a/utils/gapy/gen-debug-info-src/ext/gomp-constants.h b/utils/gapy/gen-debug-info-src/ext/gomp-constants.h
new file mode 100644
index 000000000..9bc9fa5c3
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/gomp-constants.h
@@ -0,0 +1,259 @@
+/* Communication between GCC and libgomp.
+
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GOMP_CONSTANTS_H
+#define GOMP_CONSTANTS_H 1
+
+/* Memory mapping types.  */
+
+/* One byte.  */
+#define GOMP_MAP_LAST			(1 << 8)
+
+#define GOMP_MAP_FLAG_TO		(1 << 0)
+#define GOMP_MAP_FLAG_FROM		(1 << 1)
+/* Special map kinds, enumerated starting here.  */
+#define GOMP_MAP_FLAG_SPECIAL_0		(1 << 2)
+#define GOMP_MAP_FLAG_SPECIAL_1		(1 << 3)
+#define GOMP_MAP_FLAG_SPECIAL_2		(1 << 4)
+#define GOMP_MAP_FLAG_SPECIAL		(GOMP_MAP_FLAG_SPECIAL_1 \
+					 | GOMP_MAP_FLAG_SPECIAL_0)
+/* Flag to force a specific behavior (or else, trigger a run-time error).  */
+#define GOMP_MAP_FLAG_FORCE		(1 << 7)
+
+enum gomp_map_kind
+  {
+    /* If not already present, allocate.  */
+    GOMP_MAP_ALLOC =			0,
+    /* ..., and copy to device.  */
+    GOMP_MAP_TO =			(GOMP_MAP_ALLOC | GOMP_MAP_FLAG_TO),
+    /* ..., and copy from device.  */
+    GOMP_MAP_FROM =			(GOMP_MAP_ALLOC | GOMP_MAP_FLAG_FROM),
+    /* ..., and copy to and from device.  */
+    GOMP_MAP_TOFROM =			(GOMP_MAP_TO | GOMP_MAP_FROM),
+    /* The following kind is an internal only map kind, used for pointer based
+       array sections.  OMP_CLAUSE_SIZE for these is not the pointer size,
+       which is implicitly POINTER_SIZE_UNITS, but the bias.  */
+    GOMP_MAP_POINTER =			(GOMP_MAP_FLAG_SPECIAL_0 | 0),
+    /* Also internal, behaves like GOMP_MAP_TO, but additionally any
+       GOMP_MAP_POINTER records consecutive after it which have addresses
+       falling into that range will not be ignored if GOMP_MAP_TO_PSET wasn't
+       mapped already.  */
+    GOMP_MAP_TO_PSET =			(GOMP_MAP_FLAG_SPECIAL_0 | 1),
+    /* Must already be present.  */
+    GOMP_MAP_FORCE_PRESENT =		(GOMP_MAP_FLAG_SPECIAL_0 | 2),
+    /* Deallocate a mapping, without copying from device.  */
+    GOMP_MAP_DELETE =			(GOMP_MAP_FLAG_SPECIAL_0 | 3),
+    /* Is a device pointer.  OMP_CLAUSE_SIZE for these is unused; is implicitly
+       POINTER_SIZE_UNITS.  */
+    GOMP_MAP_FORCE_DEVICEPTR =		(GOMP_MAP_FLAG_SPECIAL_1 | 0),
+    /* Do not map, copy bits for firstprivate instead.  */
+    /* OpenACC device_resident.  */
+    GOMP_MAP_DEVICE_RESIDENT =		(GOMP_MAP_FLAG_SPECIAL_1 | 1),
+    /* OpenACC link.  */
+    GOMP_MAP_LINK =			(GOMP_MAP_FLAG_SPECIAL_1 | 2),
+    /* Allocate.  */
+    GOMP_MAP_FIRSTPRIVATE =		(GOMP_MAP_FLAG_SPECIAL | 0),
+    /* Similarly, but store the value in the pointer rather than
+       pointed by the pointer.  */
+    GOMP_MAP_FIRSTPRIVATE_INT =		(GOMP_MAP_FLAG_SPECIAL | 1),
+    /* Pointer translate host address into device address and copy that
+       back to host.  */
+    GOMP_MAP_USE_DEVICE_PTR =		(GOMP_MAP_FLAG_SPECIAL | 2),
+    /* Allocate a zero length array section.  Prefer next non-zero length
+       mapping over previous non-zero length mapping over zero length mapping
+       at the address.  If not already mapped, do nothing (and pointer translate
+       to NULL).  */
+    GOMP_MAP_ZERO_LEN_ARRAY_SECTION = 	(GOMP_MAP_FLAG_SPECIAL | 3),
+    /* Allocate.  */
+    GOMP_MAP_FORCE_ALLOC =		(GOMP_MAP_FLAG_FORCE | GOMP_MAP_ALLOC),
+    /* ..., and copy to device.  */
+    GOMP_MAP_FORCE_TO =			(GOMP_MAP_FLAG_FORCE | GOMP_MAP_TO),
+    /* ..., and copy from device.  */
+    GOMP_MAP_FORCE_FROM =		(GOMP_MAP_FLAG_FORCE | GOMP_MAP_FROM),
+    /* ..., and copy to and from device.  */
+    GOMP_MAP_FORCE_TOFROM =		(GOMP_MAP_FLAG_FORCE | GOMP_MAP_TOFROM),
+    /* If not already present, allocate.  And unconditionally copy to
+       device.  */
+    GOMP_MAP_ALWAYS_TO =		(GOMP_MAP_FLAG_SPECIAL_2 | GOMP_MAP_TO),
+    /* If not already present, allocate.  And unconditionally copy from
+       device.  */
+    GOMP_MAP_ALWAYS_FROM =		(GOMP_MAP_FLAG_SPECIAL_2
+					 | GOMP_MAP_FROM),
+    /* If not already present, allocate.  And unconditionally copy to and from
+       device.  */
+    GOMP_MAP_ALWAYS_TOFROM =		(GOMP_MAP_FLAG_SPECIAL_2
+					 | GOMP_MAP_TOFROM),
+    /* Map a sparse struct; the address is the base of the structure, alignment
+       it's required alignment, and size is the number of adjacent entries
+       that belong to the struct.  The adjacent entries should be sorted by
+       increasing address, so it is easy to determine lowest needed address
+       (address of the first adjacent entry) and highest needed address
+       (address of the last adjacent entry plus its size).  */
+    GOMP_MAP_STRUCT =			(GOMP_MAP_FLAG_SPECIAL_2
+					 | GOMP_MAP_FLAG_SPECIAL | 0),
+    /* On a location of a pointer/reference that is assumed to be already mapped
+       earlier, store the translated address of the preceeding mapping.
+       No refcount is bumped by this, and the store is done unconditionally.  */
+    GOMP_MAP_ALWAYS_POINTER =		(GOMP_MAP_FLAG_SPECIAL_2
+					 | GOMP_MAP_FLAG_SPECIAL | 1),
+    /* Forced deallocation of zero length array section.  */
+    GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION
+      =					(GOMP_MAP_FLAG_SPECIAL_2
+					 | GOMP_MAP_FLAG_SPECIAL | 3),
+    /* Decrement usage count and deallocate if zero.  */
+    GOMP_MAP_RELEASE =			(GOMP_MAP_FLAG_SPECIAL_2
+					 | GOMP_MAP_DELETE),
+
+    /* Internal to GCC, not used in libgomp.  */
+    /* Do not map, but pointer assign a pointer instead.  */
+    GOMP_MAP_FIRSTPRIVATE_POINTER =	(GOMP_MAP_LAST | 1),
+    /* Do not map, but pointer assign a reference instead.  */
+    GOMP_MAP_FIRSTPRIVATE_REFERENCE =	(GOMP_MAP_LAST | 2)
+  };
+
+#define GOMP_MAP_COPY_TO_P(X) \
+  (!((X) & GOMP_MAP_FLAG_SPECIAL) \
+   && ((X) & GOMP_MAP_FLAG_TO))
+
+#define GOMP_MAP_COPY_FROM_P(X) \
+  (!((X) & GOMP_MAP_FLAG_SPECIAL) \
+   && ((X) & GOMP_MAP_FLAG_FROM))
+
+#define GOMP_MAP_POINTER_P(X) \
+  ((X) == GOMP_MAP_POINTER)
+
+#define GOMP_MAP_ALWAYS_TO_P(X) \
+  (((X) == GOMP_MAP_ALWAYS_TO) || ((X) == GOMP_MAP_ALWAYS_TOFROM))
+
+#define GOMP_MAP_ALWAYS_FROM_P(X) \
+  (((X) == GOMP_MAP_ALWAYS_FROM) || ((X) == GOMP_MAP_ALWAYS_TOFROM))
+
+#define GOMP_MAP_ALWAYS_P(X) \
+  (GOMP_MAP_ALWAYS_TO_P (X) || ((X) == GOMP_MAP_ALWAYS_FROM))
+
+
+/* Asynchronous behavior.  Keep in sync with
+   libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_async_t.  */
+
+#define GOMP_ASYNC_NOVAL		-1
+#define GOMP_ASYNC_SYNC			-2
+
+
+/* Device codes.  Keep in sync with
+   libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_device_t as well as
+   libgomp/libgomp-plugin.h.  */
+#define GOMP_DEVICE_NONE		0
+#define GOMP_DEVICE_DEFAULT		1
+#define GOMP_DEVICE_HOST		2
+/* #define GOMP_DEVICE_HOST_NONSHM	3 removed.  */
+#define GOMP_DEVICE_NOT_HOST		4
+#define GOMP_DEVICE_NVIDIA_PTX		5
+#define GOMP_DEVICE_INTEL_MIC		6
+#define GOMP_DEVICE_HSA			7
+
+#define GOMP_DEVICE_ICV			-1
+#define GOMP_DEVICE_HOST_FALLBACK	-2
+
+/* GOMP_task/GOMP_taskloop* flags argument.  */
+#define GOMP_TASK_FLAG_UNTIED		(1 << 0)
+#define GOMP_TASK_FLAG_FINAL		(1 << 1)
+#define GOMP_TASK_FLAG_MERGEABLE	(1 << 2)
+#define GOMP_TASK_FLAG_DEPEND		(1 << 3)
+#define GOMP_TASK_FLAG_PRIORITY		(1 << 4)
+#define GOMP_TASK_FLAG_UP		(1 << 8)
+#define GOMP_TASK_FLAG_GRAINSIZE	(1 << 9)
+#define GOMP_TASK_FLAG_IF		(1 << 10)
+#define GOMP_TASK_FLAG_NOGROUP		(1 << 11)
+
+/* GOMP_target{_ext,update_ext,enter_exit_data} flags argument.  */
+#define GOMP_TARGET_FLAG_NOWAIT		(1 << 0)
+#define GOMP_TARGET_FLAG_EXIT_DATA	(1 << 1)
+/* Internal to libgomp.  */
+#define GOMP_TARGET_FLAG_UPDATE		(1U << 31)
+
+/* Versions of libgomp and device-specific plugins.  GOMP_VERSION
+   should be incremented whenever an ABI-incompatible change is introduced
+   to the plugin interface defined in libgomp/libgomp.h.  */
+#define GOMP_VERSION	1
+#define GOMP_VERSION_NVIDIA_PTX 1
+#define GOMP_VERSION_INTEL_MIC 0
+#define GOMP_VERSION_HSA 0
+
+#define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
+#define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
+#define GOMP_VERSION_DEV(PACK) ((PACK) & 0xffff)
+
+#define GOMP_DIM_GANG	0
+#define GOMP_DIM_WORKER	1
+#define GOMP_DIM_VECTOR	2
+#define GOMP_DIM_MAX	3
+#define GOMP_DIM_MASK(X) (1u << (X))
+
+/* Varadic launch arguments.  End of list is marked by a zero.  */
+#define GOMP_LAUNCH_DIM		1  /* Launch dimensions, op = mask */
+#define GOMP_LAUNCH_ASYNC	2  /* Async, op = cst val if not MAX  */
+#define GOMP_LAUNCH_WAIT	3  /* Waits, op = num waits.  */
+#define GOMP_LAUNCH_CODE_SHIFT	28
+#define GOMP_LAUNCH_DEVICE_SHIFT 16
+#define GOMP_LAUNCH_OP_SHIFT 0
+#define GOMP_LAUNCH_PACK(CODE,DEVICE,OP)	\
+  (((CODE) << GOMP_LAUNCH_CODE_SHIFT)		\
+   | ((DEVICE) << GOMP_LAUNCH_DEVICE_SHIFT)	\
+   | ((OP) << GOMP_LAUNCH_OP_SHIFT))
+#define GOMP_LAUNCH_CODE(X) (((X) >> GOMP_LAUNCH_CODE_SHIFT) & 0xf)
+#define GOMP_LAUNCH_DEVICE(X) (((X) >> GOMP_LAUNCH_DEVICE_SHIFT) & 0xfff)
+#define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & 0xffff)
+#define GOMP_LAUNCH_OP_MAX 0xffff
+
+/* Bitmask to apply in order to find out the intended device of a target
+   argument.  */
+#define GOMP_TARGET_ARG_DEVICE_MASK		((1 << 7) - 1)
+/* The target argument is significant for all devices.  */
+#define GOMP_TARGET_ARG_DEVICE_ALL		0
+
+/* Flag set when the subsequent element in the device-specific argument
+   values.  */
+#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM	(1 << 7)
+
+/* Bitmask to apply to a target argument to find out the value identifier.  */
+#define GOMP_TARGET_ARG_ID_MASK			(((1 << 8) - 1) << 8)
+/* Target argument index of NUM_TEAMS.  */
+#define GOMP_TARGET_ARG_NUM_TEAMS		(1 << 8)
+/* Target argument index of THREAD_LIMIT.  */
+#define GOMP_TARGET_ARG_THREAD_LIMIT		(2 << 8)
+
+/* If the value is directly embeded in target argument, it should be a 16-bit
+   at most and shifted by this many bits.  */
+#define GOMP_TARGET_ARG_VALUE_SHIFT		16
+
+/* HSA specific data structures.  */
+
+/* Identifiers of device-specific target arguments.  */
+#define GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES	(1 << 8)
+
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/hashtab.h b/utils/gapy/gen-debug-info-src/ext/hashtab.h
new file mode 100644
index 000000000..b1b5877aa
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/hashtab.h
@@ -0,0 +1,204 @@
+/* An expandable hash tables datatype.  
+   Copyright (C) 1999-2015 Free Software Foundation, Inc.
+   Contributed by Vladimir Makarov (vmakarov@cygnus.com).
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* This package implements basic hash table functionality.  It is possible
+   to search for an entry, create an entry and destroy an entry.
+
+   Elements in the table are generic pointers.
+
+   The size of the table is not fixed; if the occupancy of the table
+   grows too high the hash table will be expanded.
+
+   The abstract data implementation is based on generalized Algorithm D
+   from Knuth's book "The art of computer programming".  Hash table is
+   expanded by creation of new hash table and transferring elements from
+   the old table to the new table.  */
+
+#ifndef __HASHTAB_H__
+#define __HASHTAB_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "ansidecl.h"
+
+/* The type for a hash code.  */
+typedef unsigned int hashval_t;
+
+/* Callback function pointer types.  */
+
+/* Calculate hash of a table entry.  */
+typedef hashval_t (*htab_hash) (const void *);
+
+/* Compare a table entry with a possible entry.  The entry already in
+   the table always comes first, so the second element can be of a
+   different type (but in this case htab_find and htab_find_slot
+   cannot be used; instead the variants that accept a hash value
+   must be used).  */
+typedef int (*htab_eq) (const void *, const void *);
+
+/* Cleanup function called whenever a live element is removed from
+   the hash table.  */
+typedef void (*htab_del) (void *);
+  
+/* Function called by htab_traverse for each live element.  The first
+   arg is the slot of the element (which can be passed to htab_clear_slot
+   if desired), the second arg is the auxiliary pointer handed to
+   htab_traverse.  Return 1 to continue scan, 0 to stop.  */
+typedef int (*htab_trav) (void **, void *);
+
+/* Memory-allocation function, with the same functionality as calloc().
+   Iff it returns NULL, the hash table implementation will pass an error
+   code back to the user, so if your code doesn't handle errors,
+   best if you use xcalloc instead.  */
+typedef void *(*htab_alloc) (size_t, size_t);
+
+/* We also need a free() routine.  */
+typedef void (*htab_free) (void *);
+
+/* Memory allocation and deallocation; variants which take an extra
+   argument.  */
+typedef void *(*htab_alloc_with_arg) (void *, size_t, size_t);
+typedef void (*htab_free_with_arg) (void *, void *);
+
+/* This macro defines reserved value for empty table entry.  */
+
+#define HTAB_EMPTY_ENTRY    ((PTR) 0)
+
+/* This macro defines reserved value for table entry which contained
+   a deleted element. */
+
+#define HTAB_DELETED_ENTRY  ((PTR) 1)
+
+/* Hash tables are of the following type.  The structure
+   (implementation) of this type is not needed for using the hash
+   tables.  All work with hash table should be executed only through
+   functions mentioned below.  The size of this structure is subject to
+   change.  */
+
+struct htab {
+  /* Pointer to hash function.  */
+  htab_hash hash_f;
+
+  /* Pointer to comparison function.  */
+  htab_eq eq_f;
+
+  /* Pointer to cleanup function.  */
+  htab_del del_f;
+
+  /* Table itself.  */
+  void **entries;
+
+  /* Current size (in entries) of the hash table.  */
+  size_t size;
+
+  /* Current number of elements including also deleted elements.  */
+  size_t n_elements;
+
+  /* Current number of deleted elements in the table.  */
+  size_t n_deleted;
+
+  /* The following member is used for debugging. Its value is number
+     of all calls of `htab_find_slot' for the hash table. */
+  unsigned int searches;
+
+  /* The following member is used for debugging.  Its value is number
+     of collisions fixed for time of work with the hash table. */
+  unsigned int collisions;
+
+  /* Pointers to allocate/free functions.  */
+  htab_alloc alloc_f;
+  htab_free free_f;
+
+  /* Alternate allocate/free functions, which take an extra argument.  */
+  void *alloc_arg;
+  htab_alloc_with_arg alloc_with_arg_f;
+  htab_free_with_arg free_with_arg_f;
+
+  /* Current size (in entries) of the hash table, as an index into the
+     table of primes.  */
+  unsigned int size_prime_index;
+};
+
+typedef struct htab *htab_t;
+
+/* An enum saying whether we insert into the hash table or not.  */
+enum insert_option {NO_INSERT, INSERT};
+
+/* The prototypes of the package functions. */
+
+extern htab_t	htab_create_alloc  (size_t, htab_hash,
+                                    htab_eq, htab_del,
+                                    htab_alloc, htab_free);
+
+extern htab_t	htab_create_alloc_ex (size_t, htab_hash,
+                                      htab_eq, htab_del,
+                                      void *, htab_alloc_with_arg,
+                                      htab_free_with_arg);
+
+extern htab_t  htab_create_typed_alloc (size_t, htab_hash, htab_eq, htab_del,
+					htab_alloc, htab_alloc, htab_free);
+
+/* Backward-compatibility functions.  */
+extern htab_t htab_create (size_t, htab_hash, htab_eq, htab_del);
+extern htab_t htab_try_create (size_t, htab_hash, htab_eq, htab_del);
+
+extern void	htab_set_functions_ex (htab_t, htab_hash,
+                                       htab_eq, htab_del,
+                                       void *, htab_alloc_with_arg,
+                                       htab_free_with_arg);
+
+extern void	htab_delete (htab_t);
+extern void	htab_empty (htab_t);
+
+extern void *	htab_find (htab_t, const void *);
+extern void **	htab_find_slot (htab_t, const void *, enum insert_option);
+extern void *	htab_find_with_hash (htab_t, const void *, hashval_t);
+extern void **	htab_find_slot_with_hash (htab_t, const void *,
+					  hashval_t, enum insert_option);
+extern void	htab_clear_slot	(htab_t, void **);
+extern void	htab_remove_elt	(htab_t, void *);
+extern void	htab_remove_elt_with_hash (htab_t, void *, hashval_t);
+
+extern void	htab_traverse (htab_t, htab_trav, void *);
+extern void	htab_traverse_noresize (htab_t, htab_trav, void *);
+
+extern size_t	htab_size (htab_t);
+extern size_t	htab_elements (htab_t);
+extern double	htab_collisions	(htab_t);
+
+/* A hash function for pointers.  */
+extern htab_hash htab_hash_pointer;
+
+/* An equality function for pointers.  */
+extern htab_eq htab_eq_pointer;
+
+/* A hash function for null-terminated strings.  */
+extern hashval_t htab_hash_string (const void *);
+
+/* An iterative hash function for arbitrary data.  */
+extern hashval_t iterative_hash (const void *, size_t, hashval_t);
+/* Shorthand for hashing something with an intrinsic size.  */
+#define iterative_hash_object(OB,INIT) iterative_hash (&OB, sizeof (OB), INIT)
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __HASHTAB_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/hp-symtab.h b/utils/gapy/gen-debug-info-src/ext/hp-symtab.h
new file mode 100644
index 000000000..e944e9091
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/hp-symtab.h
@@ -0,0 +1,1867 @@
+/* Definitions and structures for reading debug symbols from the
+   native HP C compiler.
+
+   Written by the Center for Software Science at the University of Utah
+   and by Cygnus Support.
+
+   Copyright 1994, 1995, 1998, 1999, 2003 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+#ifndef HP_SYMTAB_INCLUDED
+#define HP_SYMTAB_INCLUDED
+
+/* General information:
+
+   This header file defines and describes only the data structures
+   necessary to read debug symbols produced by the HP C compiler,
+   HP ANSI C++ compiler, and HP FORTRAN 90 compiler using the
+   SOM object file format.  
+   (For a full description of the debug format, ftp hpux-symtab.h from
+   jaguar.cs.utah.edu:/dist).
+   
+   Additional notes (Rich Title)
+   This file is a reverse-engineered version of a file called
+   "symtab.h" which exists internal to HP's Computer Languages Organization
+   in /CLO/Components/DDE/obj/som/symtab.h. Because HP's version of
+   the file is copyrighted and not distributed, it is necessary for
+   GDB to use the reverse-engineered version that follows.
+   Work was done by Cygnus to reverse-engineer the C subset of symtab.h.
+   The WDB project has extended this to also contain the C++ 
+   symbol definitions, the F90 symbol definitions, 
+   and the DOC (debugging-optimized-code) symbol definitions.
+   In some cases (the C++ symbol definitions)
+   I have added internal documentation here that
+   goes beyond what is supplied in HP's symtab.h. If we someday
+   unify these files again, the extra comments should be merged back
+   into HP's symtab.h.
+  
+   -------------------------------------------------------------------
+
+   Debug symbols are contained entirely within an unloadable space called
+   $DEBUG$.  $DEBUG$ contains several subspaces which group related
+   debug symbols.
+
+   $GNTT$ contains information for global variables, types and contants.
+
+   $LNTT$ contains information for procedures (including nesting), scoping
+   information, local variables, types, and constants.
+
+   $SLT$ contains source line information so that code addresses may be
+   mapped to source lines.
+
+   $VT$ contains various strings and constants for named objects (variables,
+   typedefs, functions, etc).  Strings are stored as null-terminated character
+   lists.  Constants always begin on word boundaries.  The first byte of
+   the VT must be zero (a null string).
+
+   $XT$ is not currently used by GDB.
+
+   Many structures within the subspaces point to other structures within
+   the same subspace, or to structures within a different subspace.  These
+   pointers are represented as a structure index from the beginning of
+   the appropriate subspace.  */
+
+/* Used to describe where a constant is stored.  */
+enum location_type
+{
+  LOCATION_IMMEDIATE,
+  LOCATION_PTR,
+  LOCATION_VT,
+};
+
+/* Languages supported by this debug format.  Within the data structures
+   this type is limited to 4 bits for a maximum of 16 languages.  */
+enum hp_language
+{
+  HP_LANGUAGE_UNKNOWN,
+  HP_LANGUAGE_C,
+  HP_LANGUAGE_FORTRAN,
+  HP_LANGUAGE_F77 = HP_LANGUAGE_FORTRAN,
+  HP_LANGUAGE_PASCAL,
+  HP_LANGUAGE_MODCAL,
+  HP_LANGUAGE_COBOL,
+  HP_LANGUAGE_BASIC,
+  HP_LANGUAGE_ADA,
+  HP_LANGUAGE_CPLUSPLUS,
+  HP_LANGUAGE_DMPASCAL
+};
+
+
+/* Basic data types available in this debug format.  Within the data
+   structures this type is limited to 5 bits for a maximum of 32 basic
+   data types.  */
+enum hp_type
+{
+  HP_TYPE_UNDEFINED, /* 0 */
+  HP_TYPE_BOOLEAN, /* 1 */
+  HP_TYPE_CHAR, /* 2 */
+  HP_TYPE_INT, /* 3 */
+  HP_TYPE_UNSIGNED_INT, /* 4 */
+  HP_TYPE_REAL, /* 5 */
+  HP_TYPE_COMPLEX, /* 6 */
+  HP_TYPE_STRING200, /* 7 */
+  HP_TYPE_LONGSTRING200, /* 8 */
+  HP_TYPE_TEXT, /* 9 */
+  HP_TYPE_FLABEL, /* 10 */
+  HP_TYPE_FTN_STRING_SPEC, /* 11 */
+  HP_TYPE_MOD_STRING_SPEC, /* 12 */
+  HP_TYPE_PACKED_DECIMAL, /* 13 */
+  HP_TYPE_REAL_3000, /* 14 */
+  HP_TYPE_MOD_STRING_3000, /* 15 */
+  HP_TYPE_ANYPOINTER, /* 16 */
+  HP_TYPE_GLOBAL_ANYPOINTER, /* 17 */
+  HP_TYPE_LOCAL_ANYPOINTER, /* 18 */
+  HP_TYPE_COMPLEXS3000, /* 19 */
+  HP_TYPE_FTN_STRING_S300_COMPAT, /* 20 */ 
+  HP_TYPE_FTN_STRING_VAX_COMPAT, /* 21 */
+  HP_TYPE_BOOLEAN_S300_COMPAT, /* 22 */
+  HP_TYPE_BOOLEAN_VAX_COMPAT, /* 23 */
+  HP_TYPE_WIDE_CHAR, /* 24 */
+  HP_TYPE_LONG, /* 25 */
+  HP_TYPE_UNSIGNED_LONG, /* 26 */
+  HP_TYPE_DOUBLE, /* 27 */
+  HP_TYPE_TEMPLATE_ARG, /* 28 */
+  HP_TYPE_VOID /* 29 */
+};
+
+/* An immediate name and type table entry.
+
+   extension and immediate will always be one.
+   global will always be zero.
+   hp_type is the basic type this entry describes.
+   bitlength is the length in bits for the basic type.  */
+struct dnttp_immediate
+{
+  unsigned int extension:	1;
+  unsigned int immediate:	1;
+  unsigned int global:		1;
+  unsigned int type: 		5;
+  unsigned int bitlength:	24;
+};
+
+/* A nonimmediate name and type table entry.
+
+   extension will always be one.
+   immediate will always be zero.
+   if global is zero, this entry points into the LNTT
+   if global is one, this entry points into the GNTT
+   index is the index within the GNTT or LNTT for this entry.  */
+struct dnttp_nonimmediate
+{
+  unsigned int extension:	1;
+  unsigned int immediate:	1;
+  unsigned int global:		1;
+  unsigned int index:		29;
+};
+
+/* A pointer to an entry in the GNTT and LNTT tables.  It has two
+   forms depending on the type being described.
+
+   The immediate form is used for simple entries and is one
+   word.
+
+   The nonimmediate form is used for complex entries and contains
+   an index into the LNTT or GNTT which describes the entire type.
+
+   If a dnttpointer is -1, then it is a NIL entry.  */
+
+#define DNTTNIL (-1)
+typedef union dnttpointer
+{
+  struct dnttp_immediate    dntti;
+  struct dnttp_nonimmediate dnttp;
+  int word;
+} dnttpointer;
+
+/* An index into the source line table.  As with dnttpointers, a sltpointer
+   of -1 indicates a NIL entry.  */
+#define SLTNIL (-1)
+typedef int sltpointer;
+
+/* Index into DOC (= "Debugging Optimized Code") line table.  */
+#define LTNIL (-1)
+typedef int ltpointer;
+
+/* Index into context table.  */
+#define CTXTNIL (-1)
+typedef int ctxtpointer;
+
+/* Unsigned byte offset into the VT.  */
+typedef unsigned int vtpointer;
+
+/* A DNTT entry (used within the GNTT and LNTT).
+
+   DNTT entries are variable sized objects, but are always a multiple
+   of 3 words (we call each group of 3 words a "block").
+
+   The first bit in each block is an extension bit.  This bit is zero
+   for the first block of a DNTT entry.  If the entry requires more
+   than one block, then this bit is set to one in all blocks after
+   the first one.  */
+
+/* Each DNTT entry describes a particular debug symbol (beginning of
+   a source file, a function, variables, structures, etc.
+
+   The type of the DNTT entry is stored in the "kind" field within the
+   DNTT entry itself.  */
+
+enum dntt_entry_type
+{
+  DNTT_TYPE_NIL = -1,
+  DNTT_TYPE_SRCFILE,
+  DNTT_TYPE_MODULE,
+  DNTT_TYPE_FUNCTION,
+  DNTT_TYPE_ENTRY,
+  DNTT_TYPE_BEGIN,
+  DNTT_TYPE_END,
+  DNTT_TYPE_IMPORT,
+  DNTT_TYPE_LABEL,
+  DNTT_TYPE_FPARAM,
+  DNTT_TYPE_SVAR,
+  DNTT_TYPE_DVAR,
+  DNTT_TYPE_HOLE1,
+  DNTT_TYPE_CONST,
+  DNTT_TYPE_TYPEDEF,
+  DNTT_TYPE_TAGDEF,
+  DNTT_TYPE_POINTER,
+  DNTT_TYPE_ENUM,
+  DNTT_TYPE_MEMENUM,
+  DNTT_TYPE_SET,
+  DNTT_TYPE_SUBRANGE,
+  DNTT_TYPE_ARRAY,
+  DNTT_TYPE_STRUCT,
+  DNTT_TYPE_UNION,
+  DNTT_TYPE_FIELD,
+  DNTT_TYPE_VARIANT,
+  DNTT_TYPE_FILE,
+  DNTT_TYPE_FUNCTYPE,
+  DNTT_TYPE_WITH,
+  DNTT_TYPE_COMMON,
+  DNTT_TYPE_COBSTRUCT,
+  DNTT_TYPE_XREF,
+  DNTT_TYPE_SA,
+  DNTT_TYPE_MACRO,
+  DNTT_TYPE_BLOCKDATA,
+  DNTT_TYPE_CLASS_SCOPE,
+  DNTT_TYPE_REFERENCE,
+  DNTT_TYPE_PTRMEM,
+  DNTT_TYPE_PTRMEMFUNC,
+  DNTT_TYPE_CLASS,
+  DNTT_TYPE_GENFIELD,
+  DNTT_TYPE_VFUNC,
+  DNTT_TYPE_MEMACCESS,
+  DNTT_TYPE_INHERITANCE,
+  DNTT_TYPE_FRIEND_CLASS,
+  DNTT_TYPE_FRIEND_FUNC,
+  DNTT_TYPE_MODIFIER,
+  DNTT_TYPE_OBJECT_ID,
+  DNTT_TYPE_MEMFUNC,
+  DNTT_TYPE_TEMPLATE,
+  DNTT_TYPE_TEMPLATE_ARG,
+  DNTT_TYPE_FUNC_TEMPLATE,
+  DNTT_TYPE_LINK,
+  DNTT_TYPE_DYN_ARRAY_DESC,
+  DNTT_TYPE_DESC_SUBRANGE,
+  DNTT_TYPE_BEGIN_EXT,
+  DNTT_TYPE_INLN,
+  DNTT_TYPE_INLN_LIST,
+  DNTT_TYPE_ALIAS,
+  DNTT_TYPE_DOC_FUNCTION,
+  DNTT_TYPE_DOC_MEMFUNC,
+  DNTT_TYPE_MAX
+};
+
+/* DNTT_TYPE_SRCFILE:
+
+   One DNTT_TYPE_SRCFILE symbol is output for the start of each source
+   file and at the begin and end of an included file.  A DNTT_TYPE_SRCFILE
+   entry is also output before each DNTT_TYPE_FUNC symbol so that debuggers
+   can determine what file a function was defined in.
+
+   LANGUAGE describes the source file's language.
+
+   NAME points to an VT entry providing the source file's name.
+
+   Note the name used for DNTT_TYPE_SRCFILE entries are exactly as seen
+   by the compiler (ie they may be relative or absolute).  C include files
+   via <> inclusion must use absolute paths.
+
+   ADDRESS points to an SLT entry from which line number and code locations
+   may be determined.  */
+
+struct dntt_type_srcfile
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;    /* DNTT_TYPE_SRCFILE */
+  unsigned int language:	4;
+  unsigned int unused:		17;
+  vtpointer name;
+  sltpointer address;
+};
+
+/* DNTT_TYPE_MODULE:
+
+   A DNTT_TYPE_MODULE symbol is emitted for the start of a pascal
+   module or C source file. A module indicates a compilation unit
+   for name-scoping purposes; in that regard there should be 
+   a 1-1 correspondence between GDB "symtab"'s and MODULE symbol records.
+
+   Each DNTT_TYPE_MODULE must have an associated DNTT_TYPE_END symbol.
+
+   NAME points to a VT entry providing the module's name.  Note C
+   source files are considered nameless modules.
+
+   ALIAS point to a VT entry providing a secondary name.
+
+   ADDRESS points to an SLT entry from which line number and code locations
+   may be determined.  */
+
+struct dntt_type_module
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10; 	/* DNTT_TYPE_MODULE */
+  unsigned int unused:		21;
+  vtpointer name;
+  vtpointer alias;
+  dnttpointer unused2;
+  sltpointer address;
+};
+
+/* DNTT_TYPE_FUNCTION,
+   DNTT_TYPE_ENTRY,
+   DNTT_TYPE_BLOCKDATA,
+   DNTT_TYPE_MEMFUNC:
+
+   A DNTT_TYPE_FUNCTION symbol is emitted for each function definition;
+   a DNTT_TYPE_ENTRY symbols is used for secondary entry points.  Both
+   symbols used the dntt_type_function structure.
+   A DNTT_TYPE_BLOCKDATA symbol is emitted ...?
+   A DNTT_TYPE_MEMFUNC symbol is emitted for inlined member functions (C++). 
+
+   Each of DNTT_TYPE_FUNCTION must have a matching DNTT_TYPE_END.
+
+   GLOBAL is nonzero if the function has global scope.
+
+   LANGUAGE describes the function's source language.
+
+   OPT_LEVEL describes the optimization level the function was compiled
+   with.
+
+   VARARGS is nonzero if the function uses varargs.
+
+   NAME points to a VT entry providing the function's name.
+
+   ALIAS points to a VT entry providing a secondary name for the function.
+
+   FIRSTPARAM points to a LNTT entry which describes the parameter list.
+
+   ADDRESS points to an SLT entry from which line number and code locations
+   may be determined.
+
+   ENTRYADDR is the memory address corresponding the function's entry point
+
+   RETVAL points to a LNTT entry describing the function's return value.
+
+   LOWADDR is the lowest memory address associated with this function.
+
+   HIADDR is the highest memory address associated with this function.  */
+
+struct dntt_type_function
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;	/* DNTT_TYPE_FUNCTION,
+				           DNTT_TYPE_ENTRY,
+					   DNTT_TYPE_BLOCKDATA
+					   or DNTT_TYPE_MEMFUNC */
+  unsigned int global:		1;
+  unsigned int language:	4;
+  unsigned int nest_level:	5;
+  unsigned int opt_level:	2;
+  unsigned int varargs:		1;
+  unsigned int lang_info:	4;
+  unsigned int inlined:		1;
+  unsigned int localalloc:	1;
+  unsigned int expansion:	1;
+  unsigned int unused:		1;
+  vtpointer name;
+  vtpointer alias;
+  dnttpointer firstparam;
+  sltpointer address;
+  CORE_ADDR entryaddr;
+  dnttpointer retval;
+  CORE_ADDR lowaddr;
+  CORE_ADDR hiaddr;
+};
+
+/* DNTT_TYPE_BEGIN:
+
+   A DNTT_TYPE_BEGIN symbol is emitted to begin a new nested scope.
+   Every DNTT_TYPE_BEGIN symbol must have a matching DNTT_TYPE_END symbol.
+
+   CLASSFLAG is nonzero if this is the beginning of a c++ class definition.
+
+   ADDRESS points to an SLT entry from which line number and code locations
+   may be determined.  */
+
+struct dntt_type_begin
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int classflag:	1;
+  unsigned int unused:		20;
+  sltpointer address;
+};
+
+/* DNTT_TYPE_END:
+
+   A DNTT_TYPE_END symbol is emitted when closing a scope started by
+   a DNTT_TYPE_MODULE, DNTT_TYPE_FUNCTION, DNTT_TYPE_WITH,
+   DNTT_TYPE_COMMON, DNTT_TYPE_BEGIN, and DNTT_TYPE_CLASS_SCOPE symbols.
+
+   ENDKIND describes what type of scope the DNTT_TYPE_END is closing
+   (one of the above 6 kinds).
+
+   CLASSFLAG is nonzero if this is the end of a c++ class definition.
+
+   ADDRESS points to an SLT entry from which line number and code locations
+   may be determined.
+
+   BEGINSCOPE points to the LNTT entry which opened the scope.  */
+
+struct dntt_type_end
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int endkind:		10;
+  unsigned int classflag:	1;
+  unsigned int unused:		10;
+  sltpointer address;
+  dnttpointer beginscope;
+};
+
+/* DNTT_TYPE_IMPORT is unused by GDB.  */
+/* DNTT_TYPE_LABEL is unused by GDB.  */
+
+/* DNTT_TYPE_FPARAM:
+
+   A DNTT_TYPE_FPARAM symbol is emitted for a function argument.  When
+   chained together the symbols represent an argument list for a function.
+
+   REGPARAM is nonzero if this parameter was passed in a register.
+
+   INDIRECT is nonzero if this parameter is a pointer to the parameter
+   (pass by reference or pass by value for large items).
+
+   LONGADDR is nonzero if the parameter is a 64bit pointer.
+
+   NAME is a pointer into the VT for the parameter's name.
+
+   LOCATION describes where the parameter is stored.  Depending on the
+   parameter type LOCATION could be a register number, or an offset
+   from the stack pointer.
+
+   TYPE points to a NTT entry describing the type of this parameter.
+
+   NEXTPARAM points to the LNTT entry describing the next parameter.  */
+
+struct dntt_type_fparam
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int regparam:	1;
+  unsigned int indirect:	1;
+  unsigned int longaddr:	1;
+  unsigned int copyparam:	1;
+  unsigned int dflt:		1;
+  unsigned int doc_ranges:	1;
+  unsigned int misc_kind:       1;
+  unsigned int unused:		14;
+  vtpointer name;
+  CORE_ADDR location;
+  dnttpointer type;
+  dnttpointer nextparam;
+  int misc;
+};
+
+/* DNTT_TYPE_SVAR:
+
+   A DNTT_TYPE_SVAR is emitted to describe a variable in static storage.
+
+   GLOBAL is nonzero if the variable has global scope.
+
+   INDIRECT is nonzero if the variable is a pointer to an object.
+
+   LONGADDR is nonzero if the variable is in long pointer space.
+
+   STATICMEM is nonzero if the variable is a member of a class.
+
+   A_UNION is nonzero if the variable is an anonymous union member.
+
+   NAME is a pointer into the VT for the variable's name.
+
+   LOCATION provides the memory address for the variable.
+
+   TYPE is a pointer into either the GNTT or LNTT which describes
+   the type of this variable.  */
+
+struct dntt_type_svar
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int global:		1;
+  unsigned int indirect:	1;
+  unsigned int longaddr:	1;
+  unsigned int staticmem:	1;
+  unsigned int a_union:		1;
+  unsigned int unused1:         1;
+  unsigned int thread_specific: 1;
+  unsigned int unused2:         14;
+  vtpointer name;
+  CORE_ADDR location;
+  dnttpointer type;
+  unsigned int offset;
+  unsigned int displacement;
+};
+
+/* DNTT_TYPE_DVAR:
+
+   A DNTT_TYPE_DVAR is emitted to describe automatic variables and variables
+   held in registers.
+
+   GLOBAL is nonzero if the variable has global scope.
+
+   INDIRECT is nonzero if the variable is a pointer to an object.
+
+   REGVAR is nonzero if the variable is in a register.
+
+   A_UNION is nonzero if the variable is an anonymous union member.
+
+   NAME is a pointer into the VT for the variable's name.
+
+   LOCATION provides the memory address or register number for the variable.
+
+   TYPE is a pointer into either the GNTT or LNTT which describes
+   the type of this variable.  */
+
+struct dntt_type_dvar
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int global:		1;
+  unsigned int indirect:	1;
+  unsigned int regvar:		1;
+  unsigned int a_union:		1;
+  unsigned int unused:		17;
+  vtpointer name;
+  int location;
+  dnttpointer type;
+  unsigned int offset;
+};
+
+/* DNTT_TYPE_CONST:
+
+   A DNTT_TYPE_CONST symbol is emitted for program constants.
+
+   GLOBAL is nonzero if the constant has global scope.
+
+   INDIRECT is nonzero if the constant is a pointer to an object.
+
+   LOCATION_TYPE describes where to find the constant's value
+   (in the VT, memory, or embedded in an instruction).
+
+   CLASSMEM is nonzero if the constant is a member of a class.
+
+   NAME is a pointer into the VT for the constant's name.
+
+   LOCATION provides the memory address, register number or pointer
+   into the VT for the constant's value.
+
+   TYPE is a pointer into either the GNTT or LNTT which describes
+   the type of this variable.  */
+
+struct dntt_type_const
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int global:		1;
+  unsigned int indirect:	1;
+  unsigned int location_type:	3;
+  unsigned int classmem:	1;
+  unsigned int unused:		15;
+  vtpointer name;
+  CORE_ADDR location;
+  dnttpointer type;
+  unsigned int offset;
+  unsigned int displacement;
+};
+
+/* DNTT_TYPE_TYPEDEF and DNTT_TYPE_TAGDEF:
+
+   The same structure is used to describe typedefs and tagdefs.
+
+   DNTT_TYPE_TYPEDEFS are associated with C "typedefs".
+
+   DNTT_TYPE_TAGDEFs are associated with C "struct", "union", and "enum"
+   tags, which may have the same name as a typedef in the same scope.
+   Also they are associated with C++ "class" tags, which implicitly have 
+   the same name as the class type.
+
+   GLOBAL is nonzero if the typedef/tagdef has global scope.
+
+   TYPEINFO is used to determine if full type information is available
+   for a tag.  (usually 1, but can be zero for opaque types in C).
+
+   NAME is a pointer into the VT for the constant's name.
+
+   TYPE points to the underlying type for the typedef/tagdef in the
+   GNTT or LNTT.  */
+
+struct dntt_type_type
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;    /* DNTT_TYPE_TYPEDEF or 
+                                          DNTT_TYPE_TAGDEF.  */
+  unsigned int global:		1;
+  unsigned int typeinfo:	1;
+  unsigned int unused:		19;
+  vtpointer name;
+  dnttpointer type;                    /* Underlying type, which for TAGDEF's may be
+                                          DNTT_TYPE_STRUCT, DNTT_TYPE_UNION,
+                                          DNTT_TYPE_ENUM, or DNTT_TYPE_CLASS. 
+                                          For TYPEDEF's other underlying types
+                                          are also possible.  */
+};
+
+/* DNTT_TYPE_POINTER:
+
+   Used to describe a pointer to an underlying type.
+
+   POINTSTO is a pointer into the GNTT or LNTT for the type which this
+   pointer points to.
+
+   BITLENGTH is the length of the pointer (not the underlying type). */
+
+struct dntt_type_pointer
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int unused:		21;
+  dnttpointer pointsto;
+  unsigned int bitlength;
+};
+
+
+/* DNTT_TYPE_ENUM:
+
+   Used to describe enumerated types.
+
+   FIRSTMEM is a pointer to a DNTT_TYPE_MEMENUM in the GNTT/LNTT which
+   describes the first member (and contains a pointer to the chain of
+   members).
+
+   BITLENGTH is the number of bits used to hold the values of the enum's
+   members.  */
+
+struct dntt_type_enum
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int unused:		21;
+  dnttpointer firstmem;
+  unsigned int bitlength;
+};
+
+/* DNTT_TYPE_MEMENUM
+
+   Used to describe members of an enumerated type.
+
+   CLASSMEM is nonzero if this member is part of a class.
+
+   NAME points into the VT for the name of this member.
+
+   VALUE is the value of this enumeration member.
+
+   NEXTMEM points to the next DNTT_TYPE_MEMENUM in the chain.  */
+
+struct dntt_type_memenum
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int classmem:	1;
+  unsigned int unused:		20;
+  vtpointer name;
+  unsigned int value;
+  dnttpointer nextmem;
+};
+
+/* DNTT_TYPE_SET
+
+   Used to describe PASCAL "set" type.
+
+   DECLARATION describes the bitpacking of the set.
+
+   SUBTYPE points to a DNTT entry describing the type of the members.
+
+   BITLENGTH is the size of the set.  */ 
+
+struct dntt_type_set
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int declaration:	2;
+  unsigned int unused:		19;
+  dnttpointer subtype;
+  unsigned int bitlength;
+};
+
+/* DNTT_TYPE_SUBRANGE
+
+   Used to describe subrange type.
+
+   DYN_LOW describes the lower bound of the subrange:
+
+     00 for a constant lower bound (found in LOWBOUND).
+
+     01 for a dynamic lower bound with the lower bound found in the
+     memory address pointed to by LOWBOUND.
+
+     10 for a dynamic lower bound described by an variable found in the
+     DNTT/LNTT (LOWBOUND would be a pointer into the DNTT/LNTT).
+
+   DYN_HIGH is similar to DYN_LOW, except it describes the upper bound.
+
+   SUBTYPE points to the type of the subrange.
+
+   BITLENGTH is the length in bits needed to describe the subrange's
+   values.  */
+
+struct dntt_type_subrange
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int dyn_low:		2;
+  unsigned int dyn_high:	2;
+  unsigned int unused:		17;
+  int lowbound;
+  int highbound;
+  dnttpointer subtype;
+  unsigned int bitlength;
+};
+
+/* DNTT_TYPE_ARRAY
+
+   Used to describe an array type.
+
+   DECLARATION describes the bit packing used in the array.
+
+   ARRAYISBYTES is nonzero if the field in arraylength describes the
+   length in bytes rather than in bits.  A value of zero is used to
+   describe an array with size 2**32.
+
+   ELEMISBYTES is nonzero if the length if each element in the array
+   is describes in bytes rather than bits.  A value of zero is used
+   to an element with size 2**32.
+
+   ELEMORDER is nonzero if the elements are indexed in increasing order.
+
+   JUSTIFIED if the elements are left justified to index zero.
+
+   ARRAYLENGTH is the length of the array.
+
+   INDEXTYPE is a DNTT pointer to the type used to index the array.
+
+   ELEMTYPE is a DNTT pointer to the type for the array elements.
+
+   ELEMLENGTH is the length of each element in the array (including
+   any padding).
+
+   Multi-dimensional arrays are represented by ELEMTYPE pointing to
+   another DNTT_TYPE_ARRAY.  */
+
+struct dntt_type_array
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int declaration:	2;
+  unsigned int dyn_low:		2;
+  unsigned int dyn_high:	2;
+  unsigned int arrayisbytes:	1;
+  unsigned int elemisbytes:	1;
+  unsigned int elemorder:	1;
+  unsigned int justified:	1;
+  unsigned int unused:		11;
+  unsigned int arraylength;
+  dnttpointer indextype;
+  dnttpointer elemtype;
+  unsigned int elemlength;
+};
+
+/* DNTT_TYPE_STRUCT
+
+   DNTT_TYPE_STRUCT is used to describe a C structure.
+
+   DECLARATION describes the bitpacking used.
+
+   FIRSTFIELD is a DNTT pointer to the first field of the structure
+   (each field contains a pointer to the next field, walk the list
+   to access all fields of the structure).
+
+   VARTAGFIELD and VARLIST are used for Pascal variant records.
+
+   BITLENGTH is the size of the structure in bits.  */
+
+struct dntt_type_struct
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int declaration:	2;
+  unsigned int unused:		19;
+  dnttpointer firstfield;
+  dnttpointer vartagfield;
+  dnttpointer varlist;
+  unsigned int bitlength;
+};
+
+/* DNTT_TYPE_UNION
+
+   DNTT_TYPE_UNION is used to describe a C union.
+
+   FIRSTFIELD is a DNTT pointer to the beginning of the field chain.
+
+   BITLENGTH is the size of the union in bits.  */
+
+struct dntt_type_union
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int unused:		21;
+  dnttpointer firstfield;
+  unsigned int bitlength;
+};
+
+/* DNTT_TYPE_FIELD
+
+   DNTT_TYPE_FIELD describes one field in a structure or union
+   or C++ class.
+
+   VISIBILITY is used to describe the visibility of the field
+   (for c++.  public = 0, protected = 1, private = 2).
+
+   A_UNION is nonzero if this field is a member of an anonymous union.
+
+   STATICMEM is nonzero if this field is a static member of a template.
+
+   NAME is a pointer into the VT for the name of the field.
+
+   BITOFFSET gives the offset of this field in bits from the beginning
+   of the structure or union this field is a member of.
+
+   TYPE is a DNTT pointer to the type describing this field.
+
+   BITLENGTH is the size of the entry in bits.
+
+   NEXTFIELD is a DNTT pointer to the next field in the chain.  */
+
+struct dntt_type_field
+{
+  unsigned int extension:	1;
+  unsigned int kind:	10;
+  unsigned int visibility:	2;
+  unsigned int a_union:		1;
+  unsigned int staticmem:	1;
+  unsigned int unused:		17;
+  vtpointer name;
+  unsigned int bitoffset;
+  dnttpointer type;
+  unsigned int bitlength;
+  dnttpointer nextfield;
+};
+
+/* DNTT_TYPE_VARIANT is unused by GDB.  */
+/* DNTT_TYPE_FILE is unused by GDB.  */
+
+/* DNTT_TYPE_FUNCTYPE
+
+   I think this is used to describe a function type (e.g., would
+   be emitted as part of a function-pointer description).
+
+   VARARGS is nonzero if this function uses varargs.
+
+   FIRSTPARAM is a DNTT pointer to the first entry in the parameter
+   chain.
+
+   RETVAL is a DNTT pointer to the type of the return value.  */
+
+struct dntt_type_functype
+{
+  unsigned int extension:	1;
+  unsigned int kind:		10;
+  unsigned int varargs:		1;
+  unsigned int info:		4;
+  unsigned int unused:		16;
+  unsigned int bitlength;
+  dnttpointer firstparam;
+  dnttpointer retval;
+};
+
+/* DNTT_TYPE_WITH is emitted by C++ to indicate "with" scoping semantics.
+   (Probably also emitted by PASCAL to support "with"...).
+   
+   C++ example: Say "memfunc" is a method of class "c", and say
+   "m" is a data member of class "c". Then from within "memfunc",
+   it is legal to reference "m" directly (e.g. you don't have to
+   say "this->m". The symbol table indicates
+   this by emitting a DNTT_TYPE_WITH symbol within the function "memfunc",
+   pointing to the type symbol for class "c".
+ 
+   In GDB, this symbol record is unnecessary, 
+   because GDB's symbol lookup algorithm
+   infers the "with" semantics when it sees a "this" argument to the member
+   function. So GDB can safely ignore the DNTT_TYPE_WITH record.
+
+   A DNTT_TYPE_WITH has a matching DNTT_TYPE_END symbol.  */
+
+struct dntt_type_with
+{
+  unsigned int extension:	1;    /* always zero */
+  unsigned int kind:		10;   /* always DNTT_TYPE_WITH */
+  unsigned int addrtype:  	2;    /* 0 => STATTYPE                */
+                                      /* 1 => DYNTYPE                 */
+                                      /* 2 => REGTYPE                 */
+  unsigned int indirect: 	1;    /* 1 => pointer to object       */
+  unsigned int longaddr:  	1;    /* 1 => in long pointer space   */
+  unsigned int nestlevel: 	6;    /* # of nesting levels back     */
+  unsigned int doc_ranges: 	1;    /* 1 => location is range list  */
+  unsigned int unused:   	10;
+  long location;       		      /* where stored (allocated)     */
+  sltpointer address;
+  dnttpointer type;                   /* type of with expression      */
+  vtpointer name;                     /* name of with expression      */
+  unsigned long  offset;              /* byte offset from location    */
+};                                   
+
+/* DNTT_TYPE_COMMON is unsupported by GDB.  */
+/* A DNTT_TYPE_COMMON symbol must have a matching DNTT_TYPE_END symbol */
+
+/* DNTT_TYPE_COBSTRUCT is unsupported by GDB.  */
+/* DNTT_TYPE_XREF is unsupported by GDB.  */
+/* DNTT_TYPE_SA is unsupported by GDB.  */
+/* DNTT_TYPE_MACRO is unsupported by GDB */
+
+/* DNTT_TYPE_BLOCKDATA has the same structure as DNTT_TYPE_FUNCTION */
+
+/* The following are the C++ specific SOM records */
+
+/*  The purpose of the DNTT_TYPE_CLASS_SCOPE is to bracket C++ methods
+    and indicate the method name belongs in the "class scope" rather
+    than in the module they are being defined in. For example:
+
+    class c {
+    ...
+    void memfunc(); // member function
+    };
+
+    void c::memfunc()   // definition of class c's "memfunc"
+    {
+    ...
+    }
+
+    main()
+    {
+    ...
+    }
+
+    In the above, the name "memfunc" is not directly visible from "main".
+    I.e., you have to say "break c::memfunc".
+    If it were a normal function (not a method), it would be visible
+    via the simple "break memfunc". Since "memfunc" otherwise looks
+    like a normal FUNCTION in the symbol table, the bracketing
+    CLASS_SCOPE is what is used to indicate it is really a method.
+    
+
+   A DNTT_TYPE_CLASS_SCOPE symbol must have a matching DNTT_TYPE_END symbol.  */
+
+struct dntt_type_class_scope
+{
+  unsigned int extension:   1;	   /* Always zero.  */
+  unsigned int kind:       10;     /* Always DNTT_TYPE_CLASS_SCOPE.  */
+  unsigned int unused:     21; 
+  sltpointer address         ;     /* Pointer to SLT entry.  */
+  dnttpointer type           ;     /* Pointer to class type DNTT.  */
+};
+
+/* C++ reference parameter.
+   The structure of this record is the same as DNTT_TYPE_POINTER - 
+   refer to struct dntt_type_pointer.  */
+
+/* The next two describe C++ pointer-to-data-member type, and 
+   pointer-to-member-function type, respectively.
+   DNTT_TYPE_PTRMEM and DNTT_TYPE_PTRMEMFUNC have the same structure.  */
+
+struct dntt_type_ptrmem
+{
+  unsigned int extension:   1;	   /* Always zero.  */
+  unsigned int kind:       10;     /* Always DNTT_TYPE_PTRMEM.  */
+  unsigned int unused:	   21;
+  dnttpointer pointsto	     ;     /* Pointer to class DNTT.  */
+  dnttpointer memtype 	     ;     /* Type of member.  */
+};
+
+struct dntt_type_ptrmemfunc
+{
+  unsigned int extension:   1;	   /* Always zero.  */
+  unsigned int kind:       10;     /* Always DNTT_TYPE_PTRMEMFUNC.  */
+  unsigned int unused:	   21;
+  dnttpointer pointsto	     ;     /* Pointer to class DNTT.  */
+  dnttpointer memtype 	     ;     /* Type of member.  */
+};
+
+/* The DNTT_TYPE_CLASS symbol is emitted to describe a class type.
+   "memberlist" points to a chained list of FIELD or GENFIELD records
+   indicating the class members. "parentlist" points to a chained list
+   of INHERITANCE records indicating classes from which we inherit
+   fields.  */
+
+struct dntt_type_class 
+{
+  unsigned int extension:   1;     /* Always zero.  */
+  unsigned int kind:       10;     /* Always DNTT_TYPE_CLASS.  */
+  unsigned int abstract:    1;     /* Is this an abstract class?  */
+  unsigned int class_decl:  2;     /* 0=class,1=union,2=struct.  */
+  unsigned int expansion:   1;     /* 1=template expansion.  */
+  unsigned int unused:     17;     
+  dnttpointer memberlist     ;     /* Ptr to chain of [GEN]FIELDs.  */
+  unsigned long vtbl_loc     ;     /* Offset in obj of ptr to vtbl.  */
+  dnttpointer parentlist     ;     /* Ptr to K_INHERITANCE list.  */
+  unsigned long bitlength    ;     /* Total at this level.  */
+  dnttpointer identlist      ;     /* Ptr to chain of class ident's.  */
+  dnttpointer friendlist     ;     /* Ptr to K_FRIEND list.  */
+  dnttpointer templateptr    ;     /* Ptr to template.  */
+  dnttpointer nextexp        ;     /* Ptr to next expansion.  */
+};
+
+/* Class members are indicated via either the FIELD record (for
+   data members, same as for C struct fields), or by the GENFIELD record
+   (for member functions).  */
+
+struct dntt_type_genfield
+{
+  unsigned int extension:   1;	   /* Always zero.  */
+  unsigned int kind:       10;     /* Always DNTT_TYPE_GENFIELD.  */
+  unsigned int visibility:  2;     /* Pub = 0, prot = 1, priv = 2.  */
+  unsigned int a_union:     1;     /* 1 => anonymous union member.  */
+  unsigned int unused:	   18;
+  dnttpointer field	     ;     /* Pointer to field or qualifier.  */
+  dnttpointer nextfield      ;     /* Pointer to next field.  */
+};
+
+/* C++ virtual functions.  */
+
+struct dntt_type_vfunc
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_VFUNC */
+  unsigned int pure:        1;     /* pure virtual function ?       */
+  unsigned int unused:	   20;
+  dnttpointer funcptr        ;     /* points to FUNCTION symbol     */
+  unsigned long vtbl_offset  ;     /* offset into vtbl for virtual  */
+};
+
+/* Not precisely sure what this is intended for - DDE ignores it.  */
+
+struct dntt_type_memaccess
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_MEMACCESS */
+  unsigned int unused:	   21;
+  dnttpointer classptr	     ;     /* pointer to base class         */
+  dnttpointer field          ;     /* pointer field                 */
+};
+
+/* The DNTT_TYPE_INHERITANCE record describes derived classes.
+   In particular, the "parentlist" field of the CLASS record points
+   to a list of INHERITANCE records for classes from which we 
+   inherit members.  */
+
+struct dntt_type_inheritance
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_INHERITANCE */
+  unsigned int Virtual:     1;     /* virtual base class ?          */
+  unsigned int visibility:  2;     /* pub = 0, prot = 1, priv = 2   */
+  unsigned int unused:	   18;
+  dnttpointer classname      ;     /* first parent class, if any    */
+  unsigned long offset       ;     /* offset to start of base class */
+  dnttpointer next           ;     /* pointer to next K_INHERITANCE */
+  unsigned long future[2]    ;     /* padding to 3-word block end   */
+};
+
+/* C++ "friend" classes ... */
+
+struct dntt_type_friend_class
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_FRIEND_CLASS */
+  unsigned int unused:	   21;
+  dnttpointer classptr       ;     /* pointer to class DNTT         */
+  dnttpointer next           ;     /* next DNTT_FRIEND              */
+};
+
+struct dntt_type_friend_func
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_FRIEND_FUNC */
+  unsigned int unused:	   21;
+  dnttpointer funcptr        ;     /* pointer to function           */
+  dnttpointer classptr       ;     /* pointer to class DNTT         */
+  dnttpointer next           ;     /* next DNTT_FRIEND              */
+  unsigned long future[2]    ;     /* padding to 3-word block end   */
+};
+
+/* DDE appears to ignore the DNTT_TYPE_MODIFIER record.
+   It could perhaps be used to give better "ptype" output in GDB;
+   otherwise it is probably safe for GDB to ignore it also.  */
+
+struct dntt_type_modifier
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_MODIFIER */
+  unsigned int m_const:     1;     /* const                         */
+  unsigned int m_static:    1;     /* static                        */
+  unsigned int m_void:      1;     /* void                          */
+  unsigned int m_volatile:  1;     /* volatile                      */
+  unsigned int m_duplicate: 1;     /* duplicate                     */
+  unsigned int unused:	   16;
+  dnttpointer type           ;     /* subtype                       */
+  unsigned long future       ;     /* padding to 3-word block end   */
+};
+
+/* I'm not sure what this was intended for - DDE ignores it.  */
+
+struct dntt_type_object_id
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_OBJECT_ID */
+  unsigned int indirect:    1;     /* Is object_ident addr of addr? */
+  unsigned int unused:	   20;
+  unsigned long object_ident ;     /* object identifier             */
+  unsigned long offset       ;     /* offset to start of base class */
+  dnttpointer next           ;     /* pointer to next K_OBJECT_ID   */
+  unsigned long segoffset    ;     /* for linker fixup              */
+  unsigned long future       ;     /* padding to 3-word block end   */
+};
+
+/* No separate dntt_type_memfunc; same as dntt_type_func */
+
+/* Symbol records to support templates. These only get used
+   in DDE's "describe" output (like GDB's "ptype").  */
+
+/* The TEMPLATE record is the header for a template-class.
+   Like the CLASS record, a TEMPLATE record has a memberlist that
+   points to a list of template members. It also has an arglist
+   pointing to a list of TEMPLATE_ARG records.  */
+
+struct dntt_type_template
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_TEMPLATE */
+  unsigned int abstract:    1;     /* is this an abstract class?    */
+  unsigned int class_decl:  2;     /* 0=class,1=union,2=struct      */
+  unsigned int unused:	   18;
+  dnttpointer memberlist     ;     /* ptr to chain of K_[GEN]FIELDs */
+  long unused2               ;     /* offset in obj of ptr to vtbl  */
+  dnttpointer parentlist     ;     /* ptr to K_INHERITANCE list     */
+  unsigned long bitlength    ;     /* total at this level           */
+  dnttpointer identlist      ;     /* ptr to chain of class ident's */
+  dnttpointer friendlist     ;     /* ptr to K_FRIEND list          */
+  dnttpointer arglist        ;     /* ptr to argument list          */
+  dnttpointer expansions     ;     /* ptr to expansion list         */
+};
+
+/* Template-class arguments are a list of TEMPL_ARG records
+   chained together. The "name" field is the name of the formal.
+   E.g.:
+   
+     template <class T> class q { ... };
+   
+   Then "T" is the name of the formal argument.  */
+
+struct dntt_type_templ_arg
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_TEMPL_ARG */
+  unsigned int usagetype:   1;     /* 0 type-name 1 expression     */
+  unsigned int unused:	   20;
+  vtpointer name             ;     /* name of argument             */
+  dnttpointer type           ;     /* for non type arguments       */
+  dnttpointer nextarg        ;     /* Next argument if any         */
+  long future[2]             ;     /* padding to 3-word block end  */
+};
+
+/* FUNC_TEMPLATE records are sort of like FUNCTION, but are emitted
+   for template member functions. E.g.,
+   
+     template <class T> class q
+     {
+        ...
+        void f();
+        ... 
+     };
+   
+   Within the list of FIELDs/GENFIELDs defining the member list
+   of the template "q", "f" would appear as a FUNC_TEMPLATE.
+   We'll also see instances of FUNCTION "f" records for each 
+   instantiation of the template.  */
+
+struct dntt_type_func_template
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_FUNC_TEMPLATE */
+  unsigned int public:      1;     /* 1 => globally visible        */
+  unsigned int language:    4;     /* type of language             */
+  unsigned int level:       5;     /* nesting level (top level = 0)*/
+  unsigned int optimize:    2;     /* level of optimization        */
+  unsigned int varargs:     1;     /* ellipses.  Pascal/800 later  */ 
+  unsigned int info:        4;     /* lang-specific stuff; F_xxxx  */
+  unsigned int inlined:     1;
+  unsigned int localloc:    1;     /* 0 at top, 1 at end of block  */
+  unsigned int unused:      2;
+  vtpointer name             ;     /* name of function             */
+  vtpointer alias            ;     /* alternate name, if any       */
+  dnttpointer firstparam     ;     /* first FPARAM, if any         */
+  dnttpointer retval         ;     /* return type, if any          */
+  dnttpointer arglist        ;     /* ptr to argument list         */
+};
+
+/* LINK is apparently intended to link together function template
+   definitions with their instantiations. However, it is not clear
+   why this would be needed, except to provide the information on
+   a "ptype" command. And as far as I can tell, aCC does not 
+   generate this record.  */
+
+struct dntt_type_link
+{
+  unsigned int extension:   1;	   /* always zero */
+  unsigned int kind:       10;     /* always DNTT_TYPE_LINK */
+  unsigned int linkKind:    4;     /* always LINK_UNKNOWN          */
+  unsigned int unused:	   17;
+  long future1               ;     /* expansion                    */
+  dnttpointer ptr1           ;     /* link from template           */
+  dnttpointer ptr2           ;     /* to expansion                 */
+  long future[2]             ;     /* padding to 3-word block end  */
+};
+
+/* end of C++ specific SOM's.  */
+
+/* DNTT_TYPE_DYN_ARRAY_DESC is unused by GDB */
+/* DNTT_TYPE_DESC_SUBRANGE is unused by GDB */
+/* DNTT_TYPE_BEGIN_EXT is unused by GDB */
+/* DNTT_TYPE_INLN is unused by GDB */
+/* DNTT_TYPE_INLN_LIST is unused by GDB */
+/* DNTT_TYPE_ALIAS is unused by GDB */
+
+struct dntt_type_doc_function
+{
+  unsigned int extension: 1;   /* always zero                  */
+  unsigned int kind:     10;   /* K_DOC_FUNCTION or            */
+                               /* K_DOC_MEMFUNC                */
+  unsigned int global:    1;   /* 1 => globally visible        */
+  unsigned int language:  4;   /* type of language             */
+  unsigned int level:     5;   /* nesting level (top level = 0)*/
+  unsigned int optimize:  2;   /* level of optimization        */
+  unsigned int varargs:   1;   /* ellipses.  Pascal/800 later  */
+  unsigned int info:      4;   /* lang-specific stuff; F_xxxx  */
+  unsigned int inlined:   1;
+  unsigned int localloc:  1;   /* 0 at top, 1 at end of block  */
+  unsigned int expansion: 1;   /* 1 = function expansion       */
+  unsigned int doc_clone: 1;
+  vtpointer name;              /* name of function             */
+  vtpointer alias;             /* alternate name, if any       */
+  dnttpointer firstparam;      /* first FPARAM, if any         */
+  sltpointer address;          /* code and text locations      */
+  CORE_ADDR entryaddr;         /* address of entry point       */
+  dnttpointer retval;          /* return type, if any          */
+  CORE_ADDR lowaddr;           /* lowest address of function   */
+  CORE_ADDR hiaddr;            /* highest address of function  */
+  dnttpointer inline_list;     /* pointer to first inline    */
+  ltpointer lt_offset;         /* start of frag/cp line table  */
+  ctxtpointer ctxt_offset;     /* start of context table for this routine */
+};
+
+/* DNTT_TYPE_DOC_MEMFUNC is unused by GDB */
+
+/* DNTT_TYPE_GENERIC and DNTT_TYPE_BLOCK are convience structures
+   so we can examine a DNTT entry in a generic fashion.  */
+struct dntt_type_generic
+{
+  unsigned int word[9];
+};
+
+struct dntt_type_block
+{
+  unsigned int extension:	1;
+  unsigned int kind:            10;
+  unsigned int unused:		21;
+  unsigned int word[2];
+};
+
+/* One entry in a DNTT (either the LNTT or GNTT).  
+   This is a union of the above 60 or so structure definitions.  */
+
+union dnttentry
+{
+  struct dntt_type_srcfile dsfile;
+  struct dntt_type_module dmodule;
+  struct dntt_type_function dfunc;
+  struct dntt_type_function dentry;
+  struct dntt_type_begin dbegin;
+  struct dntt_type_end dend;
+  struct dntt_type_fparam dfparam;
+  struct dntt_type_svar dsvar;
+  struct dntt_type_dvar ddvar;
+  struct dntt_type_const dconst;
+  struct dntt_type_type dtype;
+  struct dntt_type_type dtag;
+  struct dntt_type_pointer dptr;
+  struct dntt_type_enum denum;
+  struct dntt_type_memenum dmember;
+  struct dntt_type_set dset;
+  struct dntt_type_subrange dsubr;
+  struct dntt_type_array darray;
+  struct dntt_type_struct dstruct;
+  struct dntt_type_union dunion;
+  struct dntt_type_field dfield;
+  struct dntt_type_functype dfunctype;
+  struct dntt_type_with dwith;
+  struct dntt_type_function dblockdata;
+  struct dntt_type_class_scope dclass_scope;
+  struct dntt_type_pointer dreference;
+  struct dntt_type_ptrmem dptrmem;
+  struct dntt_type_ptrmemfunc dptrmemfunc;
+  struct dntt_type_class dclass;
+  struct dntt_type_genfield dgenfield;
+  struct dntt_type_vfunc dvfunc;
+  struct dntt_type_memaccess dmemaccess;
+  struct dntt_type_inheritance dinheritance;
+  struct dntt_type_friend_class dfriend_class;
+  struct dntt_type_friend_func dfriend_func;
+  struct dntt_type_modifier dmodifier;
+  struct dntt_type_object_id dobject_id;
+  struct dntt_type_template dtemplate;
+  struct dntt_type_templ_arg dtempl_arg;
+  struct dntt_type_func_template dfunc_template;
+  struct dntt_type_link dlink;
+  struct dntt_type_doc_function ddocfunc;
+  struct dntt_type_generic dgeneric;
+  struct dntt_type_block dblock;
+};
+
+/* Source line entry types.  */
+enum slttype
+{
+  SLT_NORMAL,
+  SLT_SRCFILE,
+  SLT_MODULE,
+  SLT_FUNCTION,
+  SLT_ENTRY,
+  SLT_BEGIN,
+  SLT_END,
+  SLT_WITH,
+  SLT_EXIT,
+  SLT_ASSIST,
+  SLT_MARKER,
+  SLT_CLASS_SCOPE,
+  SLT_INLN,
+  SLT_NORMAL_OFFSET,
+};
+
+/* A normal source line entry.  Simply provides a mapping of a source
+   line number to a code address.
+
+   SLTDESC will always be SLT_NORMAL or SLT_EXIT.  */
+
+struct slt_normal
+{
+  unsigned int sltdesc:	4;
+  unsigned int line:	28;
+  CORE_ADDR address;
+};
+
+struct slt_normal_off
+{
+  unsigned int sltdesc:	4;
+  unsigned int offset:	6;
+  unsigned int line:	22;
+  CORE_ADDR address;
+};
+
+/* A special source line entry.  Provides a mapping of a declaration
+   to a line number.  These entries point back into the DNTT which
+   references them.  */
+
+struct slt_special
+{
+  unsigned int sltdesc:	4;
+  unsigned int line:	28;
+  dnttpointer backptr;
+};
+
+/* Used to describe nesting.
+
+   For nested languages, an slt_assist entry must follow each SLT_FUNC
+   entry in the SLT.  The address field will point forward to the
+   first slt_normal entry within the function's scope.  */
+
+struct slt_assist
+{
+  unsigned int sltdesc:	4;
+  unsigned int unused:	28;
+  sltpointer address;
+};
+
+struct slt_generic
+{
+  unsigned int word[2];
+};
+
+union sltentry
+{
+  struct slt_normal snorm;
+  struct slt_normal_off snormoff;
+  struct slt_special sspec;
+  struct slt_assist sasst;
+  struct slt_generic sgeneric;
+};
+
+/* $LINES$ declarations
+   This is the line table used for optimized code, which is only present 
+   in the new $PROGRAM_INFO$ debug space.  */
+
+#define DST_LN_ESCAPE_FLAG1   15
+#define DST_LN_ESCAPE_FLAG2   14
+#define DST_LN_CTX_SPEC1      13  
+#define DST_LN_CTX_SPEC2      12
+
+/* Escape function codes:  */
+
+typedef enum
+{
+  dst_ln_pad,          /* pad byte */
+  dst_ln_escape_1,     /* reserved */
+  dst_ln_dpc1_dln1,    /* 1 byte line delta, 1 byte pc delta */
+  dst_ln_dpc2_dln2,    /* 2 bytes line delta, 2 bytes pc delta */
+  dst_ln_pc4_ln4,      /* 4 bytes ABSOLUTE line number, 4 bytes ABSOLUTE pc */
+  dst_ln_dpc0_dln1,    /* 1 byte line delta, pc delta = 0 */
+  dst_ln_ln_off_1,     /* statement escape, stmt # = 1 (2nd stmt on line) */
+  dst_ln_ln_off,       /* statement escape, stmt # = next byte */
+  dst_ln_entry,        /* entry escape, next byte is entry number */
+  dst_ln_exit,         /* exit escape */
+  dst_ln_stmt_end,     /* gap escape, 4 bytes pc delta */
+  dst_ln_stmt_cp,      /* current stmt is a critical point */
+  dst_ln_escape_12,    /* reserved */
+  dst_ln_escape_13,    /* this is an exception site record */
+  dst_ln_nxt_byte,     /* next byte contains the real escape code */
+  dst_ln_end,          /* end escape, final entry follows */
+  dst_ln_escape1_END_OF_ENUM
+}
+dst_ln_escape1_t;
+
+typedef enum
+{
+  dst_ln_ctx_1,        	/* next byte describes context switch with 5-bit */
+  			/* index into the image table and 3-bit run length. */
+			/* If run length is 0, end with another cxt specifier or ctx_end */
+  dst_ln_ctx_2,        	/* next 2 bytes switch context: 13 bit index, 3 bit run length */
+  dst_ln_ctx_4,        	/* next 4 bytes switch context: 29 bit index, 3 bit run length */
+  dst_ln_ctx_end,      	/* end current context */
+  dst_ln_col_run_1,    	/* next byte is column position of start of next statement, */
+                        /* following byte is length of statement */
+  dst_ln_col_run_2,    	/* next 2 bytes is column position of start of next statement, */
+                        /* following 2 bytes is length of statement */
+  dst_ln_init_base1,   	/* next 4 bytes are absolute PC, followed by 1 byte of line number */
+  dst_ln_init_base2,   	/* next 4 bytes are absolute PC, followed by 2 bytes of line number */
+  dst_ln_init_base3,   	/* next 4 bytes are absolute PC, followed by 3 bytes of line number */
+  dst_ln_escape2_END_OF_ENUM
+}
+dst_ln_escape2_t;           
+
+typedef union
+{
+  struct
+  {
+    unsigned int     pc_delta : 4;      /* 4 bit pc delta */
+    int              ln_delta : 4;      /* 4 bit line number delta */
+  }
+  delta;
+
+  struct
+  {
+    unsigned int     esc_flag : 4;      /* alias for pc_delta  */
+    unsigned int     esc_code : 4;      /* escape function code (dst_ln_escape1_t, or ...2_t */
+  }
+  esc;
+
+  struct
+  {
+    unsigned int     esc_flag   : 4;      /* dst_ln_ctx_spec1, or dst_ln_ctx_spec2 */
+    unsigned int     run_length : 2;      
+    unsigned int     ctx_index  : 2;      /* ...spec2 contains index;  ...spec1, index - 4 */
+  }
+  ctx_spec;
+
+  char               sdata;               /* signed data byte */
+  unsigned char      udata;               /* unsigned data byte */
+}
+dst_ln_entry_t,
+  * dst_ln_entry_ptr_t;
+
+/* Warning: although the above union occupies only 1 byte the compiler treats
+   it as having size 2 (the minimum size of a struct).  Therefore a sequence of
+   dst_ln_entry_t's cannot be described as an array, and walking through such a
+   sequence requires convoluted code such as
+        ln_ptr = (dst_ln_entry_ptr_t) (char*) ln_ptr + 1
+   We regret the inconvenience.  */
+
+/* Structure for interpreting the byte following a dst_ln_ctx1 entry.  */
+typedef struct
+{
+    unsigned int          ctx1_index : 5;      /* 5 bit index into context table */
+    unsigned int          ctx1_run_length : 3; /* 3 bit run length */
+} dst_ln_ctx1_t,
+  *dst_ln_ctx1_ptr_t;
+
+/* Structure for interpreting the bytes following a dst_ln_ctx2 entry.  */
+typedef struct
+{
+    unsigned int          ctx2_index : 13;     /* 13 bit index into context table */
+    unsigned int          ctx2_run_length : 3; /* 3 bit run length */
+} dst_ln_ctx2_t,
+  *dst_ln_ctx2_ptr_t;
+
+/* Structure for interpreting the bytes following a dst_ln_ctx4 entry.  */
+typedef struct
+{
+    unsigned int          ctx4_index : 29;     /* 29 bit index into context table */
+    unsigned int          ctx4_run_length : 3; /* 3 bit run length */
+} dst_ln_ctx4_t,
+  *dst_ln_ctx4_ptr_t;
+
+
+/*  PXDB definitions.
+  
+   PXDB is a post-processor which takes the executable file
+   and massages the debug information so that the debugger may
+   start up and run more efficiently.  Some of the tasks
+   performed by PXDB are:
+  
+   o   Remove duplicate global type and variable information
+       from the GNTT,
+  
+   o   Append the GNTT onto the end of the LNTT and place both
+       back in the LNTT section,
+  
+   o   Build quick look-up tables (description follows) for
+       files, procedures, modules, and paragraphs (for Cobol),
+       placing these in the GNTT section,
+  
+   o   Reconstruct the header appearing in the header section
+       to access this information.
+  
+   The "quick look-up" tables are in the $GNTT$ sub-space, in
+   the following order:
+  
+       Procedures    -sorted by address
+       Source files  -sorted by address (of the
+                      generated code from routines)
+       Modules       -sorted by address
+       Classes       -<unsorted?>
+       Address Alias -sorted by index <?>
+       Object IDs    -sorted by object identifier
+  
+   Most quick entries have (0-based) indices into the LNTT tables to
+   the full entries for the item it describes.
+  
+   The post-PXDB header is in the $HEADER$ sub-space.  Alas, it
+   occurs in different forms, depending on the optimization level
+   in the compilation step and whether PXDB was run or not. The
+   worst part is the forms aren't self-describing, so we'll have
+   to grovel in the bits to figure out what kind we're looking at
+   (see hp_get_header in hp-psymtab-read.c).  */
+
+/* PXDB versions.  */
+
+#define PXDB_VERSION_CPLUSPLUS	1
+#define PXDB_VERSION_7_4	2
+#define PXDB_VERSION_CPP_30	3
+#define PXDB_VERSION_DDE_3_2A	4
+#define PXDB_VERSION_DDE_3_2	5
+#define PXDB_VERSION_DDE_4_0	6
+
+#define PXDB_VERSION_2_1	1
+
+/* Header version for the case that there is no DOC info
+   but the executable has been processed by pxdb (the easy
+   case, from "cc -g").  */
+
+typedef struct PXDB_struct
+{
+  int              pd_entries;   /* # of entries in function look-up table */
+  int              fd_entries;   /* # of entries in file look-up table */
+  int              md_entries;   /* # of entries in module look-up table */
+  unsigned int     pxdbed : 1;   /* 1 => file has been preprocessed      */
+  unsigned int     bighdr : 1;   /* 1 => this header contains 'time' word */
+  unsigned int     sa_header : 1;/* 1 => created by SA version of pxdb */
+			           /*   used for version check in xdb */
+  unsigned int     inlined: 1;   /* one or more functions have been inlined */
+  unsigned int     spare:12;
+  short            version;      /* pxdb header version */
+  int              globals;      /* index into the DNTT where GNTT begins */
+  unsigned int     time;         /* modify time of file before being pxdbed */
+  int              pg_entries;   /* # of entries in label look-up table */
+  int              functions;    /* actual number of functions */
+  int              files;        /* actual number of files */
+  int              cd_entries;   /* # of entries in class look-up table */
+  int              aa_entries;   /* # of entries in addr alias look-up table */
+  int              oi_entries;   /* # of entries in object id look-up table */
+} PXDB_header, *PXDB_header_ptr;
+
+/* Header version for the case that there is no DOC info and the
+   executable has NOT been processed by pxdb.  */
+
+typedef struct XDB_header_struct
+{
+  long gntt_length; 
+  long lntt_length; 
+  long slt_length; 
+  long vt_length; 
+  long xt_length; 
+} XDB_header;
+
+/* Header version for the case that there is DOC info and the
+   executable has been processed by pxdb. */
+
+typedef struct DOC_info_PXDB_header_struct
+{
+  unsigned int xdb_header: 1; 	      /* bit set if this is post-3.1 xdb */ 
+  unsigned int doc_header: 1;         /* bit set if this is doc-style header */
+  unsigned int version: 8;            /* version of pxdb see defines
+				         PXDB_VERSION_* in this file.  */
+  unsigned int reserved_for_flags: 16;/* for future use; -- must be 
+                                         set to zero.  */
+  unsigned int has_aux_pd_table: 1;   /* $GNTT$ has aux PD table */
+  unsigned int has_expr_table: 1;     /* space has $EXPR$ */       
+  unsigned int has_range_table: 1;    /* space has $RANGE$ */       
+  unsigned int has_context_table: 1;  /* space has $SRC_CTXT$ */    
+  unsigned int has_lines_table: 1;    /* space contains a $LINES$
+                                         subspace for line tables.  */
+  unsigned int has_lt_offset_map: 1;  /* space contains an lt_offset
+                                         subspace for line table mapping.  */
+  /* The following fields are the same as those in the PXDB_header in $DEBUG$ */
+  int           pd_entries;   /* # of entries in function look-up table */
+  int           fd_entries;   /* # of entries in file look-up table */
+  int           md_entries;   /* # of entries in module look-up table */
+  unsigned int  pxdbed : 1;   /* 1 => file has been preprocessed      */
+  unsigned int  bighdr : 1;   /* 1 => this header contains 'time' word */
+  unsigned int  sa_header : 1;/* 1 => created by SA version of pxdb */
+                              /*   used for version check in xdb */
+  unsigned int  inlined: 1;   /* one or more functions have been inlined */
+  unsigned int  spare : 28;
+  int      	globals;      /* index into the DNTT where GNTT begins */
+  unsigned int  time;         /* modify time of file before being pxdbed */
+  int           pg_entries;   /* # of entries in label look-up table */
+  int           functions;    /* actual number of functions */
+  int           files;        /* actual number of files */
+  int           cd_entries;   /* # of entries in class look-up table */
+  int           aa_entries;   /* # of entries in addr alias look-up table */
+  int           oi_entries;   /* # of entries in object id look-up table */
+} DOC_info_PXDB_header;
+
+/* Header version for the case that there is DOC info and the
+   executable has NOT been processed by pxdb.  */
+
+typedef struct DOC_info_header_struct
+{
+  unsigned int xdb_header: 1; 	/* bit set if this is post-3.1 xdb */ 
+  unsigned int doc_header: 1;     /* bit set if this is doc-style header*/
+  unsigned int version: 8;      /* version of debug/header 
+                                   format. For 10.0 the value 
+                                   will be 1. For "Davis" the value is 2.  */
+  unsigned int reserved_for_flags: 18; /* for future use; -- must be set to zero.  */
+  unsigned int has_range_table: 1;     /* space contains a $RANGE$ subspace for variable ranges.  */
+  unsigned int has_context_table: 1;   /* space contains a $CTXT$ subspace for context/inline table.  */
+  unsigned int has_lines_table: 1;     /* space contains a $LINES$ subspace for line tables. */
+  unsigned int has_lt_offset_map: 1;   /* space contains an lt_offset subspace for line table mapping.  */
+
+  long   gntt_length;  /* same as old header */
+  long   lntt_length;  /* same as old header */
+  long   slt_length;   /* same as old header */
+  long   vt_length;    /* same as old header */
+  long   xt_length;    /* same as old header */
+  long   ctxt_length;  /* present only if version >= 2 */
+  long   range_length; /* present only if version >= 2 */
+  long   expr_length;  /* present only if version >= 2 */
+
+} DOC_info_header;
+
+typedef union GenericDebugHeader_union
+{
+   PXDB_header          no_doc;
+   DOC_info_PXDB_header doc;
+   XDB_header           no_pxdb_no_doc;
+   DOC_info_header      no_pxdb_doc;
+} GenericDebugHeader;
+
+
+/*  Procedure Descriptor:
+    An element of the procedure quick look-up table.  */
+
+typedef struct quick_procedure
+{
+  long           isym;		/* 0-based index of first symbol
+                                   for procedure in $LNTT$, 
+                                   i.e. the procedure itself.  */
+  CORE_ADDR	 adrStart;	/* memory adr of start of proc	*/
+  CORE_ADDR	 adrEnd;	/* memory adr of end of proc	*/
+  char         	*sbAlias;	/* alias name of procedure	*/
+  char          *sbProc;	/* real name of procedure	*/
+  CORE_ADDR	 adrBp;		/* address of entry breakpoint  */
+  CORE_ADDR	 adrExitBp;	/* address of exit breakpoint   */
+  int            icd;           /* member of this class (index) */	
+  unsigned int	 ipd;		/* index of template for this   */
+                                /* function (index)           */
+  unsigned int	 unused:    5;
+  unsigned int	 no_lt_offset: 1;/* no entry in lt_offset table */
+  unsigned int	 fTemplate: 1;	/* function template		*/
+  unsigned int	 fExpansion: 1;	/* function expansion		*/
+  unsigned int	 linked	  : 1;	/* linked with other expansions	*/
+  unsigned int	 duplicate: 1;  /* clone of another procedure   */
+  unsigned int	 overloaded:1;  /* overloaded function          */
+  unsigned int	 member:    1;  /* class member function        */
+  unsigned int	 constructor:1; /* constructor function         */
+  unsigned int	 destructor:1;  /* destructor function          */
+  unsigned int   Static:    1;  /* static function              */
+  unsigned int   Virtual:   1;  /* virtual function             */
+  unsigned int   constant:  1;  /* constant function            */
+  unsigned int   pure:      1;  /* pure (virtual) function      */
+  unsigned int   language:  4;  /* procedure's language         */
+  unsigned int   inlined:   1;  /* function has been inlined    */
+  unsigned int   Operator:  1;  /* operator function            */
+  unsigned int	 stub:      1;  /* bodyless function            */
+  unsigned int	 optimize:  2;	/* optimization level   	*/
+  unsigned int	 level:     5;	/* nesting level (top=0)	*/
+} quick_procedure_entry, *quick_procedure_entry_ptr;
+
+/*  Source File Descriptor:
+    An element of the source file quick look-up table.  */
+
+typedef struct quick_source
+{
+  long	         isym;		/* 0-based index in $LNTT$ of
+                                   first symbol for this file.     */
+  CORE_ADDR      adrStart;	/* mem adr of start of file's code */
+  CORE_ADDR      adrEnd;	/* mem adr of end of file's code   */
+  char	        *sbFile;	/* name of source file		   */
+  unsigned int   fHasDecl: 1;	/* do we have a .d file?	   */
+  unsigned int   fWarned:  1;	/* have warned about age problems? */
+  unsigned int   fSrcfile: 1;   /* 0 => include 1=> source         */
+  unsigned short ilnMac;	/* lines in file (0 if don't know) */
+  int	         ipd;		/* 0-based index of first procedure
+                                   in this file, in the quick
+                                   look-up table of procedures.    */
+  unsigned int  *rgLn;		/* line pointer array, if any	   */
+} quick_file_entry, *quick_file_entry_ptr;
+
+/*  Module Descriptor:
+    An element of the module quick reference table.  */
+
+typedef struct quick_module
+{
+  long           isym;		   /* 0-based index of first
+                                      symbol for module.        */
+  CORE_ADDR	 adrStart;	   /* adr of start of mod.	*/
+  CORE_ADDR	 adrEnd;	   /* adr of end of mod.	*/
+  char	        *sbAlias;	   /* alias name of module   	*/
+  char	        *sbMod;		   /* real name of module	*/
+  unsigned int   imports:       1; /* module have any imports?  */
+  unsigned int   vars_in_front: 1; /* module globals in front?  */
+  unsigned int   vars_in_gaps:  1; /* module globals in gaps?   */
+  unsigned int   language:      4; /* type of language          */
+  unsigned int   unused      : 25;
+  unsigned int   unused2;	   /* space for future stuff	*/
+} quick_module_entry, *quick_module_entry_ptr;
+
+/*  Auxiliary Procedure Descriptor:
+    An element of the auxiliary procedure quick look-up table.  */
+
+typedef struct quick_aux_procedure
+{
+  long	 isym_inln;	/* start on inline list for proc */
+  long   spare;
+} quick_aux_procedure_entry, *quick_aux_procedure_entry_ptr;
+
+/*  Paragraph Descriptor:
+    An element of the paragraph quick look-up table.  */
+
+typedef struct quick_paragraph
+{
+  long             isym;       /* first symbol for label (index)  */
+  CORE_ADDR        adrStart;   /* memory adr of start of label    */
+  CORE_ADDR        adrEnd;     /* memory adr of end of label      */
+  char            *sbLab;      /* name of label                   */
+  unsigned int     inst;       /* Used in xdb to store inst @ bp  */
+  unsigned int     sect:    1; /* true = section, false = parag.  */
+  unsigned int     unused: 31; /* future use                      */
+} quick_paragraph_entry, *quick_paragraph_entry_ptr;
+
+/* Class Descriptor:
+   An element of the class quick look-up table.  */
+
+typedef struct quick_class
+{
+  char	         *sbClass;	/* name of class	        */
+  long            isym;         /* class symbol (tag)           */
+  unsigned int	  type : 2;	/* 0=class, 1=union, 2=struct   */
+  unsigned int	  fTemplate : 1;/* class template               */
+  unsigned int	  expansion : 1;/* template expansion           */
+  unsigned int	  unused    :28;
+  sltpointer      lowscope;	/* beginning of defined scope   */
+  sltpointer      hiscope;	/* end of defined scope         */
+} quick_class_entry, *quick_class_entry_ptr;
+
+/* Address Alias Entry
+   An element of the address alias quick look-up table.  */
+
+typedef struct quick_alias
+{
+  CORE_ADDR     low;
+  CORE_ADDR     high;
+  int           index;
+  unsigned int	unused : 31;
+  unsigned int	alternate : 1;	/* alternate unnamed aliases?   */
+} quick_alias_entry, *quick_alias_entry_ptr;
+
+/* Object Identification Entry
+   An element of the object identification quick look-up table.  */
+
+typedef struct quick_obj_ID
+{
+  CORE_ADDR    obj_ident;	/* class identifier         */
+  long         isym;		/* class symbol             */
+  long         offset;		/* offset to object start   */
+} quick_obj_ID_entry, *quick_obj_ID_entry_ptr;
+
+#endif /* HP_SYMTAB_INCLUDED */
diff --git a/utils/gapy/gen-debug-info-src/ext/ieee.h b/utils/gapy/gen-debug-info-src/ext/ieee.h
new file mode 100644
index 000000000..72fcad420
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/ieee.h
@@ -0,0 +1,165 @@
+/* IEEE Standard 695-1980 "Universal Format for Object Modules" header file
+
+   Copyright 2001 Free Software Foundation, Inc.
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.
+   
+   Contributed by Cygnus Support.  */
+
+#define N_W_VARIABLES 8
+#define Module_Beginning 0xe0
+
+typedef struct ieee_module
+  {
+    char *processor;
+    char *module_name;
+  }
+ieee_module_begin_type;
+
+#define Address_Descriptor 0xec
+typedef struct ieee_address
+  {
+    bfd_vma number_of_bits_mau;
+    bfd_vma number_of_maus_in_address;
+
+    unsigned char byte_order;
+#define IEEE_LITTLE 0xcc
+#define IEEE_BIG 0xcd
+  }
+ieee_address_descriptor_type;
+
+typedef union ieee_w_variable
+  {
+    file_ptr offset[N_W_VARIABLES];
+
+    struct
+      {
+	file_ptr extension_record;
+	file_ptr environmental_record;
+	file_ptr section_part;
+	file_ptr external_part;
+	file_ptr debug_information_part;
+	file_ptr data_part;
+	file_ptr trailer_part;
+	file_ptr me_record;
+      }
+    r;
+  }
+ieee_w_variable_type;
+
+typedef enum ieee_record
+  { 
+    ieee_number_start_enum = 0x00,
+    ieee_number_end_enum=0x7f,
+    ieee_number_repeat_start_enum = 0x80,
+    ieee_number_repeat_end_enum = 0x88,
+    ieee_number_repeat_4_enum = 0x84,
+    ieee_number_repeat_3_enum = 0x83,
+    ieee_number_repeat_2_enum = 0x82,
+    ieee_number_repeat_1_enum = 0x81,
+    ieee_module_beginning_enum = 0xe0,
+    ieee_module_end_enum = 0xe1,
+    ieee_extension_length_1_enum = 0xde,
+    ieee_extension_length_2_enum = 0xdf,
+    ieee_section_type_enum = 0xe6,
+    ieee_section_alignment_enum = 0xe7,
+    ieee_external_symbol_enum = 0xe8,
+    ieee_comma = 0x90,
+    ieee_external_reference_enum = 0xe9,
+    ieee_set_current_section_enum = 0xe5,
+    ieee_address_descriptor_enum = 0xec,
+    ieee_load_constant_bytes_enum = 0xed,
+    ieee_load_with_relocation_enum = 0xe4,
+
+    ieee_variable_A_enum = 0xc1,
+    ieee_variable_B_enum = 0xc2,
+    ieee_variable_C_enum = 0xc3,
+    ieee_variable_D_enum = 0xc4,
+    ieee_variable_E_enum = 0xc5,
+    ieee_variable_F_enum = 0xc6,
+    ieee_variable_G_enum = 0xc7,
+    ieee_variable_H_enum = 0xc8,
+    ieee_variable_I_enum = 0xc9,
+    ieee_variable_J_enum = 0xca,
+    ieee_variable_K_enum = 0xcb,
+    ieee_variable_L_enum = 0xcc,
+    ieee_variable_M_enum = 0xcd,
+    ieee_variable_N_enum = 0xce,
+    ieee_variable_O_enum = 0xcf,
+    ieee_variable_P_enum = 0xd0,
+    ieee_variable_Q_enum = 0xd1,
+    ieee_variable_R_enum = 0xd2,
+    ieee_variable_S_enum = 0xd3,
+    ieee_variable_T_enum = 0xd4,
+    ieee_variable_U_enum = 0xd5,
+    ieee_variable_V_enum = 0xd6,
+    ieee_variable_W_enum = 0xd7,
+    ieee_variable_X_enum = 0xd8,
+    ieee_variable_Y_enum = 0xd9,
+    ieee_variable_Z_enum = 0xda,
+    ieee_function_plus_enum = 0xa5,
+    ieee_function_minus_enum = 0xa6,
+    ieee_function_signed_open_b_enum = 0xba,
+    ieee_function_signed_close_b_enum = 0xbb,
+
+    ieee_function_unsigned_open_b_enum = 0xbc,
+    ieee_function_unsigned_close_b_enum = 0xbd,
+
+    ieee_function_either_open_b_enum = 0xbe,
+    ieee_function_either_close_b_enum = 0xbf,
+    ieee_record_seperator_enum = 0xdb,
+
+    ieee_e2_first_byte_enum = 0xe2,
+    ieee_section_size_enum = 0xe2d3,
+    ieee_physical_region_size_enum = 0xe2c1,
+    ieee_region_base_address_enum = 0xe2c2,
+    ieee_mau_size_enum = 0xe2c6,
+    ieee_m_value_enum = 0xe2cd,
+    ieee_section_base_address_enum = 0xe2cc,
+    ieee_asn_record_enum = 0xe2ce,
+    ieee_section_offset_enum = 0xe2d2,
+    ieee_value_starting_address_enum = 0xe2c7,
+    ieee_assign_value_to_variable_enum = 0xe2d7,
+    ieee_set_current_pc_enum = 0xe2d0,
+    ieee_value_record_enum = 0xe2c9,
+    ieee_nn_record = 0xf0,
+    ieee_at_record_enum = 0xf1,
+    ieee_ty_record_enum = 0xf2,
+    ieee_attribute_record_enum = 0xf1c9,
+    ieee_atn_record_enum = 0xf1ce,
+    ieee_external_reference_info_record_enum = 0xf1d8,
+    ieee_weak_external_reference_enum= 0xf4,
+    ieee_repeat_data_enum = 0xf7,
+    ieee_bb_record_enum = 0xf8,
+    ieee_be_record_enum = 0xf9
+  }
+ieee_record_enum_type;
+
+typedef struct ieee_section
+  {
+    unsigned int section_index;
+    unsigned int section_type;
+    char *       section_name;
+    unsigned int parent_section_index;
+    unsigned int sibling_section_index;
+    unsigned int context_index;
+  }
+ieee_section_type;
+
+#define IEEE_REFERENCE_BASE 11
+#define IEEE_PUBLIC_BASE 32
+#define IEEE_SECTION_NUMBER_BASE 1
+
diff --git a/utils/gapy/gen-debug-info-src/ext/leb128.h b/utils/gapy/gen-debug-info-src/ext/leb128.h
new file mode 100644
index 000000000..56016b0f7
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/leb128.h
@@ -0,0 +1,136 @@
+/* Utilities for reading leb128 values.
+   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+This file is part of the libiberty library.
+Libiberty is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+Libiberty is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with libiberty; see the file COPYING.LIB.  If not, write
+to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* The functions defined here can be speed critical.
+   Since they are all pretty small we keep things simple and just define
+   them all as "static inline".
+
+   WARNING: This file is used by GDB which is stuck at C90. :-(
+   Though it can use stdint.h, inttypes.h.
+   Therefore if you want to add support for "long long" you need
+   to wrap it in #ifdef CC_HAS_LONG_LONG.  */
+
+#ifndef LEB128_H
+#define LEB128_H
+
+/* Get a definition for inline.  */
+#include "ansidecl.h"
+
+/* Get a definition for NULL, size_t.  */
+#include <stddef.h>
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+/* Decode the unsigned LEB128 constant at BUF into the variable pointed to
+   by R, and return the number of bytes read.
+   If we read off the end of the buffer, zero is returned,
+   and nothing is stored in R.
+
+   Note: The result is an int instead of a pointer to the next byte to be
+   read to avoid const-vs-non-const problems.  */
+
+static inline size_t
+read_uleb128_to_uint64 (const unsigned char *buf, const unsigned char *buf_end,
+			uint64_t *r)
+{
+  const unsigned char *p = buf;
+  unsigned int shift = 0;
+  uint64_t result = 0;
+  unsigned char byte;
+
+  while (1)
+    {
+      if (p >= buf_end)
+	return 0;
+
+      byte = *p++;
+      result |= ((uint64_t) (byte & 0x7f)) << shift;
+      if ((byte & 0x80) == 0)
+	break;
+      shift += 7;
+    }
+
+  *r = result;
+  return p - buf;
+}
+
+/* Decode the signed LEB128 constant at BUF into the variable pointed to
+   by R, and return the number of bytes read.
+   If we read off the end of the buffer, zero is returned,
+   and nothing is stored in R.
+
+   Note: The result is an int instead of a pointer to the next byte to be
+   read to avoid const-vs-non-const problems.  */
+
+static inline size_t
+read_sleb128_to_int64 (const unsigned char *buf, const unsigned char *buf_end,
+		       int64_t *r)
+{
+  const unsigned char *p = buf;
+  unsigned int shift = 0;
+  int64_t result = 0;
+  unsigned char byte;
+
+  while (1)
+    {
+      if (p >= buf_end)
+	return 0;
+
+      byte = *p++;
+      result |= ((uint64_t) (byte & 0x7f)) << shift;
+      shift += 7;
+      if ((byte & 0x80) == 0)
+	break;
+    }
+  if (shift < (sizeof (*r) * 8) && (byte & 0x40) != 0)
+    result |= -(((uint64_t) 1) << shift);
+
+  *r = result;
+  return p - buf;
+}
+
+/* Return the number of bytes to read to skip past an LEB128 number in BUF.
+   If the end isn't found before reaching BUF_END, return zero.
+
+   Note: The result is an int instead of a pointer to the next byte to be
+   read to avoid const-vs-non-const problems.  */
+
+static inline size_t
+skip_leb128 (const unsigned char *buf, const unsigned char *buf_end)
+{
+  const unsigned char *p = buf;
+  unsigned char byte;
+
+  while (1)
+    {
+      if (p == buf_end)
+	return 0;
+
+      byte = *p++;
+      if ((byte & 0x80) == 0)
+	return p - buf;
+    }
+}
+
+#endif /* LEB128_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/libbfd.a b/utils/gapy/gen-debug-info-src/ext/libbfd.a
new file mode 100644
index 000000000..5b848f563
Binary files /dev/null and b/utils/gapy/gen-debug-info-src/ext/libbfd.a differ
diff --git a/utils/gapy/gen-debug-info-src/ext/libiberty.a b/utils/gapy/gen-debug-info-src/ext/libiberty.a
new file mode 100644
index 000000000..b338123fa
Binary files /dev/null and b/utils/gapy/gen-debug-info-src/ext/libiberty.a differ
diff --git a/utils/gapy/gen-debug-info-src/ext/libiberty.h b/utils/gapy/gen-debug-info-src/ext/libiberty.h
new file mode 100644
index 000000000..a9c885fc6
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/libiberty.h
@@ -0,0 +1,739 @@
+/* Function declarations for libiberty.
+
+   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   
+   Note - certain prototypes declared in this header file are for
+   functions whoes implementation copyright does not belong to the
+   FSF.  Those prototypes are present in this file for reference
+   purposes only and their presence in this file should not construed
+   as an indication of ownership by the FSF of the implementation of
+   those functions in any way or form whatsoever.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.
+   
+   Written by Cygnus Support, 1994.
+
+   The libiberty library provides a number of functions which are
+   missing on some operating systems.  We do not declare those here,
+   to avoid conflicts with the system header files on operating
+   systems that do support those functions.  In this file we only
+   declare those functions which are specific to libiberty.  */
+
+#ifndef LIBIBERTY_H
+#define LIBIBERTY_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "ansidecl.h"
+
+/* Get a definition for size_t.  */
+#include <stddef.h>
+/* Get a definition for va_list.  */
+#include <stdarg.h>
+
+#include <stdio.h>
+
+/* If the OS supports it, ensure that the supplied stream is setup to
+   avoid any multi-threaded locking.  Otherwise leave the FILE pointer
+   unchanged.  If the stream is NULL do nothing.  */
+
+extern void unlock_stream (FILE *);
+
+/* If the OS supports it, ensure that the standard I/O streams, stdin,
+   stdout and stderr are setup to avoid any multi-threaded locking.
+   Otherwise do nothing.  */
+
+extern void unlock_std_streams (void);
+
+/* Open and return a FILE pointer.  If the OS supports it, ensure that
+   the stream is setup to avoid any multi-threaded locking.  Otherwise
+   return the FILE pointer unchanged.  */
+
+extern FILE *fopen_unlocked (const char *, const char *);
+extern FILE *fdopen_unlocked (int, const char *);
+extern FILE *freopen_unlocked (const char *, const char *, FILE *);
+
+/* Build an argument vector from a string.  Allocates memory using
+   malloc.  Use freeargv to free the vector.  */
+
+extern char **buildargv (const char *) ATTRIBUTE_MALLOC;
+
+/* Free a vector returned by buildargv.  */
+
+extern void freeargv (char **);
+
+/* Duplicate an argument vector. Allocates memory using malloc.  Use
+   freeargv to free the vector.  */
+
+extern char **dupargv (char * const *) ATTRIBUTE_MALLOC;
+
+/* Expand "@file" arguments in argv.  */
+
+extern void expandargv (int *, char ***);
+
+/* Write argv to an @-file, inserting necessary quoting.  */
+
+extern int writeargv (char * const *, FILE *);
+
+/* Return the number of elements in argv.  */
+
+extern int countargv (char * const *);
+
+/* Return the last component of a path name.  Note that we can't use a
+   prototype here because the parameter is declared inconsistently
+   across different systems, sometimes as "char *" and sometimes as
+   "const char *" */
+
+/* HAVE_DECL_* is a three-state macro: undefined, 0 or 1.  If it is
+   undefined, we haven't run the autoconf check so provide the
+   declaration without arguments.  If it is 0, we checked and failed
+   to find the declaration so provide a fully prototyped one.  If it
+   is 1, we found it so don't provide any declaration at all.  */
+#if !HAVE_DECL_BASENAME
+#if defined (__GNU_LIBRARY__ ) || defined (__linux__) \
+ || defined (__FreeBSD__) || defined (__OpenBSD__) || defined (__NetBSD__) \
+ || defined (__CYGWIN__) || defined (__CYGWIN32__) || defined (__MINGW32__) \
+ || defined (__DragonFly__) || defined (HAVE_DECL_BASENAME) 
+extern char *basename (const char *) ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_NONNULL(1);
+#else
+/* Do not allow basename to be used if there is no prototype seen.  We
+   either need to use the above prototype or have one from
+   autoconf which would result in HAVE_DECL_BASENAME being set.  */
+#define basename basename_cannot_be_used_without_a_prototype
+#endif
+#endif
+
+/* A well-defined basename () that is always compiled in.  */
+
+extern const char *lbasename (const char *) ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_NONNULL(1);
+
+/* Same, but assumes DOS semantics (drive name, backslash is also a
+   dir separator) regardless of host.  */
+
+extern const char *dos_lbasename (const char *) ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_NONNULL(1);
+
+/* Same, but assumes Unix semantics (absolute paths always start with
+   a slash, only forward slash is accepted as dir separator)
+   regardless of host.  */
+
+extern const char *unix_lbasename (const char *) ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_NONNULL(1);
+
+/* A well-defined realpath () that is always compiled in.  */
+
+extern char *lrealpath (const char *);
+
+/* Concatenate an arbitrary number of strings.  You must pass NULL as
+   the last argument of this function, to terminate the list of
+   strings.  Allocates memory using xmalloc.  */
+
+extern char *concat (const char *, ...) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_SENTINEL;
+
+/* Concatenate an arbitrary number of strings.  You must pass NULL as
+   the last argument of this function, to terminate the list of
+   strings.  Allocates memory using xmalloc.  The first argument is
+   not one of the strings to be concatenated, but if not NULL is a
+   pointer to be freed after the new string is created, similar to the
+   way xrealloc works.  */
+
+extern char *reconcat (char *, const char *, ...) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_SENTINEL;
+
+/* Determine the length of concatenating an arbitrary number of
+   strings.  You must pass NULL as the last argument of this function,
+   to terminate the list of strings.  */
+
+extern unsigned long concat_length (const char *, ...) ATTRIBUTE_SENTINEL;
+
+/* Concatenate an arbitrary number of strings into a SUPPLIED area of
+   memory.  You must pass NULL as the last argument of this function,
+   to terminate the list of strings.  The supplied memory is assumed
+   to be large enough.  */
+
+extern char *concat_copy (char *, const char *, ...) ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_NONNULL(1) ATTRIBUTE_SENTINEL;
+
+/* Concatenate an arbitrary number of strings into a GLOBAL area of
+   memory.  You must pass NULL as the last argument of this function,
+   to terminate the list of strings.  The supplied memory is assumed
+   to be large enough.  */
+
+extern char *concat_copy2 (const char *, ...) ATTRIBUTE_RETURNS_NONNULL ATTRIBUTE_SENTINEL;
+
+/* This is the global area used by concat_copy2.  */
+
+extern char *libiberty_concat_ptr;
+
+/* Concatenate an arbitrary number of strings.  You must pass NULL as
+   the last argument of this function, to terminate the list of
+   strings.  Allocates memory using alloca.  The arguments are
+   evaluated twice!  */
+#define ACONCAT(ACONCAT_PARAMS) \
+  (libiberty_concat_ptr = (char *) alloca (concat_length ACONCAT_PARAMS + 1), \
+   concat_copy2 ACONCAT_PARAMS)
+
+/* Check whether two file descriptors refer to the same file.  */
+
+extern int fdmatch (int fd1, int fd2);
+
+/* Return the position of the first bit set in the argument.  */
+/* Prototypes vary from system to system, so we only provide a
+   prototype on systems where we know that we need it.  */
+#if defined (HAVE_DECL_FFS) && !HAVE_DECL_FFS
+extern int ffs(int);
+#endif
+
+/* Get the working directory.  The result is cached, so don't call
+   chdir() between calls to getpwd().  */
+
+extern char * getpwd (void);
+
+/* Get the current time.  */
+/* Prototypes vary from system to system, so we only provide a
+   prototype on systems where we know that we need it.  */
+#ifdef __MINGW32__
+/* Forward declaration to avoid #include <sys/time.h>.   */
+struct timeval;
+extern int gettimeofday (struct timeval *, void *); 
+#endif
+
+/* Get the amount of time the process has run, in microseconds.  */
+
+extern long get_run_time (void);
+
+/* Generate a relocated path to some installation directory.  Allocates
+   return value using malloc.  */
+
+extern char *make_relative_prefix (const char *, const char *,
+                                   const char *) ATTRIBUTE_MALLOC;
+
+/* Generate a relocated path to some installation directory without
+   attempting to follow any soft links.  Allocates
+   return value using malloc.  */
+
+extern char *make_relative_prefix_ignore_links (const char *, const char *,
+						const char *) ATTRIBUTE_MALLOC;
+
+/* Returns a pointer to a directory path suitable for creating temporary
+   files in.  */
+
+extern const char *choose_tmpdir (void) ATTRIBUTE_RETURNS_NONNULL;
+
+/* Choose a temporary directory to use for scratch files.  */
+
+extern char *choose_temp_base (void) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;
+
+/* Return a temporary file name or NULL if unable to create one.  */
+
+extern char *make_temp_file (const char *) ATTRIBUTE_MALLOC;
+
+/* Remove a link to a file unless it is special. */
+
+extern int unlink_if_ordinary (const char *);
+
+/* Allocate memory filled with spaces.  Allocates using malloc.  */
+
+extern const char *spaces (int count);
+
+/* Return the maximum error number for which strerror will return a
+   string.  */
+
+extern int errno_max (void);
+
+/* Return the name of an errno value (e.g., strerrno (EINVAL) returns
+   "EINVAL").  */
+
+extern const char *strerrno (int);
+
+/* Given the name of an errno value, return the value.  */
+
+extern int strtoerrno (const char *);
+
+/* ANSI's strerror(), but more robust.  */
+
+extern char *xstrerror (int) ATTRIBUTE_RETURNS_NONNULL;
+
+/* Return the maximum signal number for which strsignal will return a
+   string.  */
+
+extern int signo_max (void);
+
+/* Return a signal message string for a signal number
+   (e.g., strsignal (SIGHUP) returns something like "Hangup").  */
+/* This is commented out as it can conflict with one in system headers.
+   We still document its existence though.  */
+
+/*extern const char *strsignal (int);*/
+
+/* Return the name of a signal number (e.g., strsigno (SIGHUP) returns
+   "SIGHUP").  */
+
+extern const char *strsigno (int);
+
+/* Given the name of a signal, return its number.  */
+
+extern int strtosigno (const char *);
+
+/* Register a function to be run by xexit.  Returns 0 on success.  */
+
+extern int xatexit (void (*fn) (void));
+
+/* Exit, calling all the functions registered with xatexit.  */
+
+extern void xexit (int status) ATTRIBUTE_NORETURN;
+
+/* Set the program name used by xmalloc.  */
+
+extern void xmalloc_set_program_name (const char *);
+
+/* Report an allocation failure.  */
+extern void xmalloc_failed (size_t) ATTRIBUTE_NORETURN;
+
+/* Allocate memory without fail.  If malloc fails, this will print a
+   message to stderr (using the name set by xmalloc_set_program_name,
+   if any) and then call xexit.  */
+
+extern void *xmalloc (size_t) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;
+
+/* Reallocate memory without fail.  This works like xmalloc.  Note,
+   realloc type functions are not suitable for attribute malloc since
+   they may return the same address across multiple calls. */
+
+extern void *xrealloc (void *, size_t) ATTRIBUTE_RETURNS_NONNULL;
+
+/* Allocate memory without fail and set it to zero.  This works like
+   xmalloc.  */
+
+extern void *xcalloc (size_t, size_t) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;
+
+/* Copy a string into a memory buffer without fail.  */
+
+extern char *xstrdup (const char *) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;
+
+/* Copy at most N characters from string into a buffer without fail.  */
+
+extern char *xstrndup (const char *, size_t) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;
+
+/* Copy an existing memory buffer to a new memory buffer without fail.  */
+
+extern void *xmemdup (const void *, size_t, size_t) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;
+
+/* Physical memory routines.  Return values are in BYTES.  */
+extern double physmem_total (void);
+extern double physmem_available (void);
+
+/* Compute the 32-bit CRC of a block of memory.  */
+extern unsigned int xcrc32 (const unsigned char *, int, unsigned int);
+
+/* These macros provide a K&R/C89/C++-friendly way of allocating structures
+   with nice encapsulation.  The XDELETE*() macros are technically
+   superfluous, but provided here for symmetry.  Using them consistently
+   makes it easier to update client code to use different allocators such
+   as new/delete and new[]/delete[].  */
+
+/* Scalar allocators.  */
+
+#define XALLOCA(T)		((T *) alloca (sizeof (T)))
+#define XNEW(T)			((T *) xmalloc (sizeof (T)))
+#define XCNEW(T)		((T *) xcalloc (1, sizeof (T)))
+#define XDUP(T, P)		((T *) xmemdup ((P), sizeof (T), sizeof (T)))
+#define XDELETE(P)		free ((void*) (P))
+
+/* Array allocators.  */
+
+#define XALLOCAVEC(T, N)	((T *) alloca (sizeof (T) * (N)))
+#define XNEWVEC(T, N)		((T *) xmalloc (sizeof (T) * (N)))
+#define XCNEWVEC(T, N)		((T *) xcalloc ((N), sizeof (T)))
+#define XDUPVEC(T, P, N)	((T *) xmemdup ((P), sizeof (T) * (N), sizeof (T) * (N)))
+#define XRESIZEVEC(T, P, N)	((T *) xrealloc ((void *) (P), sizeof (T) * (N)))
+#define XDELETEVEC(P)		free ((void*) (P))
+
+/* Allocators for variable-sized structures and raw buffers.  */
+
+#define XALLOCAVAR(T, S)	((T *) alloca ((S)))
+#define XNEWVAR(T, S)		((T *) xmalloc ((S)))
+#define XCNEWVAR(T, S)		((T *) xcalloc (1, (S)))
+#define XDUPVAR(T, P, S1, S2)	((T *) xmemdup ((P), (S1), (S2)))
+#define XRESIZEVAR(T, P, S)	((T *) xrealloc ((P), (S)))
+
+/* Type-safe obstack allocator.  */
+
+#define XOBNEW(O, T)		((T *) obstack_alloc ((O), sizeof (T)))
+#define XOBNEWVEC(O, T, N)	((T *) obstack_alloc ((O), sizeof (T) * (N)))
+#define XOBNEWVAR(O, T, S)	((T *) obstack_alloc ((O), (S)))
+#define XOBFINISH(O, T)         ((T) obstack_finish ((O)))
+
+/* hex character manipulation routines */
+
+#define _hex_array_size 256
+#define _hex_bad	99
+extern const unsigned char _hex_value[_hex_array_size];
+extern void hex_init (void);
+#define hex_p(c)	(hex_value (c) != _hex_bad)
+/* If you change this, note well: Some code relies on side effects in
+   the argument being performed exactly once.  */
+#define hex_value(c)	((unsigned int) _hex_value[(unsigned char) (c)])
+
+/* Flags for pex_init.  These are bits to be or'ed together.  */
+
+/* Record subprocess times, if possible.  */
+#define PEX_RECORD_TIMES	0x1
+
+/* Use pipes for communication between processes, if possible.  */
+#define PEX_USE_PIPES		0x2
+
+/* Save files used for communication between processes.  */
+#define PEX_SAVE_TEMPS		0x4
+
+/* Prepare to execute one or more programs, with standard output of
+   each program fed to standard input of the next.
+   FLAGS	As above.
+   PNAME	The name of the program to report in error messages.
+   TEMPBASE	A base name to use for temporary files; may be NULL to
+   		use a random name.
+   Returns NULL on error.  */
+
+extern struct pex_obj *pex_init (int flags, const char *pname,
+				 const char *tempbase) ATTRIBUTE_RETURNS_NONNULL;
+
+/* Flags for pex_run.  These are bits to be or'ed together.  */
+
+/* Last program in pipeline.  Standard output of program goes to
+   OUTNAME, or, if OUTNAME is NULL, to standard output of caller.  Do
+   not set this if you want to call pex_read_output.  After this is
+   set, pex_run may no longer be called with the same struct
+   pex_obj.  */
+#define PEX_LAST		0x1
+
+/* Search for program in executable search path.  */
+#define PEX_SEARCH		0x2
+
+/* OUTNAME is a suffix.  */
+#define PEX_SUFFIX		0x4
+
+/* Send program's standard error to standard output.  */
+#define PEX_STDERR_TO_STDOUT	0x8
+
+/* Input file should be opened in binary mode.  This flag is ignored
+   on Unix.  */
+#define PEX_BINARY_INPUT	0x10
+
+/* Output file should be opened in binary mode.  This flag is ignored
+   on Unix.  For proper behaviour PEX_BINARY_INPUT and
+   PEX_BINARY_OUTPUT have to match appropriately--i.e., a call using
+   PEX_BINARY_OUTPUT should be followed by a call using
+   PEX_BINARY_INPUT.  */
+#define PEX_BINARY_OUTPUT	0x20
+
+/* Capture stderr to a pipe.  The output can be read by
+   calling pex_read_err and reading from the returned
+   FILE object.  This flag may be specified only for
+   the last program in a pipeline.  
+
+   This flag is supported only on Unix and Windows.  */
+#define PEX_STDERR_TO_PIPE	0x40
+
+/* Capture stderr in binary mode.  This flag is ignored
+   on Unix.  */
+#define PEX_BINARY_ERROR	0x80
+
+/* Append stdout to existing file instead of truncating it.  */
+#define PEX_STDOUT_APPEND	0x100
+
+/* Thes same as PEX_STDOUT_APPEND, but for STDERR.  */
+#define PEX_STDERR_APPEND	0x200
+
+/* Execute one program.  Returns NULL on success.  On error returns an
+   error string (typically just the name of a system call); the error
+   string is statically allocated.
+
+   OBJ		Returned by pex_init.
+
+   FLAGS	As above.
+
+   EXECUTABLE	The program to execute.
+
+   ARGV		NULL terminated array of arguments to pass to the program.
+
+   OUTNAME	Sets the output file name as follows:
+
+		PEX_SUFFIX set (OUTNAME may not be NULL):
+		  TEMPBASE parameter to pex_init not NULL:
+		    Output file name is the concatenation of TEMPBASE
+		    and OUTNAME.
+		  TEMPBASE is NULL:
+		    Output file name is a random file name ending in
+		    OUTNAME.
+		PEX_SUFFIX not set:
+		  OUTNAME not NULL:
+		    Output file name is OUTNAME.
+		  OUTNAME NULL, TEMPBASE not NULL:
+		    Output file name is randomly chosen using
+		    TEMPBASE.
+		  OUTNAME NULL, TEMPBASE NULL:
+		    Output file name is randomly chosen.
+
+		If PEX_LAST is not set, the output file name is the
+   		name to use for a temporary file holding stdout, if
+   		any (there will not be a file if PEX_USE_PIPES is set
+   		and the system supports pipes).  If a file is used, it
+   		will be removed when no longer needed unless
+   		PEX_SAVE_TEMPS is set.
+
+		If PEX_LAST is set, and OUTNAME is not NULL, standard
+   		output is written to the output file name.  The file
+   		will not be removed.  If PEX_LAST and PEX_SUFFIX are
+   		both set, TEMPBASE may not be NULL.
+
+   ERRNAME	If not NULL, this is the name of a file to which
+		standard error is written.  If NULL, standard error of
+		the program is standard error of the caller.
+
+   ERR		On an error return, *ERR is set to an errno value, or
+   		to 0 if there is no relevant errno.
+*/
+
+extern const char *pex_run (struct pex_obj *obj, int flags,
+			    const char *executable, char * const *argv,
+			    const char *outname, const char *errname,
+			    int *err);
+
+/* As for pex_run (), but takes an extra parameter to enable the
+   environment for the child process to be specified.
+
+   ENV		The environment for the child process, specified as
+		an array of character pointers.  Each element of the
+		array should point to a string of the form VAR=VALUE,
+                with the exception of the last element which must be
+                a null pointer.
+*/
+
+extern const char *pex_run_in_environment (struct pex_obj *obj, int flags,
+			                   const char *executable,
+                                           char * const *argv,
+                                           char * const *env,
+              	          		   const char *outname,
+					   const char *errname, int *err);
+
+/* Return a stream for a temporary file to pass to the first program
+   in the pipeline as input.  The file name is chosen as for pex_run.
+   pex_run closes the file automatically; don't close it yourself.  */
+
+extern FILE *pex_input_file (struct pex_obj *obj, int flags,
+                             const char *in_name);
+
+/* Return a stream for a pipe connected to the standard input of the
+   first program in the pipeline.  You must have passed
+   `PEX_USE_PIPES' to `pex_init'.  Close the returned stream
+   yourself.  */
+
+extern FILE *pex_input_pipe (struct pex_obj *obj, int binary);
+
+/* Read the standard output of the last program to be executed.
+   pex_run can not be called after this.  BINARY should be non-zero if
+   the file should be opened in binary mode; this is ignored on Unix.
+   Returns NULL on error.  Don't call fclose on the returned FILE; it
+   will be closed by pex_free.  */
+
+extern FILE *pex_read_output (struct pex_obj *, int binary);
+
+/* Read the standard error of the last program to be executed.
+   pex_run can not be called after this.  BINARY should be non-zero if
+   the file should be opened in binary mode; this is ignored on Unix.
+   Returns NULL on error.  Don't call fclose on the returned FILE; it
+   will be closed by pex_free.  */
+
+extern FILE *pex_read_err (struct pex_obj *, int binary);
+
+/* Return exit status of all programs in VECTOR.  COUNT indicates the
+   size of VECTOR.  The status codes in the vector are in the order of
+   the calls to pex_run.  Returns 0 on error, 1 on success.  */
+
+extern int pex_get_status (struct pex_obj *, int count, int *vector);
+
+/* Return times of all programs in VECTOR.  COUNT indicates the size
+   of VECTOR.  struct pex_time is really just struct timeval, but that
+   is not portable to all systems.  Returns 0 on error, 1 on
+   success.  */
+
+struct pex_time
+{
+  unsigned long user_seconds;
+  unsigned long user_microseconds;
+  unsigned long system_seconds;
+  unsigned long system_microseconds;
+};
+
+extern int pex_get_times (struct pex_obj *, int count,
+			  struct pex_time *vector);
+
+/* Clean up a pex_obj.  If you have not called pex_get_times or
+   pex_get_status, this will try to kill the subprocesses.  */
+
+extern void pex_free (struct pex_obj *);
+
+/* Just execute one program.  Return value is as for pex_run.
+   FLAGS	Combination of PEX_SEARCH and PEX_STDERR_TO_STDOUT.
+   EXECUTABLE	As for pex_run.
+   ARGV		As for pex_run.
+   PNAME	As for pex_init.
+   OUTNAME	As for pex_run when PEX_LAST is set.
+   ERRNAME	As for pex_run.
+   STATUS	Set to exit status on success.
+   ERR		As for pex_run.
+*/
+
+extern const char *pex_one (int flags, const char *executable,
+			    char * const *argv, const char *pname,
+			    const char *outname, const char *errname,
+			    int *status, int *err);
+
+/* pexecute and pwait are the old pexecute interface, still here for
+   backward compatibility.  Don't use these for new code.  Instead,
+   use pex_init/pex_run/pex_get_status/pex_free, or pex_one.  */
+
+/* Definitions used by the pexecute routine.  */
+
+#define PEXECUTE_FIRST   1
+#define PEXECUTE_LAST    2
+#define PEXECUTE_ONE     (PEXECUTE_FIRST + PEXECUTE_LAST)
+#define PEXECUTE_SEARCH  4
+#define PEXECUTE_VERBOSE 8
+
+/* Execute a program.  */
+
+extern int pexecute (const char *, char * const *, const char *,
+                     const char *, char **, char **, int);
+
+/* Wait for pexecute to finish.  */
+
+extern int pwait (int, int *, int);
+
+#if defined(HAVE_DECL_ASPRINTF) && !HAVE_DECL_ASPRINTF
+/* Like sprintf but provides a pointer to malloc'd storage, which must
+   be freed by the caller.  */
+
+extern int asprintf (char **, const char *, ...) ATTRIBUTE_PRINTF_2;
+#endif
+
+/* Like asprintf but allocates memory without fail. This works like
+   xmalloc.  */
+
+extern char *xasprintf (const char *, ...) ATTRIBUTE_MALLOC ATTRIBUTE_PRINTF_1;
+
+#if !HAVE_DECL_VASPRINTF
+/* Like vsprintf but provides a pointer to malloc'd storage, which
+   must be freed by the caller.  */
+
+extern int vasprintf (char **, const char *, va_list) ATTRIBUTE_PRINTF(2,0);
+#endif
+
+/* Like vasprintf but allocates memory without fail. This works like
+   xmalloc.  */
+
+extern char *xvasprintf (const char *, va_list) ATTRIBUTE_MALLOC ATTRIBUTE_PRINTF(1,0);
+
+#if defined(HAVE_DECL_SNPRINTF) && !HAVE_DECL_SNPRINTF
+/* Like sprintf but prints at most N characters.  */
+extern int snprintf (char *, size_t, const char *, ...) ATTRIBUTE_PRINTF_3;
+#endif
+
+#if defined(HAVE_DECL_VSNPRINTF) && !HAVE_DECL_VSNPRINTF
+/* Like vsprintf but prints at most N characters.  */
+extern int vsnprintf (char *, size_t, const char *, va_list) ATTRIBUTE_PRINTF(3,0);
+#endif
+
+#if defined (HAVE_DECL_STRNLEN) && !HAVE_DECL_STRNLEN
+extern size_t strnlen (const char *, size_t);
+#endif
+
+#if defined(HAVE_DECL_STRVERSCMP) && !HAVE_DECL_STRVERSCMP
+/* Compare version strings.  */
+extern int strverscmp (const char *, const char *);
+#endif
+
+#if defined(HAVE_DECL_STRTOL) && !HAVE_DECL_STRTOL
+extern long int strtol (const char *nptr,
+                        char **endptr, int base);
+#endif
+
+#if defined(HAVE_DECL_STRTOUL) && !HAVE_DECL_STRTOUL
+extern unsigned long int strtoul (const char *nptr,
+                                  char **endptr, int base);
+#endif
+
+#if defined(HAVE_LONG_LONG) && defined(HAVE_DECL_STRTOLL) && !HAVE_DECL_STRTOLL
+__extension__
+extern long long int strtoll (const char *nptr,
+                              char **endptr, int base);
+#endif
+
+#if defined(HAVE_LONG_LONG) && defined(HAVE_DECL_STRTOULL) && !HAVE_DECL_STRTOULL
+__extension__
+extern unsigned long long int strtoull (const char *nptr,
+                                        char **endptr, int base);
+#endif
+
+#if defined(HAVE_DECL_STRVERSCMP) && !HAVE_DECL_STRVERSCMP
+/* Compare version strings.  */
+extern int strverscmp (const char *, const char *);
+#endif
+
+/* Set the title of a process */
+extern void setproctitle (const char *name, ...);
+
+/* Increase stack limit if possible.  */
+extern void stack_limit_increase (unsigned long);
+
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
+/* Drastically simplified alloca configurator.  If we're using GCC,
+   we use __builtin_alloca; otherwise we use the C alloca.  The C
+   alloca is always available.  You can override GCC by defining
+   USE_C_ALLOCA yourself.  The canonical autoconf macro C_ALLOCA is
+   also set/unset as it is often used to indicate whether code needs
+   to call alloca(0).  */
+extern void *C_alloca (size_t) ATTRIBUTE_MALLOC;
+#undef alloca
+#if GCC_VERSION >= 2000 && !defined USE_C_ALLOCA
+# define alloca(x) __builtin_alloca(x)
+# undef C_ALLOCA
+# define ASTRDUP(X) \
+  (__extension__ ({ const char *const libiberty_optr = (X); \
+   const unsigned long libiberty_len = strlen (libiberty_optr) + 1; \
+   char *const libiberty_nptr = (char *const) alloca (libiberty_len); \
+   (char *) memcpy (libiberty_nptr, libiberty_optr, libiberty_len); }))
+#else
+# define alloca(x) C_alloca(x)
+# undef USE_C_ALLOCA
+# define USE_C_ALLOCA 1
+# undef C_ALLOCA
+# define C_ALLOCA 1
+extern const char *libiberty_optr;
+extern char *libiberty_nptr;
+extern unsigned long libiberty_len;
+# define ASTRDUP(X) \
+  (libiberty_optr = (X), \
+   libiberty_len = strlen (libiberty_optr) + 1, \
+   libiberty_nptr = (char *) alloca (libiberty_len), \
+   (char *) memcpy (libiberty_nptr, libiberty_optr, libiberty_len))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* ! defined (LIBIBERTY_H) */
diff --git a/utils/gapy/gen-debug-info-src/ext/longlong.h b/utils/gapy/gen-debug-info-src/ext/longlong.h
new file mode 100644
index 000000000..b25a59462
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/longlong.h
@@ -0,0 +1,1758 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* You have to define the following before including this file:
+
+   UWtype -- An unsigned type, default type for operations (typically a "word")
+   UHWtype -- An unsigned type, at least half the size of UWtype.
+   UDWtype -- An unsigned type, at least twice as large a UWtype
+   W_TYPE_SIZE -- size in bits of UWtype
+
+   UQItype -- Unsigned 8 bit type.
+   SItype, USItype -- Signed and unsigned 32 bit types.
+   DItype, UDItype -- Signed and unsigned 64 bit types.
+
+   On a 32 bit machine UWtype should typically be USItype;
+   on a 64 bit machine, UWtype should typically be UDItype.  */
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+#ifndef W_TYPE_SIZE
+#define W_TYPE_SIZE	32
+#define UWtype		USItype
+#define UHWtype		USItype
+#define UDWtype		UDItype
+#endif
+
+/* Used in glibc only.  */
+#ifndef attribute_hidden
+#define attribute_hidden
+#endif
+
+extern const UQItype __clz_tab[256] attribute_hidden;
+
+/* Define auxiliary asm macros.
+
+   1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
+   UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
+   word product in HIGH_PROD and LOW_PROD.
+
+   2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
+   UDWtype product.  This is just a variant of umul_ppmm.
+
+   3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator) divides a UDWtype, composed by the UWtype integers
+   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
+   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
+   than DENOMINATOR for correct operation.  If, in addition, the most
+   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
+   UDIV_NEEDS_NORMALIZATION is defined to 1.
+
+   4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
+   is rounded towards 0.
+
+   5) count_leading_zeros(count, x) counts the number of zero-bits from the
+   msb to the first nonzero bit in the UWtype X.  This is the number of
+   steps X needs to be shifted left to set the msb.  Undefined for X == 0,
+   unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
+
+   6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
+   from the least significant end.
+
+   7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+   high_addend_2, low_addend_2) adds two UWtype integers, composed by
+   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+   (i.e. carry out) is not stored anywhere, and is lost.
+
+   8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
+   and is lost.
+
+   If any of these macros are left undefined for a particular CPU,
+   C macros are used.  */
+
+/* The CPUs come in alphabetical order below.
+
+   Please add support for more CPUs here, or improve the current support
+   for the CPUs below!
+   (E.g. WE32100, IBM360.)  */
+
+#if defined (__GNUC__) && !defined (NO_ASM)
+
+/* We sometimes need to clobber "cc" with gcc2, but that would not be
+   understood by gcc1.  Use cpp to avoid major code duplication.  */
+#if __GNUC__ < 2
+#define __CLOBBER_CC
+#define __AND_CLOBBER_CC
+#else /* __GNUC__ >= 2 */
+#define __CLOBBER_CC : "cc"
+#define __AND_CLOBBER_CC , "cc"
+#endif /* __GNUC__ < 2 */
+
+#if defined (__aarch64__)
+
+#if W_TYPE_SIZE == 32
+#define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
+#define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctz (X))
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* W_TYPE_SIZE == 32 */
+
+#if W_TYPE_SIZE == 64
+#define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clzll (X))
+#define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctzll (X))
+#define COUNT_LEADING_ZEROS_0 64
+#endif /* W_TYPE_SIZE == 64 */
+
+#endif /* __aarch64__ */
+
+#if defined (__alpha) && W_TYPE_SIZE == 64
+/* There is a bug in g++ before version 5 that
+   errors on __builtin_alpha_umulh.  */
+#if !defined(__cplusplus) || __GNUC__ >= 5
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    (ph) = __builtin_alpha_umulh (__m0, __m1);				\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 46
+#endif /* !c++ */
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UDItype __r;							\
+    (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
+    (r) = __r;								\
+  } while (0)
+extern UDItype __udiv_qrnnd (UDItype *, UDItype, UDItype, UDItype);
+#define UDIV_TIME 220
+#endif /* LONGLONG_STANDALONE */
+#ifdef __alpha_cix__
+#define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clzl (X))
+#define count_trailing_zeros(COUNT,X)	((COUNT) = __builtin_ctzl (X))
+#define COUNT_LEADING_ZEROS_0 64
+#else
+#define count_leading_zeros(COUNT,X) \
+  do {									\
+    UDItype __xr = (X), __t, __a;					\
+    __t = __builtin_alpha_cmpbge (0, __xr);				\
+    __a = __clz_tab[__t ^ 0xff] - 1;					\
+    __t = __builtin_alpha_extbl (__xr, __a);				\
+    (COUNT) = 64 - (__clz_tab[__t] + __a*8);				\
+  } while (0)
+#define count_trailing_zeros(COUNT,X) \
+  do {									\
+    UDItype __xr = (X), __t, __a;					\
+    __t = __builtin_alpha_cmpbge (0, __xr);				\
+    __t = ~__t & -~__t;							\
+    __a = ((__t & 0xCC) != 0) * 2;					\
+    __a += ((__t & 0xF0) != 0) * 4;					\
+    __a += ((__t & 0xAA) != 0);						\
+    __t = __builtin_alpha_extbl (__xr, __a);				\
+    __a <<= 3;								\
+    __t &= -__t;							\
+    __a += ((__t & 0xCC) != 0) * 2;					\
+    __a += ((__t & 0xF0) != 0) * 4;					\
+    __a += ((__t & 0xAA) != 0);						\
+    (COUNT) = __a;							\
+  } while (0)
+#endif /* __alpha_cix__ */
+#endif /* __alpha */
+
+#if defined (__arc__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add.f	%1, %4, %5\n\tadc	%0, %2, %3"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%r" ((USItype) (ah)),					\
+	     "rICal" ((USItype) (bh)),					\
+	     "%r" ((USItype) (al)),					\
+	     "rICal" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub.f	%1, %4, %5\n\tsbc	%0, %2, %3"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "r" ((USItype) (ah)),					\
+	     "rICal" ((USItype) (bh)),					\
+	     "r" ((USItype) (al)),					\
+	     "rICal" ((USItype) (bl)))
+
+#define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
+#ifdef __ARC_NORM__
+#define count_leading_zeros(count, x) \
+  do									\
+    {									\
+      SItype c_;							\
+									\
+      __asm__ ("norm.f\t%0,%1\n\tmov.mi\t%0,-1" : "=r" (c_) : "r" (x) : "cc");\
+      (count) = c_ + 1;							\
+    }									\
+  while (0)
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* __ARC_NORM__ */
+#endif /* __arc__ */
+
+#if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
+ && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("adds	%1, %4, %5\n\tadc	%0, %2, %3"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%r" ((USItype) (ah)),					\
+	     "rI" ((USItype) (bh)),					\
+	     "%r" ((USItype) (al)),					\
+	     "rI" ((USItype) (bl)) __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "r" ((USItype) (ah)),					\
+	     "rI" ((USItype) (bh)),					\
+	     "r" ((USItype) (al)),					\
+	     "rI" ((USItype) (bl)) __CLOBBER_CC)
+# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
+     || defined(__ARM_ARCH_3__)
+#  define umul_ppmm(xh, xl, a, b)					\
+  do {									\
+    register USItype __t0, __t1, __t2;					\
+    __asm__ ("%@ Inlined umul_ppmm\n"					\
+	   "	mov	%2, %5, lsr #16\n"				\
+	   "	mov	%0, %6, lsr #16\n"				\
+	   "	bic	%3, %5, %2, lsl #16\n"				\
+	   "	bic	%4, %6, %0, lsl #16\n"				\
+	   "	mul	%1, %3, %4\n"					\
+	   "	mul	%4, %2, %4\n"					\
+	   "	mul	%3, %0, %3\n"					\
+	   "	mul	%0, %2, %0\n"					\
+	   "	adds	%3, %4, %3\n"					\
+	   "	addcs	%0, %0, #65536\n"				\
+	   "	adds	%1, %1, %3, lsl #16\n"				\
+	   "	adc	%0, %0, %3, lsr #16"				\
+	   : "=&r" ((USItype) (xh)),					\
+	     "=r" ((USItype) (xl)),					\
+	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
+	   : "r" ((USItype) (a)),					\
+	     "r" ((USItype) (b)) __CLOBBER_CC );			\
+  } while (0)
+#  define UMUL_TIME 20
+# else
+#  define umul_ppmm(xh, xl, a, b)					\
+  do {									\
+    /* Generate umull, under compiler control.  */			\
+    register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
+    (xl) = (USItype)__t0;						\
+    (xh) = (USItype)(__t0 >> 32);					\
+  } while (0)
+#  define UMUL_TIME 3
+# endif
+# define UDIV_TIME 100
+#endif /* __arm__ */
+
+#if defined(__arm__)
+/* Let gcc decide how best to implement count_leading_zeros.  */
+#define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
+#define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+
+#if defined (__AVR__)
+
+#if W_TYPE_SIZE == 16
+#define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clz (X))
+#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X))
+#define COUNT_LEADING_ZEROS_0 16
+#endif /* W_TYPE_SIZE == 16 */
+
+#if W_TYPE_SIZE == 32
+#define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzl (X))
+#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzl (X))
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* W_TYPE_SIZE == 32 */
+
+#if W_TYPE_SIZE == 64
+#define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzll (X))
+#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzll (X))
+#define COUNT_LEADING_ZEROS_0 64
+#endif /* W_TYPE_SIZE == 64 */
+
+#endif /* defined (__AVR__) */
+
+#if defined (__CRIS__)
+
+#if __CRIS_arch_version >= 3
+#define count_leading_zeros(COUNT, X) ((COUNT) = __builtin_clz (X))
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* __CRIS_arch_version >= 3 */
+
+#if __CRIS_arch_version >= 8
+#define count_trailing_zeros(COUNT, X) ((COUNT) = __builtin_ctz (X))
+#endif /* __CRIS_arch_version >= 8 */
+
+#if __CRIS_arch_version >= 10
+#define __umulsidi3(u,v) ((UDItype)(USItype) (u) * (UDItype)(USItype) (v))
+#else
+#define __umulsidi3 __umulsidi3
+extern UDItype __umulsidi3 (USItype, USItype);
+#endif /* __CRIS_arch_version >= 10 */
+
+#define umul_ppmm(w1, w0, u, v)		\
+  do {					\
+    UDItype __x = __umulsidi3 (u, v);	\
+    (w0) = (USItype) (__x);		\
+    (w1) = (USItype) (__x >> 32);	\
+  } while (0)
+
+/* FIXME: defining add_ssaaaa and sub_ddmmss should be advantageous for
+   DFmode ("double" intrinsics, avoiding two of the three insns handling
+   carry), but defining them as open-code C composing and doing the
+   operation in DImode (UDImode) shows that the DImode needs work:
+   register pressure from requiring neighboring registers and the
+   traffic to and from them come to dominate, in the 4.7 series.  */
+
+#endif /* defined (__CRIS__) */
+
+#if defined (__hppa) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%rM" ((USItype) (ah)),					\
+	     "rM" ((USItype) (bh)),					\
+	     "%rM" ((USItype) (al)),					\
+	     "rM" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "rM" ((USItype) (ah)),					\
+	     "rM" ((USItype) (bh)),					\
+	     "rM" ((USItype) (al)),					\
+	     "rM" ((USItype) (bl)))
+#if defined (_PA_RISC1_1)
+#define umul_ppmm(w1, w0, u, v) \
+  do {									\
+    union								\
+      {									\
+	UDItype __f;							\
+	struct {USItype __w1, __w0;} __w1w0;				\
+      } __t;								\
+    __asm__ ("xmpyu %1,%2,%0"						\
+	     : "=x" (__t.__f)						\
+	     : "x" ((USItype) (u)),					\
+	       "x" ((USItype) (v)));					\
+    (w1) = __t.__w1w0.__w1;						\
+    (w0) = __t.__w1w0.__w0;						\
+     } while (0)
+#define UMUL_TIME 8
+#else
+#define UMUL_TIME 30
+#endif
+#define UDIV_TIME 40
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __tmp;							\
+    __asm__ (								\
+       "ldi		1,%0\n"						\
+"	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
+"	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"\
+"	ldo		16(%0),%0		; Yes.  Perform add.\n"	\
+"	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
+"	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"\
+"	ldo		8(%0),%0		; Yes.  Perform add.\n"	\
+"	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
+"	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"\
+"	ldo		4(%0),%0		; Yes.  Perform add.\n"	\
+"	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
+"	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"\
+"	ldo		2(%0),%0		; Yes.  Perform add.\n"	\
+"	extru		%1,30,1,%1		; Extract bit 1.\n"	\
+"	sub		%0,%1,%0		; Subtract it.\n"	\
+	: "=r" (count), "=r" (__tmp) : "1" (x));			\
+  } while (0)
+#endif
+
+#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
+#if !defined (__zarch__)
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
+	     : "=&r" (__x.__ll)						\
+	     : "r" (m0), "r" (m1));					\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __x.__i.__h = n1; __x.__i.__l = n0;					\
+    __asm__ ("dr %0,%2"							\
+	     : "=r" (__x.__ll)						\
+	     : "0" (__x.__ll), "r" (d));				\
+    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
+  } while (0)
+#else
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {                                                                  \
+    register SItype __r0 __asm__ ("0");					\
+    register SItype __r1 __asm__ ("1") = (m0);				\
+									\
+    __asm__ ("mr\t%%r0,%3"                                              \
+	     : "=r" (__r0), "=r" (__r1)					\
+	     : "r"  (__r1),  "r" (m1));					\
+    (xh) = __r0; (xl) = __r1;						\
+  } while (0)
+
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    register SItype __r0 __asm__ ("0") = (n1);				\
+    register SItype __r1 __asm__ ("1") = (n0);				\
+									\
+    __asm__ ("dr\t%%r0,%4"                                              \
+	     : "=r" (__r0), "=r" (__r1)					\
+	     : "r" (__r0), "r" (__r1), "r" (d));			\
+    (q) = __r1; (r) = __r0;						\
+  } while (0)
+#endif /* __zarch__ */
+#endif
+
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add{l} {%5,%1|%1,%5}\n\tadc{l} {%3,%0|%0,%3}"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%0" ((USItype) (ah)),					\
+	     "g" ((USItype) (bh)),					\
+	     "%1" ((USItype) (al)),					\
+	     "g" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub{l} {%5,%1|%1,%5}\n\tsbb{l} {%3,%0|%0,%3}"		\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "0" ((USItype) (ah)),					\
+	     "g" ((USItype) (bh)),					\
+	     "1" ((USItype) (al)),					\
+	     "g" ((USItype) (bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mul{l} %3"							\
+	   : "=a" ((USItype) (w0)),					\
+	     "=d" ((USItype) (w1))					\
+	   : "%0" ((USItype) (u)),					\
+	     "rm" ((USItype) (v)))
+#define udiv_qrnnd(q, r, n1, n0, dv) \
+  __asm__ ("div{l} %4"							\
+	   : "=a" ((USItype) (q)),					\
+	     "=d" ((USItype) (r))					\
+	   : "0" ((USItype) (n0)),					\
+	     "1" ((USItype) (n1)),					\
+	     "rm" ((USItype) (dv)))
+#define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
+#define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
+#define UMUL_TIME 40
+#define UDIV_TIME 40
+#endif /* 80x86 */
+
+#if defined (__x86_64__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add{q} {%5,%1|%1,%5}\n\tadc{q} {%3,%0|%0,%3}"		\
+	   : "=r" ((UDItype) (sh)),					\
+	     "=&r" ((UDItype) (sl))					\
+	   : "%0" ((UDItype) (ah)),					\
+	     "rme" ((UDItype) (bh)),					\
+	     "%1" ((UDItype) (al)),					\
+	     "rme" ((UDItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub{q} {%5,%1|%1,%5}\n\tsbb{q} {%3,%0|%0,%3}"		\
+	   : "=r" ((UDItype) (sh)),					\
+	     "=&r" ((UDItype) (sl))					\
+	   : "0" ((UDItype) (ah)),					\
+	     "rme" ((UDItype) (bh)),					\
+	     "1" ((UDItype) (al)),					\
+	     "rme" ((UDItype) (bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mul{q} %3"							\
+	   : "=a" ((UDItype) (w0)),					\
+	     "=d" ((UDItype) (w1))					\
+	   : "%0" ((UDItype) (u)),					\
+	     "rm" ((UDItype) (v)))
+#define udiv_qrnnd(q, r, n1, n0, dv) \
+  __asm__ ("div{q} %4"							\
+	   : "=a" ((UDItype) (q)),					\
+	     "=d" ((UDItype) (r))					\
+	   : "0" ((UDItype) (n0)),					\
+	     "1" ((UDItype) (n1)),					\
+	     "rm" ((UDItype) (dv)))
+#define count_leading_zeros(count, x)	((count) = __builtin_clzll (x))
+#define count_trailing_zeros(count, x)	((count) = __builtin_ctzll (x))
+#define UMUL_TIME 40
+#define UDIV_TIME 40
+#endif /* x86_64 */
+
+#if defined (__i960__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __xx;							\
+  __asm__ ("emul	%2,%1,%0"					\
+	   : "=d" (__xx.__ll)						\
+	   : "%dI" ((USItype) (u)),					\
+	     "dI" ((USItype) (v)));					\
+  (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("emul	%2,%1,%0"					\
+	     : "=d" (__w)						\
+	     : "%dI" ((USItype) (u)),					\
+	       "dI" ((USItype) (v)));					\
+    __w; })
+#endif /* __i960__ */
+
+#if defined (__ia64) && W_TYPE_SIZE == 64
+/* This form encourages gcc (pre-release 3.4 at least) to emit predicated
+   "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
+   code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
+   register, which takes an extra cycle.  */
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
+  do {									\
+    UWtype __x;								\
+    __x = (al) - (bl);							\
+    if ((al) < (bl))							\
+      (sh) = (ah) - (bh) - 1;						\
+    else								\
+      (sh) = (ah) - (bh);						\
+    (sl) = __x;								\
+  } while (0)
+
+/* Do both product parts in assembly, since that gives better code with
+   all gcc versions.  Some callers will just use the upper part, and in
+   that situation we waste an instruction, but not any cycles.  */
+#define umul_ppmm(ph, pl, m0, m1)					\
+  __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
+	   : "=&f" (ph), "=f" (pl)					\
+	   : "f" (m0), "f" (m1))
+#define count_leading_zeros(count, x)					\
+  do {									\
+    UWtype _x = (x), _y, _a, _c;					\
+    __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
+    __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
+    _c = (_a - 1) << 3;							\
+    _x >>= _c;								\
+    if (_x >= 1 << 4)							\
+      _x >>= 4, _c += 4;						\
+    if (_x >= 1 << 2)							\
+      _x >>= 2, _c += 2;						\
+    _c += _x >> 1;							\
+    (count) =  W_TYPE_SIZE - 1 - _c;					\
+  } while (0)
+/* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
+   based, and we don't need a special case for x==0 here */
+#define count_trailing_zeros(count, x)					\
+  do {									\
+    UWtype __ctz_x = (x);						\
+    __asm__ ("popcnt %0 = %1"						\
+	     : "=r" (count)						\
+	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
+  } while (0)
+#define UMUL_TIME 14
+#endif
+
+#if defined (__M32R__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  /* The cmp clears the condition bit.  */ \
+  __asm__ ("cmp %0,%0\n\taddx %1,%5\n\taddx %0,%3"			\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "0" ((USItype) (ah)),					\
+	     "r" ((USItype) (bh)),					\
+	     "1" ((USItype) (al)),					\
+	     "r" ((USItype) (bl))					\
+	   : "cbit")
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  /* The cmp clears the condition bit.  */ \
+  __asm__ ("cmp %0,%0\n\tsubx %1,%5\n\tsubx %0,%3"			\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "0" ((USItype) (ah)),					\
+	     "r" ((USItype) (bh)),					\
+	     "1" ((USItype) (al)),					\
+	     "r" ((USItype) (bl))					\
+	   : "cbit")
+#endif /* __M32R__ */
+
+#if defined (__mc68000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
+	   : "=d" ((USItype) (sh)),					\
+	     "=&d" ((USItype) (sl))					\
+	   : "%0" ((USItype) (ah)),					\
+	     "d" ((USItype) (bh)),					\
+	     "%1" ((USItype) (al)),					\
+	     "g" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
+	   : "=d" ((USItype) (sh)),					\
+	     "=&d" ((USItype) (sl))					\
+	   : "0" ((USItype) (ah)),					\
+	     "d" ((USItype) (bh)),					\
+	     "1" ((USItype) (al)),					\
+	     "g" ((USItype) (bl)))
+
+/* The '020, '030, '040, '060 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
+#if (defined (__mc68020__) && !defined (__mc68060__))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulu%.l %3,%1:%0"						\
+	   : "=d" ((USItype) (w0)),					\
+	     "=d" ((USItype) (w1))					\
+	   : "%0" ((USItype) (u)),					\
+	     "dmi" ((USItype) (v)))
+#define UMUL_TIME 45
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divu%.l %4,%1:%0"						\
+	   : "=d" ((USItype) (q)),					\
+	     "=d" ((USItype) (r))					\
+	   : "0" ((USItype) (n0)),					\
+	     "1" ((USItype) (n1)),					\
+	     "dmi" ((USItype) (d)))
+#define UDIV_TIME 90
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divs%.l %4,%1:%0"						\
+	   : "=d" ((USItype) (q)),					\
+	     "=d" ((USItype) (r))					\
+	   : "0" ((USItype) (n0)),					\
+	     "1" ((USItype) (n1)),					\
+	     "dmi" ((USItype) (d)))
+
+#elif defined (__mcoldfire__) /* not mc68020 */
+
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("| Inlined umul_ppmm\n"					\
+	   "	move%.l	%2,%/d0\n"					\
+	   "	move%.l	%3,%/d1\n"					\
+	   "	move%.l	%/d0,%/d2\n"					\
+	   "	swap	%/d0\n"						\
+	   "	move%.l	%/d1,%/d3\n"					\
+	   "	swap	%/d1\n"						\
+	   "	move%.w	%/d2,%/d4\n"					\
+	   "	mulu	%/d3,%/d4\n"					\
+	   "	mulu	%/d1,%/d2\n"					\
+	   "	mulu	%/d0,%/d3\n"					\
+	   "	mulu	%/d0,%/d1\n"					\
+	   "	move%.l	%/d4,%/d0\n"					\
+	   "	clr%.w	%/d0\n"						\
+	   "	swap	%/d0\n"						\
+	   "	add%.l	%/d0,%/d2\n"					\
+	   "	add%.l	%/d3,%/d2\n"					\
+	   "	jcc	1f\n"						\
+	   "	add%.l	%#65536,%/d1\n"					\
+	   "1:	swap	%/d2\n"						\
+	   "	moveq	%#0,%/d0\n"					\
+	   "	move%.w	%/d2,%/d0\n"					\
+	   "	move%.w	%/d4,%/d2\n"					\
+	   "	move%.l	%/d2,%1\n"					\
+	   "	add%.l	%/d1,%/d0\n"					\
+	   "	move%.l	%/d0,%0"					\
+	   : "=g" ((USItype) (xh)),					\
+	     "=g" ((USItype) (xl))					\
+	   : "g" ((USItype) (a)),					\
+	     "g" ((USItype) (b))					\
+	   : "d0", "d1", "d2", "d3", "d4")
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+#else /* not ColdFire */
+/* %/ inserts REGISTER_PREFIX, %# inserts IMMEDIATE_PREFIX.  */
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("| Inlined umul_ppmm\n"					\
+	   "	move%.l	%2,%/d0\n"					\
+	   "	move%.l	%3,%/d1\n"					\
+	   "	move%.l	%/d0,%/d2\n"					\
+	   "	swap	%/d0\n"						\
+	   "	move%.l	%/d1,%/d3\n"					\
+	   "	swap	%/d1\n"						\
+	   "	move%.w	%/d2,%/d4\n"					\
+	   "	mulu	%/d3,%/d4\n"					\
+	   "	mulu	%/d1,%/d2\n"					\
+	   "	mulu	%/d0,%/d3\n"					\
+	   "	mulu	%/d0,%/d1\n"					\
+	   "	move%.l	%/d4,%/d0\n"					\
+	   "	eor%.w	%/d0,%/d0\n"					\
+	   "	swap	%/d0\n"						\
+	   "	add%.l	%/d0,%/d2\n"					\
+	   "	add%.l	%/d3,%/d2\n"					\
+	   "	jcc	1f\n"						\
+	   "	add%.l	%#65536,%/d1\n"					\
+	   "1:	swap	%/d2\n"						\
+	   "	moveq	%#0,%/d0\n"					\
+	   "	move%.w	%/d2,%/d0\n"					\
+	   "	move%.w	%/d4,%/d2\n"					\
+	   "	move%.l	%/d2,%1\n"					\
+	   "	add%.l	%/d1,%/d0\n"					\
+	   "	move%.l	%/d0,%0"					\
+	   : "=g" ((USItype) (xh)),					\
+	     "=g" ((USItype) (xl))					\
+	   : "g" ((USItype) (a)),					\
+	     "g" ((USItype) (b))					\
+	   : "d0", "d1", "d2", "d3", "d4")
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+
+#endif /* not mc68020 */
+
+/* The '020, '030, '040 and '060 have bitfield insns.
+   cpu32 disguises as a 68020, but lacks them.  */
+#if defined (__mc68020__) && !defined (__mcpu32__)
+#define count_leading_zeros(count, x) \
+  __asm__ ("bfffo %1{%b2:%b2},%0"					\
+	   : "=d" ((USItype) (count))					\
+	   : "od" ((USItype) (x)), "n" (0))
+/* Some ColdFire architectures have a ff1 instruction supported via
+   __builtin_clz. */
+#elif defined (__mcfisaaplus__) || defined (__mcfisac__)
+#define count_leading_zeros(count,x) ((count) = __builtin_clz (x))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+#endif /* mc68000 */
+
+#if defined (__m88000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%rJ" ((USItype) (ah)),					\
+	     "rJ" ((USItype) (bh)),					\
+	     "%rJ" ((USItype) (al)),					\
+	     "rJ" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "rJ" ((USItype) (ah)),					\
+	     "rJ" ((USItype) (bh)),					\
+	     "rJ" ((USItype) (al)),					\
+	     "rJ" ((USItype) (bl)))
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("ff1 %0,%1"						\
+	     : "=r" (__cbtmp)						\
+	     : "r" ((USItype) (x)));					\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 63 /* sic */
+#if defined (__mc88110__)
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __xx;							\
+    __asm__ ("mulu.d	%0,%1,%2"					\
+	     : "=r" (__xx.__ll)						\
+	     : "r" ((USItype) (u)),					\
+	       "r" ((USItype) (v)));					\
+    (wh) = __xx.__i.__h;						\
+    (wl) = __xx.__i.__l;						\
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __xx;							\
+  USItype __q;								\
+  __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
+  __asm__ ("divu.d %0,%1,%2"						\
+	   : "=r" (__q)							\
+	   : "r" (__xx.__ll),						\
+	     "r" ((USItype) (d)));					\
+  (r) = (n0) - __q * (d); (q) = __q; })
+#define UMUL_TIME 5
+#define UDIV_TIME 25
+#else
+#define UMUL_TIME 17
+#define UDIV_TIME 150
+#endif /* __mc88110__ */
+#endif /* __m88000__ */
+
+#if defined (__mn10300__)
+# if defined (__AM33__)
+#  define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
+#  define umul_ppmm(w1, w0, u, v)		\
+    asm("mulu %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
+#  define smul_ppmm(w1, w0, u, v)		\
+    asm("mul %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
+# else
+#  define umul_ppmm(w1, w0, u, v)		\
+    asm("nop; nop; mulu %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
+#  define smul_ppmm(w1, w0, u, v)		\
+    asm("nop; nop; mul %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
+# endif
+# define add_ssaaaa(sh, sl, ah, al, bh, bl)	\
+  do {						\
+    DWunion __s, __a, __b;			\
+    __a.s.low = (al); __a.s.high = (ah);	\
+    __b.s.low = (bl); __b.s.high = (bh);	\
+    __s.ll = __a.ll + __b.ll;			\
+    (sl) = __s.s.low; (sh) = __s.s.high;	\
+  } while (0)
+# define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
+  do {						\
+    DWunion __s, __a, __b;			\
+    __a.s.low = (al); __a.s.high = (ah);	\
+    __b.s.low = (bl); __b.s.high = (bh);	\
+    __s.ll = __a.ll - __b.ll;			\
+    (sl) = __s.s.low; (sh) = __s.s.high;	\
+  } while (0)
+# define udiv_qrnnd(q, r, nh, nl, d)		\
+  asm("divu %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
+# define sdiv_qrnnd(q, r, nh, nl, d)		\
+  asm("div %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
+# define UMUL_TIME 3
+# define UDIV_TIME 38
+#endif
+
+#if defined (__mips__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
+    (w1) = (USItype) (__x >> 32);					\
+    (w0) = (USItype) (__x);						\
+  } while (0)
+#define UMUL_TIME 10
+#define UDIV_TIME 100
+
+#if (__mips == 32 || __mips == 64) && ! defined (__mips16)
+#define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+#endif /* __mips__ */
+
+#if defined (__ns32000__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __xx;							\
+  __asm__ ("meid %2,%0"							\
+	   : "=g" (__xx.__ll)						\
+	   : "%0" ((USItype) (u)),					\
+	     "g" ((USItype) (v)));					\
+  (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("meid %2,%0"						\
+	     : "=g" (__w)						\
+	     : "%0" ((USItype) (u)),					\
+	       "g" ((USItype) (v)));					\
+    __w; })
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __xx;							\
+  __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
+  __asm__ ("deid %2,%0"							\
+	   : "=g" (__xx.__ll)						\
+	   : "0" (__xx.__ll),						\
+	     "g" ((USItype) (d)));					\
+  (r) = __xx.__i.__l; (q) = __xx.__i.__h; })
+#define count_trailing_zeros(count,x) \
+  do {									\
+    __asm__ ("ffsd     %2,%0"						\
+	    : "=r" ((USItype) (count))					\
+	    : "0" ((USItype) 0),					\
+	      "r" ((USItype) (x)));					\
+  } while (0)
+#endif /* __ns32000__ */
+
+/* FIXME: We should test _IBMR2 here when we add assembly support for the
+   system vendor compilers.
+   FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
+   enough, since that hits ARM and m68k too.  */
+#if (defined (_ARCH_PPC)	/* AIX */				\
+     || defined (__powerpc__)	/* gcc */				\
+     || defined (__POWERPC__)	/* BEOS */				\
+     || defined (__ppc__)	/* Darwin */				\
+     || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */    \
+     || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */               \
+	 && CPU_FAMILY == PPC)                                                \
+     ) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else								\
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)			\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else								\
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
+  || defined (__ppc__)                                                    \
+  || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */       \
+  || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */                  \
+	 && CPU_FAMILY == PPC)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    SItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define SMUL_TIME 14
+#define UDIV_TIME 120
+#endif
+#endif /* 32-bit POWER architecture variants.  */
+
+/* We should test _IBMR2 here when we add assembly support for the system
+   vendor compilers.  */
+#if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+    else								\
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)			\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else								\
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 64
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    DItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define SMUL_TIME 14  /* ??? */
+#define UDIV_TIME 120 /* ??? */
+#endif /* 64-bit PowerPC.  */
+
+#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("a %1,%5\n\tae %0,%3"					\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%0" ((USItype) (ah)),					\
+	     "r" ((USItype) (bh)),					\
+	     "%1" ((USItype) (al)),					\
+	     "r" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("s %1,%5\n\tse %0,%3"					\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "0" ((USItype) (ah)),					\
+	     "r" ((USItype) (bh)),					\
+	     "1" ((USItype) (al)),					\
+	     "r" ((USItype) (bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ (								\
+       "s	r2,r2\n"						\
+"	mts	r10,%2\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	cas	%0,r2,r0\n"						\
+"	mfs	r10,%1"							\
+	     : "=r" ((USItype) (ph)),					\
+	       "=r" ((USItype) (pl))					\
+	     : "%r" (__m0),						\
+		"r" (__m1)						\
+	     : "r2");							\
+    (ph) += ((((SItype) __m0 >> 31) & __m1)				\
+	     + (((SItype) __m1 >> 31) & __m0));				\
+  } while (0)
+#define UMUL_TIME 20
+#define UDIV_TIME 200
+#define count_leading_zeros(count, x) \
+  do {									\
+    if ((x) >= 0x10000)							\
+      __asm__ ("clz	%0,%1"						\
+	       : "=r" ((USItype) (count))				\
+	       : "r" ((USItype) (x) >> 16));				\
+    else								\
+      {									\
+	__asm__ ("clz	%0,%1"						\
+		 : "=r" ((USItype) (count))				\
+		 : "r" ((USItype) (x)));					\
+	(count) += 16;							\
+      }									\
+  } while (0)
+#endif
+
+#if defined(__sh__) && W_TYPE_SIZE == 32
+#ifndef __sh1__
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ (								\
+       "dmulu.l	%2,%3\n\tsts%M1	macl,%1\n\tsts%M0	mach,%0"	\
+	   : "=r<" ((USItype)(w1)),					\
+	     "=r<" ((USItype)(w0))					\
+	   : "r" ((USItype)(u)),					\
+	     "r" ((USItype)(v))						\
+	   : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
+/* This is the same algorithm as __udiv_qrnnd_c.  */
+#define UDIV_NEEDS_NORMALIZATION 1
+
+#ifdef __FDPIC__
+/* FDPIC needs a special version of the asm fragment to extract the
+   code address from the function descriptor. __udiv_qrnnd_16 is
+   assumed to be local and not to use the GOT, so loading r12 is
+   not needed. */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
+			__attribute__ ((visibility ("hidden")));	\
+    /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
+    __asm__ (								\
+	"mov%M4	%4,r5\n"						\
+"	swap.w	%3,r4\n"						\
+"	swap.w	r5,r6\n"						\
+"	mov.l	@%5,r2\n"						\
+"	jsr	@r2\n"							\
+"	shll16	r6\n"							\
+"	swap.w	r4,r4\n"						\
+"	mov.l	@%5,r2\n"						\
+"	jsr	@r2\n"							\
+"	swap.w	r1,%0\n"						\
+"	or	r1,%0"							\
+	: "=r" (q), "=&z" (r)						\
+	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
+	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
+  } while (0)
+#else
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
+			__attribute__ ((visibility ("hidden")));	\
+    /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
+    __asm__ (								\
+	"mov%M4 %4,r5\n"						\
+"	swap.w %3,r4\n"							\
+"	swap.w r5,r6\n"							\
+"	jsr @%5\n"							\
+"	shll16 r6\n"							\
+"	swap.w r4,r4\n"							\
+"	jsr @%5\n"							\
+"	swap.w r1,%0\n"							\
+"	or r1,%0"							\
+	: "=r" (q), "=&z" (r)						\
+	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
+	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
+  } while (0)
+#endif /* __FDPIC__  */
+
+#define UDIV_TIME 80
+
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
+  __asm__ ("clrt;subc %5,%1; subc %4,%0"				\
+	   : "=r" (sh), "=r" (sl)					\
+	   : "0" (ah), "1" (al), "r" (bh), "r" (bl) : "t")
+
+#endif /* __sh__ */
+
+#if defined (__sparc__) && !defined (__arch64__) && !defined (__sparcv9) \
+    && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%rJ" ((USItype) (ah)),					\
+	     "rI" ((USItype) (bh)),					\
+	     "%rJ" ((USItype) (al)),					\
+	     "rI" ((USItype) (bl))					\
+	   __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "rJ" ((USItype) (ah)),					\
+	     "rI" ((USItype) (bh)),					\
+	     "rJ" ((USItype) (al)),					\
+	     "rI" ((USItype) (bl))					\
+	   __CLOBBER_CC)
+#if defined (__sparc_v9__)
+#define umul_ppmm(w1, w0, u, v) \
+  do {									\
+    register USItype __g1 asm ("g1");					\
+    __asm__ ("umul\t%2,%3,%1\n\t"					\
+	     "srlx\t%1, 32, %0"						\
+	     : "=r" ((USItype) (w1)),					\
+	       "=r" (__g1)						\
+	     : "r" ((USItype) (u)),					\
+	       "r" ((USItype) (v)));					\
+    (w0) = __g1;							\
+  } while (0)
+#define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
+  __asm__ ("mov\t%2,%%y\n\t"						\
+	   "udiv\t%3,%4,%0\n\t"						\
+	   "umul\t%0,%4,%1\n\t"						\
+	   "sub\t%3,%1,%1"						\
+	   : "=&r" ((USItype) (__q)),					\
+	     "=&r" ((USItype) (__r))					\
+	   : "r" ((USItype) (__n1)),					\
+	     "r" ((USItype) (__n0)),					\
+	     "r" ((USItype) (__d)))
+#else
+#if defined (__sparc_v8__)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
+	   : "=r" ((USItype) (w1)),					\
+	     "=r" ((USItype) (w0))					\
+	   : "r" ((USItype) (u)),					\
+	     "r" ((USItype) (v)))
+#define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
+  __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
+	   : "=&r" ((USItype) (__q)),					\
+	     "=&r" ((USItype) (__r))					\
+	   : "r" ((USItype) (__n1)),					\
+	     "r" ((USItype) (__n0)),					\
+	     "r" ((USItype) (__d)))
+#else
+#if defined (__sparclite__)
+/* This has hardware multiply but not divide.  It also has two additional
+   instructions scan (ffs from high bit) and divscc.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
+	   : "=r" ((USItype) (w1)),					\
+	     "=r" ((USItype) (w0))					\
+	   : "r" ((USItype) (u)),					\
+	     "r" ((USItype) (v)))
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("! Inlined udiv_qrnnd\n"					\
+"	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
+"	tst	%%g0\n"							\
+"	divscc	%3,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%0\n"						\
+"	rd	%%y,%1\n"						\
+"	bl,a 1f\n"							\
+"	add	%1,%4,%1\n"						\
+"1:	! End of inline udiv_qrnnd"					\
+	   : "=r" ((USItype) (q)),					\
+	     "=r" ((USItype) (r))					\
+	   : "r" ((USItype) (n1)),					\
+	     "r" ((USItype) (n0)),					\
+	     "rI" ((USItype) (d))					\
+	   : "g1" __AND_CLOBBER_CC)
+#define UDIV_TIME 37
+#define count_leading_zeros(count, x) \
+  do {                                                                  \
+  __asm__ ("scan %1,1,%0"                                               \
+	   : "=r" ((USItype) (count))                                   \
+	   : "r" ((USItype) (x)));					\
+  } while (0)
+/* Early sparclites return 63 for an argument of 0, but they warn that future
+   implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
+   undefined.  */
+#else
+/* SPARC without integer multiplication and divide instructions.
+   (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("! Inlined umul_ppmm\n"					\
+"	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"\
+"	sra	%3,31,%%o5	! Don't move this insn\n"		\
+"	and	%2,%%o5,%%o5	! Don't move this insn\n"		\
+"	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,0,%%g1\n"						\
+"	add	%%g1,%%o5,%0\n"						\
+"	rd	%%y,%1"							\
+	   : "=r" ((USItype) (w1)),					\
+	     "=r" ((USItype) (w0))					\
+	   : "%rI" ((USItype) (u)),					\
+	     "r" ((USItype) (v))						\
+	   : "g1", "o5" __AND_CLOBBER_CC)
+#define UMUL_TIME 39		/* 39 instructions */
+/* It's quite necessary to add this much assembler for the sparc.
+   The default udiv_qrnnd (in C) is more than 10 times slower!  */
+#define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
+  __asm__ ("! Inlined udiv_qrnnd\n"					\
+"	mov	32,%%g1\n"						\
+"	subcc	%1,%2,%%g0\n"						\
+"1:	bcs	5f\n"							\
+"	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
+"	sub	%1,%2,%1	! this kills msb of n\n"		\
+"	addx	%1,%1,%1	! so this can't give carry\n"		\
+"	subcc	%%g1,1,%%g1\n"						\
+"2:	bne	1b\n"							\
+"	 subcc	%1,%2,%%g0\n"						\
+"	bcs	3f\n"							\
+"	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
+"	b	3f\n"							\
+"	 sub	%1,%2,%1	! this kills msb of n\n"		\
+"4:	sub	%1,%2,%1\n"						\
+"5:	addxcc	%1,%1,%1\n"						\
+"	bcc	2b\n"							\
+"	 subcc	%%g1,1,%%g1\n"						\
+"! Got carry from n.  Subtract next step to cancel this carry.\n"	\
+"	bne	4b\n"							\
+"	 addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n"	\
+"	sub	%1,%2,%1\n"						\
+"3:	xnor	%0,0,%0\n"						\
+"	! End of inline udiv_qrnnd"					\
+	   : "=&r" ((USItype) (__q)),					\
+	     "=&r" ((USItype) (__r))					\
+	   : "r" ((USItype) (__d)),					\
+	     "1" ((USItype) (__n1)),					\
+	     "0" ((USItype) (__n0)) : "g1" __AND_CLOBBER_CC)
+#define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+#endif /* __sparc_v9__ */
+#endif /* sparc32 */
+
+#if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
+    && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
+  do {									\
+    UDItype __carry = 0;						\
+    __asm__ ("addcc\t%r5,%6,%1\n\t"					\
+	     "add\t%r3,%4,%0\n\t"					\
+	     "movcs\t%%xcc, 1, %2\n\t"					\
+	     "add\t%0, %2, %0"						\
+	     : "=r" ((UDItype)(sh)),				      	\
+	       "=&r" ((UDItype)(sl)),				      	\
+	       "+r" (__carry)				      		\
+	     : "%rJ" ((UDItype)(ah)),				     	\
+	       "rI" ((UDItype)(bh)),				      	\
+	       "%rJ" ((UDItype)(al)),				     	\
+	       "rI" ((UDItype)(bl))				       	\
+	     __CLOBBER_CC);						\
+  } while (0)
+
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
+  do {									\
+    UDItype __carry = 0;						\
+    __asm__ ("subcc\t%r5,%6,%1\n\t"					\
+	     "sub\t%r3,%4,%0\n\t"					\
+	     "movcs\t%%xcc, 1, %2\n\t"					\
+	     "sub\t%0, %2, %0"						\
+	     : "=r" ((UDItype)(sh)),				      	\
+	       "=&r" ((UDItype)(sl)),				      	\
+	       "+r" (__carry)				      		\
+	     : "%rJ" ((UDItype)(ah)),				     	\
+	       "rI" ((UDItype)(bh)),				      	\
+	       "%rJ" ((UDItype)(al)),				     	\
+	       "rI" ((UDItype)(bl))				       	\
+	     __CLOBBER_CC);						\
+  } while (0)
+
+#define umul_ppmm(wh, wl, u, v)						\
+  do {									\
+	  UDItype tmp1, tmp2, tmp3, tmp4;				\
+	  __asm__ __volatile__ (					\
+		   "srl %7,0,%3\n\t"					\
+		   "mulx %3,%6,%1\n\t"					\
+		   "srlx %6,32,%2\n\t"					\
+		   "mulx %2,%3,%4\n\t"					\
+		   "sllx %4,32,%5\n\t"					\
+		   "srl %6,0,%3\n\t"					\
+		   "sub %1,%5,%5\n\t"					\
+		   "srlx %5,32,%5\n\t"					\
+		   "addcc %4,%5,%4\n\t"					\
+		   "srlx %7,32,%5\n\t"					\
+		   "mulx %3,%5,%3\n\t"					\
+		   "mulx %2,%5,%5\n\t"					\
+		   "sethi %%hi(0x80000000),%2\n\t"			\
+		   "addcc %4,%3,%4\n\t"					\
+		   "srlx %4,32,%4\n\t"					\
+		   "add %2,%2,%2\n\t"					\
+		   "movcc %%xcc,%%g0,%2\n\t"				\
+		   "addcc %5,%4,%5\n\t"					\
+		   "sllx %3,32,%3\n\t"					\
+		   "add %1,%3,%1\n\t"					\
+		   "add %5,%2,%0"					\
+	   : "=r" ((UDItype)(wh)),					\
+	     "=&r" ((UDItype)(wl)),					\
+	     "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4)	\
+	   : "r" ((UDItype)(u)),					\
+	     "r" ((UDItype)(v))						\
+	   __CLOBBER_CC);						\
+  } while (0)
+#define UMUL_TIME 96
+#define UDIV_TIME 230
+#endif /* sparc64 */
+
+#if defined (__vax__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
+	   : "=g" ((USItype) (sh)),					\
+	     "=&g" ((USItype) (sl))					\
+	   : "%0" ((USItype) (ah)),					\
+	     "g" ((USItype) (bh)),					\
+	     "%1" ((USItype) (al)),					\
+	     "g" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
+	   : "=g" ((USItype) (sh)),					\
+	     "=&g" ((USItype) (sl))					\
+	   : "0" ((USItype) (ah)),					\
+	     "g" ((USItype) (bh)),					\
+	     "1" ((USItype) (al)),					\
+	     "g" ((USItype) (bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {								\
+	UDItype __ll;							\
+	struct {USItype __l, __h;} __i;					\
+      } __xx;								\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("emul %1,%2,$0,%0"						\
+	     : "=r" (__xx.__ll)						\
+	     : "g" (__m0),						\
+	       "g" (__m1));						\
+    (xh) = __xx.__i.__h;						\
+    (xl) = __xx.__i.__l;						\
+    (xh) += ((((SItype) __m0 >> 31) & __m1)				\
+	     + (((SItype) __m1 >> 31) & __m0));				\
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {SItype __l, __h;} __i;				\
+	  } __xx;							\
+    __xx.__i.__h = n1; __xx.__i.__l = n0;				\
+    __asm__ ("ediv %3,%2,%0,%1"						\
+	     : "=g" (q), "=g" (r)					\
+	     : "g" (__xx.__ll), "g" (d));				\
+  } while (0)
+#endif /* __vax__ */
+
+#ifdef _TMS320C6X
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do									\
+    {									\
+      UDItype __ll;							\
+      __asm__ ("addu .l1 %1, %2, %0"					\
+	       : "=a" (__ll) : "a" (al), "a" (bl));			\
+      (sl) = (USItype)__ll;						\
+      (sh) = ((USItype)(__ll >> 32)) + (ah) + (bh);			\
+    }									\
+  while (0)
+
+#ifdef _TMS320C6400_PLUS
+#define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
+    (w1) = (USItype) (__x >> 32);					\
+    (w0) = (USItype) (__x);						\
+  } while (0)
+#endif  /* _TMS320C6400_PLUS */
+
+#define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
+#ifdef _TMS320C6400
+#define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
+#endif
+#define UMUL_TIME 4
+#define UDIV_TIME 40
+#endif /* _TMS320C6X */
+
+#if defined (__xtensa__) && W_TYPE_SIZE == 32
+/* This code is not Xtensa-configuration-specific, so rely on the compiler
+   to expand builtin functions depending on what configuration features
+   are available.  This avoids library calls when the operation can be
+   performed in-line.  */
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    DWunion __w;							\
+    __w.ll = __builtin_umulsidi3 (u, v);				\
+    w1 = __w.s.high;							\
+    w0 = __w.s.low;							\
+  } while (0)
+#define __umulsidi3(u, v)		__builtin_umulsidi3 (u, v)
+#define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
+#define count_trailing_zeros(COUNT, X)	((COUNT) = __builtin_ctz (X))
+#endif /* __xtensa__ */
+
+#if defined xstormy16
+extern UHItype __stormy16_count_leading_zeros (UHItype);
+#define count_leading_zeros(count, x)					\
+  do									\
+    {									\
+      UHItype size;							\
+									\
+      /* We assume that W_TYPE_SIZE is a multiple of 16...  */		\
+      for ((count) = 0, size = W_TYPE_SIZE; size; size -= 16)		\
+	{								\
+	  UHItype c;							\
+									\
+	  c = __clzhi2 ((x) >> (size - 16));				\
+	  (count) += c;							\
+	  if (c != 16)							\
+	    break;							\
+	}								\
+    }									\
+  while (0)
+#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
+#endif
+
+#if defined (__z8000__) && W_TYPE_SIZE == 16
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
+	   : "=r" ((unsigned int)(sh)),					\
+	     "=&r" ((unsigned int)(sl))					\
+	   : "%0" ((unsigned int)(ah)),					\
+	     "r" ((unsigned int)(bh)),					\
+	     "%1" ((unsigned int)(al)),					\
+	     "rQR" ((unsigned int)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
+	   : "=r" ((unsigned int)(sh)),					\
+	     "=&r" ((unsigned int)(sl))					\
+	   : "0" ((unsigned int)(ah)),					\
+	     "r" ((unsigned int)(bh)),					\
+	     "1" ((unsigned int)(al)),					\
+	     "rQR" ((unsigned int)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {long int __ll;						\
+	   struct {unsigned int __h, __l;} __i;				\
+	  } __xx;							\
+    unsigned int __m0 = (m0), __m1 = (m1);				\
+    __asm__ ("mult	%S0,%H3"					\
+	     : "=r" (__xx.__i.__h),					\
+	       "=r" (__xx.__i.__l)					\
+	     : "%1" (__m0),						\
+	       "rQR" (__m1));						\
+    (xh) = __xx.__i.__h; (xl) = __xx.__i.__l;				\
+    (xh) += ((((signed int) __m0 >> 15) & __m1)				\
+	     + (((signed int) __m1 >> 15) & __m0));			\
+  } while (0)
+#endif /* __z8000__ */
+
+#endif /* __GNUC__ */
+
+/* If this machine has no inline assembler, use C macros.  */
+
+#if !defined (add_ssaaaa)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) + (bl);							\
+    (sh) = (ah) + (bh) + (__x < (al));					\
+    (sl) = __x;								\
+  } while (0)
+#endif
+
+#if !defined (sub_ddmmss)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) - (bl);							\
+    (sh) = (ah) - (bh) - (__x > (al));					\
+    (sl) = __x;								\
+  } while (0)
+#endif
+
+/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
+   smul_ppmm.  */
+#if !defined (umul_ppmm) && defined (smul_ppmm)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __w1;							\
+    UWtype __xm0 = (u), __xm1 = (v);					\
+    smul_ppmm (__w1, w0, __xm0, __xm1);					\
+    (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
+		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
+  } while (0)
+#endif
+
+/* If we still don't have umul_ppmm, define it using plain C.  */
+#if !defined (umul_ppmm)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __x0, __x1, __x2, __x3;					\
+    UHWtype __ul, __vl, __uh, __vh;					\
+									\
+    __ul = __ll_lowpart (u);						\
+    __uh = __ll_highpart (u);						\
+    __vl = __ll_lowpart (v);						\
+    __vh = __ll_highpart (v);						\
+									\
+    __x0 = (UWtype) __ul * __vl;					\
+    __x1 = (UWtype) __ul * __vh;					\
+    __x2 = (UWtype) __uh * __vl;					\
+    __x3 = (UWtype) __uh * __vh;					\
+									\
+    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
+    __x1 += __x2;		/* but this indeed can */		\
+    if (__x1 < __x2)		/* did we get it? */			\
+      __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
+									\
+    (w1) = __x3 + __ll_highpart (__x1);					\
+    (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
+  } while (0)
+#endif
+
+#if !defined (__umulsidi3)
+#define __umulsidi3(u, v) \
+  ({DWunion __w;							\
+    umul_ppmm (__w.s.high, __w.s.low, u, v);				\
+    __w.ll; })
+#endif
+
+/* Define this unconditionally, so it can be used for debugging.  */
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+  do {									\
+    UWtype __d1, __d0, __q1, __q0;					\
+    UWtype __r1, __r0, __m;						\
+    __d1 = __ll_highpart (d);						\
+    __d0 = __ll_lowpart (d);						\
+									\
+    __r1 = (n1) % __d1;							\
+    __q1 = (n1) / __d1;							\
+    __m = (UWtype) __q1 * __d0;						\
+    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
+    if (__r1 < __m)							\
+      {									\
+	__q1--, __r1 += (d);						\
+	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
+	  if (__r1 < __m)						\
+	    __q1--, __r1 += (d);					\
+      }									\
+    __r1 -= __m;							\
+									\
+    __r0 = __r1 % __d1;							\
+    __q0 = __r1 / __d1;							\
+    __m = (UWtype) __q0 * __d0;						\
+    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
+    if (__r0 < __m)							\
+      {									\
+	__q0--, __r0 += (d);						\
+	if (__r0 >= (d))						\
+	  if (__r0 < __m)						\
+	    __q0--, __r0 += (d);					\
+      }									\
+    __r0 -= __m;							\
+									\
+    (q) = (UWtype) __q1 * __ll_B | __q0;				\
+    (r) = __r0;								\
+  } while (0)
+
+/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
+   __udiv_w_sdiv (defined in libgcc or elsewhere).  */
+#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {									\
+    extern UWtype __udiv_w_sdiv (UWtype *, UWtype, UWtype, UWtype);	\
+    UWtype __r;								\
+    (q) = __udiv_w_sdiv (&__r, nh, nl, d);				\
+    (r) = __r;								\
+  } while (0)
+#endif
+
+/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
+#if !defined (udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#endif
+
+#if !defined (count_leading_zeros)
+#define count_leading_zeros(count, x) \
+  do {									\
+    UWtype __xr = (x);							\
+    UWtype __a;								\
+									\
+    if (W_TYPE_SIZE <= 32)						\
+      {									\
+	__a = __xr < ((UWtype)1<<2*__BITS4)				\
+	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
+	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
+      }									\
+    else								\
+      {									\
+	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
+	  if (((__xr >> __a) & 0xff) != 0)				\
+	    break;							\
+      }									\
+									\
+    (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
+#endif
+
+#if !defined (count_trailing_zeros)
+/* Define count_trailing_zeros using count_leading_zeros.  The latter might be
+   defined in asm, but if it is not, the C version above is good enough.  */
+#define count_trailing_zeros(count, x) \
+  do {									\
+    UWtype __ctz_x = (x);						\
+    UWtype __ctz_c;							\
+    count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
+    (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
+  } while (0)
+#endif
+
+#ifndef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/lto-symtab.h b/utils/gapy/gen-debug-info-src/ext/lto-symtab.h
new file mode 100644
index 000000000..878d0502c
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/lto-symtab.h
@@ -0,0 +1,41 @@
+/* Data types used in the IL symbol table.
+   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Contributed by Rafael Espindola <espindola@google.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_LTO_SYMTAB_H
+#define GCC_LTO_SYMTAB_H
+
+enum gcc_plugin_symbol_kind
+  {
+    GCCPK_DEF,
+    GCCPK_WEAKDEF,
+    GCCPK_UNDEF,
+    GCCPK_WEAKUNDEF,
+    GCCPK_COMMON
+  };
+
+enum gcc_plugin_symbol_visibility
+  {
+    GCCPV_DEFAULT,
+    GCCPV_PROTECTED,
+    GCCPV_INTERNAL,
+    GCCPV_HIDDEN
+  };
+
+#endif /* GCC_LTO_SYMTAB_H  */
diff --git a/utils/gapy/gen-debug-info-src/ext/md5.h b/utils/gapy/gen-debug-info-src/ext/md5.h
new file mode 100644
index 000000000..909f7b4df
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/md5.h
@@ -0,0 +1,154 @@
+/* md5.h - Declaration of functions and data types used for MD5 sum
+   computing library functions.
+   Copyright (C) 1995-2015 Free Software Foundation, Inc.
+   NOTE: The canonical source of this file is maintained with the GNU C
+   Library.  Bugs can be reported to bug-glibc@prep.ai.mit.edu.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 2, or (at your option) any
+   later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef _MD5_H
+#define _MD5_H 1
+
+#include <stdio.h>
+
+#if defined HAVE_LIMITS_H || _LIBC
+# include <limits.h>
+#endif
+
+#include "ansidecl.h"
+
+/* The following contortions are an attempt to use the C preprocessor
+   to determine an unsigned integral type that is 32 bits wide.  An
+   alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but
+   doing that would require that the configure script compile and *run*
+   the resulting executable.  Locally running cross-compiled executables
+   is usually not possible.  */
+
+#ifdef _LIBC
+# include <sys/types.h>
+typedef u_int32_t md5_uint32;
+typedef uintptr_t md5_uintptr;
+#elif defined (HAVE_SYS_TYPES_H) && defined (HAVE_STDINT_H)
+#include <stdint.h>
+#include <sys/types.h>
+typedef uint32_t md5_uint32;
+typedef uintptr_t md5_uintptr;
+#else
+#  define INT_MAX_32_BITS 2147483647
+
+/* If UINT_MAX isn't defined, assume it's a 32-bit type.
+   This should be valid for all systems GNU cares about because
+   that doesn't include 16-bit systems, and only modern systems
+   (that certainly have <limits.h>) have 64+-bit integral types.  */
+
+# ifndef INT_MAX
+#  define INT_MAX INT_MAX_32_BITS
+# endif
+
+# if INT_MAX == INT_MAX_32_BITS
+   typedef unsigned int md5_uint32;
+# else
+#  if SHRT_MAX == INT_MAX_32_BITS
+    typedef unsigned short md5_uint32;
+#  else
+#   if LONG_MAX == INT_MAX_32_BITS
+     typedef unsigned long md5_uint32;
+#   else
+     /* The following line is intended to evoke an error.
+        Using #error is not portable enough.  */
+     "Cannot determine unsigned 32-bit data type."
+#   endif
+#  endif
+# endif
+/* We have to make a guess about the integer type equivalent in size
+   to pointers which should always be correct.  */
+typedef unsigned long int md5_uintptr;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure to save state of computation between the single steps.  */
+struct md5_ctx
+{
+  md5_uint32 A;
+  md5_uint32 B;
+  md5_uint32 C;
+  md5_uint32 D;
+
+  md5_uint32 total[2];
+  md5_uint32 buflen;
+  char buffer[128] ATTRIBUTE_ALIGNED_ALIGNOF(md5_uint32);
+};
+
+/*
+ * The following three functions are build up the low level used in
+ * the functions `md5_stream' and `md5_buffer'.
+ */
+
+/* Initialize structure containing state of computation.
+   (RFC 1321, 3.3: Step 3)  */
+extern void md5_init_ctx (struct md5_ctx *ctx);
+
+/* Starting with the result of former calls of this function (or the
+   initialization function update the context for the next LEN bytes
+   starting at BUFFER.
+   It is necessary that LEN is a multiple of 64!!! */
+extern void md5_process_block (const void *buffer, size_t len,
+                               struct md5_ctx *ctx);
+
+/* Starting with the result of former calls of this function (or the
+   initialization function update the context for the next LEN bytes
+   starting at BUFFER.
+   It is NOT required that LEN is a multiple of 64.  */
+extern void md5_process_bytes (const void *buffer, size_t len,
+                               struct md5_ctx *ctx);
+
+/* Process the remaining bytes in the buffer and put result from CTX
+   in first 16 bytes following RESBUF.  The result is always in little
+   endian byte order, so that a byte-wise output yields to the wanted
+   ASCII representation of the message digest.
+
+   IMPORTANT: On some systems it is required that RESBUF is correctly
+   aligned for a 32 bits value.  */
+extern void *md5_finish_ctx (struct md5_ctx *ctx, void *resbuf);
+
+
+/* Put result from CTX in first 16 bytes following RESBUF.  The result is
+   always in little endian byte order, so that a byte-wise output yields
+   to the wanted ASCII representation of the message digest.
+
+   IMPORTANT: On some systems it is required that RESBUF is correctly
+   aligned for a 32 bits value.  */
+extern void *md5_read_ctx (const struct md5_ctx *ctx, void *resbuf);
+
+
+/* Compute MD5 message digest for bytes read from STREAM.  The
+   resulting message digest number will be written into the 16 bytes
+   beginning at RESBLOCK.  */
+extern int md5_stream (FILE *stream, void *resblock);
+
+/* Compute MD5 message digest for LEN bytes beginning at BUFFER.  The
+   result is always in little endian byte order, so that a byte-wise
+   output yields to the wanted ASCII representation of the message
+   digest.  */
+extern void *md5_buffer (const char *buffer, size_t len, void *resblock);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/oasys.h b/utils/gapy/gen-debug-info-src/ext/oasys.h
new file mode 100644
index 000000000..92d5c266f
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/oasys.h
@@ -0,0 +1,192 @@
+/* Oasys object format header file for BFD.
+
+   Copyright 2001, 2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.
+
+   Contributed by Cygnus Support.  */
+
+#define OASYS_MAX_SEC_COUNT 16
+/* **** */
+
+typedef struct oasys_archive_header
+  {
+    unsigned int  version;
+    char create_date[12];
+    char revision_date[12];
+    unsigned int mod_count;
+    file_ptr mod_tbl_offset;
+    unsigned int sym_tbl_size;
+    unsigned int sym_count;
+    file_ptr sym_tbl_offset;
+    unsigned int xref_count;
+    file_ptr xref_lst_offset;
+  }
+oasys_archive_header_type;
+
+typedef struct oasys_extarchive_header
+  {
+    bfd_byte version[4];
+    bfd_byte create_date[12];
+    bfd_byte revision_date[12];
+    bfd_byte mod_count[4];
+    bfd_byte mod_tbl_offset[4];
+    bfd_byte sym_tbl_size[4];
+    bfd_byte sym_count[4];
+    bfd_byte sym_tbl_offset[4];
+    bfd_byte xref_count[4];
+    bfd_byte xref_lst_offset[4];
+  }
+oasys_extarchive_header_type;
+
+typedef struct oasys_module_table
+  {
+    int mod_number;
+    char mod_date[12];
+    unsigned int mod_size;
+    unsigned int dep_count;
+    unsigned int depee_count;
+    file_ptr file_offset;
+    unsigned int sect_count;
+    char *module_name;
+    unsigned int module_name_size;
+  }
+oasys_module_table_type;
+
+typedef struct oasys_extmodule_table_a
+  {
+    bfd_byte mod_number[4];
+    bfd_byte mod_date[12];
+    bfd_byte mod_size[4];
+    bfd_byte dep_count[4];
+    bfd_byte depee_count[4];
+    bfd_byte sect_count[4];
+    bfd_byte file_offset[4];
+    bfd_byte mod_name[32];
+  }
+oasys_extmodule_table_type_a_type;
+
+typedef struct oasys_extmodule_table_b
+  {
+    bfd_byte mod_number[4];
+    bfd_byte mod_date[12];
+    bfd_byte mod_size[4];
+    bfd_byte dep_count[4];
+    bfd_byte depee_count[4];
+    bfd_byte sect_count[4];
+    bfd_byte file_offset[4];
+    bfd_byte mod_name_length[4];
+  }
+oasys_extmodule_table_type_b_type;
+
+typedef enum oasys_record
+  {
+    oasys_record_is_end_enum = 0,
+    oasys_record_is_data_enum = 1,
+    oasys_record_is_symbol_enum = 2,
+    oasys_record_is_header_enum = 3,
+    oasys_record_is_named_section_enum = 4,
+    oasys_record_is_com_enum = 5,
+    oasys_record_is_debug_enum = 6,
+    oasys_record_is_section_enum = 7,
+    oasys_record_is_debug_file_enum = 8,
+    oasys_record_is_module_enum = 9,
+    oasys_record_is_local_enum = 10
+  }
+oasys_record_enum_type;
+
+typedef struct oasys_record_header
+  {
+    unsigned char length;
+    unsigned char check_sum;
+    unsigned char type;
+    unsigned char fill;
+  }
+oasys_record_header_type;
+
+typedef struct oasys_data_record
+  {
+    oasys_record_header_type header;
+    unsigned char relb;
+    bfd_byte addr[4];
+    /* maximum total size of data record is 255 bytes */
+    bfd_byte data[246];
+  }
+oasys_data_record_type;
+
+typedef struct oasys_header_record
+  {
+    oasys_record_header_type header;
+    unsigned char version_number;
+    unsigned char rev_number;
+    char module_name[26-6];
+    char description[64-26];
+  }
+oasys_header_record_type;
+
+#define OASYS_VERSION_NUMBER 0
+#define OASYS_REV_NUMBER 0
+
+typedef struct oasys_symbol_record
+  {
+    oasys_record_header_type header;
+    unsigned char relb;
+    bfd_byte value[4];
+    bfd_byte refno[2];
+    char name[64];
+  }
+oasys_symbol_record_type;
+
+#define RELOCATION_PCREL_BIT 0x80
+#define RELOCATION_32BIT_BIT 0x40
+#define RELOCATION_TYPE_BITS 0x30
+#define RELOCATION_TYPE_ABS 0x00
+#define RELOCATION_TYPE_REL 0x10
+#define RELOCATION_TYPE_UND 0x20
+#define RELOCATION_TYPE_COM 0x30
+#define RELOCATION_SECT_BITS 0x0f
+
+typedef struct oasys_section_record
+  {
+    oasys_record_header_type header;
+    unsigned char relb;
+    bfd_byte value[4];
+    bfd_byte vma[4];
+    bfd_byte fill[3];
+  }
+oasys_section_record_type;
+
+typedef struct oasys_end_record
+  {
+    oasys_record_header_type header;
+    unsigned char relb;
+    bfd_byte entry[4];
+    bfd_byte fill[2];
+    bfd_byte zero;
+  }
+oasys_end_record_type;
+
+typedef union oasys_record_union
+  {
+    oasys_record_header_type header;
+    oasys_data_record_type data;
+    oasys_section_record_type section;
+    oasys_symbol_record_type symbol;
+    oasys_header_record_type first;
+    oasys_end_record_type end;
+    bfd_byte pad[256];
+  }
+oasys_record_union_type;
diff --git a/utils/gapy/gen-debug-info-src/ext/objalloc.h b/utils/gapy/gen-debug-info-src/ext/objalloc.h
new file mode 100644
index 000000000..2c0635090
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/objalloc.h
@@ -0,0 +1,115 @@
+/* objalloc.h -- routines to allocate memory for objects
+   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Cygnus Solutions.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef OBJALLOC_H
+#define OBJALLOC_H
+
+#include "ansidecl.h"
+
+/* These routines allocate space for an object.  The assumption is
+   that the object will want to allocate space as it goes along, but
+   will never want to free any particular block.  There is a function
+   to free a block, which also frees all more recently allocated
+   blocks.  There is also a function to free all the allocated space.
+
+   This is essentially a specialization of obstacks.  The main
+   difference is that a block may not be allocated a bit at a time.
+   Another difference is that these routines are always built on top
+   of malloc, and always pass an malloc failure back to the caller,
+   unlike more recent versions of obstacks.  */
+
+/* This is what an objalloc structure looks like.  Callers should not
+   refer to these fields, nor should they allocate these structure
+   themselves.  Instead, they should only create them via
+   objalloc_init, and only access them via the functions and macros
+   listed below.  The structure is only defined here so that we can
+   access it via macros.  */
+
+struct objalloc
+{
+  char *current_ptr;
+  unsigned int current_space;
+  void *chunks;
+};
+
+/* Work out the required alignment.  */
+
+struct objalloc_align { char x; double d; };
+
+#if defined (__STDC__) && __STDC__
+#ifndef offsetof
+#include <stddef.h>
+#endif
+#endif
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((unsigned long) &((TYPE *)0)->MEMBER)
+#endif
+#define OBJALLOC_ALIGN offsetof (struct objalloc_align, d)
+
+/* Create an objalloc structure.  Returns NULL if malloc fails.  */
+
+extern struct objalloc *objalloc_create (void);
+
+/* Allocate space from an objalloc structure.  Returns NULL if malloc
+   fails.  */
+
+extern void *_objalloc_alloc (struct objalloc *, unsigned long);
+
+/* The macro version of objalloc_alloc.  We only define this if using
+   gcc, because otherwise we would have to evaluate the arguments
+   multiple times, or use a temporary field as obstack.h does.  */
+
+#if defined (__GNUC__) && defined (__STDC__) && __STDC__
+
+/* NextStep 2.0 cc is really gcc 1.93 but it defines __GNUC__ = 2 and
+   does not implement __extension__.  But that compiler doesn't define
+   __GNUC_MINOR__.  */
+#if __GNUC__ < 2 || (__NeXT__ && !__GNUC_MINOR__)
+#define __extension__
+#endif
+
+#define objalloc_alloc(o, l)						\
+  __extension__								\
+  ({ struct objalloc *__o = (o);					\
+     unsigned long __len = (l);						\
+     if (__len == 0)							\
+       __len = 1;							\
+     __len = (__len + OBJALLOC_ALIGN - 1) &~ (OBJALLOC_ALIGN - 1);	\
+     (__len != 0 && __len <= __o->current_space				\
+      ? (__o->current_ptr += __len,					\
+	 __o->current_space -= __len,					\
+	 (void *) (__o->current_ptr - __len))				\
+      : _objalloc_alloc (__o, __len)); })
+
+#else /* ! __GNUC__ */
+
+#define objalloc_alloc(o, l) _objalloc_alloc ((o), (l))
+
+#endif /* ! __GNUC__ */
+
+/* Free an entire objalloc structure.  */
+
+extern void objalloc_free (struct objalloc *);
+
+/* Free a block allocated by objalloc_alloc.  This also frees all more
+   recently allocated blocks.  */
+
+extern void objalloc_free_block (struct objalloc *, void *);
+
+#endif /* OBJALLOC_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/obstack.h b/utils/gapy/gen-debug-info-src/ext/obstack.h
new file mode 100644
index 000000000..0d13c72d0
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/obstack.h
@@ -0,0 +1,535 @@
+/* obstack.h - object stack macros
+   Copyright (C) 1988-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Summary:
+
+   All the apparent functions defined here are macros. The idea
+   is that you would use these pre-tested macros to solve a
+   very specific set of problems, and they would run fast.
+   Caution: no side-effects in arguments please!! They may be
+   evaluated MANY times!!
+
+   These macros operate a stack of objects.  Each object starts life
+   small, and may grow to maturity.  (Consider building a word syllable
+   by syllable.)  An object can move while it is growing.  Once it has
+   been "finished" it never changes address again.  So the "top of the
+   stack" is typically an immature growing object, while the rest of the
+   stack is of mature, fixed size and fixed address objects.
+
+   These routines grab large chunks of memory, using a function you
+   supply, called 'obstack_chunk_alloc'.  On occasion, they free chunks,
+   by calling 'obstack_chunk_free'.  You must define them and declare
+   them before using any obstack macros.
+
+   Each independent stack is represented by a 'struct obstack'.
+   Each of the obstack macros expects a pointer to such a structure
+   as the first argument.
+
+   One motivation for this package is the problem of growing char strings
+   in symbol tables.  Unless you are "fascist pig with a read-only mind"
+   --Gosper's immortal quote from HAKMEM item 154, out of context--you
+   would not like to put any arbitrary upper limit on the length of your
+   symbols.
+
+   In practice this often means you will build many short symbols and a
+   few long symbols.  At the time you are reading a symbol you don't know
+   how long it is.  One traditional method is to read a symbol into a
+   buffer, realloc()ating the buffer every time you try to read a symbol
+   that is longer than the buffer.  This is beaut, but you still will
+   want to copy the symbol from the buffer to a more permanent
+   symbol-table entry say about half the time.
+
+   With obstacks, you can work differently.  Use one obstack for all symbol
+   names.  As you read a symbol, grow the name in the obstack gradually.
+   When the name is complete, finalize it.  Then, if the symbol exists already,
+   free the newly read name.
+
+   The way we do this is to take a large chunk, allocating memory from
+   low addresses.  When you want to build a symbol in the chunk you just
+   add chars above the current "high water mark" in the chunk.  When you
+   have finished adding chars, because you got to the end of the symbol,
+   you know how long the chars are, and you can create a new object.
+   Mostly the chars will not burst over the highest address of the chunk,
+   because you would typically expect a chunk to be (say) 100 times as
+   long as an average object.
+
+   In case that isn't clear, when we have enough chars to make up
+   the object, THEY ARE ALREADY CONTIGUOUS IN THE CHUNK (guaranteed)
+   so we just point to it where it lies.  No moving of chars is
+   needed and this is the second win: potentially long strings need
+   never be explicitly shuffled. Once an object is formed, it does not
+   change its address during its lifetime.
+
+   When the chars burst over a chunk boundary, we allocate a larger
+   chunk, and then copy the partly formed object from the end of the old
+   chunk to the beginning of the new larger chunk.  We then carry on
+   accreting characters to the end of the object as we normally would.
+
+   A special macro is provided to add a single char at a time to a
+   growing object.  This allows the use of register variables, which
+   break the ordinary 'growth' macro.
+
+   Summary:
+        We allocate large chunks.
+        We carve out one object at a time from the current chunk.
+        Once carved, an object never moves.
+        We are free to append data of any size to the currently
+          growing object.
+        Exactly one object is growing in an obstack at any one time.
+        You can run one obstack per control block.
+        You may have as many control blocks as you dare.
+        Because of the way we do it, you can "unwind" an obstack
+          back to a previous state. (You may remove objects much
+          as you would with a stack.)
+ */
+
+
+/* Don't do the contents of this file more than once.  */
+
+#ifndef _OBSTACK_H
+#define _OBSTACK_H 1
+
+#ifndef _OBSTACK_INTERFACE_VERSION
+# define _OBSTACK_INTERFACE_VERSION 2
+#endif
+
+#include <stddef.h>             /* For size_t and ptrdiff_t.  */
+#include <string.h>             /* For __GNU_LIBRARY__, and memcpy.  */
+
+#if _OBSTACK_INTERFACE_VERSION == 1
+/* For binary compatibility with obstack version 1, which used "int"
+   and "long" for these two types.  */
+# define _OBSTACK_SIZE_T unsigned int
+# define _CHUNK_SIZE_T unsigned long
+# define _OBSTACK_CAST(type, expr) ((type) (expr))
+#else
+/* Version 2 with sane types, especially for 64-bit hosts.  */
+# define _OBSTACK_SIZE_T size_t
+# define _CHUNK_SIZE_T size_t
+# define _OBSTACK_CAST(type, expr) (expr)
+#endif
+
+/* If B is the base of an object addressed by P, return the result of
+   aligning P to the next multiple of A + 1.  B and P must be of type
+   char *.  A + 1 must be a power of 2.  */
+
+#define __BPTR_ALIGN(B, P, A) ((B) + (((P) - (B) + (A)) & ~(A)))
+
+/* Similar to __BPTR_ALIGN (B, P, A), except optimize the common case
+   where pointers can be converted to integers, aligned as integers,
+   and converted back again.  If ptrdiff_t is narrower than a
+   pointer (e.g., the AS/400), play it safe and compute the alignment
+   relative to B.  Otherwise, use the faster strategy of computing the
+   alignment relative to 0.  */
+
+#define __PTR_ALIGN(B, P, A)						      \
+  __BPTR_ALIGN (sizeof (ptrdiff_t) < sizeof (void *) ? (B) : (char *) 0,      \
+                P, A)
+
+#ifndef __attribute_pure__
+# if defined __GNUC_MINOR__ && __GNUC__ * 1000 + __GNUC_MINOR__ >= 2096
+#  define __attribute_pure__ __attribute__ ((__pure__))
+# else
+#  define __attribute_pure__
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct _obstack_chunk           /* Lives at front of each chunk. */
+{
+  char *limit;                  /* 1 past end of this chunk */
+  struct _obstack_chunk *prev;  /* address of prior chunk or NULL */
+  char contents[4];             /* objects begin here */
+};
+
+struct obstack          /* control current object in current chunk */
+{
+  _CHUNK_SIZE_T chunk_size;     /* preferred size to allocate chunks in */
+  struct _obstack_chunk *chunk; /* address of current struct obstack_chunk */
+  char *object_base;            /* address of object we are building */
+  char *next_free;              /* where to add next char to current object */
+  char *chunk_limit;            /* address of char after current chunk */
+  union
+  {
+    _OBSTACK_SIZE_T i;
+    void *p;
+  } temp;                       /* Temporary for some macros.  */
+  _OBSTACK_SIZE_T alignment_mask;  /* Mask of alignment for each object. */
+
+  /* These prototypes vary based on 'use_extra_arg'.  */
+  union
+  {
+    void *(*plain) (size_t);
+    void *(*extra) (void *, size_t);
+  } chunkfun;
+  union
+  {
+    void (*plain) (void *);
+    void (*extra) (void *, void *);
+  } freefun;
+
+  void *extra_arg;              /* first arg for chunk alloc/dealloc funcs */
+  unsigned use_extra_arg : 1;     /* chunk alloc/dealloc funcs take extra arg */
+  unsigned maybe_empty_object : 1; /* There is a possibility that the current
+                                      chunk contains a zero-length object.  This
+                                      prevents freeing the chunk if we allocate
+                                      a bigger chunk to replace it. */
+  unsigned alloc_failed : 1;      /* No longer used, as we now call the failed
+                                     handler on error, but retained for binary
+                                     compatibility.  */
+};
+
+/* Declare the external functions we use; they are in obstack.c.  */
+
+extern void _obstack_newchunk (struct obstack *, _OBSTACK_SIZE_T);
+extern void _obstack_free (struct obstack *, void *);
+extern int _obstack_begin (struct obstack *,
+                           _OBSTACK_SIZE_T, _OBSTACK_SIZE_T,
+                           void *(*) (size_t), void (*) (void *));
+extern int _obstack_begin_1 (struct obstack *,
+                             _OBSTACK_SIZE_T, _OBSTACK_SIZE_T,
+                             void *(*) (void *, size_t),
+                             void (*) (void *, void *), void *);
+extern _OBSTACK_SIZE_T _obstack_memory_used (struct obstack *)
+  __attribute_pure__;
+
+
+/* Error handler called when 'obstack_chunk_alloc' failed to allocate
+   more memory.  This can be set to a user defined function which
+   should either abort gracefully or use longjump - but shouldn't
+   return.  The default action is to print a message and abort.  */
+extern void (*obstack_alloc_failed_handler) (void);
+
+/* Exit value used when 'print_and_abort' is used.  */
+extern int obstack_exit_failure;
+
+/* Pointer to beginning of object being allocated or to be allocated next.
+   Note that this might not be the final address of the object
+   because a new chunk might be needed to hold the final size.  */
+
+#define obstack_base(h) ((void *) (h)->object_base)
+
+/* Size for allocating ordinary chunks.  */
+
+#define obstack_chunk_size(h) ((h)->chunk_size)
+
+/* Pointer to next byte not yet allocated in current chunk.  */
+
+#define obstack_next_free(h) ((void *) (h)->next_free)
+
+/* Mask specifying low bits that should be clear in address of an object.  */
+
+#define obstack_alignment_mask(h) ((h)->alignment_mask)
+
+/* To prevent prototype warnings provide complete argument list.  */
+#define obstack_init(h)							      \
+  _obstack_begin ((h), 0, 0,						      \
+                  _OBSTACK_CAST (void *(*) (size_t), obstack_chunk_alloc),    \
+                  _OBSTACK_CAST (void (*) (void *), obstack_chunk_free))
+
+#define obstack_begin(h, size)						      \
+  _obstack_begin ((h), (size), 0,					      \
+                  _OBSTACK_CAST (void *(*) (size_t), obstack_chunk_alloc), \
+                  _OBSTACK_CAST (void (*) (void *), obstack_chunk_free))
+
+#define obstack_specify_allocation(h, size, alignment, chunkfun, freefun)     \
+  _obstack_begin ((h), (size), (alignment),				      \
+                  _OBSTACK_CAST (void *(*) (size_t), chunkfun),		      \
+                  _OBSTACK_CAST (void (*) (void *), freefun))
+
+#define obstack_specify_allocation_with_arg(h, size, alignment, chunkfun, freefun, arg) \
+  _obstack_begin_1 ((h), (size), (alignment),				      \
+                    _OBSTACK_CAST (void *(*) (void *, size_t), chunkfun),     \
+                    _OBSTACK_CAST (void (*) (void *, void *), freefun), arg)
+
+#define obstack_chunkfun(h, newchunkfun)				      \
+  ((void) ((h)->chunkfun.extra = (void *(*) (void *, size_t)) (newchunkfun)))
+
+#define obstack_freefun(h, newfreefun)					      \
+  ((void) ((h)->freefun.extra = (void *(*) (void *, void *)) (newfreefun)))
+
+#define obstack_1grow_fast(h, achar) ((void) (*((h)->next_free)++ = (achar)))
+
+#define obstack_blank_fast(h, n) ((void) ((h)->next_free += (n)))
+
+#define obstack_memory_used(h) _obstack_memory_used (h)
+
+#if defined __GNUC__
+# if !defined __GNUC_MINOR__ || __GNUC__ * 1000 + __GNUC_MINOR__ < 2008
+#  define __extension__
+# endif
+
+/* For GNU C, if not -traditional,
+   we can define these macros to compute all args only once
+   without using a global variable.
+   Also, we can avoid using the 'temp' slot, to make faster code.  */
+
+# define obstack_object_size(OBSTACK)					      \
+  __extension__								      \
+    ({ struct obstack const *__o = (OBSTACK);				      \
+       (_OBSTACK_SIZE_T) (__o->next_free - __o->object_base); })
+
+/* The local variable is named __o1 to avoid a shadowed variable
+   warning when invoked from other obstack macros.  */
+# define obstack_room(OBSTACK)						      \
+  __extension__								      \
+    ({ struct obstack const *__o1 = (OBSTACK);				      \
+       (_OBSTACK_SIZE_T) (__o1->chunk_limit - __o1->next_free); })
+
+# define obstack_make_room(OBSTACK, length)				      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       _OBSTACK_SIZE_T __len = (length);				      \
+       if (obstack_room (__o) < __len)					      \
+         _obstack_newchunk (__o, __len);				      \
+       (void) 0; })
+
+# define obstack_empty_p(OBSTACK)					      \
+  __extension__								      \
+    ({ struct obstack const *__o = (OBSTACK);				      \
+       (__o->chunk->prev == 0						      \
+        && __o->next_free == __PTR_ALIGN ((char *) __o->chunk,		      \
+                                          __o->chunk->contents,		      \
+                                          __o->alignment_mask)); })
+
+# define obstack_grow(OBSTACK, where, length)				      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       _OBSTACK_SIZE_T __len = (length);				      \
+       if (obstack_room (__o) < __len)					      \
+         _obstack_newchunk (__o, __len);				      \
+       memcpy (__o->next_free, where, __len);				      \
+       __o->next_free += __len;						      \
+       (void) 0; })
+
+# define obstack_grow0(OBSTACK, where, length)				      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       _OBSTACK_SIZE_T __len = (length);				      \
+       if (obstack_room (__o) < __len + 1)				      \
+         _obstack_newchunk (__o, __len + 1);				      \
+       memcpy (__o->next_free, where, __len);				      \
+       __o->next_free += __len;						      \
+       *(__o->next_free)++ = 0;						      \
+       (void) 0; })
+
+# define obstack_1grow(OBSTACK, datum)					      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       if (obstack_room (__o) < 1)					      \
+         _obstack_newchunk (__o, 1);					      \
+       obstack_1grow_fast (__o, datum); })
+
+/* These assume that the obstack alignment is good enough for pointers
+   or ints, and that the data added so far to the current object
+   shares that much alignment.  */
+
+# define obstack_ptr_grow(OBSTACK, datum)				      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       if (obstack_room (__o) < sizeof (void *))			      \
+         _obstack_newchunk (__o, sizeof (void *));			      \
+       obstack_ptr_grow_fast (__o, datum); })
+
+# define obstack_int_grow(OBSTACK, datum)				      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       if (obstack_room (__o) < sizeof (int))				      \
+         _obstack_newchunk (__o, sizeof (int));				      \
+       obstack_int_grow_fast (__o, datum); })
+
+# define obstack_ptr_grow_fast(OBSTACK, aptr)				      \
+  __extension__								      \
+    ({ struct obstack *__o1 = (OBSTACK);				      \
+       void *__p1 = __o1->next_free;					      \
+       *(const void **) __p1 = (aptr);					      \
+       __o1->next_free += sizeof (const void *);			      \
+       (void) 0; })
+
+# define obstack_int_grow_fast(OBSTACK, aint)				      \
+  __extension__								      \
+    ({ struct obstack *__o1 = (OBSTACK);				      \
+       void *__p1 = __o1->next_free;					      \
+       *(int *) __p1 = (aint);						      \
+       __o1->next_free += sizeof (int);					      \
+       (void) 0; })
+
+# define obstack_blank(OBSTACK, length)					      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       _OBSTACK_SIZE_T __len = (length);				      \
+       if (obstack_room (__o) < __len)					      \
+         _obstack_newchunk (__o, __len);				      \
+       obstack_blank_fast (__o, __len); })
+
+# define obstack_alloc(OBSTACK, length)					      \
+  __extension__								      \
+    ({ struct obstack *__h = (OBSTACK);					      \
+       obstack_blank (__h, (length));					      \
+       obstack_finish (__h); })
+
+# define obstack_copy(OBSTACK, where, length)				      \
+  __extension__								      \
+    ({ struct obstack *__h = (OBSTACK);					      \
+       obstack_grow (__h, (where), (length));				      \
+       obstack_finish (__h); })
+
+# define obstack_copy0(OBSTACK, where, length)				      \
+  __extension__								      \
+    ({ struct obstack *__h = (OBSTACK);					      \
+       obstack_grow0 (__h, (where), (length));				      \
+       obstack_finish (__h); })
+
+/* The local variable is named __o1 to avoid a shadowed variable
+   warning when invoked from other obstack macros, typically obstack_free.  */
+# define obstack_finish(OBSTACK)					      \
+  __extension__								      \
+    ({ struct obstack *__o1 = (OBSTACK);				      \
+       void *__value = (void *) __o1->object_base;			      \
+       if (__o1->next_free == __value)					      \
+         __o1->maybe_empty_object = 1;					      \
+       __o1->next_free							      \
+         = __PTR_ALIGN (__o1->object_base, __o1->next_free,		      \
+                        __o1->alignment_mask);				      \
+       if ((size_t) (__o1->next_free - (char *) __o1->chunk)		      \
+           > (size_t) (__o1->chunk_limit - (char *) __o1->chunk))	      \
+         __o1->next_free = __o1->chunk_limit;				      \
+       __o1->object_base = __o1->next_free;				      \
+       __value; })
+
+# define obstack_free(OBSTACK, OBJ)					      \
+  __extension__								      \
+    ({ struct obstack *__o = (OBSTACK);					      \
+       void *__obj = (void *) (OBJ);					      \
+       if (__obj > (void *) __o->chunk && __obj < (void *) __o->chunk_limit)  \
+         __o->next_free = __o->object_base = (char *) __obj;		      \
+       else								      \
+         _obstack_free (__o, __obj); })
+
+#else /* not __GNUC__ */
+
+# define obstack_object_size(h)						      \
+  ((_OBSTACK_SIZE_T) ((h)->next_free - (h)->object_base))
+
+# define obstack_room(h)						      \
+  ((_OBSTACK_SIZE_T) ((h)->chunk_limit - (h)->next_free))
+
+# define obstack_empty_p(h)						      \
+  ((h)->chunk->prev == 0						      \
+   && (h)->next_free == __PTR_ALIGN ((char *) (h)->chunk,		      \
+                                     (h)->chunk->contents,		      \
+                                     (h)->alignment_mask))
+
+/* Note that the call to _obstack_newchunk is enclosed in (..., 0)
+   so that we can avoid having void expressions
+   in the arms of the conditional expression.
+   Casting the third operand to void was tried before,
+   but some compilers won't accept it.  */
+
+# define obstack_make_room(h, length)					      \
+  ((h)->temp.i = (length),						      \
+   ((obstack_room (h) < (h)->temp.i)					      \
+    ? (_obstack_newchunk (h, (h)->temp.i), 0) : 0),			      \
+   (void) 0)
+
+# define obstack_grow(h, where, length)					      \
+  ((h)->temp.i = (length),						      \
+   ((obstack_room (h) < (h)->temp.i)					      \
+   ? (_obstack_newchunk ((h), (h)->temp.i), 0) : 0),			      \
+   memcpy ((h)->next_free, where, (h)->temp.i),				      \
+   (h)->next_free += (h)->temp.i,					      \
+   (void) 0)
+
+# define obstack_grow0(h, where, length)				      \
+  ((h)->temp.i = (length),						      \
+   ((obstack_room (h) < (h)->temp.i + 1)				      \
+   ? (_obstack_newchunk ((h), (h)->temp.i + 1), 0) : 0),		      \
+   memcpy ((h)->next_free, where, (h)->temp.i),				      \
+   (h)->next_free += (h)->temp.i,					      \
+   *((h)->next_free)++ = 0,						      \
+   (void) 0)
+
+# define obstack_1grow(h, datum)					      \
+  (((obstack_room (h) < 1)						      \
+    ? (_obstack_newchunk ((h), 1), 0) : 0),				      \
+   obstack_1grow_fast (h, datum))
+
+# define obstack_ptr_grow(h, datum)					      \
+  (((obstack_room (h) < sizeof (char *))				      \
+    ? (_obstack_newchunk ((h), sizeof (char *)), 0) : 0),		      \
+   obstack_ptr_grow_fast (h, datum))
+
+# define obstack_int_grow(h, datum)					      \
+  (((obstack_room (h) < sizeof (int))					      \
+    ? (_obstack_newchunk ((h), sizeof (int)), 0) : 0),			      \
+   obstack_int_grow_fast (h, datum))
+
+# define obstack_ptr_grow_fast(h, aptr)					      \
+  (((const void **) ((h)->next_free += sizeof (void *)))[-1] = (aptr),	      \
+   (void) 0)
+
+# define obstack_int_grow_fast(h, aint)					      \
+  (((int *) ((h)->next_free += sizeof (int)))[-1] = (aint),		      \
+   (void) 0)
+
+# define obstack_blank(h, length)					      \
+  ((h)->temp.i = (length),						      \
+   ((obstack_room (h) < (h)->temp.i)					      \
+   ? (_obstack_newchunk ((h), (h)->temp.i), 0) : 0),			      \
+   obstack_blank_fast (h, (h)->temp.i))
+
+# define obstack_alloc(h, length)					      \
+  (obstack_blank ((h), (length)), obstack_finish ((h)))
+
+# define obstack_copy(h, where, length)					      \
+  (obstack_grow ((h), (where), (length)), obstack_finish ((h)))
+
+# define obstack_copy0(h, where, length)				      \
+  (obstack_grow0 ((h), (where), (length)), obstack_finish ((h)))
+
+# define obstack_finish(h)						      \
+  (((h)->next_free == (h)->object_base					      \
+    ? (((h)->maybe_empty_object = 1), 0)				      \
+    : 0),								      \
+   (h)->temp.p = (h)->object_base,					      \
+   (h)->next_free							      \
+     = __PTR_ALIGN ((h)->object_base, (h)->next_free,			      \
+                    (h)->alignment_mask),				      \
+   (((size_t) ((h)->next_free - (char *) (h)->chunk)			      \
+     > (size_t) ((h)->chunk_limit - (char *) (h)->chunk))		      \
+   ? ((h)->next_free = (h)->chunk_limit) : 0),				      \
+   (h)->object_base = (h)->next_free,					      \
+   (h)->temp.p)
+
+# define obstack_free(h, obj)						      \
+  ((h)->temp.p = (void *) (obj),					      \
+   (((h)->temp.p > (void *) (h)->chunk					      \
+     && (h)->temp.p < (void *) (h)->chunk_limit)			      \
+    ? (void) ((h)->next_free = (h)->object_base = (char *) (h)->temp.p)       \
+    : _obstack_free ((h), (h)->temp.p)))
+
+#endif /* not __GNUC__ */
+
+#ifdef __cplusplus
+}       /* C++ */
+#endif
+
+#endif /* _OBSTACK_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/os9k.h b/utils/gapy/gen-debug-info-src/ext/os9k.h
new file mode 100644
index 000000000..e8baee17f
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/os9k.h
@@ -0,0 +1,181 @@
+/* os9k.h  -  OS-9000 i386 module header definitions
+   Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of GNU CC.
+   
+GNU CC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU CC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU CC; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#if !defined(_MODULE_H)
+#define _MODULE_H
+
+#define _MPF386
+
+/* Size of common header less parity field.  */
+#define N_M_PARITY  (sizeof(mh_com)-sizeof(unisgned short))
+#define OLD_M_PARITY 46
+#define M_PARITY N_M_PARITY
+
+#ifdef _MPF68K
+#define MODSYNC 0x4afc		/* Module header sync code for 680x0 processors.  */
+#endif
+
+#ifdef _MPF386
+#define MODSYNC 0x4afc		/* Module header sync code for 80386 processors.  */
+#endif
+
+#define MODREV	1		/* Module format revision 1.  */
+#define CRCCON	0x800063	/* CRC polynomial constant.  */
+
+/* Module access permission values.  */
+#define MP_OWNER_READ	0x0001
+#define MP_OWNER_WRITE	0x0002
+#define MP_OWNER_EXEC	0x0004
+#define MP_GROUP_READ	0x0010
+#define MP_GROUP_WRITE	0x0020
+#define MP_GROUP_EXEC	0x0040
+#define MP_WORLD_READ	0x0100
+#define MP_WORLD_WRITE	0x0200
+#define MP_WORLD_EXEC	0x0400
+#define MP_WORLD_ACCESS	0x0777
+#define MP_OWNER_MASK	0x000f
+#define MP_GROUP_MASK	0x00f0
+#define MP_WORLD_MASK	0x0f00
+#define MP_SYSTM_MASK	0xf000
+
+/* Module Type/Language values.  */
+#define MT_ANY		0
+#define MT_PROGRAM	0x0001
+#define MT_SUBROUT	0x0002
+#define MT_MULTI	0x0003
+#define MT_DATA		0x0004
+#define MT_TRAPLIB	0x000b
+#define MT_SYSTEM	0x000c
+#define MT_FILEMAN	0x000d
+#define MT_DEVDRVR	0x000e 
+#define MT_DEVDESC	0x000f
+#define MT_MASK		0xff00
+
+#define ML_ANY		0
+#define ML_OBJECT	1
+#define ML_ICODE	2
+#define ML_PCODE	3
+#define ML_CCODE	4
+#define ML_CBLCODE	5
+#define ML_FRTNCODE	6
+#define ML_MASK		0x00ff
+
+#define mktypelang(type, lang)	(((type) << 8) | (lang))
+
+/* Module Attribute values.  */
+#define MA_REENT	0x80
+#define MA_GHOST	0x40
+#define MA_SUPER	0x20
+#define MA_MASK		0xff00
+#define MR_MASK		0x00ff
+
+#define mkattrevs(attr, revs)	(((attr) << 8) | (revs))
+
+#define m_user 		m_owner.grp_usr.usr
+#define m_group 	m_owner.grp_usr.grp
+#define m_group_user	m_owner.group_user
+
+/* Macro definitions for accessing module header fields.  */
+#define MODNAME(mod) ((u_char*)((u_char*)mod + ((Mh_com)mod)->m_name))
+#if 0
+/* Appears not to be used, and the u_int32 typedef is gone (because it
+   conflicted with a Mach header.  */
+#define MODSIZE(mod) ((u_int32)((Mh_com)mod)->m_size)
+#endif /* 0 */
+#define MHCOM_BYTES_SIZE 80
+#define N_BADMAG(a) (((a).a_info) != MODSYNC)
+
+typedef struct mh_com
+{
+  /* Sync bytes ($4afc).  */
+  unsigned char m_sync[2];
+  unsigned char m_sysrev[2];	/* System revision check value.  */
+  unsigned char m_size[4];	/* Module size.  */
+  unsigned char m_owner[4];	/* Group/user id.  */
+  unsigned char m_name[4];	/* Offset to module name.  */
+  unsigned char m_access[2];	/* Access permissions.  */
+  unsigned char m_tylan[2];	/* Type/lang.  */
+  unsigned char m_attrev[2];	/* Rev/attr.  */
+  unsigned char m_edit[2];	/* Edition.  */
+  unsigned char m_needs[4];	/* Module hardware requirements flags. (reserved).  */
+  unsigned char m_usage[4];	/* Comment string offset.  */
+  unsigned char m_symbol[4];	/* Symbol table offset.  */
+  unsigned char m_exec[4];	/* Offset to execution entry point.  */
+  unsigned char m_excpt[4];	/* Offset to exception entry point.  */
+  unsigned char m_data[4];	/* Data storage requirement.  */
+  unsigned char m_stack[4];	/* Stack size.  */
+  unsigned char m_idata[4];	/* Offset to initialized data.  */
+  unsigned char m_idref[4];	/* Offset to data reference lists.  */
+  unsigned char m_init[4];	/* Initialization routine offset.  */
+  unsigned char m_term[4];	/* Termination routine offset.  */
+  unsigned char m_ident[2];	/* Ident code for ident program.  */
+  char          m_spare[8];	/* Reserved bytes.  */
+  unsigned char m_parity[2]; 	/* Header parity.  */
+} mh_com,*Mh_com;
+
+/* Executable memory module.  */
+typedef mh_com *Mh_exec,mh_exec;
+
+/* Data memory module.  */
+typedef mh_com *Mh_data,mh_data;
+
+/* File manager memory module.  */
+typedef mh_com *Mh_fman,mh_fman;
+
+/* Device driver module.  */
+typedef mh_com *Mh_drvr,mh_drvr;
+
+/* Trap handler module.  */
+typedef	mh_com mh_trap, *Mh_trap;
+
+/* Device descriptor module.  */
+typedef	mh_com *Mh_dev,mh_dev;
+
+/* Configuration module.  */
+typedef mh_com *Mh_config, mh_config;
+
+#if 0 
+
+#if !defined(_MODDIR_H)
+/* Go get _os_fmod (and others).  */
+#include <moddir.h>
+#endif
+
+error_code _os_crc (void *, u_int32, int *);
+error_code _os_datmod (char *, u_int32, u_int16 *, u_int16 *, u_int32, void **, mh_data **);
+error_code _os_get_moddir (void *, u_int32 *);
+error_code _os_initdata (mh_com *, void *);
+error_code _os_link (char **, mh_com **, void **, u_int16 *, u_int16 *);
+error_code _os_linkm (mh_com *, void **, u_int16 *, u_int16 *);
+error_code _os_load (char *, mh_com **, void **, u_int32, u_int16 *, u_int16 *, u_int32);
+error_code _os_mkmodule (char *, u_int32, u_int16 *, u_int16 *, u_int32, void **, mh_com **, u_int32);
+error_code _os_modaddr (void *, mh_com **);
+error_code _os_setcrc (mh_com *);
+error_code _os_slink (u_int32, char *, void **, void **, mh_com **);
+error_code _os_slinkm (u_int32, mh_com *, void **, void **);
+error_code _os_unlink (mh_com *);
+error_code _os_unload (char *, u_int32);
+error_code _os_tlink (u_int32, char *, void **, mh_trap **, void *, u_int32);
+error_code _os_tlinkm (u_int32, mh_com *, void **, void *, u_int32);
+error_code _os_iodel (mh_com *);
+error_code _os_vmodul (mh_com *, mh_com *, u_int32);
+#endif /* 0 */
+
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/partition.h b/utils/gapy/gen-debug-info-src/ext/partition.h
new file mode 100644
index 000000000..c39873b60
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/partition.h
@@ -0,0 +1,82 @@
+/* List implementation of a partition of consecutive integers.
+   Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
+   Contributed by CodeSourcery, LLC.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to
+   the Free Software Foundation, 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* This package implements a partition of consecutive integers.  The
+   elements are partitioned into classes.  Each class is represented
+   by one of its elements, the canonical element, which is chosen
+   arbitrarily from elements in the class.  The principal operations
+   on a partition are FIND, which takes an element, determines its
+   class, and returns the canonical element for that class, and UNION,
+   which unites the two classes that contain two given elements into a
+   single class.
+
+   The list implementation used here provides constant-time finds.  By
+   storing the size of each class with the class's canonical element,
+   it is able to perform unions over all the classes in the partition
+   in O (N log N) time.  */
+
+#ifndef _PARTITION_H
+#define _PARTITION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "ansidecl.h"
+#include <stdio.h>
+
+struct partition_elem
+{
+  /* The next element in this class.  Elements in each class form a
+     circular list.  */
+  struct partition_elem* next;
+  /* The canonical element that represents the class containing this
+     element.  */
+  int class_element;
+  /* The number of elements in this class.  Valid only if this is the
+     canonical element for its class.  */
+  unsigned class_count;
+};
+
+typedef struct partition_def 
+{
+  /* The number of elements in this partition.  */
+  int num_elements;
+  /* The elements in the partition.  */
+  struct partition_elem elements[1];
+} *partition;
+
+extern partition partition_new (int);
+extern void partition_delete (partition);
+extern int partition_union (partition, int, int);
+extern void partition_print (partition,	FILE*);
+
+/* Returns the canonical element corresponding to the class containing
+   ELEMENT__ in PARTITION__.  */
+
+#define partition_find(partition__, element__) \
+    ((partition__)->elements[(element__)].class_element)
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _PARTITION_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/plugin-api.h b/utils/gapy/gen-debug-info-src/ext/plugin-api.h
new file mode 100644
index 000000000..a794a3704
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/plugin-api.h
@@ -0,0 +1,457 @@
+/* plugin-api.h -- External linker plugin API.  */
+
+/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+   Written by Cary Coutant <ccoutant@google.com>.
+
+   This file is part of binutils.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+/* This file defines the interface for writing a linker plugin, which is
+   described at < http://gcc.gnu.org/wiki/whopr/driver >.  */
+
+#ifndef PLUGIN_API_H
+#define PLUGIN_API_H
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#elif defined(HAVE_INTTYPES_H)
+#include <inttypes.h>
+#endif
+#include <sys/types.h>
+#if !defined(HAVE_STDINT_H) && !defined(HAVE_INTTYPES_H) && \
+    !defined(UINT64_MAX) && !defined(uint64_t)
+#error can not find uint64_t type
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* Status code returned by most API routines.  */
+
+enum ld_plugin_status
+{
+  LDPS_OK = 0,
+  LDPS_NO_SYMS,         /* Attempt to get symbols that haven't been added. */
+  LDPS_BAD_HANDLE,      /* No claimed object associated with given handle. */
+  LDPS_ERR
+  /* Additional Error codes TBD.  */
+};
+
+/* The version of the API specification.  */
+
+enum ld_plugin_api_version
+{
+  LD_PLUGIN_API_VERSION = 1
+};
+
+/* The type of output file being generated by the linker.  */
+
+enum ld_plugin_output_file_type
+{
+  LDPO_REL,
+  LDPO_EXEC,
+  LDPO_DYN,
+  LDPO_PIE
+};
+
+/* An input file managed by the plugin library.  */
+
+struct ld_plugin_input_file
+{
+  const char *name;
+  int fd;
+  off_t offset;
+  off_t filesize;
+  void *handle;
+};
+
+/* A symbol belonging to an input file managed by the plugin library.  */
+
+struct ld_plugin_symbol
+{
+  char *name;
+  char *version;
+  int def;
+  int visibility;
+  uint64_t size;
+  char *comdat_key;
+  int resolution;
+};
+
+/* An object's section.  */
+
+struct ld_plugin_section
+{
+  const void* handle;
+  unsigned int shndx;
+};
+
+/* Whether the symbol is a definition, reference, or common, weak or not.  */
+
+enum ld_plugin_symbol_kind
+{
+  LDPK_DEF,
+  LDPK_WEAKDEF,
+  LDPK_UNDEF,
+  LDPK_WEAKUNDEF,
+  LDPK_COMMON
+};
+
+/* The visibility of the symbol.  */
+
+enum ld_plugin_symbol_visibility
+{
+  LDPV_DEFAULT,
+  LDPV_PROTECTED,
+  LDPV_INTERNAL,
+  LDPV_HIDDEN
+};
+
+/* How a symbol is resolved.  */
+
+enum ld_plugin_symbol_resolution
+{
+  LDPR_UNKNOWN = 0,
+
+  /* Symbol is still undefined at this point.  */
+  LDPR_UNDEF,
+
+  /* This is the prevailing definition of the symbol, with references from
+     regular object code.  */
+  LDPR_PREVAILING_DEF,
+
+  /* This is the prevailing definition of the symbol, with no
+     references from regular objects.  It is only referenced from IR
+     code.  */
+  LDPR_PREVAILING_DEF_IRONLY,
+
+  /* This definition was pre-empted by a definition in a regular
+     object file.  */
+  LDPR_PREEMPTED_REG,
+
+  /* This definition was pre-empted by a definition in another IR file.  */
+  LDPR_PREEMPTED_IR,
+
+  /* This symbol was resolved by a definition in another IR file.  */
+  LDPR_RESOLVED_IR,
+
+  /* This symbol was resolved by a definition in a regular object
+     linked into the main executable.  */
+  LDPR_RESOLVED_EXEC,
+
+  /* This symbol was resolved by a definition in a shared object.  */
+  LDPR_RESOLVED_DYN,
+
+  /* This is the prevailing definition of the symbol, with no
+     references from regular objects.  It is only referenced from IR
+     code, but the symbol is exported and may be referenced from
+     a dynamic object (not seen at link time).  */
+  LDPR_PREVAILING_DEF_IRONLY_EXP
+};
+
+/* The plugin library's "claim file" handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_claim_file_handler) (
+  const struct ld_plugin_input_file *file, int *claimed);
+
+/* The plugin library's "all symbols read" handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_all_symbols_read_handler) (void);
+
+/* The plugin library's cleanup handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_cleanup_handler) (void);
+
+/* The linker's interface for registering the "claim file" handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_register_claim_file) (ld_plugin_claim_file_handler handler);
+
+/* The linker's interface for registering the "all symbols read" handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_register_all_symbols_read) (
+  ld_plugin_all_symbols_read_handler handler);
+
+/* The linker's interface for registering the cleanup handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_register_cleanup) (ld_plugin_cleanup_handler handler);
+
+/* The linker's interface for adding symbols from a claimed input file.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_add_symbols) (void *handle, int nsyms,
+                          const struct ld_plugin_symbol *syms);
+
+/* The linker's interface for getting the input file information with
+   an open (possibly re-opened) file descriptor.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_file) (const void *handle,
+                             struct ld_plugin_input_file *file);
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_view) (const void *handle, const void **viewp);
+
+/* The linker's interface for releasing the input file.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_release_input_file) (const void *handle);
+
+/* The linker's interface for retrieving symbol resolution information.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_symbols) (const void *handle, int nsyms,
+                          struct ld_plugin_symbol *syms);
+
+/* The linker's interface for adding a compiled input file.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_add_input_file) (const char *pathname);
+
+/* The linker's interface for adding a library that should be searched.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_add_input_library) (const char *libname);
+
+/* The linker's interface for adding a library path that should be searched.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_set_extra_library_path) (const char *path);
+
+/* The linker's interface for issuing a warning or error message.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_message) (int level, const char *format, ...);
+
+/* The linker's interface for retrieving the number of sections in an object.
+   The handle is obtained in the claim_file handler.  This interface should
+   only be invoked in the claim_file handler.   This function sets *COUNT to
+   the number of sections in the object.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_section_count) (const void* handle, unsigned int *count);
+
+/* The linker's interface for retrieving the section type of a specific
+   section in an object.  This interface should only be invoked in the
+   claim_file handler.  This function sets *TYPE to an ELF SHT_xxx value.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_section_type) (const struct ld_plugin_section section,
+                                     unsigned int *type);
+
+/* The linker's interface for retrieving the name of a specific section in
+   an object. This interface should only be invoked in the claim_file handler.
+   This function sets *SECTION_NAME_PTR to a null-terminated buffer allocated
+   by malloc.  The plugin must free *SECTION_NAME_PTR.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_section_name) (const struct ld_plugin_section section,
+                                     char **section_name_ptr);
+
+/* The linker's interface for retrieving the contents of a specific section
+   in an object.  This interface should only be invoked in the claim_file
+   handler.  This function sets *SECTION_CONTENTS to point to a buffer that is
+   valid until clam_file handler returns.  It sets *LEN to the size of the
+   buffer.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_section_contents) (const struct ld_plugin_section section,
+                                         const unsigned char **section_contents,
+                                         size_t* len);
+
+/* The linker's interface for specifying the desired order of sections.
+   The sections should be specifed using the array SECTION_LIST in the
+   order in which they should appear in the final layout.  NUM_SECTIONS
+   specifies the number of entries in each array.  This should be invoked
+   in the all_symbols_read handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_update_section_order) (const struct ld_plugin_section *section_list,
+				   unsigned int num_sections);
+
+/* The linker's interface for specifying that reordering of sections is
+   desired so that the linker can prepare for it.  This should be invoked
+   before update_section_order, preferably in the claim_file handler.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_allow_section_ordering) (void);
+
+/* The linker's interface for specifying that a subset of sections is
+   to be mapped to a unique segment.  If the plugin wants to call
+   unique_segment_for_sections, it must call this function from a
+   claim_file_handler or when it is first loaded.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_allow_unique_segment_for_sections) (void);
+
+/* The linker's interface for specifying that a specific set of sections
+   must be mapped to a unique segment.  ELF segments do not have names
+   and the NAME is used as the name of the newly created output section
+   that is then placed in the unique PT_LOAD segment.  FLAGS is used to
+   specify if any additional segment flags need to be set.  For instance,
+   a specific segment flag can be set to identify this segment.  Unsetting
+   segment flags that would be set by default is not possible.  The
+   parameter SEGMENT_ALIGNMENT when non-zero will override the default.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_unique_segment_for_sections) (
+    const char* segment_name,
+    uint64_t segment_flags,
+    uint64_t segment_alignment,
+    const struct ld_plugin_section * section_list,
+    unsigned int num_sections);
+
+/* The linker's interface for retrieving the section alignment requirement
+   of a specific section in an object.  This interface should only be invoked in the
+   claim_file handler.  This function sets *ADDRALIGN to the ELF sh_addralign
+   value of the input section.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_section_alignment) (const struct ld_plugin_section section,
+                                          unsigned int *addralign);
+
+/* The linker's interface for retrieving the section size of a specific section
+   in an object.  This interface should only be invoked in the claim_file handler.
+   This function sets *SECSIZE to the ELF sh_size
+   value of the input section.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_get_input_section_size) (const struct ld_plugin_section section,
+                                     uint64_t *secsize);
+
+enum ld_plugin_level
+{
+  LDPL_INFO,
+  LDPL_WARNING,
+  LDPL_ERROR,
+  LDPL_FATAL
+};
+
+/* Values for the tv_tag field of the transfer vector.  */
+
+enum ld_plugin_tag
+{
+  LDPT_NULL = 0,
+  LDPT_API_VERSION = 1,
+  LDPT_GOLD_VERSION = 2,
+  LDPT_LINKER_OUTPUT = 3,
+  LDPT_OPTION = 4,
+  LDPT_REGISTER_CLAIM_FILE_HOOK = 5,
+  LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK = 6,
+  LDPT_REGISTER_CLEANUP_HOOK = 7,
+  LDPT_ADD_SYMBOLS = 8,
+  LDPT_GET_SYMBOLS = 9,
+  LDPT_ADD_INPUT_FILE = 10,
+  LDPT_MESSAGE = 11,
+  LDPT_GET_INPUT_FILE = 12,
+  LDPT_RELEASE_INPUT_FILE = 13,
+  LDPT_ADD_INPUT_LIBRARY = 14,
+  LDPT_OUTPUT_NAME = 15,
+  LDPT_SET_EXTRA_LIBRARY_PATH = 16,
+  LDPT_GNU_LD_VERSION = 17,
+  LDPT_GET_VIEW = 18,
+  LDPT_GET_INPUT_SECTION_COUNT = 19,
+  LDPT_GET_INPUT_SECTION_TYPE = 20,
+  LDPT_GET_INPUT_SECTION_NAME = 21,
+  LDPT_GET_INPUT_SECTION_CONTENTS = 22,
+  LDPT_UPDATE_SECTION_ORDER = 23,
+  LDPT_ALLOW_SECTION_ORDERING = 24,
+  LDPT_GET_SYMBOLS_V2 = 25,
+  LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS = 26,
+  LDPT_UNIQUE_SEGMENT_FOR_SECTIONS = 27,
+  LDPT_GET_SYMBOLS_V3 = 28,
+  LDPT_GET_INPUT_SECTION_ALIGNMENT = 29,
+  LDPT_GET_INPUT_SECTION_SIZE = 30
+};
+
+/* The plugin transfer vector.  */
+
+struct ld_plugin_tv
+{
+  enum ld_plugin_tag tv_tag;
+  union
+  {
+    int tv_val;
+    const char *tv_string;
+    ld_plugin_register_claim_file tv_register_claim_file;
+    ld_plugin_register_all_symbols_read tv_register_all_symbols_read;
+    ld_plugin_register_cleanup tv_register_cleanup;
+    ld_plugin_add_symbols tv_add_symbols;
+    ld_plugin_get_symbols tv_get_symbols;
+    ld_plugin_add_input_file tv_add_input_file;
+    ld_plugin_message tv_message;
+    ld_plugin_get_input_file tv_get_input_file;
+    ld_plugin_get_view tv_get_view;
+    ld_plugin_release_input_file tv_release_input_file;
+    ld_plugin_add_input_library tv_add_input_library;
+    ld_plugin_set_extra_library_path tv_set_extra_library_path;
+    ld_plugin_get_input_section_count tv_get_input_section_count;
+    ld_plugin_get_input_section_type tv_get_input_section_type;
+    ld_plugin_get_input_section_name tv_get_input_section_name;
+    ld_plugin_get_input_section_contents tv_get_input_section_contents;
+    ld_plugin_update_section_order tv_update_section_order;
+    ld_plugin_allow_section_ordering tv_allow_section_ordering;
+    ld_plugin_allow_unique_segment_for_sections tv_allow_unique_segment_for_sections; 
+    ld_plugin_unique_segment_for_sections tv_unique_segment_for_sections;
+    ld_plugin_get_input_section_alignment tv_get_input_section_alignment;
+    ld_plugin_get_input_section_size tv_get_input_section_size;
+  } tv_u;
+};
+
+/* The plugin library's "onload" entry point.  */
+
+typedef
+enum ld_plugin_status
+(*ld_plugin_onload) (struct ld_plugin_tv *tv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(PLUGIN_API_H) */
diff --git a/utils/gapy/gen-debug-info-src/ext/progress.h b/utils/gapy/gen-debug-info-src/ext/progress.h
new file mode 100644
index 000000000..80ffbe24a
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/progress.h
@@ -0,0 +1,38 @@
+/* Default definitions for progress macros.
+   Copyright 1994, 2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+
+/* The default definitions below are intended to be replaced by real
+   definitions, if building the tools for an interactive programming
+   environment.  */
+
+#ifndef _PROGRESS_H
+#define _PROGRESS_H
+
+#ifndef START_PROGRESS
+#define START_PROGRESS(STR,N)
+#endif
+
+#ifndef PROGRESS
+#define PROGRESS(X)
+#endif
+
+#ifndef END_PROGRESS
+#define END_PROGRESS(STR)
+#endif
+
+#endif /* _PROGRESS_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/safe-ctype.h b/utils/gapy/gen-debug-info-src/ext/safe-ctype.h
new file mode 100644
index 000000000..a6d163e6e
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/safe-ctype.h
@@ -0,0 +1,150 @@
+/* <ctype.h> replacement macros.
+
+   Copyright (C) 2000-2015 Free Software Foundation, Inc.
+   Contributed by Zack Weinberg <zackw@stanford.edu>.
+
+This file is part of the libiberty library.
+Libiberty is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+Libiberty is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with libiberty; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* This is a compatible replacement of the standard C library's <ctype.h>
+   with the following properties:
+
+   - Implements all isxxx() macros required by C99.
+   - Also implements some character classes useful when
+     parsing C-like languages.
+   - Does not change behavior depending on the current locale.
+   - Behaves properly for all values in the range of a signed or
+     unsigned char.
+
+   To avoid conflicts, this header defines the isxxx functions in upper
+   case, e.g. ISALPHA not isalpha.  */
+
+#ifndef SAFE_CTYPE_H
+#define SAFE_CTYPE_H
+
+/* Determine host character set.  */
+#define HOST_CHARSET_UNKNOWN 0
+#define HOST_CHARSET_ASCII   1
+#define HOST_CHARSET_EBCDIC  2
+
+#if  '\n' == 0x0A && ' ' == 0x20 && '0' == 0x30 \
+   && 'A' == 0x41 && 'a' == 0x61 && '!' == 0x21
+#  define HOST_CHARSET HOST_CHARSET_ASCII
+#else
+# if '\n' == 0x15 && ' ' == 0x40 && '0' == 0xF0 \
+   && 'A' == 0xC1 && 'a' == 0x81 && '!' == 0x5A
+#  define HOST_CHARSET HOST_CHARSET_EBCDIC
+# else
+#  define HOST_CHARSET HOST_CHARSET_UNKNOWN
+# endif
+#endif
+
+/* Categories.  */
+
+enum {
+  /* In C99 */
+  _sch_isblank  = 0x0001,	/* space \t */
+  _sch_iscntrl  = 0x0002,	/* nonprinting characters */
+  _sch_isdigit  = 0x0004,	/* 0-9 */
+  _sch_islower  = 0x0008,	/* a-z */
+  _sch_isprint  = 0x0010,	/* any printing character including ' ' */
+  _sch_ispunct  = 0x0020,	/* all punctuation */
+  _sch_isspace  = 0x0040,	/* space \t \n \r \f \v */
+  _sch_isupper  = 0x0080,	/* A-Z */
+  _sch_isxdigit = 0x0100,	/* 0-9A-Fa-f */
+
+  /* Extra categories useful to cpplib.  */
+  _sch_isidst	= 0x0200,	/* A-Za-z_ */
+  _sch_isvsp    = 0x0400,	/* \n \r */
+  _sch_isnvsp   = 0x0800,	/* space \t \f \v \0 */
+
+  /* Combinations of the above.  */
+  _sch_isalpha  = _sch_isupper|_sch_islower,	/* A-Za-z */
+  _sch_isalnum  = _sch_isalpha|_sch_isdigit,	/* A-Za-z0-9 */
+  _sch_isidnum  = _sch_isidst|_sch_isdigit,	/* A-Za-z0-9_ */
+  _sch_isgraph  = _sch_isalnum|_sch_ispunct,	/* isprint and not space */
+  _sch_iscppsp  = _sch_isvsp|_sch_isnvsp,	/* isspace + \0 */
+  _sch_isbasic  = _sch_isprint|_sch_iscppsp     /* basic charset of ISO C
+						   (plus ` and @)  */
+};
+
+/* Character classification.  */
+extern const unsigned short _sch_istable[256];
+
+#define _sch_test(c, bit) (_sch_istable[(c) & 0xff] & (unsigned short)(bit))
+
+#define ISALPHA(c)  _sch_test(c, _sch_isalpha)
+#define ISALNUM(c)  _sch_test(c, _sch_isalnum)
+#define ISBLANK(c)  _sch_test(c, _sch_isblank)
+#define ISCNTRL(c)  _sch_test(c, _sch_iscntrl)
+#define ISDIGIT(c)  _sch_test(c, _sch_isdigit)
+#define ISGRAPH(c)  _sch_test(c, _sch_isgraph)
+#define ISLOWER(c)  _sch_test(c, _sch_islower)
+#define ISPRINT(c)  _sch_test(c, _sch_isprint)
+#define ISPUNCT(c)  _sch_test(c, _sch_ispunct)
+#define ISSPACE(c)  _sch_test(c, _sch_isspace)
+#define ISUPPER(c)  _sch_test(c, _sch_isupper)
+#define ISXDIGIT(c) _sch_test(c, _sch_isxdigit)
+
+#define ISIDNUM(c)	_sch_test(c, _sch_isidnum)
+#define ISIDST(c)	_sch_test(c, _sch_isidst)
+#define IS_ISOBASIC(c)	_sch_test(c, _sch_isbasic)
+#define IS_VSPACE(c)	_sch_test(c, _sch_isvsp)
+#define IS_NVSPACE(c)	_sch_test(c, _sch_isnvsp)
+#define IS_SPACE_OR_NUL(c)	_sch_test(c, _sch_iscppsp)
+
+/* Character transformation.  */
+extern const unsigned char  _sch_toupper[256];
+extern const unsigned char  _sch_tolower[256];
+#define TOUPPER(c) _sch_toupper[(c) & 0xff]
+#define TOLOWER(c) _sch_tolower[(c) & 0xff]
+
+/* Prevent the users of safe-ctype.h from accidently using the routines
+   from ctype.h.  Initially, the approach was to produce an error when
+   detecting that ctype.h has been included.  But this was causing
+   trouble as ctype.h might get indirectly included as a result of
+   including another system header (for instance gnulib's stdint.h).
+   So we include ctype.h here and then immediately redefine its macros.  */
+
+#include <ctype.h>
+#undef isalpha
+#define isalpha(c) do_not_use_isalpha_with_safe_ctype
+#undef isalnum
+#define isalnum(c) do_not_use_isalnum_with_safe_ctype
+#undef iscntrl
+#define iscntrl(c) do_not_use_iscntrl_with_safe_ctype
+#undef isdigit
+#define isdigit(c) do_not_use_isdigit_with_safe_ctype
+#undef isgraph
+#define isgraph(c) do_not_use_isgraph_with_safe_ctype
+#undef islower
+#define islower(c) do_not_use_islower_with_safe_ctype
+#undef isprint
+#define isprint(c) do_not_use_isprint_with_safe_ctype
+#undef ispunct
+#define ispunct(c) do_not_use_ispunct_with_safe_ctype
+#undef isspace
+#define isspace(c) do_not_use_isspace_with_safe_ctype
+#undef isupper
+#define isupper(c) do_not_use_isupper_with_safe_ctype
+#undef isxdigit
+#define isxdigit(c) do_not_use_isxdigit_with_safe_ctype
+#undef toupper
+#define toupper(c) do_not_use_toupper_with_safe_ctype
+#undef tolower
+#define tolower(c) do_not_use_tolower_with_safe_ctype
+
+#endif /* SAFE_CTYPE_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/sha1.h b/utils/gapy/gen-debug-info-src/ext/sha1.h
new file mode 100644
index 000000000..24f3ab79d
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/sha1.h
@@ -0,0 +1,145 @@
+/* Declarations of functions and data types used for SHA1 sum
+   library functions.
+   Copyright (C) 2000-2015 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef SHA1_H
+# define SHA1_H 1
+
+#include <stdio.h>
+
+#if defined HAVE_LIMITS_H || _LIBC
+# include <limits.h>
+#endif
+
+#include "ansidecl.h"
+
+/* The following contortions are an attempt to use the C preprocessor
+   to determine an unsigned integral type that is 32 bits wide.  An
+   alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but
+   doing that would require that the configure script compile and *run*
+   the resulting executable.  Locally running cross-compiled executables
+   is usually not possible.  */
+
+#ifdef _LIBC
+# include <sys/types.h>
+typedef u_int32_t sha1_uint32;
+typedef uintptr_t sha1_uintptr;
+#elif defined (HAVE_SYS_TYPES_H) && defined (HAVE_STDINT_H)
+#include <stdint.h>
+#include <sys/types.h>
+typedef uint32_t sha1_uint32;
+typedef uintptr_t sha1_uintptr;
+#else
+#  define INT_MAX_32_BITS 2147483647
+
+/* If UINT_MAX isn't defined, assume it's a 32-bit type.
+   This should be valid for all systems GNU cares about because
+   that doesn't include 16-bit systems, and only modern systems
+   (that certainly have <limits.h>) have 64+-bit integral types.  */
+
+# ifndef INT_MAX
+#  define INT_MAX INT_MAX_32_BITS
+# endif
+
+# if INT_MAX == INT_MAX_32_BITS
+   typedef unsigned int sha1_uint32;
+# else
+#  if SHRT_MAX == INT_MAX_32_BITS
+    typedef unsigned short sha1_uint32;
+#  else
+#   if LONG_MAX == INT_MAX_32_BITS
+     typedef unsigned long sha1_uint32;
+#   else
+     /* The following line is intended to evoke an error.
+        Using #error is not portable enough.  */
+     "Cannot determine unsigned 32-bit data type."
+#   endif
+#  endif
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure to save state of computation between the single steps.  */
+struct sha1_ctx
+{
+  sha1_uint32 A;
+  sha1_uint32 B;
+  sha1_uint32 C;
+  sha1_uint32 D;
+  sha1_uint32 E;
+
+  sha1_uint32 total[2];
+  sha1_uint32 buflen;
+  sha1_uint32 buffer[32];
+};
+
+
+/* Initialize structure containing state of computation. */
+extern void sha1_init_ctx (struct sha1_ctx *ctx);
+
+/* Starting with the result of former calls of this function (or the
+   initialization function update the context for the next LEN bytes
+   starting at BUFFER.
+   It is necessary that LEN is a multiple of 64!!! */
+extern void sha1_process_block (const void *buffer, size_t len,
+				struct sha1_ctx *ctx);
+
+/* Starting with the result of former calls of this function (or the
+   initialization function update the context for the next LEN bytes
+   starting at BUFFER.
+   It is NOT required that LEN is a multiple of 64.  */
+extern void sha1_process_bytes (const void *buffer, size_t len,
+				struct sha1_ctx *ctx);
+
+/* Process the remaining bytes in the buffer and put result from CTX
+   in first 20 bytes following RESBUF.  The result is always in little
+   endian byte order, so that a byte-wise output yields to the wanted
+   ASCII representation of the message digest.
+
+   IMPORTANT: On some systems it is required that RESBUF be correctly
+   aligned for a 32 bits value.  */
+extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf);
+
+
+/* Put result from CTX in first 20 bytes following RESBUF.  The result is
+   always in little endian byte order, so that a byte-wise output yields
+   to the wanted ASCII representation of the message digest.
+
+   IMPORTANT: On some systems it is required that RESBUF is correctly
+   aligned for a 32 bits value.  */
+extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf);
+
+
+/* Compute SHA1 message digest for bytes read from STREAM.  The
+   resulting message digest number will be written into the 20 bytes
+   beginning at RESBLOCK.  */
+extern int sha1_stream (FILE *stream, void *resblock);
+
+/* Compute SHA1 message digest for LEN bytes beginning at BUFFER.  The
+   result is always in little endian byte order, so that a byte-wise
+   output yields to the wanted ASCII representation of the message
+   digest.  */
+extern void *sha1_buffer (const char *buffer, size_t len, void *resblock);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/simple-object.h b/utils/gapy/gen-debug-info-src/ext/simple-object.h
new file mode 100644
index 000000000..5bd3d06c1
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/simple-object.h
@@ -0,0 +1,204 @@
+/* simple-object.h -- simple routines to read and write object files
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef SIMPLE_OBJECT_H
+#define SIMPLE_OBJECT_H
+
+#include <stddef.h>
+#include <sys/types.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This header file provides four types with associated functions.
+   They are used to read and write object files.  This is a minimal
+   interface, intended to support the needs of gcc without bringing in
+   all the power and complexity of BFD.  */
+
+/* The type simple_object_read * is used to read an existing object
+   file.  */
+
+typedef struct simple_object_read_struct simple_object_read;
+
+/* Create an simple_object_read given DESCRIPTOR, an open file
+   descriptor, and OFFSET, an offset within the file.  The offset is
+   for use with archives, and should be 0 for an ordinary object file.
+   The descriptor must remain open until done with the returned
+   simple_object_read.  SEGMENT_NAME is used on Mach-O and is required
+   on that platform: it means to only look at sections within the
+   segment with that name.  It is ignored for other object file
+   formats.  On error, this function returns NULL, and sets *ERRMSG to
+   an error string and sets *ERR to an errno value or 0 if there is no
+   relevant errno.  */
+
+extern simple_object_read *
+simple_object_start_read (int descriptor, off_t offset,
+			  const char *segment_name, const char **errmsg,
+			  int *err);
+
+/* Call PFN for each section in SIMPLE_OBJECT, passing it the section
+   name, offset within the file of the section contents, and length of
+   the section contents.  The offset within the file is relative to
+   the offset passed to simple_object_start_read.  The DATA argument
+   to simple_object_find_sections is passed on to PFN.  If PFN returns
+   0, the loop is stopped and simple_object_find_sections returns.  If
+   PFN returns non-zero, the loop continues.  On success this returns
+   NULL.  On error it returns an error string, and sets *ERR to an
+   errno value or 0 if there is no relevant errno.  */
+
+extern const char *
+simple_object_find_sections (simple_object_read *simple_object,
+			     int (*pfn) (void *data, const char *,
+					 off_t offset, off_t length),
+			     void *data,
+			     int *err);
+
+/* Look for the section NAME in SIMPLE_OBJECT.  This returns
+   information for the first section NAME in SIMPLE_OBJECT.  Note that
+   calling this multiple times is inefficient; use
+   simple_object_find_sections instead.
+
+   If found, return 1 and set *OFFSET to the offset in the file of the
+   section contents and set *LENGTH to the length of the section
+   contents.  *OFFSET will be relative to the offset passed to
+   simple_object_start_read.
+
+   If the section is not found, and no error occurs, return 0 and set
+   *ERRMSG to NULL.
+
+   If an error occurs, return 0, set *ERRMSG to an error message, and
+   set *ERR to an errno value or 0 if there is no relevant errno.  */
+
+extern int
+simple_object_find_section (simple_object_read *simple_object,
+			    const char *name, off_t *offset, off_t *length,
+			    const char **errmsg, int *err);
+
+/* Release all resources associated with SIMPLE_OBJECT.  This does not
+   close the file descriptor.  */
+
+extern void
+simple_object_release_read (simple_object_read *);
+
+/* The type simple_object_attributes holds the attributes of an object
+   file that matter for creating a file or ensuring that two files are
+   compatible.  This is a set of magic numbers.  */
+
+typedef struct simple_object_attributes_struct simple_object_attributes;
+
+/* Fetch the attributes of SIMPLE_OBJECT.  This information will
+   persist until simple_object_attributes_release is called, even if
+   SIMPLE_OBJECT is closed.  On error this returns NULL, sets *ERRMSG
+   to an error message, and sets *ERR to an errno value or 0 if there
+   isn't one.  */
+
+extern simple_object_attributes *
+simple_object_fetch_attributes (simple_object_read *simple_object,
+				const char **errmsg, int *err);
+
+/* Merge the FROM attributes into TO.  If two objects with these
+   attributes could be linked together without error, returns NULL.
+   Otherwise, returns an error message, and sets *ERR to an errno
+   value or 0 if there isn't one.  */
+
+extern const char *
+simple_object_attributes_merge (simple_object_attributes *to,
+				simple_object_attributes *from,
+				int *err);
+
+/* Release all resources associated with ATTRS.  */
+
+extern void
+simple_object_release_attributes (simple_object_attributes *attrs);
+
+/* The type simple_object_write is used to create a new object file.  */
+
+typedef struct simple_object_write_struct simple_object_write;
+
+/* Start creating a new object file which is like ATTRS.  You must
+   fetch attribute information from an existing object file before you
+   can create a new one.  There is currently no support for creating
+   an object file de novo.  The segment name is only used on Mach-O,
+   where it is required.  It means that all sections are created
+   within that segment.  It is ignored for other object file formats.
+   On error this function returns NULL, sets *ERRMSG to an error
+   message, and sets *ERR to an errno value or 0 if there isn't
+   one.  */
+
+extern simple_object_write *
+simple_object_start_write (simple_object_attributes *attrs,
+			   const char *segment_name,
+			   const char **errmsg, int *err);
+
+/* The type simple_object_write_section is a handle for a section
+   which is being written.  */
+
+typedef struct simple_object_write_section_struct simple_object_write_section;
+
+/* Add a section to SIMPLE_OBJECT.  NAME is the name of the new
+   section.  ALIGN is the required alignment expressed as the number
+   of required low-order 0 bits (e.g., 2 for alignment to a 32-bit
+   boundary).  The section is created as containing data, readable,
+   not writable, not executable, not loaded at runtime.  On error this
+   returns NULL, sets *ERRMSG to an error message, and sets *ERR to an
+   errno value or 0 if there isn't one.  */
+
+extern simple_object_write_section *
+simple_object_write_create_section (simple_object_write *simple_object,
+				    const char *name, unsigned int align,
+				    const char **errmsg, int *err);
+
+/* Add data BUFFER/SIZE to SECTION in SIMPLE_OBJECT.  If COPY is
+   non-zero, the data will be copied into memory if necessary.  If
+   COPY is zero, BUFFER must persist until SIMPLE_OBJECT is released.
+   On success this returns NULL.  On error this returns an error
+   message, and sets *ERR to an errno value or 0 if there isn't
+   one.  */
+
+extern const char *
+simple_object_write_add_data (simple_object_write *simple_object,
+			      simple_object_write_section *section,
+			      const void *buffer, size_t size,
+			      int copy, int *err);
+
+/* Write the complete object file to DESCRIPTOR, an open file
+   descriptor.  This returns NULL on success.  On error this returns
+   an error message, and sets *ERR to an errno value or 0 if there
+   isn't one.  */
+
+extern const char *
+simple_object_write_to_file (simple_object_write *simple_object,
+			     int descriptor, int *err);
+
+/* Release all resources associated with SIMPLE_OBJECT, including any
+   simple_object_write_section's that may have been created.  */
+
+extern void
+simple_object_release_write (simple_object_write *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/utils/gapy/gen-debug-info-src/ext/sort.h b/utils/gapy/gen-debug-info-src/ext/sort.h
new file mode 100644
index 000000000..23025d431
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/sort.h
@@ -0,0 +1,48 @@
+/* Sorting algorithms.
+   Copyright (C) 2000-2015 Free Software Foundation, Inc.
+   Contributed by Mark Mitchell <mark@codesourcery.com>.
+
+This file is part of GCC.
+   
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef SORT_H
+#define SORT_H
+
+#include <sys/types.h> /* For size_t */
+#ifdef __STDC__
+#include <stddef.h>
+#endif	/* __STDC__ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "ansidecl.h"
+
+/* Sort an array of pointers.  */
+
+extern void sort_pointers (size_t, void **, void **);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* SORT_H */
+
+
+   
+   
diff --git a/utils/gapy/gen-debug-info-src/ext/splay-tree.h b/utils/gapy/gen-debug-info-src/ext/splay-tree.h
new file mode 100644
index 000000000..f71d7d7b6
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/splay-tree.h
@@ -0,0 +1,156 @@
+/* A splay-tree datatype.  
+   Copyright (C) 1998-2015 Free Software Foundation, Inc.
+   Contributed by Mark Mitchell (mark@markmitchell.com).
+
+   This file is part of GCC.
+   
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to
+   the Free Software Foundation, 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* For an easily readable description of splay-trees, see:
+
+     Lewis, Harry R. and Denenberg, Larry.  Data Structures and Their
+     Algorithms.  Harper-Collins, Inc.  1991.  
+
+   The major feature of splay trees is that all basic tree operations
+   are amortized O(log n) time for a tree with n nodes.  */
+
+#ifndef _SPLAY_TREE_H
+#define _SPLAY_TREE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "ansidecl.h"
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+/* Use typedefs for the key and data types to facilitate changing
+   these types, if necessary.  These types should be sufficiently wide
+   that any pointer or scalar can be cast to these types, and then
+   cast back, without loss of precision.  */
+typedef uintptr_t splay_tree_key;
+typedef uintptr_t splay_tree_value;
+
+/* Forward declaration for a node in the tree.  */
+typedef struct splay_tree_node_s *splay_tree_node;
+
+/* The type of a function which compares two splay-tree keys.  The
+   function should return values as for qsort.  */
+typedef int (*splay_tree_compare_fn) (splay_tree_key, splay_tree_key);
+
+/* The type of a function used to deallocate any resources associated
+   with the key.  */
+typedef void (*splay_tree_delete_key_fn) (splay_tree_key);
+
+/* The type of a function used to deallocate any resources associated
+   with the value.  */
+typedef void (*splay_tree_delete_value_fn) (splay_tree_value);
+
+/* The type of a function used to iterate over the tree.  */
+typedef int (*splay_tree_foreach_fn) (splay_tree_node, void*);
+
+/* The type of a function used to allocate memory for tree root and
+   node structures.  The first argument is the number of bytes needed;
+   the second is a data pointer the splay tree functions pass through
+   to the allocator.  This function must never return zero.  */
+typedef void *(*splay_tree_allocate_fn) (int, void *);
+
+/* The type of a function used to free memory allocated using the
+   corresponding splay_tree_allocate_fn.  The first argument is the
+   memory to be freed; the latter is a data pointer the splay tree
+   functions pass through to the freer.  */
+typedef void (*splay_tree_deallocate_fn) (void *, void *);
+
+/* The nodes in the splay tree.  */
+struct splay_tree_node_s {
+  /* The key.  */
+  splay_tree_key key;
+
+  /* The value.  */
+  splay_tree_value value;
+
+  /* The left and right children, respectively.  */
+  splay_tree_node left;
+  splay_tree_node right;
+};
+
+/* The splay tree itself.  */
+struct splay_tree_s {
+  /* The root of the tree.  */
+  splay_tree_node root;
+
+  /* The comparision function.  */
+  splay_tree_compare_fn comp;
+
+  /* The deallocate-key function.  NULL if no cleanup is necessary.  */
+  splay_tree_delete_key_fn delete_key;
+
+  /* The deallocate-value function.  NULL if no cleanup is necessary.  */
+  splay_tree_delete_value_fn delete_value;
+
+  /* Node allocate function.  Takes allocate_data as a parameter. */
+  splay_tree_allocate_fn allocate;
+
+  /* Free function for nodes and trees.  Takes allocate_data as a parameter.  */
+  splay_tree_deallocate_fn deallocate;
+
+  /* Parameter for allocate/free functions.  */
+  void *allocate_data;
+};
+
+typedef struct splay_tree_s *splay_tree;
+
+extern splay_tree splay_tree_new (splay_tree_compare_fn,
+				  splay_tree_delete_key_fn,
+				  splay_tree_delete_value_fn);
+extern splay_tree splay_tree_new_with_allocator (splay_tree_compare_fn,
+						 splay_tree_delete_key_fn,
+						 splay_tree_delete_value_fn,
+						 splay_tree_allocate_fn,
+						 splay_tree_deallocate_fn,
+						 void *);
+extern splay_tree splay_tree_new_typed_alloc (splay_tree_compare_fn,
+					      splay_tree_delete_key_fn,
+					      splay_tree_delete_value_fn,
+					      splay_tree_allocate_fn,
+					      splay_tree_allocate_fn,
+					      splay_tree_deallocate_fn,
+					      void *);
+extern void splay_tree_delete (splay_tree);
+extern splay_tree_node splay_tree_insert (splay_tree,
+					  splay_tree_key,
+					  splay_tree_value);
+extern void splay_tree_remove	(splay_tree, splay_tree_key);
+extern splay_tree_node splay_tree_lookup (splay_tree, splay_tree_key);
+extern splay_tree_node splay_tree_predecessor (splay_tree, splay_tree_key);
+extern splay_tree_node splay_tree_successor (splay_tree, splay_tree_key);
+extern splay_tree_node splay_tree_max (splay_tree);
+extern splay_tree_node splay_tree_min (splay_tree);
+extern int splay_tree_foreach (splay_tree, splay_tree_foreach_fn, void*);
+extern int splay_tree_compare_ints (splay_tree_key, splay_tree_key);
+extern int splay_tree_compare_pointers (splay_tree_key,	splay_tree_key);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _SPLAY_TREE_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/symcat.h b/utils/gapy/gen-debug-info-src/ext/symcat.h
new file mode 100644
index 000000000..95fc9b2c5
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/symcat.h
@@ -0,0 +1,55 @@
+/* Symbol concatenation utilities.
+
+   Copyright (C) 1998-2015 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+ 
+   You should have received a copy of the GNU General Public License along
+   with this program; if not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef SYM_CAT_H
+#define SYM_CAT_H
+
+#if defined (__STDC__) || defined (ALMOST_STDC) || defined (HAVE_STRINGIZE)
+#define CONCAT2(a,b)	 a##b
+#define CONCAT3(a,b,c)	 a##b##c
+#define CONCAT4(a,b,c,d) a##b##c##d
+#define CONCAT5(a,b,c,d,e) a##b##c##d##e
+#define CONCAT6(a,b,c,d,e,f) a##b##c##d##e##f
+#define STRINGX(s) #s
+#else
+/* Note one should never pass extra whitespace to the CONCATn macros,
+   e.g. CONCAT2(foo, bar) because traditonal C will keep the space between
+   the two labels instead of concatenating them.  Instead, make sure to
+   write CONCAT2(foo,bar).  */
+#define CONCAT2(a,b)	 a/**/b
+#define CONCAT3(a,b,c)	 a/**/b/**/c
+#define CONCAT4(a,b,c,d) a/**/b/**/c/**/d
+#define CONCAT5(a,b,c,d,e) a/**/b/**/c/**/d/**/e
+#define CONCAT6(a,b,c,d,e,f) a/**/b/**/c/**/d/**/e/**/f
+#define STRINGX(s) "s"
+#endif
+
+#define XCONCAT2(a,b)     CONCAT2(a,b)
+#define XCONCAT3(a,b,c)   CONCAT3(a,b,c)
+#define XCONCAT4(a,b,c,d) CONCAT4(a,b,c,d)
+#define XCONCAT5(a,b,c,d,e) CONCAT5(a,b,c,d,e)
+#define XCONCAT6(a,b,c,d,e,f) CONCAT6(a,b,c,d,e,f)
+
+/* Note the layer of indirection here is typically used to allow
+   stringification of the expansion of macros.  I.e. "#define foo
+   bar", "XSTRING(foo)", to yield "bar".  Be aware that this only
+   works for __STDC__, not for traditional C which will still resolve
+   to "foo".  */
+#define XSTRING(s) STRINGX(s) 
+
+#endif /* SYM_CAT_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/timeval-utils.h b/utils/gapy/gen-debug-info-src/ext/timeval-utils.h
new file mode 100644
index 000000000..adbe8183f
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/timeval-utils.h
@@ -0,0 +1,40 @@
+/* Basic struct timeval utilities.
+   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+
+This file is part of the libiberty library.
+Libiberty is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+Libiberty is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with libiberty; see the file COPYING.LIB.  If not,
+write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef TIMEVAL_UTILS_H
+#define TIMEVAL_UTILS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* forward decl */
+struct timeval;
+
+extern void timeval_add (struct timeval *result,
+			 const struct timeval *a, const struct timeval *b);
+
+extern void timeval_sub (struct timeval *result,
+			 const struct timeval *a, const struct timeval *b);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* TIMEVAL_UTILS_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/vtv-change-permission.h b/utils/gapy/gen-debug-info-src/ext/vtv-change-permission.h
new file mode 100644
index 000000000..04ad8c3db
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/vtv-change-permission.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
+  
+   This file is part of GCC.
+  
+   modify it under the terms of the GNU Library General Public License
+   as published by the Free Software Foundation; either version 2, or
+   (at your option) any later version.
+
+   In addition to the permissions in the GNU Library General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Library Public License restrictions do apply in other
+   respects; for example, they cover modification of the file, and
+   distribution when not linked into a combined executable.)
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+
+#ifndef __VTV_H__
+#define __VTV_H__
+
+/* We could have used an enumeration here but it just makes it more
+   difficult for the compiler to generate a call to this.  These are
+   used as arguments to the function __VLTChangePermission, declared
+   below.  */
+#define __VLTP_READ_ONLY  0
+#define __VLTP_READ_WRITE 1
+
+#ifdef __cplusplus
+extern "C" void __VLTChangePermission (int);
+#else
+extern void __VLTChangePermission (int);
+#endif
+
+#ifdef BIG_PAGE_SIZE
+/* TODO - Replace '4096' below with correct big page size.  */
+#define VTV_PAGE_SIZE 4096
+#else 
+#if defined(__sun__) && defined(__svr4__) && defined(__sparc__)
+#define VTV_PAGE_SIZE 8192
+#else
+#define VTV_PAGE_SIZE 4096
+#endif
+#endif
+
+
+
+#endif /* __VTV_H__ */
diff --git a/utils/gapy/gen-debug-info-src/ext/xregex.h b/utils/gapy/gen-debug-info-src/ext/xregex.h
new file mode 100644
index 000000000..4c5ef9345
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/xregex.h
@@ -0,0 +1,29 @@
+/* This file redefines all regex external names before including
+   a renamed copy of glibc's regex.h.  */
+
+#ifndef _XREGEX_H
+#define _XREGEX_H 1
+
+#  define regfree xregfree 
+#  define regexec xregexec
+#  define regcomp xregcomp
+#  define regerror xregerror
+#  define regoff_t xregoff_t
+#  define re_set_registers xre_set_registers
+#  define re_match_2 xre_match_2
+#  define re_match xre_match
+#  define re_search xre_search
+#  define re_compile_pattern xre_compile_pattern
+#  define re_set_syntax xre_set_syntax
+#  define re_search_2 xre_search_2
+#  define re_compile_fastmap xre_compile_fastmap
+#  define re_syntax_options xre_syntax_options
+#  define re_max_failures xre_max_failures
+
+#  define _REGEX_RE_COMP
+#  define re_comp xre_comp
+#  define re_exec xre_exec
+
+#include "xregex2.h"
+
+#endif /* xregex.h */
diff --git a/utils/gapy/gen-debug-info-src/ext/xregex2.h b/utils/gapy/gen-debug-info-src/ext/xregex2.h
new file mode 100644
index 000000000..05066d4b2
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/xregex2.h
@@ -0,0 +1,564 @@
+/* Definitions for data structures and routines for the regular
+   expression library, version 0.12.
+
+   Copyright (C) 1985-2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.  Its master source is NOT part of
+   the C library, however.  The master source lives in /gd/gnu/lib.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301 USA.  */
+
+#ifndef _REGEX_H
+#define _REGEX_H 1
+
+/* Allow the use in C++ code.  */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* POSIX says that <sys/types.h> must be included (by the caller) before
+   <regex.h>.  */
+
+#if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS
+/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
+   should be there.  */
+# include <stddef.h>
+#endif
+
+/* The following two types have to be signed and unsigned integer type
+   wide enough to hold a value of a pointer.  For most ANSI compilers
+   ptrdiff_t and size_t should be likely OK.  Still size of these two
+   types is 2 for Microsoft C.  Ugh... */
+typedef long int s_reg_t;
+typedef unsigned long int active_reg_t;
+
+/* The following bits are used to determine the regexp syntax we
+   recognize.  The set/not-set meanings are chosen so that Emacs syntax
+   remains the value 0.  The bits are given in alphabetical order, and
+   the definitions shifted by one from the previous bit; thus, when we
+   add or remove a bit, only one other definition need change.  */
+typedef unsigned long int reg_syntax_t;
+
+/* If this bit is not set, then \ inside a bracket expression is literal.
+   If set, then such a \ quotes the following character.  */
+#define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1)
+
+/* If this bit is not set, then + and ? are operators, and \+ and \? are
+     literals.
+   If set, then \+ and \? are operators and + and ? are literals.  */
+#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
+
+/* If this bit is set, then character classes are supported.  They are:
+     [:alpha:], [:upper:], [:lower:],  [:digit:], [:alnum:], [:xdigit:],
+     [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
+   If not set, then character classes are not supported.  */
+#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
+
+/* If this bit is set, then ^ and $ are always anchors (outside bracket
+     expressions, of course).
+   If this bit is not set, then it depends:
+        ^  is an anchor if it is at the beginning of a regular
+           expression or after an open-group or an alternation operator;
+        $  is an anchor if it is at the end of a regular expression, or
+           before a close-group or an alternation operator.
+
+   This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
+   POSIX draft 11.2 says that * etc. in leading positions is undefined.
+   We already implemented a previous draft which made those constructs
+   invalid, though, so we haven't changed the code back.  */
+#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
+
+/* If this bit is set, then special characters are always special
+     regardless of where they are in the pattern.
+   If this bit is not set, then special characters are special only in
+     some contexts; otherwise they are ordinary.  Specifically,
+     * + ? and intervals are only special when not after the beginning,
+     open-group, or alternation operator.  */
+#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
+
+/* If this bit is set, then *, +, ?, and { cannot be first in an re or
+     immediately after an alternation or begin-group operator.  */
+#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
+
+/* If this bit is set, then . matches newline.
+   If not set, then it doesn't.  */
+#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
+
+/* If this bit is set, then . doesn't match NUL.
+   If not set, then it does.  */
+#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
+
+/* If this bit is set, nonmatching lists [^...] do not match newline.
+   If not set, they do.  */
+#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
+
+/* If this bit is set, either \{...\} or {...} defines an
+     interval, depending on RE_NO_BK_BRACES.
+   If not set, \{, \}, {, and } are literals.  */
+#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
+
+/* If this bit is set, +, ? and | aren't recognized as operators.
+   If not set, they are.  */
+#define RE_LIMITED_OPS (RE_INTERVALS << 1)
+
+/* If this bit is set, newline is an alternation operator.
+   If not set, newline is literal.  */
+#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
+
+/* If this bit is set, then `{...}' defines an interval, and \{ and \}
+     are literals.
+  If not set, then `\{...\}' defines an interval.  */
+#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
+
+/* If this bit is set, (...) defines a group, and \( and \) are literals.
+   If not set, \(...\) defines a group, and ( and ) are literals.  */
+#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
+
+/* If this bit is set, then \<digit> matches <digit>.
+   If not set, then \<digit> is a back-reference.  */
+#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
+
+/* If this bit is set, then | is an alternation operator, and \| is literal.
+   If not set, then \| is an alternation operator, and | is literal.  */
+#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
+
+/* If this bit is set, then an ending range point collating higher
+     than the starting range point, as in [z-a], is invalid.
+   If not set, then when ending range point collates higher than the
+     starting range point, the range is ignored.  */
+#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
+
+/* If this bit is set, then an unmatched ) is ordinary.
+   If not set, then an unmatched ) is invalid.  */
+#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
+
+/* If this bit is set, succeed as soon as we match the whole pattern,
+   without further backtracking.  */
+#define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1)
+
+/* If this bit is set, do not process the GNU regex operators.
+   If not set, then the GNU regex operators are recognized. */
+#define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1)
+
+/* If this bit is set, turn on internal regex debugging.
+   If not set, and debugging was on, turn it off.
+   This only works if regex.c is compiled -DDEBUG.
+   We define this bit always, so that all that's needed to turn on
+   debugging is to recompile regex.c; the calling code can always have
+   this bit set, and it won't affect anything in the normal case. */
+#define RE_DEBUG (RE_NO_GNU_OPS << 1)
+
+/* If this bit is set, a syntactically invalid interval is treated as
+   a string of ordinary characters.  For example, the ERE 'a{1' is
+   treated as 'a\{1'.  */
+#define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
+
+/* This global variable defines the particular regexp syntax to use (for
+   some interfaces).  When a regexp is compiled, the syntax used is
+   stored in the pattern buffer, so changing this does not affect
+   already-compiled regexps.  */
+extern reg_syntax_t re_syntax_options;
+
+/* Define combinations of the above bits for the standard possibilities.
+   (The [[[ comments delimit what gets put into the Texinfo file, so
+   don't delete them!)  */
+/* [[[begin syntaxes]]] */
+#define RE_SYNTAX_EMACS 0
+
+#define RE_SYNTAX_AWK							\
+  (RE_BACKSLASH_ESCAPE_IN_LISTS   | RE_DOT_NOT_NULL			\
+   | RE_NO_BK_PARENS              | RE_NO_BK_REFS			\
+   | RE_NO_BK_VBAR                | RE_NO_EMPTY_RANGES			\
+   | RE_DOT_NEWLINE		  | RE_CONTEXT_INDEP_ANCHORS		\
+   | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS)
+
+#define RE_SYNTAX_GNU_AWK						\
+  ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG)	\
+   & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS))
+
+#define RE_SYNTAX_POSIX_AWK 						\
+  (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS		\
+   | RE_INTERVALS	    | RE_NO_GNU_OPS)
+
+#define RE_SYNTAX_GREP							\
+  (RE_BK_PLUS_QM              | RE_CHAR_CLASSES				\
+   | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS				\
+   | RE_NEWLINE_ALT)
+
+#define RE_SYNTAX_EGREP							\
+  (RE_CHAR_CLASSES        | RE_CONTEXT_INDEP_ANCHORS			\
+   | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE			\
+   | RE_NEWLINE_ALT       | RE_NO_BK_PARENS				\
+   | RE_NO_BK_VBAR)
+
+#define RE_SYNTAX_POSIX_EGREP						\
+  (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES			\
+   | RE_INVALID_INTERVAL_ORD)
+
+/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
+#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
+
+#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
+
+/* Syntax bits common to both basic and extended POSIX regex syntax.  */
+#define _RE_SYNTAX_POSIX_COMMON						\
+  (RE_CHAR_CLASSES | RE_DOT_NEWLINE      | RE_DOT_NOT_NULL		\
+   | RE_INTERVALS  | RE_NO_EMPTY_RANGES)
+
+#define RE_SYNTAX_POSIX_BASIC						\
+  (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
+
+/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
+   RE_LIMITED_OPS, i.e., \? \+ \| are not recognized.  Actually, this
+   isn't minimal, since other operators, such as \`, aren't disabled.  */
+#define RE_SYNTAX_POSIX_MINIMAL_BASIC					\
+  (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
+
+#define RE_SYNTAX_POSIX_EXTENDED					\
+  (_RE_SYNTAX_POSIX_COMMON  | RE_CONTEXT_INDEP_ANCHORS			\
+   | RE_CONTEXT_INDEP_OPS   | RE_NO_BK_BRACES				\
+   | RE_NO_BK_PARENS        | RE_NO_BK_VBAR				\
+   | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD)
+
+/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is
+   removed and RE_NO_BK_REFS is added.  */
+#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED				\
+  (_RE_SYNTAX_POSIX_COMMON  | RE_CONTEXT_INDEP_ANCHORS			\
+   | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES				\
+   | RE_NO_BK_PARENS        | RE_NO_BK_REFS				\
+   | RE_NO_BK_VBAR	    | RE_UNMATCHED_RIGHT_PAREN_ORD)
+/* [[[end syntaxes]]] */
+
+/* Maximum number of duplicates an interval can allow.  Some systems
+   (erroneously) define this in other header files, but we want our
+   value, so remove any previous define.  */
+#ifdef RE_DUP_MAX
+# undef RE_DUP_MAX
+#endif
+/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows.  */
+#define RE_DUP_MAX (0x7fff)
+
+
+/* POSIX `cflags' bits (i.e., information for `regcomp').  */
+
+/* If this bit is set, then use extended regular expression syntax.
+   If not set, then use basic regular expression syntax.  */
+#define REG_EXTENDED 1
+
+/* If this bit is set, then ignore case when matching.
+   If not set, then case is significant.  */
+#define REG_ICASE (REG_EXTENDED << 1)
+
+/* If this bit is set, then anchors do not match at newline
+     characters in the string.
+   If not set, then anchors do match at newlines.  */
+#define REG_NEWLINE (REG_ICASE << 1)
+
+/* If this bit is set, then report only success or fail in regexec.
+   If not set, then returns differ between not matching and errors.  */
+#define REG_NOSUB (REG_NEWLINE << 1)
+
+
+/* POSIX `eflags' bits (i.e., information for regexec).  */
+
+/* If this bit is set, then the beginning-of-line operator doesn't match
+     the beginning of the string (presumably because it's not the
+     beginning of a line).
+   If not set, then the beginning-of-line operator does match the
+     beginning of the string.  */
+#define REG_NOTBOL 1
+
+/* Like REG_NOTBOL, except for the end-of-line.  */
+#define REG_NOTEOL (1 << 1)
+
+
+/* If any error codes are removed, changed, or added, update the
+   `re_error_msg' table in regex.c.  */
+typedef enum
+{
+#ifdef _XOPEN_SOURCE
+  REG_ENOSYS = -1,	/* This will never happen for this implementation.  */
+#endif
+
+  REG_NOERROR = 0,	/* Success.  */
+  REG_NOMATCH,		/* Didn't find a match (for regexec).  */
+
+  /* POSIX regcomp return error codes.  (In the order listed in the
+     standard.)  */
+  REG_BADPAT,		/* Invalid pattern.  */
+  REG_ECOLLATE,		/* Not implemented.  */
+  REG_ECTYPE,		/* Invalid character class name.  */
+  REG_EESCAPE,		/* Trailing backslash.  */
+  REG_ESUBREG,		/* Invalid back reference.  */
+  REG_EBRACK,		/* Unmatched left bracket.  */
+  REG_EPAREN,		/* Parenthesis imbalance.  */
+  REG_EBRACE,		/* Unmatched \{.  */
+  REG_BADBR,		/* Invalid contents of \{\}.  */
+  REG_ERANGE,		/* Invalid range end.  */
+  REG_ESPACE,		/* Ran out of memory.  */
+  REG_BADRPT,		/* No preceding re for repetition op.  */
+
+  /* Error codes we've added.  */
+  REG_EEND,		/* Premature end.  */
+  REG_ESIZE,		/* Compiled pattern bigger than 2^16 bytes.  */
+  REG_ERPAREN		/* Unmatched ) or \); not returned from regcomp.  */
+} reg_errcode_t;
+
+/* This data structure represents a compiled pattern.  Before calling
+   the pattern compiler, the fields `buffer', `allocated', `fastmap',
+   `translate', and `no_sub' can be set.  After the pattern has been
+   compiled, the `re_nsub' field is available.  All other fields are
+   private to the regex routines.  */
+
+#ifndef RE_TRANSLATE_TYPE
+# define RE_TRANSLATE_TYPE char *
+#endif
+
+struct re_pattern_buffer
+{
+/* [[[begin pattern_buffer]]] */
+	/* Space that holds the compiled pattern.  It is declared as
+          `unsigned char *' because its elements are
+           sometimes used as array indexes.  */
+  unsigned char *buffer;
+
+	/* Number of bytes to which `buffer' points.  */
+  unsigned long int allocated;
+
+	/* Number of bytes actually used in `buffer'.  */
+  unsigned long int used;
+
+        /* Syntax setting with which the pattern was compiled.  */
+  reg_syntax_t syntax;
+
+        /* Pointer to a fastmap, if any, otherwise zero.  re_search uses
+           the fastmap, if there is one, to skip over impossible
+           starting points for matches.  */
+  char *fastmap;
+
+        /* Either a translate table to apply to all characters before
+           comparing them, or zero for no translation.  The translation
+           is applied to a pattern when it is compiled and to a string
+           when it is matched.  */
+  RE_TRANSLATE_TYPE translate;
+
+	/* Number of subexpressions found by the compiler.  */
+  size_t re_nsub;
+
+        /* Zero if this pattern cannot match the empty string, one else.
+           Well, in truth it's used only in `re_search_2', to see
+           whether or not we should use the fastmap, so we don't set
+           this absolutely perfectly; see `re_compile_fastmap' (the
+           `duplicate' case).  */
+  unsigned can_be_null : 1;
+
+        /* If REGS_UNALLOCATED, allocate space in the `regs' structure
+             for `max (RE_NREGS, re_nsub + 1)' groups.
+           If REGS_REALLOCATE, reallocate space if necessary.
+           If REGS_FIXED, use what's there.  */
+#define REGS_UNALLOCATED 0
+#define REGS_REALLOCATE 1
+#define REGS_FIXED 2
+  unsigned regs_allocated : 2;
+
+        /* Set to zero when `regex_compile' compiles a pattern; set to one
+           by `re_compile_fastmap' if it updates the fastmap.  */
+  unsigned fastmap_accurate : 1;
+
+        /* If set, `re_match_2' does not return information about
+           subexpressions.  */
+  unsigned no_sub : 1;
+
+        /* If set, a beginning-of-line anchor doesn't match at the
+           beginning of the string.  */
+  unsigned not_bol : 1;
+
+        /* Similarly for an end-of-line anchor.  */
+  unsigned not_eol : 1;
+
+        /* If true, an anchor at a newline matches.  */
+  unsigned newline_anchor : 1;
+
+/* [[[end pattern_buffer]]] */
+};
+
+typedef struct re_pattern_buffer regex_t;
+
+/* Type for byte offsets within the string.  POSIX mandates this.  */
+typedef int regoff_t;
+
+
+/* This is the structure we store register match data in.  See
+   regex.texinfo for a full description of what registers match.  */
+struct re_registers
+{
+  unsigned num_regs;
+  regoff_t *start;
+  regoff_t *end;
+};
+
+
+/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
+   `re_match_2' returns information about at least this many registers
+   the first time a `regs' structure is passed.  */
+#ifndef RE_NREGS
+# define RE_NREGS 30
+#endif
+
+
+/* POSIX specification for registers.  Aside from the different names than
+   `re_registers', POSIX uses an array of structures, instead of a
+   structure of arrays.  */
+typedef struct
+{
+  regoff_t rm_so;  /* Byte offset from string's start to substring's start.  */
+  regoff_t rm_eo;  /* Byte offset from string's start to substring's end.  */
+} regmatch_t;
+
+/* Declarations for routines.  */
+
+/* To avoid duplicating every routine declaration -- once with a
+   prototype (if we are ANSI), and once without (if we aren't) -- we
+   use the following macro to declare argument types.  This
+   unfortunately clutters up the declarations a bit, but I think it's
+   worth it.  */
+
+/* Sets the current default syntax to SYNTAX, and return the old syntax.
+   You can also simply assign to the `re_syntax_options' variable.  */
+extern reg_syntax_t re_set_syntax (reg_syntax_t syntax);
+
+/* Compile the regular expression PATTERN, with length LENGTH
+   and syntax given by the global `re_syntax_options', into the buffer
+   BUFFER.  Return NULL if successful, and an error string if not.  */
+extern const char *re_compile_pattern (const char *pattern, size_t length,
+                                       struct re_pattern_buffer *buffer);
+
+
+/* Compile a fastmap for the compiled pattern in BUFFER; used to
+   accelerate searches.  Return 0 if successful and -2 if was an
+   internal error.  */
+extern int re_compile_fastmap (struct re_pattern_buffer *buffer);
+
+
+/* Search in the string STRING (with length LENGTH) for the pattern
+   compiled into BUFFER.  Start searching at position START, for RANGE
+   characters.  Return the starting position of the match, -1 for no
+   match, or -2 for an internal error.  Also return register
+   information in REGS (if REGS and BUFFER->no_sub are nonzero).  */
+extern int re_search (struct re_pattern_buffer *buffer, const char *string,
+                      int length, int start, int range,
+                      struct re_registers *regs);
+
+
+/* Like `re_search', but search in the concatenation of STRING1 and
+   STRING2.  Also, stop searching at index START + STOP.  */
+extern int re_search_2 (struct re_pattern_buffer *buffer, const char *string1,
+                        int length1, const char *string2, int length2,
+                        int start, int range, struct re_registers *regs,
+                        int stop);
+
+
+/* Like `re_search', but return how many characters in STRING the regexp
+   in BUFFER matched, starting at position START.  */
+extern int re_match (struct re_pattern_buffer *buffer, const char *string,
+                     int length, int start, struct re_registers *regs);
+
+
+/* Relates to `re_match' as `re_search_2' relates to `re_search'.  */
+extern int re_match_2 (struct re_pattern_buffer *buffer, const char *string1,
+                       int length1, const char *string2, int length2,
+                       int start, struct re_registers *regs, int stop);
+
+
+/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
+   ENDS.  Subsequent matches using BUFFER and REGS will use this memory
+   for recording register information.  STARTS and ENDS must be
+   allocated with malloc, and must each be at least `NUM_REGS * sizeof
+   (regoff_t)' bytes long.
+
+   If NUM_REGS == 0, then subsequent matches should allocate their own
+   register data.
+
+   Unless this function is called, the first search or match using
+   PATTERN_BUFFER will allocate its own register data, without
+   freeing the old data.  */
+extern void re_set_registers (struct re_pattern_buffer *buffer,
+                              struct re_registers *regs,
+                              unsigned num_regs, regoff_t *starts,
+                              regoff_t *ends);
+
+#if defined _REGEX_RE_COMP || defined _LIBC
+# ifndef _CRAY
+/* 4.2 bsd compatibility.  */
+extern char *re_comp (const char *);
+extern int re_exec (const char *);
+# endif
+#endif
+
+/* GCC 2.95 and later have "__restrict"; C99 compilers have
+   "restrict", and "configure" may have defined "restrict".  */
+#ifndef __restrict
+# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__))
+#  if defined restrict || 199901L <= __STDC_VERSION__
+#   define __restrict restrict
+#  else
+#   define __restrict
+#  endif
+# endif
+#endif
+
+/* GCC 3.1 and later support declaring arrays as non-overlapping
+   using the syntax array_name[restrict]  */
+#ifndef __restrict_arr
+# if ! (3 < __GNUC__ || (3 == __GNUC__ && 1 <= __GNUC_MINOR__)) || defined (__GNUG__)
+#  define __restrict_arr
+# else
+#  define __restrict_arr __restrict
+# endif
+#endif
+
+/* POSIX compatibility.  */
+extern int regcomp (regex_t *__restrict __preg,
+                    const char *__restrict __pattern,
+                    int __cflags);
+
+#if (__GNUC__)
+__extension__
+#endif
+extern int regexec (const regex_t *__restrict __preg,
+                    const char *__restrict __string, size_t __nmatch,
+                    regmatch_t __pmatch[__restrict_arr],
+                    int __eflags);
+
+extern size_t regerror (int __errcode, const regex_t *__preg,
+                        char *__errbuf, size_t __errbuf_size);
+
+extern void regfree (regex_t *__preg);
+
+
+#ifdef __cplusplus
+}
+#endif	/* C++ */
+
+#endif /* regex.h */
+
+/*
+Local variables:
+make-backup-files: t
+version-control: t
+trim-versions-without-asking: nil
+End:
+*/
diff --git a/utils/gapy/gen-debug-info-src/ext/xtensa-config.h b/utils/gapy/gen-debug-info-src/ext/xtensa-config.h
new file mode 100644
index 000000000..5ae4c8060
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/xtensa-config.h
@@ -0,0 +1,176 @@
+/* Xtensa configuration settings.
+   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Contributed by Bob Wilson (bob.wilson@acm.org) at Tensilica.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef XTENSA_CONFIG_H
+#define XTENSA_CONFIG_H
+
+/* The macros defined here match those with the same names in the Xtensa
+   compile-time HAL (Hardware Abstraction Layer).  Please refer to the
+   Xtensa System Software Reference Manual for documentation of these
+   macros.  */
+
+#undef XCHAL_HAVE_BE
+#define XCHAL_HAVE_BE			1
+
+#undef XCHAL_HAVE_DENSITY
+#define XCHAL_HAVE_DENSITY		1
+
+#undef XCHAL_HAVE_CONST16
+#define XCHAL_HAVE_CONST16		0
+
+#undef XCHAL_HAVE_ABS
+#define XCHAL_HAVE_ABS			1
+
+#undef XCHAL_HAVE_ADDX
+#define XCHAL_HAVE_ADDX			1
+
+#undef XCHAL_HAVE_L32R
+#define XCHAL_HAVE_L32R			1
+
+#undef XSHAL_USE_ABSOLUTE_LITERALS
+#define XSHAL_USE_ABSOLUTE_LITERALS	0
+
+#undef XSHAL_HAVE_TEXT_SECTION_LITERALS
+#define XSHAL_HAVE_TEXT_SECTION_LITERALS 1 /* Set if there is some memory that allows both code and literals.  */
+
+#undef XCHAL_HAVE_MAC16
+#define XCHAL_HAVE_MAC16		0
+
+#undef XCHAL_HAVE_MUL16
+#define XCHAL_HAVE_MUL16		1
+
+#undef XCHAL_HAVE_MUL32
+#define XCHAL_HAVE_MUL32		1
+
+#undef XCHAL_HAVE_MUL32_HIGH
+#define XCHAL_HAVE_MUL32_HIGH		0
+
+#undef XCHAL_HAVE_DIV32
+#define XCHAL_HAVE_DIV32		1
+
+#undef XCHAL_HAVE_NSA
+#define XCHAL_HAVE_NSA			1
+
+#undef XCHAL_HAVE_MINMAX
+#define XCHAL_HAVE_MINMAX		1
+
+#undef XCHAL_HAVE_SEXT
+#define XCHAL_HAVE_SEXT			1
+
+#undef XCHAL_HAVE_LOOPS
+#define XCHAL_HAVE_LOOPS		1
+
+#undef XCHAL_HAVE_THREADPTR
+#define XCHAL_HAVE_THREADPTR		1
+
+#undef XCHAL_HAVE_RELEASE_SYNC
+#define XCHAL_HAVE_RELEASE_SYNC		1
+
+#undef XCHAL_HAVE_S32C1I
+#define XCHAL_HAVE_S32C1I		1
+
+#undef XCHAL_HAVE_BOOLEANS
+#define XCHAL_HAVE_BOOLEANS		0
+
+#undef XCHAL_HAVE_FP
+#define XCHAL_HAVE_FP			0
+
+#undef XCHAL_HAVE_FP_DIV
+#define XCHAL_HAVE_FP_DIV		0
+
+#undef XCHAL_HAVE_FP_RECIP
+#define XCHAL_HAVE_FP_RECIP		0
+
+#undef XCHAL_HAVE_FP_SQRT
+#define XCHAL_HAVE_FP_SQRT		0
+
+#undef XCHAL_HAVE_FP_RSQRT
+#define XCHAL_HAVE_FP_RSQRT		0
+
+#undef XCHAL_HAVE_DFP_accel
+#define XCHAL_HAVE_DFP_accel			0
+#undef XCHAL_HAVE_WINDOWED
+#define XCHAL_HAVE_WINDOWED		1
+
+#undef XCHAL_NUM_AREGS
+#define XCHAL_NUM_AREGS			32
+
+#undef XCHAL_HAVE_WIDE_BRANCHES
+#define XCHAL_HAVE_WIDE_BRANCHES	0
+
+#undef XCHAL_HAVE_PREDICTED_BRANCHES
+#define XCHAL_HAVE_PREDICTED_BRANCHES	0
+
+
+#undef XCHAL_ICACHE_SIZE
+#define XCHAL_ICACHE_SIZE		16384
+
+#undef XCHAL_DCACHE_SIZE
+#define XCHAL_DCACHE_SIZE		16384
+
+#undef XCHAL_ICACHE_LINESIZE
+#define XCHAL_ICACHE_LINESIZE		32
+
+#undef XCHAL_DCACHE_LINESIZE
+#define XCHAL_DCACHE_LINESIZE		32
+
+#undef XCHAL_ICACHE_LINEWIDTH
+#define XCHAL_ICACHE_LINEWIDTH		5
+
+#undef XCHAL_DCACHE_LINEWIDTH
+#define XCHAL_DCACHE_LINEWIDTH		5
+
+#undef XCHAL_DCACHE_IS_WRITEBACK
+#define XCHAL_DCACHE_IS_WRITEBACK	1
+
+
+#undef XCHAL_HAVE_MMU
+#define XCHAL_HAVE_MMU			1
+
+#undef XCHAL_MMU_MIN_PTE_PAGE_SIZE
+#define XCHAL_MMU_MIN_PTE_PAGE_SIZE	12
+
+
+#undef XCHAL_HAVE_DEBUG
+#define XCHAL_HAVE_DEBUG		1
+
+#undef XCHAL_NUM_IBREAK
+#define XCHAL_NUM_IBREAK		2
+
+#undef XCHAL_NUM_DBREAK
+#define XCHAL_NUM_DBREAK		2
+
+#undef XCHAL_DEBUGLEVEL
+#define XCHAL_DEBUGLEVEL		6
+
+
+#undef XCHAL_MAX_INSTRUCTION_SIZE
+#define XCHAL_MAX_INSTRUCTION_SIZE	3
+
+#undef XCHAL_INST_FETCH_WIDTH
+#define XCHAL_INST_FETCH_WIDTH		4
+
+
+#undef XSHAL_ABI
+#undef XTHAL_ABI_WINDOWED
+#undef XTHAL_ABI_CALL0
+#define XSHAL_ABI			XTHAL_ABI_WINDOWED
+#define XTHAL_ABI_WINDOWED		0
+#define XTHAL_ABI_CALL0			1
+
+#endif /* !XTENSA_CONFIG_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/xtensa-isa-internal.h b/utils/gapy/gen-debug-info-src/ext/xtensa-isa-internal.h
new file mode 100644
index 000000000..6c727366b
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/xtensa-isa-internal.h
@@ -0,0 +1,234 @@
+/* Internal definitions for configurable Xtensa ISA support.
+   Copyright 2003, 2004, 2005, 2008, 2010 Free Software Foundation, Inc.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301,
+   USA.  */
+
+#ifndef XTENSA_ISA_INTERNAL_H
+#define XTENSA_ISA_INTERNAL_H
+
+/* Flags.  */
+
+#define XTENSA_OPERAND_IS_REGISTER	0x00000001
+#define XTENSA_OPERAND_IS_PCRELATIVE	0x00000002
+#define XTENSA_OPERAND_IS_INVISIBLE	0x00000004
+#define XTENSA_OPERAND_IS_UNKNOWN	0x00000008
+
+#define XTENSA_OPCODE_IS_BRANCH		0x00000001
+#define XTENSA_OPCODE_IS_JUMP		0x00000002
+#define XTENSA_OPCODE_IS_LOOP		0x00000004
+#define XTENSA_OPCODE_IS_CALL		0x00000008
+
+#define XTENSA_STATE_IS_EXPORTED	0x00000001
+#define XTENSA_STATE_IS_SHARED_OR	0x00000002
+
+#define XTENSA_INTERFACE_HAS_SIDE_EFFECT 0x00000001
+
+/* Function pointer typedefs */
+typedef void (*xtensa_format_encode_fn) (xtensa_insnbuf);
+typedef void (*xtensa_get_slot_fn) (const xtensa_insnbuf, xtensa_insnbuf);
+typedef void (*xtensa_set_slot_fn) (xtensa_insnbuf, const xtensa_insnbuf);
+typedef int (*xtensa_opcode_decode_fn) (const xtensa_insnbuf);
+typedef uint32 (*xtensa_get_field_fn) (const xtensa_insnbuf);
+typedef void (*xtensa_set_field_fn) (xtensa_insnbuf, uint32);
+typedef int (*xtensa_immed_decode_fn) (uint32 *);
+typedef int (*xtensa_immed_encode_fn) (uint32 *);
+typedef int (*xtensa_do_reloc_fn) (uint32 *, uint32);
+typedef int (*xtensa_undo_reloc_fn) (uint32 *, uint32);
+typedef void (*xtensa_opcode_encode_fn) (xtensa_insnbuf);
+typedef int (*xtensa_format_decode_fn) (const xtensa_insnbuf);
+typedef int (*xtensa_length_decode_fn) (const unsigned char *);
+
+typedef struct xtensa_format_internal_struct
+{
+  const char *name;			/* Instruction format name.  */
+  int length;				/* Instruction length in bytes.  */
+  xtensa_format_encode_fn encode_fn;
+  int num_slots;
+  int *slot_id;				/* Array[num_slots] of slot IDs.  */
+} xtensa_format_internal;
+
+typedef struct xtensa_slot_internal_struct
+{
+  const char *name;			/* Not necessarily unique.  */
+  const char *format;
+  int position;
+  xtensa_get_slot_fn get_fn;
+  xtensa_set_slot_fn set_fn;
+  xtensa_get_field_fn *get_field_fns;	/* Array[field_id].  */
+  xtensa_set_field_fn *set_field_fns;	/* Array[field_id].  */
+  xtensa_opcode_decode_fn opcode_decode_fn;
+  const char *nop_name;
+} xtensa_slot_internal;
+
+typedef struct xtensa_operand_internal_struct
+{
+  const char *name;
+  int field_id;
+  xtensa_regfile regfile;		/* Register file.  */
+  int num_regs;				/* Usually 1; 2 for reg pairs, etc.  */
+  uint32 flags;				/* See XTENSA_OPERAND_* flags.  */
+  xtensa_immed_encode_fn encode;	/* Encode the operand value.  */
+  xtensa_immed_decode_fn decode;	/* Decode the value from the field.  */
+  xtensa_do_reloc_fn do_reloc;		/* Perform a PC-relative reloc.  */
+  xtensa_undo_reloc_fn undo_reloc;	/* Undo a PC-relative relocation.  */
+} xtensa_operand_internal;
+
+typedef struct xtensa_arg_internal_struct
+{
+  union {
+    int operand_id;			/* For normal operands.  */
+    xtensa_state state;			/* For stateOperands.  */
+  } u;
+  char inout;				/* Direction: 'i', 'o', or 'm'.  */
+} xtensa_arg_internal;
+
+typedef struct xtensa_iclass_internal_struct
+{
+  int num_operands;			/* Size of "operands" array.  */
+  xtensa_arg_internal *operands;	/* Array[num_operands].  */
+
+  int num_stateOperands;		/* Size of "stateOperands" array.  */
+  xtensa_arg_internal *stateOperands;	/* Array[num_stateOperands].  */
+
+  int num_interfaceOperands;		/* Size of "interfaceOperands".  */
+  xtensa_interface *interfaceOperands;	/* Array[num_interfaceOperands].  */
+} xtensa_iclass_internal;
+
+typedef struct xtensa_opcode_internal_struct
+{
+  const char *name;			/* Opcode mnemonic.  */
+  int iclass_id;			/* Iclass for this opcode.  */
+  uint32 flags;				/* See XTENSA_OPCODE_* flags.  */
+  xtensa_opcode_encode_fn *encode_fns;	/* Array[slot_id].  */
+  int num_funcUnit_uses;		/* Number of funcUnit_use entries.  */
+  xtensa_funcUnit_use *funcUnit_uses;	/* Array[num_funcUnit_uses].  */
+} xtensa_opcode_internal;
+
+typedef struct xtensa_regfile_internal_struct
+{
+  const char *name;			/* Full name of the regfile.  */
+  const char *shortname;		/* Abbreviated name.  */
+  xtensa_regfile parent;		/* View parent (or identity).  */
+  int num_bits;				/* Width of the registers.  */
+  int num_entries;			/* Number of registers.  */
+} xtensa_regfile_internal;
+
+typedef struct xtensa_interface_internal_struct
+{
+  const char *name;			/* Interface name.  */
+  int num_bits;				/* Width of the interface.  */
+  uint32 flags;				/* See XTENSA_INTERFACE_* flags.  */
+  int class_id;				/* Class of related interfaces.  */
+  char inout;				/* "i" or "o".  */
+} xtensa_interface_internal;
+
+typedef struct xtensa_funcUnit_internal_struct
+{
+  const char *name;			/* Functional unit name.  */
+  int num_copies;			/* Number of instances.  */
+} xtensa_funcUnit_internal;
+
+typedef struct xtensa_state_internal_struct
+{
+  const char *name;			/* State name.  */
+  int num_bits;				/* Number of state bits.  */
+  uint32 flags;				/* See XTENSA_STATE_* flags.  */
+} xtensa_state_internal;
+
+typedef struct xtensa_sysreg_internal_struct
+{
+  const char *name;			/* Register name.  */
+  int number;				/* Register number.  */
+  int is_user;				/* Non-zero if a "user register".  */
+} xtensa_sysreg_internal;
+
+typedef struct xtensa_lookup_entry_struct
+{
+  const char *key;
+  union
+  {
+    xtensa_opcode opcode;		/* Internal opcode number.  */
+    xtensa_sysreg sysreg;		/* Internal sysreg number.  */
+    xtensa_state state;			/* Internal state number.  */
+    xtensa_interface intf;		/* Internal interface number.  */
+    xtensa_funcUnit fun;		/* Internal funcUnit number.  */
+  } u;
+} xtensa_lookup_entry;
+
+typedef struct xtensa_isa_internal_struct
+{
+  int is_big_endian;			/* Endianness.  */
+  int insn_size;			/* Maximum length in bytes.  */
+  int insnbuf_size;			/* Number of insnbuf_words.  */
+
+  int num_formats;
+  xtensa_format_internal *formats;
+  xtensa_format_decode_fn format_decode_fn;
+  xtensa_length_decode_fn length_decode_fn;
+
+  int num_slots;
+  xtensa_slot_internal *slots;
+
+  int num_fields;
+
+  int num_operands;
+  xtensa_operand_internal *operands;
+
+  int num_iclasses;
+  xtensa_iclass_internal *iclasses;
+
+  int num_opcodes;
+  xtensa_opcode_internal *opcodes;
+  xtensa_lookup_entry *opname_lookup_table;
+
+  int num_regfiles;
+  xtensa_regfile_internal *regfiles;
+
+  int num_states;
+  xtensa_state_internal *states;
+  xtensa_lookup_entry *state_lookup_table;
+
+  int num_sysregs;
+  xtensa_sysreg_internal *sysregs;
+  xtensa_lookup_entry *sysreg_lookup_table;
+
+  /* The current Xtensa ISA only supports 256 of each kind of sysreg so
+     we can get away with implementing lookups with tables indexed by
+     the register numbers.  If we ever allow larger sysreg numbers, this
+     may have to be reimplemented.  The first entry in the following
+     arrays corresponds to "special" registers and the second to "user"
+     registers.  */
+  int max_sysreg_num[2];
+  xtensa_sysreg *sysreg_table[2];
+
+  int num_interfaces;
+  xtensa_interface_internal *interfaces;
+  xtensa_lookup_entry *interface_lookup_table;
+
+  int num_funcUnits;
+  xtensa_funcUnit_internal *funcUnits;
+  xtensa_lookup_entry *funcUnit_lookup_table;
+
+} xtensa_isa_internal;
+
+extern int xtensa_isa_name_compare (const void *, const void *);
+
+extern xtensa_isa_status xtisa_errno;
+extern char xtisa_error_msg[];
+
+#endif /* !XTENSA_ISA_INTERNAL_H */
diff --git a/utils/gapy/gen-debug-info-src/ext/xtensa-isa.h b/utils/gapy/gen-debug-info-src/ext/xtensa-isa.h
new file mode 100644
index 000000000..c3c740da4
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/ext/xtensa-isa.h
@@ -0,0 +1,813 @@
+/* Interface definition for configurable Xtensa ISA support.
+   Copyright 2003, 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, 
+   USA.  */
+
+#ifndef XTENSA_LIBISA_H
+#define XTENSA_LIBISA_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Version number: This is intended to help support code that works with
+   versions of this library from multiple Xtensa releases.  */
+
+#define XTENSA_ISA_VERSION 7000
+
+#ifndef uint32
+#define uint32 unsigned int
+#endif
+
+/* This file defines the interface to the Xtensa ISA library.  This
+   library contains most of the ISA-specific information for a
+   particular Xtensa processor.  For example, the set of valid
+   instructions, their opcode encodings and operand fields are all
+   included here.
+
+   This interface basically defines a number of abstract data types.
+
+   . an instruction buffer - for holding the raw instruction bits
+   . ISA info - information about the ISA as a whole
+   . instruction formats - instruction size and slot structure
+   . opcodes - information about individual instructions
+   . operands - information about register and immediate instruction operands
+   . stateOperands - information about processor state instruction operands
+   . interfaceOperands - information about interface instruction operands
+   . register files - register file information
+   . processor states - internal processor state information
+   . system registers - "special registers" and "user registers"
+   . interfaces - TIE interfaces that are external to the processor
+   . functional units - TIE shared functions
+
+   The interface defines a set of functions to access each data type.
+   With the exception of the instruction buffer, the internal
+   representations of the data structures are hidden.  All accesses must
+   be made through the functions defined here.  */
+
+typedef struct xtensa_isa_opaque { int unused; } *xtensa_isa;
+
+
+/* Most of the Xtensa ISA entities (e.g., opcodes, regfiles, etc.) are
+   represented here using sequential integers beginning with 0.  The
+   specific values are only fixed for a particular instantiation of an
+   xtensa_isa structure, so these values should only be used
+   internally.  */
+
+typedef int xtensa_opcode;
+typedef int xtensa_format;
+typedef int xtensa_regfile;
+typedef int xtensa_state;
+typedef int xtensa_sysreg;
+typedef int xtensa_interface;
+typedef int xtensa_funcUnit;
+
+
+/* Define a unique value for undefined items.  */
+
+#define XTENSA_UNDEFINED -1
+
+
+/* Overview of using this interface to decode/encode instructions:
+
+   Each Xtensa instruction is associated with a particular instruction
+   format, where the format defines a fixed number of slots for
+   operations.  The formats for the core Xtensa ISA have only one slot,
+   but FLIX instructions may have multiple slots.  Within each slot,
+   there is a single opcode and some number of associated operands.
+
+   The encoding and decoding functions operate on instruction buffers,
+   not on the raw bytes of the instructions.  The same instruction
+   buffer data structure is used for both entire instructions and
+   individual slots in those instructions -- the contents of a slot need
+   to be extracted from or inserted into the buffer for the instruction
+   as a whole.
+
+   Decoding an instruction involves first finding the format, which
+   identifies the number of slots, and then decoding each slot
+   separately.  A slot is decoded by finding the opcode and then using
+   the opcode to determine how many operands there are.  For example:
+
+   xtensa_insnbuf_from_chars
+   xtensa_format_decode
+   for each slot {
+     xtensa_format_get_slot
+     xtensa_opcode_decode
+     for each operand {
+       xtensa_operand_get_field
+       xtensa_operand_decode
+     }
+   }
+
+   Encoding an instruction is roughly the same procedure in reverse:
+
+   xtensa_format_encode
+   for each slot {
+     xtensa_opcode_encode
+     for each operand {
+       xtensa_operand_encode
+       xtensa_operand_set_field
+     }
+     xtensa_format_set_slot
+   }
+   xtensa_insnbuf_to_chars
+*/
+
+
+/* Error handling.  */
+
+/* Error codes.  The code for the most recent error condition can be
+   retrieved with the "errno" function.  For any result other than
+   xtensa_isa_ok, an error message containing additional information
+   about the problem can be retrieved using the "error_msg" function.
+   The error messages are stored in an internal buffer, which should
+   not be freed and may be overwritten by subsequent operations.  */
+
+typedef enum xtensa_isa_status_enum
+{
+  xtensa_isa_ok = 0,
+  xtensa_isa_bad_format,
+  xtensa_isa_bad_slot,
+  xtensa_isa_bad_opcode,
+  xtensa_isa_bad_operand,
+  xtensa_isa_bad_field,
+  xtensa_isa_bad_iclass,
+  xtensa_isa_bad_regfile,
+  xtensa_isa_bad_sysreg,
+  xtensa_isa_bad_state,
+  xtensa_isa_bad_interface,
+  xtensa_isa_bad_funcUnit,
+  xtensa_isa_wrong_slot,
+  xtensa_isa_no_field,
+  xtensa_isa_out_of_memory,
+  xtensa_isa_buffer_overflow,
+  xtensa_isa_internal_error,
+  xtensa_isa_bad_value
+} xtensa_isa_status;
+
+extern xtensa_isa_status
+xtensa_isa_errno (xtensa_isa isa);
+
+extern char *
+xtensa_isa_error_msg (xtensa_isa isa);
+
+
+
+/* Instruction buffers.  */
+
+typedef uint32 xtensa_insnbuf_word;
+typedef xtensa_insnbuf_word *xtensa_insnbuf;
+
+
+/* Get the size in "insnbuf_words" of the xtensa_insnbuf array.  */
+
+extern int
+xtensa_insnbuf_size (xtensa_isa isa); 
+
+
+/* Allocate an xtensa_insnbuf of the right size.  */
+
+extern xtensa_insnbuf
+xtensa_insnbuf_alloc (xtensa_isa isa);
+
+
+/* Release an xtensa_insnbuf.  */
+
+extern void
+xtensa_insnbuf_free (xtensa_isa isa, xtensa_insnbuf buf);
+
+
+/* Conversion between raw memory (char arrays) and our internal
+   instruction representation.  This is complicated by the Xtensa ISA's
+   variable instruction lengths.  When converting to chars, the buffer
+   must contain a valid instruction so we know how many bytes to copy;
+   thus, the "to_chars" function returns the number of bytes copied or
+   XTENSA_UNDEFINED on error.  The "from_chars" function first reads the
+   minimal number of bytes required to decode the instruction length and
+   then proceeds to copy the entire instruction into the buffer; if the
+   memory does not contain a valid instruction, it copies the maximum
+   number of bytes required for the longest Xtensa instruction.  The
+   "num_chars" argument may be used to limit the number of bytes that
+   can be read or written.  Otherwise, if "num_chars" is zero, the
+   functions may read or write past the end of the code.  */
+
+extern int
+xtensa_insnbuf_to_chars (xtensa_isa isa, const xtensa_insnbuf insn,
+			 unsigned char *cp, int num_chars);
+
+extern void
+xtensa_insnbuf_from_chars (xtensa_isa isa, xtensa_insnbuf insn,
+			   const unsigned char *cp, int num_chars);
+
+
+
+/* ISA information.  */
+
+/* Initialize the ISA information.  */
+
+extern xtensa_isa
+xtensa_isa_init (xtensa_isa_status *errno_p, char **error_msg_p);
+
+
+/* Deallocate an xtensa_isa structure.  */
+
+extern void
+xtensa_isa_free (xtensa_isa isa);
+
+
+/* Get the maximum instruction size in bytes.  */
+
+extern int
+xtensa_isa_maxlength (xtensa_isa isa); 
+
+
+/* Decode the length in bytes of an instruction in raw memory (not an
+   insnbuf).  This function reads only the minimal number of bytes
+   required to decode the instruction length.  Returns
+   XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_isa_length_from_chars (xtensa_isa isa, const unsigned char *cp);
+
+
+/* Get the number of stages in the processor's pipeline.  The pipeline
+   stage values returned by other functions in this library will range
+   from 0 to N-1, where N is the value returned by this function.
+   Note that the stage numbers used here may not correspond to the
+   actual processor hardware, e.g., the hardware may have additional
+   stages before stage 0.  Returns XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_isa_num_pipe_stages (xtensa_isa isa); 
+
+
+/* Get the number of various entities that are defined for this processor.  */
+
+extern int
+xtensa_isa_num_formats (xtensa_isa isa);
+
+extern int
+xtensa_isa_num_opcodes (xtensa_isa isa);
+
+extern int
+xtensa_isa_num_regfiles (xtensa_isa isa);
+
+extern int
+xtensa_isa_num_states (xtensa_isa isa);
+
+extern int
+xtensa_isa_num_sysregs (xtensa_isa isa);
+
+extern int
+xtensa_isa_num_interfaces (xtensa_isa isa);
+
+extern int
+xtensa_isa_num_funcUnits (xtensa_isa isa);
+
+
+
+/* Instruction formats.  */
+
+/* Get the name of a format.  Returns null on error.  */
+
+extern const char *
+xtensa_format_name (xtensa_isa isa, xtensa_format fmt);
+
+
+/* Given a format name, return the format number.  Returns
+   XTENSA_UNDEFINED if the name is not a valid format.  */
+
+extern xtensa_format
+xtensa_format_lookup (xtensa_isa isa, const char *fmtname);
+
+
+/* Decode the instruction format from a binary instruction buffer.
+   Returns XTENSA_UNDEFINED if the format is not recognized.  */
+
+extern xtensa_format
+xtensa_format_decode (xtensa_isa isa, const xtensa_insnbuf insn);
+
+
+/* Set the instruction format field(s) in a binary instruction buffer.
+   All the other fields are set to zero.  Returns non-zero on error.  */
+
+extern int
+xtensa_format_encode (xtensa_isa isa, xtensa_format fmt, xtensa_insnbuf insn);
+
+
+/* Find the length (in bytes) of an instruction.  Returns
+   XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_format_length (xtensa_isa isa, xtensa_format fmt);
+
+
+/* Get the number of slots in an instruction.  Returns XTENSA_UNDEFINED
+   on error.  */
+
+extern int
+xtensa_format_num_slots (xtensa_isa isa, xtensa_format fmt);
+
+
+/* Get the opcode for a no-op in a particular slot.
+   Returns XTENSA_UNDEFINED on error.  */
+
+extern xtensa_opcode
+xtensa_format_slot_nop_opcode (xtensa_isa isa, xtensa_format fmt, int slot);
+
+
+/* Get the bits for a specified slot out of an insnbuf for the
+   instruction as a whole and put them into an insnbuf for that one
+   slot, and do the opposite to set a slot.  Return non-zero on error.  */
+
+extern int
+xtensa_format_get_slot (xtensa_isa isa, xtensa_format fmt, int slot,
+			const xtensa_insnbuf insn, xtensa_insnbuf slotbuf);
+
+extern int
+xtensa_format_set_slot (xtensa_isa isa, xtensa_format fmt, int slot,
+			xtensa_insnbuf insn, const xtensa_insnbuf slotbuf);
+
+
+
+/* Opcode information.  */
+
+/* Translate a mnemonic name to an opcode.  Returns XTENSA_UNDEFINED if
+   the name is not a valid opcode mnemonic.  */
+
+extern xtensa_opcode
+xtensa_opcode_lookup (xtensa_isa isa, const char *opname);
+
+
+/* Decode the opcode for one instruction slot from a binary instruction
+   buffer.  Returns the opcode or XTENSA_UNDEFINED if the opcode is
+   illegal.  */
+
+extern xtensa_opcode
+xtensa_opcode_decode (xtensa_isa isa, xtensa_format fmt, int slot,
+		      const xtensa_insnbuf slotbuf);
+
+
+/* Set the opcode field(s) for an instruction slot.  All other fields
+   in the slot are set to zero.  Returns non-zero if the opcode cannot
+   be encoded.  */
+
+extern int
+xtensa_opcode_encode (xtensa_isa isa, xtensa_format fmt, int slot,
+		      xtensa_insnbuf slotbuf, xtensa_opcode opc);
+
+
+/* Get the mnemonic name for an opcode.  Returns null on error.  */
+
+extern const char *
+xtensa_opcode_name (xtensa_isa isa, xtensa_opcode opc);
+
+
+/* Check various properties of opcodes.  These functions return 0 if
+   the condition is false, 1 if the condition is true, and
+   XTENSA_UNDEFINED on error.  The instructions are classified as
+   follows:
+
+   branch: conditional branch; may fall through to next instruction (B*)
+   jump: unconditional branch (J, JX, RET*, RF*)
+   loop: zero-overhead loop (LOOP*)
+   call: unconditional call; control returns to next instruction (CALL*)
+
+   For the opcodes that affect control flow in some way, the branch
+   target may be specified by an immediate operand or it may be an
+   address stored in a register.  You can distinguish these by
+   checking if the instruction has a PC-relative immediate
+   operand.  */
+
+extern int
+xtensa_opcode_is_branch (xtensa_isa isa, xtensa_opcode opc);
+
+extern int
+xtensa_opcode_is_jump (xtensa_isa isa, xtensa_opcode opc);
+
+extern int
+xtensa_opcode_is_loop (xtensa_isa isa, xtensa_opcode opc);
+
+extern int
+xtensa_opcode_is_call (xtensa_isa isa, xtensa_opcode opc);
+
+
+/* Find the number of ordinary operands, state operands, and interface
+   operands for an instruction.  These return XTENSA_UNDEFINED on
+   error.  */
+
+extern int
+xtensa_opcode_num_operands (xtensa_isa isa, xtensa_opcode opc);
+
+extern int
+xtensa_opcode_num_stateOperands (xtensa_isa isa, xtensa_opcode opc);
+
+extern int
+xtensa_opcode_num_interfaceOperands (xtensa_isa isa, xtensa_opcode opc);
+
+
+/* Get functional unit usage requirements for an opcode.  Each "use"
+   is identified by a <functional unit, pipeline stage> pair.  The
+   "num_funcUnit_uses" function returns the number of these "uses" or
+   XTENSA_UNDEFINED on error.  The "funcUnit_use" function returns
+   a pointer to a "use" pair or null on error.  */
+
+typedef struct xtensa_funcUnit_use_struct
+{
+  xtensa_funcUnit unit;
+  int stage;
+} xtensa_funcUnit_use;
+
+extern int
+xtensa_opcode_num_funcUnit_uses (xtensa_isa isa, xtensa_opcode opc);
+
+extern xtensa_funcUnit_use *
+xtensa_opcode_funcUnit_use (xtensa_isa isa, xtensa_opcode opc, int u);
+
+
+
+/* Operand information.  */
+
+/* Get the name of an operand.  Returns null on error.  */
+
+extern const char *
+xtensa_operand_name (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+
+/* Some operands are "invisible", i.e., not explicitly specified in
+   assembly language.  When assembling an instruction, you need not set
+   the values of invisible operands, since they are either hardwired or
+   derived from other field values.  The values of invisible operands
+   can be examined in the same way as other operands, but remember that
+   an invisible operand may get its value from another visible one, so
+   the entire instruction must be available before examining the
+   invisible operand values.  This function returns 1 if an operand is
+   visible, 0 if it is invisible, or XTENSA_UNDEFINED on error.  Note
+   that whether an operand is visible is orthogonal to whether it is
+   "implicit", i.e., whether it is encoded in a field in the
+   instruction.  */
+
+extern int
+xtensa_operand_is_visible (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+
+/* Check if an operand is an input ('i'), output ('o'), or inout ('m')
+   operand.  Note: The output operand of a conditional assignment
+   (e.g., movnez) appears here as an inout ('m') even if it is declared
+   in the TIE code as an output ('o'); this allows the compiler to
+   properly handle register allocation for conditional assignments.
+   Returns 0 on error.  */
+
+extern char
+xtensa_operand_inout (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+
+/* Get and set the raw (encoded) value of the field for the specified
+   operand.  The "set" function does not check if the value fits in the
+   field; that is done by the "encode" function below.  Both of these
+   functions return non-zero on error, e.g., if the field is not defined
+   for the specified slot.  */
+
+extern int
+xtensa_operand_get_field (xtensa_isa isa, xtensa_opcode opc, int opnd,
+			  xtensa_format fmt, int slot,
+			  const xtensa_insnbuf slotbuf, uint32 *valp);
+
+extern int 
+xtensa_operand_set_field (xtensa_isa isa, xtensa_opcode opc, int opnd,
+			  xtensa_format fmt, int slot,
+			  xtensa_insnbuf slotbuf, uint32 val);
+
+
+/* Encode and decode operands.  The raw bits in the operand field may
+   be encoded in a variety of different ways.  These functions hide
+   the details of that encoding.  The result values are returned through
+   the argument pointer.  The return value is non-zero on error.  */
+
+extern int
+xtensa_operand_encode (xtensa_isa isa, xtensa_opcode opc, int opnd,
+		       uint32 *valp);
+
+extern int
+xtensa_operand_decode (xtensa_isa isa, xtensa_opcode opc, int opnd,
+		       uint32 *valp);
+
+
+/* An operand may be either a register operand or an immediate of some
+   sort (e.g., PC-relative or not).  The "is_register" function returns
+   0 if the operand is an immediate, 1 if it is a register, and
+   XTENSA_UNDEFINED on error.  The "regfile" function returns the
+   regfile for a register operand, or XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_operand_is_register (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+extern xtensa_regfile
+xtensa_operand_regfile (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+
+/* Register operands may span multiple consecutive registers, e.g., a
+   64-bit data type may occupy two 32-bit registers.  Only the first
+   register is encoded in the operand field.  This function specifies
+   the number of consecutive registers occupied by this operand.  For
+   non-register operands, the return value is undefined.  Returns
+   XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_operand_num_regs (xtensa_isa isa, xtensa_opcode opc, int opnd);
+				 
+
+/* Some register operands do not completely identify the register being
+   accessed.  For example, the operand value may be added to an internal
+   state value.  By definition, this implies that the corresponding
+   regfile is not allocatable.  Unknown registers should generally be
+   treated with worst-case assumptions.  The function returns 0 if the
+   register value is unknown, 1 if known, and XTENSA_UNDEFINED on
+   error.  */
+
+extern int
+xtensa_operand_is_known_reg (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+
+/* Check if an immediate operand is PC-relative.  Returns 0 for register
+   operands and non-PC-relative immediates, 1 for PC-relative
+   immediates, and XTENSA_UNDEFINED on error.  */
+ 
+extern int
+xtensa_operand_is_PCrelative (xtensa_isa isa, xtensa_opcode opc, int opnd);
+
+
+/* For PC-relative offset operands, the interpretation of the offset may
+   vary between opcodes, e.g., is it relative to the current PC or that
+   of the next instruction?  The following functions are defined to
+   perform PC-relative relocations and to undo them (as in the
+   disassembler).  The "do_reloc" function takes the desired address
+   value and the PC of the current instruction and sets the value to the
+   corresponding PC-relative offset (which can then be encoded and
+   stored into the operand field).  The "undo_reloc" function takes the
+   unencoded offset value and the current PC and sets the value to the
+   appropriate address.  The return values are non-zero on error.  Note
+   that these functions do not replace the encode/decode functions; the
+   operands must be encoded/decoded separately and the encode functions
+   are responsible for detecting invalid operand values.  */
+
+extern int
+xtensa_operand_do_reloc (xtensa_isa isa, xtensa_opcode opc, int opnd,
+			 uint32 *valp, uint32 pc);
+
+extern int
+xtensa_operand_undo_reloc (xtensa_isa isa, xtensa_opcode opc, int opnd,
+			   uint32 *valp, uint32 pc);
+
+
+
+/* State Operands.  */
+
+/* Get the state accessed by a state operand.  Returns XTENSA_UNDEFINED
+   on error.  */
+
+extern xtensa_state
+xtensa_stateOperand_state (xtensa_isa isa, xtensa_opcode opc, int stOp);
+
+
+/* Check if a state operand is an input ('i'), output ('o'), or inout
+   ('m') operand.  Returns 0 on error.  */
+
+extern char
+xtensa_stateOperand_inout (xtensa_isa isa, xtensa_opcode opc, int stOp);
+
+
+
+/* Interface Operands.  */
+
+/* Get the external interface accessed by an interface operand.
+   Returns XTENSA_UNDEFINED on error.  */
+
+extern xtensa_interface
+xtensa_interfaceOperand_interface (xtensa_isa isa, xtensa_opcode opc,
+				   int ifOp);
+
+
+
+/* Register Files.  */
+
+/* Regfiles include both "real" regfiles and "views", where a view
+   allows a group of adjacent registers in a real "parent" regfile to be
+   viewed as a single register.  A regfile view has all the same
+   properties as its parent except for its (long) name, bit width, number
+   of entries, and default ctype.  You can use the parent function to
+   distinguish these two classes.  */
+
+/* Look up a regfile by either its name or its abbreviated "short name".
+   Returns XTENSA_UNDEFINED on error.  The "lookup_shortname" function
+   ignores "view" regfiles since they always have the same shortname as
+   their parents.  */
+
+extern xtensa_regfile
+xtensa_regfile_lookup (xtensa_isa isa, const char *name);
+
+extern xtensa_regfile
+xtensa_regfile_lookup_shortname (xtensa_isa isa, const char *shortname);
+
+
+/* Get the name or abbreviated "short name" of a regfile.
+   Returns null on error.  */
+
+extern const char *
+xtensa_regfile_name (xtensa_isa isa, xtensa_regfile rf);
+
+extern const char *
+xtensa_regfile_shortname (xtensa_isa isa, xtensa_regfile rf);
+
+
+/* Get the parent regfile of a "view" regfile.  If the regfile is not a
+   view, the result is the same as the input parameter.  Returns
+   XTENSA_UNDEFINED on error.  */
+
+extern xtensa_regfile
+xtensa_regfile_view_parent (xtensa_isa isa, xtensa_regfile rf);
+
+
+/* Get the bit width of a regfile or regfile view.
+   Returns XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_regfile_num_bits (xtensa_isa isa, xtensa_regfile rf);
+
+
+/* Get the number of regfile entries.  Returns XTENSA_UNDEFINED on
+   error.  */
+
+extern int
+xtensa_regfile_num_entries (xtensa_isa isa, xtensa_regfile rf);
+
+
+
+/* Processor States.  */
+
+/* Look up a state by name.  Returns XTENSA_UNDEFINED on error.  */
+
+extern xtensa_state
+xtensa_state_lookup (xtensa_isa isa, const char *name);
+
+
+/* Get the name for a processor state.  Returns null on error.  */
+
+extern const char *
+xtensa_state_name (xtensa_isa isa, xtensa_state st);
+
+
+/* Get the bit width for a processor state.
+   Returns XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_state_num_bits (xtensa_isa isa, xtensa_state st);
+
+
+/* Check if a state is exported from the processor core.  Returns 0 if
+   the condition is false, 1 if the condition is true, and
+   XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_state_is_exported (xtensa_isa isa, xtensa_state st);
+
+
+/* Check for a "shared_or" state.  Returns 0 if the condition is false,
+   1 if the condition is true, and XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_state_is_shared_or (xtensa_isa isa, xtensa_state st);
+
+
+
+/* Sysregs ("special registers" and "user registers").  */
+
+/* Look up a register by its number and whether it is a "user register"
+   or a "special register".  Returns XTENSA_UNDEFINED if the sysreg does
+   not exist.  */
+
+extern xtensa_sysreg
+xtensa_sysreg_lookup (xtensa_isa isa, int num, int is_user);
+
+
+/* Check if there exists a sysreg with a given name.
+   If not, this function returns XTENSA_UNDEFINED.  */
+
+extern xtensa_sysreg
+xtensa_sysreg_lookup_name (xtensa_isa isa, const char *name);
+
+
+/* Get the name of a sysreg.  Returns null on error.  */
+
+extern const char *
+xtensa_sysreg_name (xtensa_isa isa, xtensa_sysreg sysreg);
+
+
+/* Get the register number.  Returns XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_sysreg_number (xtensa_isa isa, xtensa_sysreg sysreg);
+
+
+/* Check if a sysreg is a "special register" or a "user register".
+   Returns 0 for special registers, 1 for user registers and
+   XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_sysreg_is_user (xtensa_isa isa, xtensa_sysreg sysreg);
+
+
+
+/* Interfaces.  */
+
+/* Find an interface by name.  The return value is XTENSA_UNDEFINED if
+   the specified interface is not found.  */
+
+extern xtensa_interface
+xtensa_interface_lookup (xtensa_isa isa, const char *ifname);
+
+
+/* Get the name of an interface.  Returns null on error.  */
+
+extern const char *
+xtensa_interface_name (xtensa_isa isa, xtensa_interface intf);
+
+
+/* Get the bit width for an interface.
+   Returns XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_interface_num_bits (xtensa_isa isa, xtensa_interface intf);
+
+
+/* Check if an interface is an input ('i') or output ('o') with respect
+   to the Xtensa processor core.  Returns 0 on error.  */
+
+extern char
+xtensa_interface_inout (xtensa_isa isa, xtensa_interface intf);
+
+
+/* Check if accessing an interface has potential side effects.
+   Currently "data" interfaces have side effects and "control"
+   interfaces do not.  Returns 1 if there are side effects, 0 if not,
+   and XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_interface_has_side_effect (xtensa_isa isa, xtensa_interface intf);
+
+
+/* Some interfaces may be related such that accessing one interface
+   has side effects on a set of related interfaces.  The interfaces
+   are partitioned into equivalence classes of related interfaces, and
+   each class is assigned a unique identifier number.  This function
+   returns the class identifier for an interface, or XTENSA_UNDEFINED
+   on error.  These identifiers can be compared to determine if two
+   interfaces are related; the specific values of the identifiers have
+   no particular meaning otherwise.  */
+
+extern int
+xtensa_interface_class_id (xtensa_isa isa, xtensa_interface intf);
+
+
+
+/* Functional Units.  */
+
+/* Find a functional unit by name.  The return value is XTENSA_UNDEFINED if
+   the specified unit is not found.  */
+
+extern xtensa_funcUnit
+xtensa_funcUnit_lookup (xtensa_isa isa, const char *fname);
+
+
+/* Get the name of a functional unit.  Returns null on error.  */
+
+extern const char *
+xtensa_funcUnit_name (xtensa_isa isa, xtensa_funcUnit fun);
+
+
+/* Functional units may be replicated.  See how many instances of a
+   particular function unit exist.  Returns XTENSA_UNDEFINED on error.  */
+
+extern int
+xtensa_funcUnit_num_copies (xtensa_isa isa, xtensa_funcUnit fun);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* XTENSA_LIBISA_H */
diff --git a/utils/gapy/gen-debug-info-src/main.cpp b/utils/gapy/gen-debug-info-src/main.cpp
new file mode 100644
index 000000000..2f0c23107
--- /dev/null
+++ b/utils/gapy/gen-debug-info-src/main.cpp
@@ -0,0 +1,119 @@
+/* Main header file for the bfd library -- portable access to object files.
+
+   Copyright (C) 1990-2017 Free Software Foundation, Inc.
+
+   Contributed by Cygnus Support.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#include <bfd/config.h>
+#include <bfd/bfd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+static asymbol **asymbols = NULL;
+static bfd *abfd;
+
+static int dump_debug(FILE *output)
+{
+    if (asymbols == NULL) {
+        long symsize;
+        long symbol_count;
+        symsize = bfd_get_symtab_upper_bound (abfd);
+        if (symsize < 0) return -1;
+        asymbols = (asymbol **) malloc (symsize);
+        symbol_count = bfd_canonicalize_symtab (abfd, asymbols);
+        if (symbol_count < 0) return -1;
+    }
+
+    const char *file, *function;
+    unsigned int line;
+    
+    for (asection *s = abfd->sections; s; s = s->next)
+    {
+        if (s->flags & SEC_CODE)
+        {
+            unsigned long long section_base = bfd_get_section_vma(abfd, s);
+
+            for (unsigned long long addr = 0;
+                addr < bfd_section_size (abfd, s);
+                addr+=2)
+            {
+                if (bfd_find_nearest_line(abfd, s, asymbols, addr, &file, &function, &line))
+                {
+                    if (output)
+                    {
+                        fprintf(output, "%llx %s %s %s %d\n", section_base + addr, function, function, file, line);
+                    }
+                    else
+                    {
+                        printf("%llx %s %s %s %d\n", section_base + addr, function, function, file, line);
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    char *input = argv[1], *output=NULL;
+    
+    if (argc > 2)
+    {
+        output = argv[2];
+    }
+
+    abfd = bfd_openr(input, 0);
+    if (abfd == NULL)
+    {
+        fprintf (stderr, "Can't open %s: %s\n", argv[1], bfd_errmsg (bfd_get_error ()));
+        return -1;
+    }
+
+    if (!bfd_check_format (abfd, bfd_object))
+    {
+        fprintf (stderr, "Can't load %s: %s\n", argv[1], bfd_errmsg (bfd_get_error ()));
+        return -1;
+    }
+
+    FILE *output_file = NULL;
+    
+    if (output)
+    {
+        output_file = fopen(output, "w");
+        if (output_file == NULL)
+        {
+            return -1;
+        }
+    }
+
+    if (dump_debug(output_file))
+    {
+        return -1;
+    }
+
+    if (output_file)
+    {
+        fclose(output_file);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/utils/gapy/runner/board/board_runner.py b/utils/gapy/runner/board/board_runner.py
index 5b9580d86..f790ecef4 100644
--- a/utils/gapy/runner/board/board_runner.py
+++ b/utils/gapy/runner/board/board_runner.py
@@ -38,49 +38,77 @@ def __init__(self, args, config, system):
 
 
     def flash(self):
-        flash = self.get_boot_flash()
-        if flash.get_bool('content/flash'):
+        for flash in self.get_flashs():
+            if flash.get_bool('content/flash'):
 
-            image = flash.get_str('content/image')
+                image = flash.get_str('content/image')
 
-            if os.environ.get('GAP_USE_PLPBRIDGE') is not None:
-                cmd = 'plpbridge --chip=%s --verbose 10 --cable=%s --flash-image=%s flash wait' % (os.environ.get('TARGET_NAME'), os.environ.get("PLPBRIDGE_CABLE"), image)
+                if os.environ.get('GAP_USE_PLPBRIDGE') is not None:
+                    cmd = 'plpbridge --chip=%s --verbose 10 --cable=%s --flash-image=%s flash wait' % (os.environ.get('TARGET_NAME'), os.environ.get("PLPBRIDGE_CABLE"), image)
 
-            else:
-                if os.environ.get('GAPY_OPENOCD_CABLE') is not None:
-                    self.config.set('openocd/cable', os.environ.get('GAPY_OPENOCD_CABLE'))
+                else:
+                    if os.environ.get('GAPY_OPENOCD_CABLE') is not None:
+                        self.config.set('openocd/cable', os.environ.get('GAPY_OPENOCD_CABLE'))
+
+                    openocd = self.config.get_str("openocd/path")
+                    cable = self.config.get_str('openocd/cable')
+                    script = self.config.get_str('openocd/script')
+                    image_size = os.path.getsize(image)
+                    gap_tools = os.environ.get('GAP_OPENOCD_TOOLS')
+
+                    wsl    = self.config.get_str('runner/wsl')
+                    if wsl is None:
+                        wsl_image = image
+                    else:
+                        path_header = '\\"//wsl$/' + wsl
+                        path_footer = '\\"'
+                        wsl_image = path_header + image + path_footer
+                        script = os.environ.get('OPENOCD_CHIP_TARGET')
+
+                    if self.config.get_str('**/chip_family') == 'gap':
+
+                        if flash.get_str('datasheet/type') == 'spi':
+                            flasher_script = 'gap_flash_raw_spi'
+                        else:
+                            flasher_script = 'gap_flash_raw_hyper'
+
+                        cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -c "script %s; script %s; script tcl/flash_image.tcl; script tcl/jtag_boot.tcl; %s %s %d %s; exit;"' % (openocd, cable, script, flasher_script, image, image_size, gap_tools)
 
-                openocd = self.config.get_str("openocd/path")
-                cable = self.config.get_str('openocd/cable')
-                script = self.config.get_str('openocd/script')
-                image_size = os.path.getsize(image)
-                gap_tools = os.environ.get('GAP_OPENOCD_TOOLS')
+                    elif self.config.get_str('**/chip/name') == 'vega':
 
-                if self.config.get_str('**/chip_family') == 'gap':
+                        cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -f "%s" -f "%s" -f "%s/tcl/flash_image.tcl" -c "vega_flash_raw_hyper %s %d %s; exit;"' % (openocd, cable, script, gap_tools, wsl_image, image_size, gap_tools)
 
-                    if flash.get_str('datasheet/type') == 'spi':
-                        flasher_script = 'gap_flash_raw_spi'
                     else:
-                        flasher_script = 'gap_flash_raw_hyper'
 
-                    cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -c "script %s; script %s; script tcl/flash_image.tcl; script tcl/jtag_boot.tcl; %s %s %d %s; exit;"' % (openocd, cable, script, flasher_script, image, image_size, gap_tools)
+                        if flash.get_str('datasheet/type') == 'mram':
 
-                else:
+                            flasher_binary = gap_tools + '/gap_bins/gap_flasher-gap9_evk-mram.elf'
+                            sector_size = 0x2000
 
-                    if flash.get_str('datasheet/type') == 'spi':
-                        flasher_script = 'gap9_flash_raw_spi'
-                    else: 
-                        if self.config.get_str('**/chip/name') == 'vega':
-                            flasher_script = 'vega_flash_raw_hyper'
                         else:
-                            flasher_script = 'gap9_flash_raw_hyper'
 
-                    cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -c "script %s; script %s; script %s/tcl/flash_image.tcl; %s %s %d %s; exit;"' % (openocd, cable, script, gap_tools, flasher_script, image, image_size, gap_tools)
-
-            print ('Flashing image with command:')
-            print (cmd)
-            
-            return os.system(cmd)
+                            if os.environ.get('BOARD_NAME') == 'gap9_evk':
+                                if self.config.get_str('runner/platform') == 'fpga':
+                                    flasher_binary = gap_tools + '/gap_bins/gap_flasher-gap9_evk-fpga.elf'
+                                else:
+                                    flasher_binary = gap_tools + '/gap_bins/gap_flasher-gap9_evk.elf'
+                                sector_size = 0x1000
+                            else:
+                                if self.config.get_str('runner/platform') == 'fpga':
+                                    flasher_binary = gap_tools + '/gap_bins/gap_flasher-gapuino9.elf'
+                                    sector_size = 0x40000
+                                else:
+                                    # This is for the variant of socketed gap9mod with atxp032
+                                    flasher_binary = gap_tools + '/gap_bins/gap_flasher-gap9_v2.elf'
+                                    sector_size = 0x1000
+
+                        cmd = '%s -d0 -c "gdb_port disabled; telnet_port disabled; tcl_port disabled" -f "%s" -f "%s" -f "%s/tcl/flash_image.tcl" -c "gap9_flash_raw %s %d %s 0x%x; exit;"' % (openocd, cable, script, gap_tools, wsl_image, image_size, flasher_binary, sector_size)
+
+                print ('Flashing image with command:')
+                print (cmd)
+                
+                if os.system(cmd):
+                    return -1
         
         return 0
 
diff --git a/utils/gapy/runner/chips/gap9_v2_efuse.py b/utils/gapy/runner/chips/gap9_v2_efuse.py
index de9d6e37f..873d8a45f 100644
--- a/utils/gapy/runner/chips/gap9_v2_efuse.py
+++ b/utils/gapy/runner/chips/gap9_v2_efuse.py
@@ -329,6 +329,23 @@ def __str__(self):
         return result
 
 
+    def gen_c_struct(self, name, file):
+        file.write('typedef struct\n')
+        file.write('{\n')
+        file.write('    unsigned int id;\n')
+        file.write('    unsigned int val;\n')
+        file.write('}pi_fuser_reg_t;\n')
+        file.write('\n')
+        file.write('pi_fuser_reg_t %s[] = {\n' % name)
+        for id in range (0, self.nb_regs):
+            value = self.efuses_list[id].get()
+            if value != 0:
+                file.write('    { .id=%d, .val=0x%x },\n' % (id, value))
+
+        file.write('};\n')
+
+
+
     def gen(self, traces, filename):
         traces.info('  Generating to file: ' + filename)
 
diff --git a/utils/gapy/runner/default_runner.py b/utils/gapy/runner/default_runner.py
index 10eb76b5c..1098e24d6 100644
--- a/utils/gapy/runner/default_runner.py
+++ b/utils/gapy/runner/default_runner.py
@@ -103,7 +103,7 @@ def __handle_target(self, target, flash_device_prefix=None):
 
                 # The flash can contain the boot binary and several partitions for FS
                 if flash_config.get('content/boot-loader') is not None:
-
+                    flash_image = True
                     gen_image = True
 
                 if flash_config.get('content/partitions') is not None:
@@ -184,6 +184,17 @@ def exec_prepare(self):
     def get_boot_mode(self, target='runner'):
         return self.config.get_str('%s/boot/mode' % target)
 
+    def get_flashs(self, target='runner', flash_device_prefix=None):
+        result = []
+        for flash_path in self.config.get_py(target + '/flash_devices'):
+            if flash_device_prefix is not None:
+                flash_path = flash_device_prefix + flash_path
+
+            result.append(self.config.get(flash_path))
+
+        return result
+
+
     def get_boot_flash(self, target='runner', flash_device_prefix=None):
         flash_path = self.config.get_str("%s/boot/device" % target)
         if flash_path is None:
diff --git a/utils/gapy/runner/rtl/chips/gap9.py b/utils/gapy/runner/rtl/chips/gap9.py
index e22f578f2..400ea92de 100644
--- a/utils/gapy/runner/rtl/chips/gap9.py
+++ b/utils/gapy/runner/rtl/chips/gap9.py
@@ -48,6 +48,11 @@ def __init__(self, args, config, system):
 
         self.set_cmd_arg('+TB_DEBUG_VERBOSITY=%d' % self.config.get_int('**/rtl/verbosity'))
 
+        if os.environ.get('BOARD_NAME') == 'gap9_evk':
+            self.set_cmd_arg('+ENABLE_HYPER0_CS1_MX25U51245G_VIP=1')
+            self.set_cmd_arg('+ENABLE_HYPER0_CS0_PSRAM_VIP=1')
+            self.set_cmd_arg('+VIP_MODE=CUSTOM')
+
         boot_mode = self.config.get_str('**/runner/boot/mode')
         if boot_mode == 'jtag' or self.config.get_bool('**/runner/boot/jtag_force'):
             self.set_cmd_arg('+VSIM_BOOTMODE_CFG=1')
@@ -84,6 +89,9 @@ def __init__(self, args, config, system):
         self.set_arg('-permit_unmatched_virtual_intf')
         self.set_arg('+preload_file=efuse_preload.data')
 
+        if self.platform_tool == 'vsim':
+            self.set_arg('-suppress 12130')
+
         uart_baudrate = self.config.get_int('**/rtl/testbench/uart/baudrate')
         if uart_baudrate is not None:
             self.set_cmd_arg('+CONFIG_UART_BAUDRATE=%d' % uart_baudrate)
diff --git a/utils/gapy/runner/rtl/rtl_runner.py b/utils/gapy/runner/rtl/rtl_runner.py
index c094d500b..c1e1540d4 100644
--- a/utils/gapy/runner/rtl/rtl_runner.py
+++ b/utils/gapy/runner/rtl/rtl_runner.py
@@ -204,10 +204,7 @@ def exec(self):
             status = 0
 
         if self.args.extend_traces:
-            if os.environ.get('CONFIG_NEW_HARTS') is not None:
-                traces = ['trace_core_00_9.log', 'trace_core_00_0.log', 'trace_core_00_1.log', 'trace_core_00_2.log', 'trace_core_00_3.log', 'trace_core_00_4.log', 'trace_core_00_5.log', 'trace_core_00_6.log', 'trace_core_00_7.log', 'trace_core_00_8.log']
-            else:
-                traces = ['trace_core_1f_0.log', 'trace_core_00_0.log', 'trace_core_00_1.log', 'trace_core_00_2.log', 'trace_core_00_3.log', 'trace_core_00_4.log', 'trace_core_00_5.log', 'trace_core_00_6.log', 'trace_core_00_7.log', 'trace_core_00_8.log']
+            traces = ['trace_core_00_9.log', 'trace_core_00_0.log', 'trace_core_00_1.log', 'trace_core_00_2.log', 'trace_core_00_3.log', 'trace_core_00_4.log', 'trace_core_00_5.log', 'trace_core_00_6.log', 'trace_core_00_7.log', 'trace_core_00_8.log']
             binary = self.config.get_str('runner/boot-loader')
             rom_binary = '%s/boot/boot-gap9' % self.__get_platform_path()
 
diff --git a/utils/gapy/targets/gap9_v2.json b/utils/gapy/targets/gap9_v2.json
index 7e5ff1cc4..85b3c05d7 100644
--- a/utils/gapy/targets/gap9_v2.json
+++ b/utils/gapy/targets/gap9_v2.json
@@ -45,7 +45,7 @@
         ],
         "boot": {
             "jtag_mode": null,
-            "mode": "flash",
+            "mode": "jtag",
             "device": "target/board/devices/flash"
         },
         "efuses": {
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf
new file mode 100755
index 000000000..05fe0de49
Binary files /dev/null and b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-fpga.elf differ
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf
new file mode 100755
index 000000000..78e5c9b04
Binary files /dev/null and b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk-mram.elf differ
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf
new file mode 100755
index 000000000..63d1e132b
Binary files /dev/null and b/utils/openocd_tools/gap_bins/gap_flasher-gap9_evk.elf differ
diff --git a/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf b/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf
new file mode 100755
index 000000000..29564a524
Binary files /dev/null and b/utils/openocd_tools/gap_bins/gap_flasher-gap9_v2.elf differ
diff --git a/utils/openocd_tools/src/flasher/Makefile b/utils/openocd_tools/src/flasher/Makefile
index 4f44f60ce..4405b569e 100644
--- a/utils/openocd_tools/src/flasher/Makefile
+++ b/utils/openocd_tools/src/flasher/Makefile
@@ -5,9 +5,12 @@ APP              = gap_flasher
 APP_SRCS        += gap_flasher.c
 APP_INC	        +=
 
+ifdef MRAM
+APP_CFLAGS      += -DUSE_MRAM=1
+else
 spiflash ?= 0
 flash = $(spiflash)
-
 APP_CFLAGS      += -DFLASH_TYPE=$(flash)
+endif
 
 include $(RULES_DIR)/pmsis_rules.mk
diff --git a/utils/openocd_tools/src/flasher/boards.mk b/utils/openocd_tools/src/flasher/boards.mk
new file mode 100644
index 000000000..8c07ab1cb
--- /dev/null
+++ b/utils/openocd_tools/src/flasher/boards.mk
@@ -0,0 +1,15 @@
+gap9_evk:
+	$(MAKE) PMSIS_OS=freertos clean platform=fpga
+	$(MAKE) PMSIS_OS=freertos platform=fpga io=host all
+	cp BUILD/GAP9_V2/GCC_RISCV_FREERTOS/gap_flasher ../../gap_bins/gap_flasher-gap9_evk-fpga.elf
+	$(MAKE) PMSIS_OS=freertos clean platform=board
+	$(MAKE) PMSIS_OS=freertos platform=board io=host all
+	cp BUILD/GAP9_V2/GCC_RISCV_FREERTOS/gap_flasher ../../gap_bins/gap_flasher-gap9_evk.elf
+	$(MAKE) PMSIS_OS=freertos clean platform=board
+	$(MAKE) MRAM=1 PMSIS_OS=freertos platform=board io=host all
+	cp BUILD/GAP9_V2/GCC_RISCV_FREERTOS/gap_flasher ../../gap_bins/gap_flasher-gap9_evk-mram.elf
+
+gap9_v2:
+	$(MAKE) PMSIS_OS=freertos clean platform=board
+	$(MAKE) PMSIS_OS=freertos platform=board io=host all
+	cp BUILD/GAP9_V2/GCC_RISCV_FREERTOS/gap_flasher ../../gap_bins/gap_flasher-gap9_v2.elf
diff --git a/utils/openocd_tools/src/flasher/gap_flasher.c b/utils/openocd_tools/src/flasher/gap_flasher.c
index 9f354942a..631439d48 100644
--- a/utils/openocd_tools/src/flasher/gap_flasher.c
+++ b/utils/openocd_tools/src/flasher/gap_flasher.c
@@ -1,4 +1,5 @@
 #include "pmsis.h"
+#include "bsp/bsp.h"
 #include "bsp/flash.h"
 #include "bsp/flash/hyperflash.h"
 #include "bsp/flash/spiflash.h"
@@ -6,12 +7,10 @@
 #define HYPER 0
 #define QSPI 1
 
-#if (FLASH_TYPE == HYPER)
-#define PRINT_FLASH_TYPE    "Hyperflash"
+#ifdef USE_MRAM
+#define FLASH_SECTOR_SIZE (1<<13) // 8 KiB
+#else
 #define FLASH_SECTOR_SIZE (1<<18) // 256 KiB
-#elif (FLASH_TYPE == QSPI)
-#define PRINT_FLASH_TYPE    "QSPIflash"
-#define FLASH_SECTOR_SIZE (1<<16) // 64 KiB
 #endif
 
 #define BUFF_SIZE (FLASH_SECTOR_SIZE)
@@ -50,20 +49,22 @@ static int test_entry(void)
     *(volatile uint32_t *)&debug_struct.buff_pointer = (uint32_t) buff;
 
     *(volatile uint32_t *)&debug_struct.gap_ready = 1;
-    printf("[Flahser]: %s flasher is ready\n", PRINT_FLASH_TYPE);
+#ifdef USE_MRAM
+    printf("[Flahser]: MRAM flasher is ready\n");
+#else
+    printf("[Flahser]: Default flasher is ready\n");
+#endif
     while((*(volatile uint32_t *)&debug_struct.flash_run) == 0)
     {
         pi_time_wait_us(1);
     }
 
-#if (FLASH_TYPE == HYPER)
-    struct pi_hyperflash_conf flash_conf;
-    pi_hyperflash_conf_init(&flash_conf);
-#elif (FLASH_TYPE == QSPI)
-    struct pi_spiflash_conf flash_conf;
-    pi_spiflash_conf_init(&flash_conf);
+#ifdef USE_MRAM
+    struct pi_mram_conf flash_conf;
+    pi_mram_conf_init(&flash_conf);
 #else
-    printf("No this type !\n");
+    struct pi_default_flash_conf flash_conf;
+    pi_default_flash_conf_init(&flash_conf);
 #endif
 
     pi_open_from_conf(&flash, &flash_conf);
diff --git a/utils/openocd_tools/src/fuser/Makefile b/utils/openocd_tools/src/fuser/Makefile
index 209505594..6a2963b33 100644
--- a/utils/openocd_tools/src/fuser/Makefile
+++ b/utils/openocd_tools/src/fuser/Makefile
@@ -1,5 +1,5 @@
 APP              = fuser
-APP_SRCS        += gap8_fuser.c
+APP_SRCS        += gap_fuser.c
 APP_INC	        +=
 
 dump ?= 0
@@ -33,6 +33,13 @@ ifeq '$(revb_hyper)' '1'
 APP_CFLAGS      += -DREVB_HYPER
 endif
 
+ifdef GAP9
+
+gen:
+	./gap9-efuse-gen  --output=gap9_fuser_map.h --name=fuser_map
+
+endif
+
 
 
 
diff --git a/utils/openocd_tools/src/fuser/gap9-efuse-gen b/utils/openocd_tools/src/fuser/gap9-efuse-gen
new file mode 100755
index 000000000..ef1a4462a
--- /dev/null
+++ b/utils/openocd_tools/src/fuser/gap9-efuse-gen
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+#
+# Copyright (C) 2019 GreenWaves Technologies
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import runner.chips.gap9_v2_efuse as efuse
+
+parser = argparse.ArgumentParser(description='Generate gap9 efuse map')
+
+parser.add_argument("--usecase", dest="usecase", default=None, help="specify the usecase")
+parser.add_argument("--output", dest="output", default=None, help="specify the output file path")
+parser.add_argument("--name", dest="name", default=None, help="specify the structure name")
+
+args = parser.parse_args()
+
+efuse_map = efuse.Efuse_map()
+
+efuse_map.get_efuse('info_1').get_field('icache_enabled').set(1)
+
+# By default, only activate fast clock and fed other blocks like timer at 24Mhz/16
+fast_osc_freq_div = 24576062.0 / 16
+efuse_map.get_efuse('info_1').get_field('osc_ctrl_setup').set(1)
+efuse_map.get_efuse('info_1').get_field('osc_ctrl').set(1)
+efuse_map.get_efuse('info_1').get_field('fast_clk_div_pow2_setup').set(1)
+efuse_map.get_efuse('fast_clk_div_pow2').set(4 | (1<<3))
+efuse_map.get_efuse('info_2').get_field('wake_osc_ctrl_setup').set(1)
+efuse_map.get_efuse('info_2').get_field('wake_osc_ctrl').set(1)
+efuse_map.get_efuse('info_2').get_field('wake_fast_clk_div_pow2_setup').set(1)
+efuse_map.get_efuse('wake_fast_clk_div_pow2').set(4 | (1<<3))
+
+# Lock FLL soc and periph
+efuse_map.get_efuse('info_1').get_field('fll_global_setup').set(1)
+efuse_map.get_efuse('info_1').get_field('fll_dco0_setup').set(1)
+# FLL DRR (DCO min | DCO max)
+efuse_map.get_efuse('fll_drr').set((0 << 0) | (0x1ff << 16))
+# Pre-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
+efuse_map.get_efuse('fll_ccr1_pre_lock').set((0 << 0) | (0 << 8))
+# Post-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
+efuse_map.get_efuse('fll_ccr1_post_lock').set((0 << 0) | (3 << 8))
+# FLL CCR2 (CLK0 SEL | CLK1 SEL | CLK2_SEL | CLK3_SEL | CKG0)
+efuse_map.get_efuse('fll_ccr2').set((0x1 << 0) | (0x1 << 4) | (0x1 << 8) | (0x2 << 12) | (1 << 16))
+# DCO0 CR1 (DCO EN | CLOSE LOOP | LOOP GAIN | LOCK TOL | ITG | ASSERT CYCLES)
+efuse_map.get_efuse('fll_f0cr1').set((1 << 0) | (1 << 1) | (4 << 4) | (10 << 8) | (24 << 16) | (6 << 26))
+# DCO0 CR2 (MFI | DCO CODE)
+efuse_map.get_efuse('fll_f0cr2').set((166 << 0) | (0x1A << 16))
+
+# FLL DRR (DCO min | DCO max)
+efuse_map.get_efuse('wakeup_fll_drr').set((0 << 0) | (0x1ff << 16))
+# Pre-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
+efuse_map.get_efuse('wakeup_fll_ccr1_pre_lock').set((0 << 0) | (0 << 8))
+# Post-lock FLL CCR1 (CLK0 DIV | CLK1 DIV)
+efuse_map.get_efuse('wakeup_fll_ccr1_post_lock').set((0 << 0) | (1 << 8))
+# FLL CCR2 (CLK0 SEL | CLK1 SEL | CLK2_SEL | CLK3_SEL | CKG0)
+efuse_map.get_efuse('wakeup_fll_ccr2').set((0x1 << 0) | (0x1 << 4) | (0x1 << 8) | (0x2 << 12) | (1 << 16))
+# DCO0 CR1 (DCO EN | CLOSE LOOP | LOOP GAIN | LOCK TOL | ITG | ASSERT CYCLES)
+efuse_map.get_efuse('wakeup_fll_f0cr1').set((1 << 0) | (1 << 1) | (4 << 4) | (10 << 8) | (24 << 16) | (6 << 26))
+# DCO0 CR2 (MFI | DCO CODE)
+efuse_map.get_efuse('wakeup_fll_f0cr2').set((166 << 0) | (0x1A << 16))
+
+
+if args.usecase == 'mram':
+    efuse_map.get_efuse('info_1').get_field('bootmode').set(3)
+    efuse_map.get_efuse('info_1').get_field('mram_reset_wait').set(1)
+    efuse_map.get_efuse('info_2').get_field('wake_mram_reset_wait').set(1)
+    efuse_map.get_efuse('mram_reset_wait_cycles').set(math.ceil(0.000003*fast_osc_freq_div))
+    efuse_map.get_efuse('wake_mram_reset_wait_cycles').set(math.ceil(0.000003*fast_osc_freq_div))
+    efuse_map.get_efuse('info_2').get_field('clkdiv_setup').set(1)
+    efuse_map.get_efuse('info_2').get_field('clkdiv').set(5)
+    efuse_map.get_efuse('info_3').get_field('flash_wait').set(1)
+    efuse_map.get_efuse('flash_wait').set(math.ceil(0.00002*fast_osc_freq_div))
+
+
+
+
+if args.output is not None:
+    with open(args.output, 'w') as output_file:
+        efuse_map.gen_c_struct(args.name, output_file)
diff --git a/utils/openocd_tools/src/fuser/gap8_fuser.c b/utils/openocd_tools/src/fuser/gap_fuser.c
similarity index 90%
rename from utils/openocd_tools/src/fuser/gap8_fuser.c
rename to utils/openocd_tools/src/fuser/gap_fuser.c
index 5e0aa6b51..85ae357af 100644
--- a/utils/openocd_tools/src/fuser/gap8_fuser.c
+++ b/utils/openocd_tools/src/fuser/gap_fuser.c
@@ -5,13 +5,22 @@
 // boot mode, check the specifications to know which efuses can be used.
 
 #include <pmsis.h>
+#ifdef __GAP9__
+#include "gap9_fuser_map.h"
+#include <hal/efuse/efuse_v1.h>
+#else
 #include "fuser_map.h"
+#endif
 
 #define FUSER_REG_NUM       (128)
 
 #ifdef DUMP_REG
+#ifdef __GAP9__
+unsigned int fuser_read[FUSER_REG_NUM];
+#else
 unsigned char fuser_read[FUSER_REG_NUM];
 #endif
+#endif
 
 pi_fuser_reg_t *fuser_map_check;
 
@@ -26,7 +35,11 @@ static int entry()
 
     for (int i=0; i<(sizeof(fuser_map)/sizeof(pi_fuser_reg_t)); i++)
     {
+#ifdef __GAP9__
+        efuse_program(fuser_map[i].id, fuser_map[i].val);
+#else
         plp_efuse_writeByte(fuser_map[i].id, fuser_map[i].val);
+#endif
     }
 
     // Close the current operation once done
diff --git a/utils/openocd_tools/tcl/flash_image.tcl b/utils/openocd_tools/tcl/flash_image.tcl
index e7a88fae0..fed5fc423 100644
--- a/utils/openocd_tools/tcl/flash_image.tcl
+++ b/utils/openocd_tools/tcl/flash_image.tcl
@@ -252,18 +252,18 @@ proc gap_flash_raw_hyper {image_name image_size gap_tools_path} {
 # specific for gap9
 # will need to adapt the same way as gap builder to 
 # pass all parameters for the name
-proc gap9_flash_raw_hyper {image_name image_size gap_tools_path} {
+proc gap9_flash_raw {image_name image_size flasher_binary sector_size} {
     # flash the flasher
     puts "--------------------------"
-    puts "begining flash session (hyperflash)"
+    puts "begining flash session
     puts "--------------------------"
     puts "load flasher to L2 memory"
     # need to pass board name as arg -- TODO: unify command name
-    load_and_start_binary ${gap_tools_path}/gap_bins/gap_flasher-gapuino9.elf 0x1c010180
+    load_and_start_binary ${flasher_binary} 0x1c010180
     sleep 100
     # flash the flash image with the flasher
     puts "Instruct flasher to begin flash per se"
-    gap_flasher_ctrl $image_name $image_size 0 0x40000 0 0x1c010090
+    gap_flasher_ctrl $image_name $image_size 0 $sector_size 0 0x1c010090
     sleep 2
     puts "--------------------------"
     puts "flasher is done!"
diff --git a/utils/plptest/bin/plpobjects.py b/utils/plptest/bin/plpobjects.py
index 591b8daee..d213f3ab9 100644
--- a/utils/plptest/bin/plpobjects.py
+++ b/utils/plptest/bin/plpobjects.py
@@ -904,6 +904,9 @@ def getNbSuccess(self):
             result += test.getNbSuccess()
         return result
 
+    def status(self):
+        return self.getNbTests() - self.getNbSuccess() - self.getNbSkipped()
+
     def getNbSkipped(self):
         result = 0
         for test in self.topTests:
diff --git a/utils/plptest/bin/plptest b/utils/plptest/bin/plptest
index b0ba0539d..ed42546fc 100755
--- a/utils/plptest/bin/plptest
+++ b/utils/plptest/bin/plptest
@@ -122,9 +122,13 @@ parser.add_argument('command', metavar='CMD', type=str, nargs='*',
 parser.add_argument("--config", dest="config_name", default=None, help="specify the system configuration name")
 
 parser.add_argument("--config-def", dest="configDef", action="append", default=None, help="Specifies json files containing configurations definition")
+parser.add_argument("--property", dest="properties", action="append", default=[], help="Specifies property")
+parser.add_argument("--tag", dest="tags", action="append", default=[], help="Specifies tag")
 parser.add_argument("--testset", dest="testset", action="append", default=None, metavar="PATH", help="Path to the testset. Default: %(default)s")
 parser.add_argument("--cmd", dest="commands", action="append", default=None, metavar="PATH", help="Add command to be executed. Default: %(default)s")
+parser.add_argument("--cmd-exclude", dest="commands_exclude", action="append", default=None, metavar="PATH", help="Add command to be excluded. Default: %(default)s")
 parser.add_argument("--gui", dest="gui", action="store_true", help="Opens user interface")
+parser.add_argument("--no-fail", dest="no_fail", action="store_true", help="Return an error if there is any test failure")
 parser.add_argument("--dry-run", dest="dry_run", action="store_true", help="Dry run")
 parser.add_argument("--threads", dest="threads", default=None, type=int, help="Specify the number of worker threads")
 parser.add_argument("--load-average", dest="load_average", default=0.9, type=float, help="Specify the system average load that this tool should try to respect, from 0 to 1")
@@ -183,11 +187,24 @@ def command_handle():
 if len(args.command) == 0 and not args.gui:
   args.command.append('run')
 
+if os.environ.get('PLPTEST_DEFAULT_PROPERTIES') is not None:
+  properties = []
+  for prop in os.environ.get('PLPTEST_DEFAULT_PROPERTIES').split(' '):
+    properties.append(prop)
+
+  args.properties = properties + args.properties
+
+if os.environ.get('PMSIS_OS') is not None:
+  args.properties.append('os=%s' % os.environ.get('PMSIS_OS'))
+
+if os.environ.get('PMSIS_PLATFORM') is not None:
+  args.properties.append('platform=%s' % os.environ.get('PMSIS_PLATFORM'))
+
 runner = TestRunner(
     nbThreads=args.threads, stdout=args.stdout, safe_stdout=args.safe_stdout, maxOutputLen=args.maxOutputLen,
     maxTimeout=args.maxTimeout, worker_pool=args.worker_pool, db=args.db, average_load=args.load_average,
-    bench_csv_file=bench_csv_file, bench_regexp=args.bench_regexp, commands=args.commands, dry_run=args.dry_run,
-    server=args.gui
+    bench_csv_file=bench_csv_file, bench_regexp=args.bench_regexp, commands=args.commands, exclude_commands=args.commands_exclude, dry_run=args.dry_run,
+    server=args.gui, properties=args.properties, tags=args.tags
 )
 for testset in args.testset:
   runner.addTestset(testset)
@@ -212,3 +229,6 @@ if bench_csv_file is not None:
     csv_writer = csv.writer(file)
     for key, value in bench_csv_file.items():
       csv_writer.writerow([key] + value)
+
+if args.no_fail:
+  exit(runner.plpobjects.status())
\ No newline at end of file
diff --git a/utils/plptest/bin/plptest.py b/utils/plptest/bin/plptest.py
index beef2232d..69bd88aa9 100644
--- a/utils/plptest/bin/plptest.py
+++ b/utils/plptest/bin/plptest.py
@@ -18,36 +18,100 @@
 
 class Testset(object):
 
-  def __init__(self, name, files=[], tests=[], testsets=[], parent=None, restrict=None, tags=[], description=None, parallel=True, skip=None):
-    self.name = name
-    self.files = files
-    self.parent = parent
-    self.restrict = restrict
-    self.tags = tags
-    self.description = description
-    self.parallel = parallel
-    self.tests = tests
-    self.testsets = testsets
-    self.skip = skip
+    def __init__(self, name, files=None, tests=None, testsets=None, parent=None, restrict=None, tags=None, description=None, parallel=True, skip=None):
+        if testsets is None:
+            testsets = []
+        if tests is None:
+            tests = []
+        if files is None:
+            files = []
+        if tags is None:
+            tags = []
+
+        self.name = name
+        self.files = files
+        self.parent = parent
+        self.restrict = restrict
+        self.tags = tags
+        self.description = description
+        self.parallel = parallel
+        self.tests = tests
+        self.testsets = testsets
+        self.skip = skip
 
 
 
 class Test(object):
 
-    def __init__(self, name, commands=[], timeout=-1, parent=None, path=None, restrict=None, tags=[], params=[], description=None, scores=[], skip=None, testcase=None):
+    def __init__(self, name, commands=None, timeout=-1, parent=None, path=None, restrict=None, tags=None, params=None, description=None, scores=None, skip=None, testcase=None):
+
+        if tags is None:
+            tags = []
+        if params is None:
+            params = []
+        if scores is None:
+            scores = []
+        if commands is None:
+            commands = []
+
         self.name = name
         self.commands = commands
         self.timeout = timeout
         self.parent = parent
         self.path = path
         self.restrict = restrict
-        self.tags = tags
+        self.tags = tags.copy()
         self.params = params
         self.description = description
         self.scores = scores
         self.skip = skip
         self.testcase = testcase
 
+    def add_tags(self, tags):
+        self.tags += tags
+
+    def skip_test(self, message):
+        self.skip = message
+
+    def add_testcase(self, testcase):
+        self.testcase = testcase
+
+class Sdk_test(Test):
+
+    def __init__(self, name, flags='', commands=None, timeout=1000000, parent=None, path=None, restrict=None, tags=None, params=None, description=None, scores=None, skip=None, testcase=None, checker=None, gen=None, check=None):
+
+        if params is None:
+            params = []
+        if scores is None:
+            scores = []
+        if commands is None:
+            commands = []
+
+        if len(commands) == 0:
+
+          build_dir = name.replace(':', '_')
+
+          commands = [
+            Shell('clean', 'make clean %s build_dir_ext=_%s' % (flags, build_dir)),
+          ]
+
+          if gen is not None:
+            commands.append(Shell('gen', 'make %s %s build_dir_ext=_%s' % (gen, flags, build_dir)))
+
+          commands += [
+            Shell('build', 'make build image %s build_dir_ext=_%s' % (flags, build_dir)),
+            Shell('run',   'make flash_noforce run %s build_dir_ext=_%s' % (flags, build_dir))
+          ]
+
+          if check is not None:
+            commands.append(Shell('check', 'make %s %s build_dir_ext=_%s' % (check, flags, build_dir)))
+
+          if checker is not None:
+            commands.append(Check('check', checker))
+
+        super(Sdk_test, self).__init__(name=name, commands=commands, timeout=timeout, parent=parent, path=path, restrict=restrict, tags=tags,params=params, description=description, scores=scores, skip=skip, testcase=testcase)
+
+
 class Shell(object):
 
   def __init__(self, name, cmd):
@@ -105,3 +169,41 @@ def add_category(self, name):
     category = Testplan_category(name)
     self.categories.append(category)
     return category
+
+class Testconfig(object):
+
+  def __init__(self, runner):
+    self.config = {}
+    self.config['tests'] = []
+    self.config['testsets'] = []
+    self.runner = runner
+    self.tests = {}
+    self.testsets = {}
+
+  def add_test(self, test):
+    self.config['tests'].append(test)
+    self.tests[test.name] = test
+
+  def add_testset(self, testset):
+    self.config['testsets'].append(testset)
+    self.testsets[testset.name] = testset
+
+  def gen(self):
+    return self.config
+
+  def get(self, name):
+    return self.runner.get_property(name)
+
+  def get_test(self, name):
+    test = self.tests.get(name)
+    if test is None:
+        return Test('')
+    else:
+        return test
+
+  def get_tests(self):
+    return self.config['tests'] 
+
+  def add_tag(self, tag, tests):
+      for name in tests:
+          self.get_test(name).add_tags([tag])
diff --git a/utils/plptest/bin/plptest_runner.py b/utils/plptest/bin/plptest_runner.py
index a86f818f1..56e7e8c35 100644
--- a/utils/plptest/bin/plptest_runner.py
+++ b/utils/plptest/bin/plptest_runner.py
@@ -24,11 +24,14 @@
 import pickle
 import logging
 import os
+import sys
 from twisted.internet import protocol, reactor, endpoints
 from plptest_utils import *
 import plpobjects
 import imp
 import plptest_condor
+import plptest
+import sys
 try:
   import psutil
 except:
@@ -64,7 +67,7 @@ def parse_testset(top_testset, testset, topParent, runner, path, rootdir):
   if testset.parent != None: parent = testset.struct
   else: parent = topParent
 
-  testset.struct = Testset(runner, testset.name, path, parent)
+  testset.struct = Testset(runner, testset.name, path, parent, testset)
   if top_testset is not None:
     top_testset.append(testset.struct)
 
@@ -72,12 +75,8 @@ def parse_testset(top_testset, testset, topParent, runner, path, rootdir):
   testset.struct.set_parallel(testset.parallel)
   testset.struct.skip = testset.skip
 
-
   for file in testset.files:
-    if file.find('.ini') != -1:
-      IniParser(runner, os.path.join(rootdir, file)).parse(testset.struct)
-    else:
-      CfgParser(runner, os.path.join(rootdir, file)).parse(testset.struct)
+    CfgParser(runner, os.path.join(rootdir, file)).parse(testset.struct)
 
   for sub_testset in testset.testsets:
     parse_testset(None, sub_testset, testset.struct, runner, path, rootdir)
@@ -94,7 +93,7 @@ def parse_test(top_testset, test, topParent, runner, path):
   else: parent = topParent
 
   logging.debug("Adding test (name: %s)" % (test.name))
-  test.struct = Test(runner, test.name, path, parent)
+  test.struct = Test(runner, test.name, path, parent, test)
   if top_testset is not None:
     top_testset.append(test.struct)
 
@@ -110,9 +109,6 @@ def parse_test(top_testset, test, topParent, runner, path):
   if test.skip is None and topParent is not None:
     test.struct.skip = topParent.get_skip()
 
-  for tag in test.tags:
-    test.struct.addTag(tag)
-
   test.struct.setTimeout(int(test.timeout))
 
   for param in test.params:
@@ -132,14 +128,21 @@ def parse(self, topParent=None):
     logging.debug("Starting parsing file (path: %s)" % (self.file))
 
     try:
-      module = imp.load_source('test', self.file)
+      module = imp.load_source(self.file, self.file)
     except:
       raise Exception(bcolors.FAIL + 'Unable to open test configuration file: ' + self.file + bcolors.ENDC)
 
-    try:
-      self.config.update(module.TestConfig)
-    except:
-      raise Exception(bcolors.FAIL + 'Project configuration must define the TestConfig variable: ' + self.file + bcolors.ENDC)
+    if module.__dict__.get('get_tests') is None:
+
+      try:
+        self.config.update(module.TestConfig)
+      except:
+          raise Exception(bcolors.FAIL + 'Project configuration must define the TestConfig variable: ' + self.file + bcolors.ENDC)
+
+    else:
+        testconfig = plptest.Testconfig(self.runner)
+        module.get_tests(testconfig)
+        self.config.update(testconfig.gen())
 
     top_testset = []
     result = top_testset
@@ -153,7 +156,6 @@ def parse(self, topParent=None):
       topParent = Testset(self.runner, 'top', 'top')
       result = [topParent]
 
-
     if testsets != None:
       for testset in testsets:
         parent = parse_testset(top_testset, testset, topParent, self.runner, self.path, os.path.dirname(self.file))
@@ -162,10 +164,7 @@ def parse(self, topParent=None):
     if files is not None:
         for file in files:
             file_path = os.path.join(os.path.dirname(self.file), file)
-            if file.find('.ini') != -1:
-                testset = IniParser(self.runner, file_path)
-            else:
-                testset = CfgParser(self.runner, file_path)
+            testset = CfgParser(self.runner, file_path)
             testset.parse(top_testset.struct)
 
     tests = self.config.get('tests')
@@ -177,125 +176,6 @@ def parse(self, topParent=None):
     return result
 
 
-class IniParser(object):
-
-  def __init__(self, runner, file):
-    self.file = file
-    self.runner = runner
-    self.path = os.path.dirname(self.file)
-
-  def getOptions(self, section):
-    result = []
-    fullDict = {}
-    #fullDict = dict(list(runConfig.getAll().items()) + list(self.userConf.items()))
-    #fullDict['config'] = runConfig.getDeprecatedString()
-    #if runConfig.get('flag') != None:
-    #  fullDict['flags'] = ' '.join(runConfig.get('flag'))
-    #else:
-    #  fullDict['flags'] = ''
-    for option in self.parser.options(section):
-      result.append([option, getOptionValue(self.parser.get(section, option))])
-        #, vars=fullDict))])
-    return result
-
-  def parse(self, topParent=None):
-
-    logging.debug("Starting parsing file (path: %s)" % (self.file))
-
-    config = configparser.SafeConfigParser(dict_type=collections.OrderedDict)
-    self.parser = config
-    config.optionxform = str
-    openedPaths = config.read(self.file)
-
-    if len(openedPaths) == 0:
-        logging.warning("Didn't manage to open file: %s" % (self.file))
-
-    testsets = {}
-    topTestset = None
-
-    for section in config.sections():
-
-      parent = None
-
-      sectionList = section.split(':')
-      if len(sectionList) < 2: raise Exception("Invalid section, must contains t least 2 items: [type:name]")
-      sectionType = sectionList[0]
-      if len(sectionList) > 2:
-        parentName = ':'.join(sectionList[1:len(sectionList)-1])
-        parent = testsets.get(parentName)
-      if parent == None: parent = topParent
-      name = sectionList[len(sectionList)-1]
-
-      if sectionType == 'testset':
-
-        testset = Testset(self.runner, name, self.path, parent)
-        testsets[name] = testset
-        if topTestset == None: topTestset = testset
-
-        for item in config.items(section):
-          if item[0] == 'files':
-            pass
-            files = getOptionValue(item[1]).split()
-            for childFile in files:
-              if childFile.find('.ini') != -1:
-                IniParser(self.runner, os.path.join(os.path.dirname(self.file), childFile)).parse(testset)
-              else:
-                CfgParser(self.runner, os.path.join(os.path.dirname(self.file), childFile)).parse(testset)
-          if item[0] == 'configs':
-            for conf in getOptionValue(item[1]).split():
-              testset.addConfigConstraint(pulpconfig.Configuration(useRegExp=True, name=conf))
-            #print (bcolors.FAIL + 'Caught an error while parsing test description file: ' + file + bcolors.ENDC)
-            #raise
-          else:
-            pass
-            #testset.addUserConfig(item[0], getOptionValue(item[1]))
-
-      elif sectionType == 'run':
-        # Deprecated type, just here for compatibility
-        pass
-
-      elif sectionType == 'test':
-        test = Test(self.runner, name, self.path, parent)
-        if topTestset == None: topTestset = test
-
-        for option, value in config.items(section, raw=True):
-          if option.find('command.') == 0:
-            test.addCommand([option.split('.')[1], value])
-          elif option == 'dir':
-            test.setDir(value)
-          elif option == 'configs':
-            for conf in value.split():
-              test.addConfigConstraint(conf)
-          elif option == 'tags':
-            for tag in value.split():
-              test.addTag(tag)
-          elif option == 'timeout':
-            test.setTimeout(int(value))
-          elif option == 'check':
-            for checker in value.split():
-              test.addChecker(checker)
-          elif option == 'parameters':
-            for param in value.split():
-              test.addParam(param)
-          elif option.find('probe') == 0:
-            pass
-            #probeName = option.split('[')[1].split(']')[0]
-            #if probes.get(probeName) == None:
-            #    probes[probeName] = Probe(probeName, testsuiteName, moduleName, testName)
-            #probeOption = option.split('[')[1].split(']')[1].split('.')[1]
-            #probes[probeName].setProp(probeOption, value)
-          else:
-            raise BaseException("Unknown item %s in test %s" % (option, self.name))
-
-      else:
-        raise Exception("Invalid section type: " + sectionType)
-
-    return topTestset
-
-
-
-
-
 
 class UiHandler(protocol.Protocol):
     def __init__(self, configs, tests):
@@ -347,7 +227,8 @@ def __init__(
         self, nbThreads=1, server=False, stdout=False,
         maxOutputLen=-1, maxTimeout=-1, worker_pool=None,
         db=False, pobjs=None, build=None, average_load=None, safe_stdout=False, home=None,
-        bench_csv_file=None, bench_regexp=None, commands=None, dry_run=False):
+        bench_csv_file=None, bench_regexp=None, commands=None, dry_run=False,
+        exclude_commands=None, properties=[], tags=[]):
 
         global test_runner
 
@@ -374,9 +255,11 @@ def __init__(
         self.bench_regexp = bench_regexp
         self.bench_csv_file = bench_csv_file
         self.commands = commands
+        self.exclude_commands = exclude_commands
         self.cpu_load_checker_call_id = None
         self.dry_run = dry_run
         self.testplan = None
+        self.tags = tags
 
         test_runner = self
 
@@ -386,6 +269,14 @@ def __init__(
         if worker_pool == 'condor':
             self.worker_pool = plptest_condor.Condor_pool()
 
+        self.properties = {}
+        for prop in properties:
+          name, value = prop.split('=')
+          self.properties[name] = value
+
+    def get_property(self, name):
+      return self.properties.get(name)
+
     def testcase_result(self, testcase, status, test_name):
         if self.testplan is not None:
             return self.testplan.testcase_result(testcase, status, test_name)
@@ -418,10 +309,7 @@ def stop(self):
             reactor.stop()
 
     def addTestset(self, testset):
-        if testset.find('.ini') != -1:
-            self.tests += IniParser(self, testset).parse()
-        else:
-            self.tests += CfgParser(self, testset).parse()
+        self.tests += CfgParser(self, testset).parse()
 
         for test in self.tests:
             len = test.getMaxTestNameLen()
@@ -480,7 +368,7 @@ def testEnd(self, testrun):
       else:
         testStr = bcolors.FAIL + 'KO: '.ljust(6) + bcolors.ENDC
       print (testStr + bcolors.BOLD + testrun.test.getFullName().ljust(self.maxTestNameLen + 5) + bcolors.ENDC + ' %s' % (testrun.config))
-
+      sys.stdout.flush()
 
       test = self.plpobjects.getTest(testrun.test.getFullName())
       testResult = plpobjects.TestRun(self.plpobjects, test, testrun.status, testrun.duration, testrun.config, testrun.log, build=self.build, skip=testrun.skip)
@@ -518,8 +406,9 @@ def run(self, testrun):
 
       if testrun.skip is None:
         print (bcolors.OKBLUE + 'START'.ljust(6) + bcolors.ENDC + bcolors.BOLD + testrun.test.getFullName().ljust(self.maxTestNameLen + 5) + bcolors.ENDC + ' %s' % (testrun.config))
+        sys.stdout.flush()
 
-      testrun.run(reactor, self.testEnd, self.commands, self.dry_run, testrun)
+      testrun.run(reactor, self.testEnd, self.commands, self.exclude_commands, self.dry_run, testrun)
 
     def check_cpu_load(self):
       if len(self.runnings) >= self.nbThreads:
diff --git a/utils/plptest/bin/plptest_utils.py b/utils/plptest/bin/plptest_utils.py
index f13657146..93457bace 100644
--- a/utils/plptest/bin/plptest_utils.py
+++ b/utils/plptest/bin/plptest_utils.py
@@ -208,7 +208,7 @@ def dump(self):
 
 
 class TestCommon(object):
-    def __init__(self, runner, name, path, parent):
+    def __init__(self, runner, name, path, parent, user, is_testset=False):
         self.name = name
         self.parent = parent
         self.runner = runner
@@ -224,6 +224,8 @@ def __init__(self, runner, name, path, parent):
         self.addedConfigs = []
         self.restrict = None
         self.skip = None
+        self.user = user
+        self.is_testset = is_testset
 
     def get_skip(self):
         if self.skip is not None:
@@ -235,6 +237,12 @@ def get_skip(self):
         return None
 
     def checkConfig(self, config):
+
+        if not self.is_testset and len(self.runner.tags) != 0 and self.user is not None:
+            self.isActive = len(set(self.runner.tags).intersection(self.user.tags)) != 0
+            if not self.isActive:
+                return
+
         try:
             self.activeForConfig[config.__str__()] = \
                 self.restrict is None or \
@@ -345,8 +353,8 @@ def getMaxTestNameLen(self):
 
 class Testset(TestCommon):
 
-    def __init__(self, runner, name, path, parent=None):
-        super(Testset, self).__init__(runner, name, path, parent)
+    def __init__(self, runner, name, path, parent=None, user=None):
+        super(Testset, self).__init__(runner, name, path, parent, user, is_testset=True)
         if parent is not None:
             parent.regChild(self)
         self.childs = []
@@ -413,11 +421,10 @@ def run(self, config):
 
 class Test(TestCommon):
 
-    def __init__(self, runner, name, path, parent=None):
-        super(Test, self).__init__(runner, name, path, parent)
+    def __init__(self, runner, name, path, parent=None, user=None):
+        super(Test, self).__init__(runner, name, path, parent, user)
         self.childs = []
         self.commands = []
-        self.tags = []
         self.dir = None
         self.timeout = -1
         self.checkers = []
@@ -525,9 +532,6 @@ def addCommand(self, command):
     def addParam(self, param):
         self.params.append(param)
 
-    def addTag(self, tag):
-        self.tags.append(tag)
-
     def addChecker(self, checker):
         self.checkers.append(checker)
 
@@ -796,7 +800,7 @@ def runCommand(self):
 
             self.handle_cmd_end()
             
-    def run(self, reactor, callback=None, commands=None, dry_run=False, *kargs, **kwargs):
+    def run(self, reactor, callback=None, commands=None, exclude_commands=None, dry_run=False, *kargs, **kwargs):
 
         self.dry_run = dry_run
         self.callback = callback
@@ -813,13 +817,12 @@ def run(self, reactor, callback=None, commands=None, dry_run=False, *kargs, **kw
 
         else:
 
-            if commands is None:
-                self.commands = self.test.commands.copy()
-            else:
-                self.commands = []
-                for test_command in self.test.commands:
-                    if test_command.name in commands:
-                        self.commands.append(test_command)
+            self.commands = []
+
+            for test_command in self.test.commands:
+
+                if (commands is None or test_command.name in commands) and (exclude_commands is None or not test_command.name in exclude_commands):
+                    self.commands.append(test_command)
 
             self.appendOutput('Running: ' + self.test.getFullName() + ' / ' +
                             self.config.get_config_name() + '\n')
diff --git a/utils/rules/pmsis_defs.mk b/utils/rules/pmsis_defs.mk
new file mode 100644
index 000000000..03127b47f
--- /dev/null
+++ b/utils/rules/pmsis_defs.mk
@@ -0,0 +1,32 @@
+BOARD_NAME ?= gapuino
+
+ifeq ($(TARGET_CHIP_FAMILY), GAP9)
+PMSIS_OS ?= freertos
+else
+PMSIS_OS ?=pulpos
+endif				# TARGET_CHIP_FAMILY
+
+ifndef platform
+ifdef PMSIS_PLATFORM
+platform = $(PMSIS_PLATFORM)
+else
+platform = board
+endif				# platform
+endif				# PMSIS_PLATFORM
+
+SHELL=bash
+ECHO_GREEN = $(shell tput setaf 2)
+ECHO_BOLD = $(shell tput bold)
+ECHO_CLEAR = $(shell tput sgr0)
+
+ifeq '$(PMSIS_OS)' 'pulpos'
+ifeq '$(TARGET_CHIP)' 'GAP9_V2'
+export USE_PULPOS=1
+endif
+endif
+
+# Directory containing built objects
+PMSIS_OS_UPPERCASE  = $(shell echo $(PMSIS_OS) | tr a-z A-Z)
+BUILDDIR            = $(CURDIR)/BUILD/$(TARGET_CHIP)/GCC_RISCV_$(PMSIS_OS_UPPERCASE)$(build_dir_ext)
+TARGET_BUILD_DIR    = $(BUILDDIR)
+#$(info ## BUILDDIR : $(BUILDDIR))
diff --git a/utils/rules/pmsis_rules.mk b/utils/rules/pmsis_rules.mk
index 73c6c59be..fb268e905 100644
--- a/utils/rules/pmsis_rules.mk
+++ b/utils/rules/pmsis_rules.mk
@@ -1,43 +1,4 @@
-BOARD_NAME ?= gapuino
-
-ifeq ($(TARGET_CHIP_FAMILY), GAP9)
-PMSIS_OS ?= freertos
-else
-PMSIS_OS ?=pulpos
-endif				# TARGET_CHIP_FAMILY
-
-ifndef platform
-ifdef PMSIS_PLATFORM
-platform = $(PMSIS_PLATFORM)
-else
-platform = board
-endif				# platform
-endif				# PMSIS_PLATFORM
-
-SHELL=bash
-ECHO_GREEN = $(shell tput setaf 2)
-ECHO_BOLD = $(shell tput bold)
-ECHO_CLEAR = $(shell tput sgr0)
-
-help:
-	@echo "=================== ${ECHO_BOLD}${ECHO_GREEN}GAP SDK Application${ECHO_CLEAR} ==================="
-	@echo ""
-	@echo "Main targets:"
-	@echo " - ${ECHO_BOLD}clean${ECHO_CLEAR} : clean the application"
-	@echo " - ${ECHO_BOLD}all${ECHO_CLEAR}   : build the application"
-	@echo " - ${ECHO_BOLD}run${ECHO_CLEAR}   : run the application"
-	@echo ""
-	@echo "Common options:"
-	@echo " - ${ECHO_BOLD}platform=<value>${ECHO_CLEAR} : select the platform (gvsoc or board)"
-	@echo " - ${ECHO_BOLD}PMSIS_OS=<value>${ECHO_CLEAR} : select the OS (freertos or pulpos)"
-	@echo ""
-	@echo "For more information, please refer to the SDK documentation."
-
-ifeq '$(PMSIS_OS)' 'pulpos'
-ifeq '$(TARGET_CHIP)' 'GAP9_V2'
-export USE_PULPOS=1
-endif
-endif
+include $(GAP_SDK_HOME)/utils/rules/pmsis_defs.mk
 
 APP_INC += $(TILER_EMU_INC)
 
@@ -64,6 +25,9 @@ PLPBRIDGE_EXTRA_FLAGS        += -ftdi
 else ifeq ($(BOARD_NAME), ai_deck)
 COMMON_CFLAGS          += -DCONFIG_AI_DECK
 
+else ifeq ($(BOARD_NAME), gap9_evk)
+COMMON_CFLAGS          += -DCONFIG_GAP9_EVK
+
 else ifeq ($(BOARD_NAME), vega)
 COMMON_CFLAGS          += -DCONFIG_VEGA -mvega -Wa,-mwinsn
 
@@ -76,6 +40,10 @@ ifdef RUNNER_CONFIG
 override config_args += --config-ini=$(RUNNER_CONFIG)
 endif
 
+ifeq ($(platform), gvsoc)
+CONFIG_BOOT_MODE ?= flash
+endif
+
 # Enable traces with GVSOC
 ifeq ($(platform), gvsoc)
 ifeq ($(ENABLE_CORE_TRACES), 1)
@@ -137,6 +105,10 @@ override APP_CFLAGS += -DCONFIG_SLOW_OSC
 override config_args += --config-opt=**/runner/efuses/content/info_2/wake_osc_ctrl=3
 endif				# CONFIG_SLOW_OSC
 
+ifdef CONFIG_BOOT_DEVICE
+CONFIG_BOOT_MODE=flash
+endif
+
 
 CONFIG_BOOT_DEVICE           ?= hyperflash
 
@@ -158,6 +130,8 @@ override config_args += --config-opt=**/runner/boot/device=target/chip/soc/mram
 endif
 endif
 
+
+
 ifdef CONFIG_BOOT_MODE
 override config_args += --config-opt=**/runner/boot/mode=$(CONFIG_BOOT_MODE)
 
@@ -193,6 +167,17 @@ override config_args += --config-opt=**/gvsoc/debug-mode=true
 endif
 
 
+ifdef CONFIG_AUDIO_FRAMEWORK
+-include $(CONFIG_AUDIO_FRAMEWORK_BUILDDIR)/tc_sources.mk
+
+APP_SRCS += $(GAP_SDK_HOME)/tools/audio-framework/runtime/src/framework.c $(GAP_SDK_HOME)/tools/audio-framework/runtime/src/framework_ffc.c
+APP_SRCS += $(foreach file,$(TC_GENERATED_SRCS),$(CONFIG_AUDIO_FRAMEWORK_BUILDDIR)/$(file))
+APP_SRCS += $(foreach file,$(TC_COMPONENT_SRCS),$(GAP_SDK_HOME)/tools/audio-framework/components/$(file))
+override APP_INC += $(GAP_SDK_HOME)/tools/audio-framework/runtime/include \
+	$(GAP_SDK_HOME)/tools/audio-framework/components $(CONFIG_AUDIO_FRAMEWORK_BUILDDIR)
+endif
+
+
 ifdef CONFIG_TESTBENCH
 # Set of models connected to the RTL platform, capable of generating stimuli.
 # For example, on I2S it allows to generate samples, or capture them on I2S interfaces.
@@ -205,7 +190,9 @@ override config_args += --config-opt=$(CONFIG_BOARD_PATH)/addon_testbench_enable
 CONFIG_TESTBENCH_UART_ID ?= 1
 CONFIG_TESTBENCH_UART_BAUDRATE ?= 20000000
 
-APP_SRCS           += $(GAP_LIB_PATH)/testbench/testbench.c $(GAP_LIB_PATH)/testbench/testlib.c
+APP_SRCS           += $(GAP_LIB_PATH)/testbench/testbench.c $(GAP_LIB_PATH)/testbench/testlib.c \
+	$(GAP_LIB_PATH)/testbench/testlib_hyper.c $(GAP_LIB_PATH)/testbench/testlib_i2s.c \
+	$(GAP_LIB_PATH)/testbench/testlib_uart.c $(GAP_LIB_PATH)/testbench/testlib_i2c.c
 APP_INC            += $(GAP_LIB_PATH)/testbench
 override APP_CFLAGS         += -DCONFIG_TESTBENCH_UART_ID=$(CONFIG_TESTBENCH_UART_ID) \
                       -DCONFIG_TESTBENCH_UART_BAUDRATE=$(CONFIG_TESTBENCH_UART_BAUDRATE)
@@ -220,7 +207,6 @@ ifdef CONFIG_IO_UART_BAUDRATE
 override config_args += --config-opt=**/rtl/testbench/uart/baudrate=$(CONFIG_IO_UART_BAUDRATE)
 endif				# CONFIG_IO_UART_BAUDRATE
 
-
 # FS config
 READFS_FLASH ?= flash
 
@@ -265,17 +251,25 @@ GAPY_TARGET_OPT = --target=$(GAPY_TARGET)
 endif
 
 
-# Directory containing built objects
-PMSIS_OS_UPPERCASE  = $(shell echo $(PMSIS_OS) | tr a-z A-Z)
-BUILDDIR            = $(CURDIR)/BUILD/$(TARGET_CHIP)/GCC_RISCV_$(PMSIS_OS_UPPERCASE)$(build_dir_ext)
-TARGET_BUILD_DIR    = $(BUILDDIR)
-#$(info ## BUILDDIR : $(BUILDDIR))
-
 #$(info ## App sources : $(APP_SRCS))
 #$(info ## App includes : $(APP_INC))
 #$(info ## App cflags : $(APP_CFLAGS))
 #$(info ## App ldflags : $(APP_LDFLAGS))
 
+help:
+	@echo "=================== ${ECHO_BOLD}${ECHO_GREEN}GAP SDK Application${ECHO_CLEAR} ==================="
+	@echo ""
+	@echo "Main targets:"
+	@echo " - ${ECHO_BOLD}clean${ECHO_CLEAR} : clean the application"
+	@echo " - ${ECHO_BOLD}all${ECHO_CLEAR}   : build the application"
+	@echo " - ${ECHO_BOLD}run${ECHO_CLEAR}   : run the application"
+	@echo ""
+	@echo "Common options:"
+	@echo " - ${ECHO_BOLD}platform=<value>${ECHO_CLEAR} : select the platform (gvsoc or board)"
+	@echo " - ${ECHO_BOLD}PMSIS_OS=<value>${ECHO_CLEAR} : select the OS (freertos or pulpos)"
+	@echo ""
+	@echo "For more information, please refer to the SDK documentation."
+
 ifeq '$(PMSIS_OS)' 'freertos'
 
 # Select PMSIS drivers
@@ -283,8 +277,17 @@ PMSIS_BSP_DIR            = $(GAP_SDK_HOME)/rtos/pmsis/pmsis_bsp
 
 ifeq ($(CUSTOM_BSP),)
 include $(PMSIS_BSP_DIR)/rules/freertos_bsp_rules.mk
+override APP_CFLAGS += $(PMSIS_BSP_CFLAGS)
 endif				# CUSTOM_BSP
 
+ifdef CONFIG_IO_UART_BAUDRATE
+override APP_CFLAGS += -DCONFIG_IO_UART_BAUDRATE=$(CONFIG_IO_UART_BAUDRATE)
+endif
+
+ifdef CONFIG_IO_UART_ITF
+override APP_CFLAGS += -DCONFIG_IO_UART_ITF=$(CONFIG_IO_UART_ITF)
+endif
+
 # Special flag for FreeRTOS to use semihosting.
 ifeq ($(FS_TYPE), host)
 io = host
@@ -420,5 +423,9 @@ profiler:
 	cd $(BUILDDIR) && if [ -e all.bin ]; then rm all.bin; fi; mkfifo all.bin
 	cd $(BUILDDIR) && export PULP_CONFIG_FILE=$(BUILDDIR)/gvsoc_config.json && profiler $(BUILDDIR) $(BIN) gvsoc_config.json --signal-tree-file=$(PROFILER_SIGNAL_TREE)
 
+profile:
+	gapy $(GAPY_TARGET_OPT) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) --config-opt="gvsoc/debug-mode=true" --config-opt="gvsoc/events/gen_gtkw=false" run --image --flash --exec-prepare --binary=$(BIN) $(runner_args)
+	cd $(BUILDDIR) && profiler gvsoc_config.json --signal-tree-file=$(GAP_SDK_HOME)/tools/profiler_v2/gui/images/signalstree.txt
+
 size:
 	$(GAP_SDK_HOME)/utils/bin/binary-size --binary=$(BIN) --depth=10 --groups=$(PMSIS_OS)
diff --git a/utils/rules/pulp_rules.mk b/utils/rules/pulp_rules.mk
index 10d3fce53..8f81fcb34 100644
--- a/utils/rules/pulp_rules.mk
+++ b/utils/rules/pulp_rules.mk
@@ -237,17 +237,19 @@ ifneq ($(wsl),)
 WSL_ENV="--wsl=$(wsl)"
 endif
 
+flash_noforce:
+
 flash:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args) 
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --force --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 flash_fs:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args)
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --flash --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 image:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args)
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --image --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 run.prepare:
-	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --binary=$(BIN) $(runner_args)
+	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec-prepare --binary=$(BIN) $(runner_args) $(WSL_ENV)
 
 run.exec:
 	gapy --target=$(GAPY_TARGET) --platform=$(platform) --work-dir=$(BUILDDIR) $(config_args) $(gapy_args) run --exec --binary=$(BIN) $(runner_args) $(WSL_ENV)