SHL: version 2.9.0

XUANTIE-RV · Jan 23, 2024 · 533ee30 · 533ee30
1 parent eb19642
commit 533ee30
Show file tree

Hide file tree

Showing 196 changed files with 14,834 additions and 1,775 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,7 +17,6 @@ example/*.elf
 openvx_build
 e907_build
 rvv_build
-rvv_nodot_build
 rvm_build
 c906_static_build
 c906_so_build

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,6 +17,7 @@ if (CONFIG_CUSTOM_SOURCE_SELECT)
 else()
     set(CONFIG_USE_SHL_DEBUG ON)
     set(CONFIG_SHL_LAYER_BENCHMARK ON)
+    set(CONFIG_SHL_TRACE ON)
 endif()
 
 file (STRINGS "version" SHL_VERSION)
@@ -47,18 +48,6 @@ if(CONFIG_BUILD_RISCV_RVV)
     install(TARGETS rvv_static DESTINATION lib)
 endif()
 
-if(CONFIG_BUILD_RISCV_RVV_NODOT)
-    # build rvv a without xtheadvdot extension
-    include(cmake/rules.cmake)
-    LIST(APPEND RVV_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS})
-    add_library(rvv_static STATIC ${RVV_LST})
-    SET_TARGET_PROPERTIES(rvv_static PROPERTIES OUTPUT_NAME "shl_rvv_nodot")
-    set(RVV_BUILD_FLAGS -ffp-contract=off -march=rv64gcv_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_RVV -DSHL_BUILD_REF -DSHL_BUILD_GREF)
-    target_compile_options(rvv_static PRIVATE ${RVV_BUILD_FLAGS})
-
-    install(TARGETS rvv_static DESTINATION lib)
-endif()
-
 if(CONFIG_BUILD_RISCV_C906)
     # build c906 lib
     set(CONFIG_GRAPH_REFERENCE_TVMGEN ON)
@@ -102,8 +91,8 @@ if(CONFIG_BUILD_RISCV_C920)
 
     set(SHL_LIB_TARGET "c920_lib")
     set(SHL_LIB_NAME shl_c920)
-    LIST(APPEND SHL_BUILD_SRC_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C920_SRCS})
-    set(SHL_BUILD_C_FLAGS -ffp-contract=off -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_C920 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV)
+    LIST(APPEND SHL_BUILD_SRC_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C920_SRCS} ${LLM_SRCS})
+    set(SHL_BUILD_C_FLAGS -ffp-contract=off -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_C920 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV -fopenmp)
     include(cmake/target_build.cmake)
     target_include_directories(${SHL_LIB_TARGET} PRIVATE module/dlpack/include/)
 endif()

diff --git a/Makefile b/Makefile
@@ -10,9 +10,6 @@ nn2_e907_elf:
 nn2_rvv:
 	mkdir -p rvv_build; cd rvv_build; cmake ../ -DCONFIG_BUILD_RISCV_RVV=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/rvv/; make -j${USE_CORE}; make install; cd -
 
-nn2_rvv_nodot:
-	mkdir -p rvv_nodot_build; cd rvv_nodot_build; cmake ../ -DCONFIG_BUILD_RISCV_RVV_NODOT=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/rvv_nodot/; make -j${USE_CORE}; make install; cd -
-
 nn2_c906:
 	mkdir -p c906_static_build; cd c906_static_build; cmake ../ -DCONFIG_BUILD_RISCV_C906=ON -DCONFIG_SHL_BUILD_STATIC=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/c906/; make -j${USE_CORE}; make install; cd -
 

diff --git a/cmake/c906_elf.cmake b/cmake/c906_elf.cmake
@@ -359,9 +359,12 @@ set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON)
 set(CONFIG_THEAD_RVV_DIV_FP32 ON)
 set(CONFIG_THEAD_RVV_DIV_FP16 ON)
 set(CONFIG_THEAD_RVV_DIV_INT8 ON)
+set(CONFIG_THEAD_RVV_EMBEDDING_INT32 ON)
 set(CONFIG_THEAD_RVV_ERF_FP32 ON)
 set(CONFIG_THEAD_RVV_ERF_FP16 ON)
 set(CONFIG_THEAD_RVV_ERF_INT8 ON)
+set(CONFIG_THEAD_RVV_EXPAND_DIMS_FP32 ON)
+set(CONFIG_THEAD_RVV_EXPAND_DIMS_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON)
 set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON)
@@ -383,6 +386,7 @@ set(CONFIG_THEAD_RVV_LAYER_NORM_INT8 ON)
 set(CONFIG_THEAD_RVV_LEAKY_RELU_FP32 ON)
 set(CONFIG_THEAD_RVV_LEAKY_RELU_FP16 ON)
 set(CONFIG_THEAD_RVV_LEAKY_RELU_INT8 ON)
+set(CONFIG_THEAD_RVV_LLM_POS_FP16 ON)
 set(CONFIG_THEAD_RVV_MATMUL_FP32 ON)
 set(CONFIG_THEAD_RVV_MATMUL_FP16 ON)
 set(CONFIG_THEAD_RVV_MATMUL_INT8 ON)
@@ -411,6 +415,10 @@ set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON)
 set(CONFIG_THEAD_RVV_RMS_NORM_FP32 ON)
 set(CONFIG_THEAD_RVV_RMS_NORM_FP16 ON)
 set(CONFIG_THEAD_RVV_RMS_NORM_INT8 ON)
+set(CONFIG_THEAD_RVV_ROPE_FP32 ON)
+set(CONFIG_THEAD_RVV_ROPE_FP16 ON)
+set(CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP32 ON)
+set(CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP16 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON)
 set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON)
@@ -487,3 +495,4 @@ set(CONFIG_C906_SUB_FP32 ON)
 set(CONFIG_C906_SUB_FP16 ON)
 set(CONFIG_USE_SHL_DEBUG ON)
 set(CONFIG_SHL_LAYER_BENCHMARK ON)
+set(CONFIG_SHL_TRACE ON)
diff --git a/cmake/e907.cmake b/cmake/e907.cmake
@@ -336,4 +336,5 @@ set(CONFIG_E907_OPT_MUL ON)
 set(CONFIG_E907_OPT_SUM ON)
 set(CONFIG_E907_OPT_SOFTMAX ON)
 set(CONFIG_USE_SHL_DEBUG ON)
-set(CONFIG_SHL_LAYER_BENCHMARK ON)
+set(CONFIG_SHL_LAYER_BENCHMARK ON)
+set(CONFIG_SHL_TRACE ON)
diff --git a/cmake/rules.cmake b/cmake/rules.cmake
@@ -1,11 +1,10 @@
 if (NOT CONFIG_USE_COMPILER_PATH)
 
 # riscv linux compiler
-if (CONFIG_BUILD_RISCV_RVV OR CONFIG_BUILD_RISCV_RVV_NODOT OR
-    CONFIG_BUILD_RISCV_C906 OR CONFIG_BUILD_RISCV_RVM OR
-    CONFIG_BUILD_RISCV_C908 OR CONFIG_BUILD_RISCV_C920 OR
-    CONFIG_BUILD_RISCV_C920V2 OR CONFIG_BUILD_RISCV_PNNA OR
-    CONFIG_BUILD_TH1520)
+if (CONFIG_BUILD_RISCV_RVV OR CONFIG_BUILD_RISCV_C906 OR
+    CONFIG_BUILD_RISCV_RVM OR CONFIG_BUILD_RISCV_C908 OR
+    CONFIG_BUILD_RISCV_C920 OR CONFIG_BUILD_RISCV_C920V2 OR
+    CONFIG_BUILD_RISCV_PNNA OR CONFIG_BUILD_TH1520)
     set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc)
     set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
     set(CMAKE_ASM_COMPILER riscv64-unknown-linux-gnu-gcc)
@@ -30,6 +29,11 @@ if(CONFIG_USE_EXPORT_MODEL)
     add_definitions(-D SHL_EXPORT_MODEL)
 endif()
 
+# SHL disable xtheadvdot extension
+if(CONFIG_DISABLE_VDOT_EXTENSION)
+    add_definitions(-D SHL_DISABLE_VDOT)
+endif()
+
 if (CONFIG_BUILD_ANDROID_TH1520)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_ANDROID -Wno-deprecated-non-prototype")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_ANDROID")
@@ -66,6 +70,11 @@ if(CONFIG_SHL_LAYER_BENCHMARK)
     message(STATUS "Print the execution time of each layer - ON")
 endif()
 
+if(CONFIG_SHL_TRACE)
+    add_definitions(-DSHL_TRACE)
+    message(STATUS "Generate trace data - ON")
+endif()
+
 if(CONFIG_GRAPH_REFERENCE_TVMGEN)
     add_definitions(-DGRAPH_REFERENCE_TVMGEN)
     LIST(APPEND GREF_SRCS source/tvm_gen/utils.c source/tvm_gen/setup.c)

diff --git a/include/backend/c906/perf.h b/include/backend/c906/perf.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_SHL_C906_PERF_H_
+#define INCLUDE_SHL_C906_PERF_H_
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+int shl_c906_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params,
+                                   struct csinn_perf_info *perf_info);
+
+int shl_c906_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_depthwise_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv1d_params *params,
+                                   struct csinn_perf_info *perf_info);
+
+int shl_c906_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params,
+                      struct csinn_perf_info *perf_info);
+
+int shl_c906_abs_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params,
+                      struct csinn_perf_info *perf_info);
+
+int shl_c906_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_clip_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_clip_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params,
+                                   struct csinn_perf_info *perf_info);
+
+int shl_c906_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params,
+                                   struct csinn_perf_info *perf_info);
+
+int shl_c906_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_relu_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_lrn_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_lrn_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                         struct csinn_tensor *output, struct csinn_matmul_params *params,
+                         struct csinn_perf_info *perf_info);
+
+int shl_c906_minimum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params,
+                          struct csinn_perf_info *perf_info);
+
+int shl_c906_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params,
+                      struct csinn_perf_info *perf_info);
+
+int shl_c906_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                        struct csinn_tensor *output, struct csinn_prelu_params *params,
+                        struct csinn_perf_info *perf_info);
+
+int shl_c906_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_relu1_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_split_perf(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_split_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params,
+                      struct csinn_perf_info *perf_info);
+
+int shl_c906_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reshape_params *params, struct csinn_perf_info *perf_info);
+
+int shl_c906_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params, struct csinn_perf_info *perf_info);
+
+#endif  // INCLUDE_SHL_C906_PERF_H_
diff --git a/include/backend/c908/c908.h b/include/backend/c908/c908.h
@@ -45,20 +45,6 @@ int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn
                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
                                         struct csinn_conv2d_params *params);
 
-int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                 struct csinn_pool_params *params);
-int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                 struct csinn_pool_params *params);
-int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                 struct csinn_pool_params *params);
-
-int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
-                                 struct csinn_pool_params *params);
-int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
-                                 struct csinn_pool_params *params);
-int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
-                                 struct csinn_pool_params *params);
-
 int shl_c908_fullyconnected_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
                                       struct csinn_fc_params *params);

diff --git a/include/backend/c920/c920.h b/include/backend/c920/c920.h
@@ -96,12 +96,35 @@ void shl_c920_gemm_block_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp
                                        __fp16 *bias, int m, int k, int n, const int M_BLK,
                                        const int K_BLK, const int N_BLK);
 
-/************************************ fullyconnected **********************************/
+/************************************* gemm a0b1 *************************************/
 void shl_c920_gemm_a0b1_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
                                       int M, int K, int N);
 void shl_c920_gemm_a0b1_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
                                       int M, int K, int N);
 
+void shl_c920_gemm_a0nb1r_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                        int M, int K, int N);
+void shl_c920_gemm_a0nb1n_dot_fp32_q8(float *dst, const float *sa, const int8_t *sb, float *bias,
+                                      int M, int K, int N, const __fp16 *scale);
+void shl_c920_gemm_a0nb1n_dot_fp32_q4(float *dst, const float *sa, const int8_t *sb, float *bias,
+                                      int M, int K, int N, const __fp16 *scale);
+
+void shl_c920_gemm_a0nb1r_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                        __fp16 *bias, int M, int K, int N);
+void shl_c920_gemm_a0nb1n_dot_fp16_q8(__fp16 *dst, const __fp16 *sa, const int8_t *sb, __fp16 *bias,
+                                      int M, int K, int N, const __fp16 *scale);
+void shl_c920_gemm_a0nb1n_dot_fp16_q4(__fp16 *dst, const __fp16 *sa, const int8_t *sb, __fp16 *bias,
+                                      int M, int K, int N, const __fp16 *scale);
+
+void shl_c920_gemm_a0nb1_dot_fp16_q8_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb,
+                                               __fp16 *bias, int M, int K, int N,
+                                               const __fp16 *scale);
+
+void shl_c920_gemm_a0nb1_dot_fp16_q4_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb,
+                                               __fp16 *bias, int M, int K, int N,
+                                               const __fp16 *scale);
+
+/************************************ fullyconnected **********************************/
 int shl_c920_fullyconnected_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
                                       struct csinn_fc_params *params);
@@ -110,12 +133,23 @@ int shl_c920_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_t
                                       struct csinn_fc_params *params);
 
 /*************************************** matmul ***************************************/
-int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
-                         struct csinn_tensor *output, struct csinn_matmul_params *params);
-int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
-                         struct csinn_tensor *output, struct csinn_matmul_params *params);
-int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
-                                struct csinn_tensor *output, struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b0_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                              struct csinn_tensor *output, struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b1_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                              struct csinn_tensor *output, struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b1_fp32_block_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                                          struct csinn_tensor *output,
+                                          struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b0_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                              struct csinn_tensor *output, struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b0_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                                     struct csinn_tensor *output,
+                                     struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b1_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                              struct csinn_tensor *output, struct csinn_matmul_params *params);
+int shl_c920_matmul_a0b1_fp16_block_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                                          struct csinn_tensor *output,
+                                          struct csinn_matmul_params *params);
 
 void shl_c920_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale,
                         uint32_t length);