From 533ee307e34ffec910443d44dd5541c212e6b6cf Mon Sep 17 00:00:00 2001 From: chenf Date: Tue, 23 Jan 2024 14:00:37 +0800 Subject: [PATCH] SHL: version 2.9.0 --- .gitignore | 1 - CMakeLists.txt | 17 +- Makefile | 3 - cmake/c906_elf.cmake | 9 + cmake/e907.cmake | 3 +- cmake/rules.cmake | 19 +- include/backend/c906/perf.h | 122 ++ include/backend/c908/c908.h | 14 - include/backend/c920/c920.h | 48 +- include/backend/c920/perf.h | 37 + include/backend/c920v2/perf.h | 29 + include/backend/reference/perf.h | 724 +++++++ include/backend/reference/ref.h | 8 +- include/backend/rvm/rvm.h | 18 +- include/backend/rvv/cap.h | 21 +- include/backend/rvv/perf.h | 161 ++ include/backend/rvv/rvv.h | 84 +- include/csinn/csi_nn.h | 8 +- include/csinn/csinn_data_structure.h | 20 + include/graph/shl_gref.h | 8 +- include/llm/shl_llm.h | 12 + include/shl_debug.h | 6 +- include/shl_multithread.h | 30 + include/shl_profiler.h | 227 ++ include/shl_utils.h | 20 +- module/json/.clang-format | 2 + source/c906_opt/fp16/convolution.c | 9 +- source/c906_opt/fp16/convolution_3x3_fp16.c | 9 +- source/c906_opt/fp16/fullyconnected.c | 57 +- source/c906_opt/fp32/convolution.c | 9 +- source/c906_opt/fp32/convolution_3x3_fp32.c | 9 +- source/c906_opt/performance.c | 327 +++ source/c906_opt/setup.c | 74 + source/c908_opt/fp16/convolution.c | 14 +- .../fp16/convolution_3x3_fp16_packn_1.c | 26 +- source/c908_opt/fp32/convolution.c | 14 +- .../fp32/convolution_3x3_fp32_packn_1.c | 26 +- source/c908_opt/int8/convolution.c | 59 +- source/c908_opt/int8/convolution_1x1_int8.c | 3 + source/c908_opt/int8/convolution_gemm_int8.c | 3 + source/c920_opt/fp16/convolution.c | 15 +- source/c920_opt/fp16/fullyconnected.c | 11 +- source/c920_opt/fp16/gemm_a0nb1n_fp16.c | 864 ++++++++ source/c920_opt/fp16/gemm_a0nb1r_fp16.c | 372 ++++ source/c920_opt/fp16/gemm_fp16_block.c | 4 +- source/c920_opt/fp16/gemm_fp16_rearrange.c | 143 ++ source/c920_opt/fp16/matmul_fp16.c | 369 +++- source/c920_opt/fp32/convolution.c | 15 +- source/c920_opt/fp32/gemm_a0nb1n_fp32.c | 868 ++++++++ source/c920_opt/fp32/gemm_a0nb1r_fp32.c | 372 ++++ source/c920_opt/fp32/gemm_fp32_block.c | 4 +- source/c920_opt/fp32/matmul_fp32.c | 279 ++- source/c920_opt/performance.c | 79 + source/c920_opt/setup.c | 22 +- source/c920v2_opt/fp16/convolution.c | 15 +- source/c920v2_opt/fp32/convolution.c | 15 +- source/c920v2_opt/int8/convolution.c | 35 +- source/c920v2_opt/performance.c | 61 + source/c920v2_opt/setup.c | 19 +- source/graph_ref/rms_norm.c | 13 +- source/graph_ref/setup.c | 516 ++++- source/graph_ref/subgraph.c | 87 +- source/llm/llama2.c | 130 +- source/llm/llm.c | 2 +- source/nn2/format.c | 37 +- source/nn2/rms_norm.c | 14 +- source/nn2/setup.c | 158 +- source/nn2/utils.c | 226 +- source/reference/performance.c | 1826 +++++++++++++++++ source/reference/rms_norm.c | 10 +- source/reference/rope.c | 68 +- .../reference/scaled_dot_product_attention.c | 115 +- source/reference/setup.c | 223 ++ source/thead_matrix/fullyconnected_fp16.c | 67 +- source/thead_matrix/setup.c | 18 - source/thead_rvv/CMakeLists.txt | 44 +- source/thead_rvv/Kconfig | 62 +- source/thead_rvv/binary_broadcast.c | 15 + source/thead_rvv/capability.c | 40 +- source/thead_rvv/fp16/avgpool.c | 126 +- source/thead_rvv/fp16/convolution.c | 18 +- .../fp16/convolution_3x3_fp16_packn.c | 28 +- source/thead_rvv/fp16/expand_dims.c | 42 + source/thead_rvv/fp16/fullyconnected_fp16.c | 67 +- source/thead_rvv/fp16/gemm_fp16_block.c | 5 +- source/thead_rvv/fp16/llm_pos.c | 65 + source/thead_rvv/fp16/matmul.c | 16 +- source/thead_rvv/fp16/maxpool.c | 126 +- source/thead_rvv/fp16/rms_norm.c | 90 +- source/thead_rvv/fp16/rope.c | 95 + .../fp16/scaled_dot_product_attention.c | 751 +++++++ source/thead_rvv/fp16/softmax.c | 3 +- source/thead_rvv/fp16/strided_slice.c | 3 + source/thead_rvv/fp16/transpose.c | 3 + source/thead_rvv/fp32/avgpool.c | 126 +- source/thead_rvv/fp32/convolution.c | 15 +- .../fp32/convolution_3x3_fp32_packn.c | 28 +- source/thead_rvv/fp32/expand_dims.c | 42 + source/thead_rvv/fp32/gemm_fp32_block.c | 5 +- source/thead_rvv/fp32/matmul.c | 12 +- source/thead_rvv/fp32/maxpool.c | 126 +- source/thead_rvv/fp32/rms_norm.c | 4 +- source/thead_rvv/fp32/rope.c | 93 + .../fp32/scaled_dot_product_attention.c | 750 +++++++ source/thead_rvv/fp32/softmax.c | 3 +- source/thead_rvv/fp32/transpose.c | 3 + source/thead_rvv/int32/embedding.c | 202 ++ source/thead_rvv/int8/avgpool.c | 44 +- source/thead_rvv/int8/convolution.c | 72 +- source/thead_rvv/int8/convolution1d.c | 7 +- source/thead_rvv/int8/convolution_1x1_int8.c | 3 + .../int8/convolution_3x3_int8_packn.c | 14 +- source/thead_rvv/int8/convolution_gemm_int8.c | 3 + .../int8/convolution_gemm_int8_pack1ton.c | 4 + .../int8/convolution_gemm_int8_packn.c | 3 +- .../int8/convolution_gemm_int8_packnto1.c | 3 +- source/thead_rvv/int8/depthwise_convolution.c | 5 +- source/thead_rvv/int8/fullyconnected.c | 8 +- source/thead_rvv/int8/matmul.c | 8 +- source/thead_rvv/int8/maxpool.c | 126 +- source/thead_rvv/int8/rms_norm.c | 6 +- source/thead_rvv/int8/transpose.c | 3 + source/thead_rvv/performance.c | 723 +++++++ source/thead_rvv/setup.c | 262 ++- source/utils/debug.c | 33 +- source/utils/multithread.c | 43 + source/utils/shl_profiler.c | 490 +++++ tests/autotest/interface_test.py | 29 +- tests/llm/Makefile | 5 + tests/llm/c920_llama2_quantize.c | 159 ++ tests/llm/llama2.c | 3 + tests/llm/llama2_quantize.c | 3 + tests/profiler/Makefile | 43 + tests/profiler/test_trace.c | 220 ++ tests/utils/test_utils.c | 125 +- tests/utils/test_utils.h | 3 + tests/validation_layer/Makefile.c906 | 2 +- tests/validation_layer/Makefile.c908 | 2 +- tests/validation_layer/Makefile.c920 | 4 +- tests/validation_layer/Makefile.c920v2 | 2 +- tests/validation_layer/Makefile.rvm | 2 +- tests/validation_layer/Makefile.rvv | 6 +- tests/validation_layer/abs.cpp | 16 +- tests/validation_layer/add.cpp | 16 +- tests/validation_layer/averagepool.cpp | 29 +- tests/validation_layer/averagepool_nhwc.cpp | 16 +- tests/validation_layer/broadcast_to.cpp | 16 +- tests/validation_layer/clip.cpp | 16 +- tests/validation_layer/concat.cpp | 16 +- tests/validation_layer/convolution.cpp | 53 +- tests/validation_layer/convolution1d.cpp | 24 +- tests/validation_layer/convolution_nhwc.cpp | 16 +- tests/validation_layer/deconvolution.cpp | 29 +- .../depthwise_convolution.cpp | 46 +- .../depthwise_convolution1d.cpp | 24 +- .../depthwise_convolution_nhwc.cpp | 16 +- tests/validation_layer/div.cpp | 16 +- tests/validation_layer/erf.cpp | 16 +- tests/validation_layer/fullyconnected.cpp | 27 +- tests/validation_layer/gather.cpp | 18 +- tests/validation_layer/global_avgpool.cpp | 28 +- .../validation_layer/global_avgpool_nhwc.cpp | 16 +- tests/validation_layer/global_maxpool.cpp | 28 +- .../validation_layer/global_maxpool_nhwc.cpp | 16 +- tests/validation_layer/group_convolution.cpp | 45 +- tests/validation_layer/layer_norm.cpp | 16 +- tests/validation_layer/leaky_relu.cpp | 16 +- tests/validation_layer/lrn.cpp | 16 +- tests/validation_layer/matmul.cpp | 39 +- tests/validation_layer/maxpool.cpp | 29 +- tests/validation_layer/maxpool_nhwc.cpp | 16 +- tests/validation_layer/mean_stride.cpp | 16 +- tests/validation_layer/minimum.cpp | 16 +- tests/validation_layer/mul.cpp | 16 +- tests/validation_layer/pad.cpp | 16 +- tests/validation_layer/power.cpp | 16 +- tests/validation_layer/prelu.cpp | 16 +- tests/validation_layer/reduce_sum.cpp | 16 +- tests/validation_layer/relu.cpp | 17 +- tests/validation_layer/relu1.cpp | 16 +- tests/validation_layer/relu6.cpp | 18 +- tests/validation_layer/reshape.cpp | 18 +- tests/validation_layer/rms_norm.cpp | 18 +- tests/validation_layer/sigmoid.cpp | 16 +- tests/validation_layer/silu.cpp | 16 +- tests/validation_layer/softmax.cpp | 16 +- tests/validation_layer/split.cpp | 16 +- tests/validation_layer/sqrt.cpp | 16 +- tests/validation_layer/strided_slice.cpp | 16 +- tests/validation_layer/sub.cpp | 16 +- tests/validation_layer/sum_stride.cpp | 16 +- tests/validation_layer/testutil.h | 721 +++++-- tests/validation_layer/transpose.cpp | 16 +- tests/validation_layer/where.cpp | 16 +- tests/validation_layer/where_softmax.cpp | 14 +- version | 2 +- 196 files changed, 14834 insertions(+), 1775 deletions(-) create mode 100644 include/backend/c906/perf.h create mode 100644 include/backend/c920/perf.h create mode 100644 include/backend/c920v2/perf.h create mode 100644 include/backend/reference/perf.h create mode 100644 include/backend/rvv/perf.h create mode 100644 include/shl_multithread.h create mode 100644 include/shl_profiler.h create mode 100644 module/json/.clang-format create mode 100644 source/c906_opt/performance.c create mode 100644 source/c920_opt/fp16/gemm_a0nb1n_fp16.c create mode 100644 source/c920_opt/fp16/gemm_a0nb1r_fp16.c create mode 100644 source/c920_opt/fp16/gemm_fp16_rearrange.c create mode 100644 source/c920_opt/fp32/gemm_a0nb1n_fp32.c create mode 100644 source/c920_opt/fp32/gemm_a0nb1r_fp32.c create mode 100644 source/c920_opt/performance.c create mode 100644 source/c920v2_opt/performance.c create mode 100644 source/reference/performance.c create mode 100644 source/thead_rvv/fp16/expand_dims.c create mode 100644 source/thead_rvv/fp16/llm_pos.c create mode 100644 source/thead_rvv/fp16/rope.c create mode 100644 source/thead_rvv/fp16/scaled_dot_product_attention.c create mode 100644 source/thead_rvv/fp32/expand_dims.c create mode 100644 source/thead_rvv/fp32/rope.c create mode 100644 source/thead_rvv/fp32/scaled_dot_product_attention.c create mode 100644 source/thead_rvv/int32/embedding.c create mode 100644 source/thead_rvv/performance.c create mode 100644 source/utils/multithread.c create mode 100644 source/utils/shl_profiler.c create mode 100644 tests/llm/c920_llama2_quantize.c create mode 100644 tests/profiler/Makefile create mode 100644 tests/profiler/test_trace.c diff --git a/.gitignore b/.gitignore index 8822f87d..4e60cffe 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,6 @@ example/*.elf openvx_build e907_build rvv_build -rvv_nodot_build rvm_build c906_static_build c906_so_build diff --git a/CMakeLists.txt b/CMakeLists.txt index f05ba775..a97fab5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ if (CONFIG_CUSTOM_SOURCE_SELECT) else() set(CONFIG_USE_SHL_DEBUG ON) set(CONFIG_SHL_LAYER_BENCHMARK ON) + set(CONFIG_SHL_TRACE ON) endif() file (STRINGS "version" SHL_VERSION) @@ -47,18 +48,6 @@ if(CONFIG_BUILD_RISCV_RVV) install(TARGETS rvv_static DESTINATION lib) endif() -if(CONFIG_BUILD_RISCV_RVV_NODOT) - # build rvv a without xtheadvdot extension - include(cmake/rules.cmake) - LIST(APPEND RVV_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS}) - add_library(rvv_static STATIC ${RVV_LST}) - SET_TARGET_PROPERTIES(rvv_static PROPERTIES OUTPUT_NAME "shl_rvv_nodot") - set(RVV_BUILD_FLAGS -ffp-contract=off -march=rv64gcv_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_RVV -DSHL_BUILD_REF -DSHL_BUILD_GREF) - target_compile_options(rvv_static PRIVATE ${RVV_BUILD_FLAGS}) - - install(TARGETS rvv_static DESTINATION lib) -endif() - if(CONFIG_BUILD_RISCV_C906) # build c906 lib set(CONFIG_GRAPH_REFERENCE_TVMGEN ON) @@ -102,8 +91,8 @@ if(CONFIG_BUILD_RISCV_C920) set(SHL_LIB_TARGET "c920_lib") set(SHL_LIB_NAME shl_c920) - LIST(APPEND SHL_BUILD_SRC_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C920_SRCS}) - set(SHL_BUILD_C_FLAGS -ffp-contract=off -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_C920 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV) + LIST(APPEND SHL_BUILD_SRC_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C920_SRCS} ${LLM_SRCS}) + set(SHL_BUILD_C_FLAGS -ffp-contract=off -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_C920 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV -fopenmp) include(cmake/target_build.cmake) target_include_directories(${SHL_LIB_TARGET} PRIVATE module/dlpack/include/) endif() diff --git a/Makefile b/Makefile index 1dae21a1..42a7d9bc 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,6 @@ nn2_e907_elf: nn2_rvv: mkdir -p rvv_build; cd rvv_build; cmake ../ -DCONFIG_BUILD_RISCV_RVV=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/rvv/; make -j${USE_CORE}; make install; cd - -nn2_rvv_nodot: - mkdir -p rvv_nodot_build; cd rvv_nodot_build; cmake ../ -DCONFIG_BUILD_RISCV_RVV_NODOT=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/rvv_nodot/; make -j${USE_CORE}; make install; cd - - nn2_c906: mkdir -p c906_static_build; cd c906_static_build; cmake ../ -DCONFIG_BUILD_RISCV_C906=ON -DCONFIG_SHL_BUILD_STATIC=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/c906/; make -j${USE_CORE}; make install; cd - diff --git a/cmake/c906_elf.cmake b/cmake/c906_elf.cmake index 39b4a90f..38cbd4d8 100644 --- a/cmake/c906_elf.cmake +++ b/cmake/c906_elf.cmake @@ -359,9 +359,12 @@ set(CONFIG_THEAD_RVV_DECONVOLUTION_FP16 ON) set(CONFIG_THEAD_RVV_DIV_FP32 ON) set(CONFIG_THEAD_RVV_DIV_FP16 ON) set(CONFIG_THEAD_RVV_DIV_INT8 ON) +set(CONFIG_THEAD_RVV_EMBEDDING_INT32 ON) set(CONFIG_THEAD_RVV_ERF_FP32 ON) set(CONFIG_THEAD_RVV_ERF_FP16 ON) set(CONFIG_THEAD_RVV_ERF_INT8 ON) +set(CONFIG_THEAD_RVV_EXPAND_DIMS_FP32 ON) +set(CONFIG_THEAD_RVV_EXPAND_DIMS_FP16 ON) set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32 ON) set(CONFIG_THEAD_RVV_FULLYCONNECTED_FP16 ON) set(CONFIG_THEAD_RVV_FULLYCONNECTED_INT8 ON) @@ -383,6 +386,7 @@ set(CONFIG_THEAD_RVV_LAYER_NORM_INT8 ON) set(CONFIG_THEAD_RVV_LEAKY_RELU_FP32 ON) set(CONFIG_THEAD_RVV_LEAKY_RELU_FP16 ON) set(CONFIG_THEAD_RVV_LEAKY_RELU_INT8 ON) +set(CONFIG_THEAD_RVV_LLM_POS_FP16 ON) set(CONFIG_THEAD_RVV_MATMUL_FP32 ON) set(CONFIG_THEAD_RVV_MATMUL_FP16 ON) set(CONFIG_THEAD_RVV_MATMUL_INT8 ON) @@ -411,6 +415,10 @@ set(CONFIG_THEAD_RVV_RESHAPE_INT8 ON) set(CONFIG_THEAD_RVV_RMS_NORM_FP32 ON) set(CONFIG_THEAD_RVV_RMS_NORM_FP16 ON) set(CONFIG_THEAD_RVV_RMS_NORM_INT8 ON) +set(CONFIG_THEAD_RVV_ROPE_FP32 ON) +set(CONFIG_THEAD_RVV_ROPE_FP16 ON) +set(CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP32 ON) +set(CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP16 ON) set(CONFIG_THEAD_RVV_SIGMOID_FP32 ON) set(CONFIG_THEAD_RVV_SIGMOID_FP16 ON) set(CONFIG_THEAD_RVV_SIGMOID_INT8 ON) @@ -487,3 +495,4 @@ set(CONFIG_C906_SUB_FP32 ON) set(CONFIG_C906_SUB_FP16 ON) set(CONFIG_USE_SHL_DEBUG ON) set(CONFIG_SHL_LAYER_BENCHMARK ON) +set(CONFIG_SHL_TRACE ON) diff --git a/cmake/e907.cmake b/cmake/e907.cmake index 1fa5c72d..eed8036d 100644 --- a/cmake/e907.cmake +++ b/cmake/e907.cmake @@ -336,4 +336,5 @@ set(CONFIG_E907_OPT_MUL ON) set(CONFIG_E907_OPT_SUM ON) set(CONFIG_E907_OPT_SOFTMAX ON) set(CONFIG_USE_SHL_DEBUG ON) -set(CONFIG_SHL_LAYER_BENCHMARK ON) \ No newline at end of file +set(CONFIG_SHL_LAYER_BENCHMARK ON) +set(CONFIG_SHL_TRACE ON) \ No newline at end of file diff --git a/cmake/rules.cmake b/cmake/rules.cmake index 500ef7e2..e17cd278 100644 --- a/cmake/rules.cmake +++ b/cmake/rules.cmake @@ -1,11 +1,10 @@ if (NOT CONFIG_USE_COMPILER_PATH) # riscv linux compiler -if (CONFIG_BUILD_RISCV_RVV OR CONFIG_BUILD_RISCV_RVV_NODOT OR - CONFIG_BUILD_RISCV_C906 OR CONFIG_BUILD_RISCV_RVM OR - CONFIG_BUILD_RISCV_C908 OR CONFIG_BUILD_RISCV_C920 OR - CONFIG_BUILD_RISCV_C920V2 OR CONFIG_BUILD_RISCV_PNNA OR - CONFIG_BUILD_TH1520) +if (CONFIG_BUILD_RISCV_RVV OR CONFIG_BUILD_RISCV_C906 OR + CONFIG_BUILD_RISCV_RVM OR CONFIG_BUILD_RISCV_C908 OR + CONFIG_BUILD_RISCV_C920 OR CONFIG_BUILD_RISCV_C920V2 OR + CONFIG_BUILD_RISCV_PNNA OR CONFIG_BUILD_TH1520) set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc) set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++) set(CMAKE_ASM_COMPILER riscv64-unknown-linux-gnu-gcc) @@ -30,6 +29,11 @@ if(CONFIG_USE_EXPORT_MODEL) add_definitions(-D SHL_EXPORT_MODEL) endif() +# SHL disable xtheadvdot extension +if(CONFIG_DISABLE_VDOT_EXTENSION) + add_definitions(-D SHL_DISABLE_VDOT) +endif() + if (CONFIG_BUILD_ANDROID_TH1520) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_ANDROID -Wno-deprecated-non-prototype") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_ANDROID") @@ -66,6 +70,11 @@ if(CONFIG_SHL_LAYER_BENCHMARK) message(STATUS "Print the execution time of each layer - ON") endif() +if(CONFIG_SHL_TRACE) + add_definitions(-DSHL_TRACE) + message(STATUS "Generate trace data - ON") +endif() + if(CONFIG_GRAPH_REFERENCE_TVMGEN) add_definitions(-DGRAPH_REFERENCE_TVMGEN) LIST(APPEND GREF_SRCS source/tvm_gen/utils.c source/tvm_gen/setup.c) diff --git a/include/backend/c906/perf.h b/include/backend/c906/perf.h new file mode 100644 index 00000000..1c03719d --- /dev/null +++ b/include/backend/c906/perf.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_SHL_C906_PERF_H_ +#define INCLUDE_SHL_C906_PERF_H_ + +#include "csi_nn.h" +#include "shl_utils.h" + +int shl_c906_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_depthwise_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_abs_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_lrn_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_minimum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_relu1_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_split_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_c906_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_perf_info *perf_info); + +int shl_c906_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +#endif // INCLUDE_SHL_C906_PERF_H_ diff --git a/include/backend/c908/c908.h b/include/backend/c908/c908.h index 07cd0e70..633863ba 100644 --- a/include/backend/c908/c908.h +++ b/include/backend/c908/c908.h @@ -45,20 +45,6 @@ int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn struct csinn_tensor *kernel, struct csinn_tensor *bias, struct csinn_conv2d_params *params); -int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); - -int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); - int shl_c908_fullyconnected_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weights, struct csinn_tensor *bias, struct csinn_fc_params *params); diff --git a/include/backend/c920/c920.h b/include/backend/c920/c920.h index a3881846..2a41d19d 100644 --- a/include/backend/c920/c920.h +++ b/include/backend/c920/c920.h @@ -96,12 +96,35 @@ void shl_c920_gemm_block_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp __fp16 *bias, int m, int k, int n, const int M_BLK, const int K_BLK, const int N_BLK); -/************************************ fullyconnected **********************************/ +/************************************* gemm a0b1 *************************************/ void shl_c920_gemm_a0b1_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, int M, int K, int N); void shl_c920_gemm_a0b1_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int M, int K, int N); +void shl_c920_gemm_a0nb1r_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, + int M, int K, int N); +void shl_c920_gemm_a0nb1n_dot_fp32_q8(float *dst, const float *sa, const int8_t *sb, float *bias, + int M, int K, int N, const __fp16 *scale); +void shl_c920_gemm_a0nb1n_dot_fp32_q4(float *dst, const float *sa, const int8_t *sb, float *bias, + int M, int K, int N, const __fp16 *scale); + +void shl_c920_gemm_a0nb1r_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + __fp16 *bias, int M, int K, int N); +void shl_c920_gemm_a0nb1n_dot_fp16_q8(__fp16 *dst, const __fp16 *sa, const int8_t *sb, __fp16 *bias, + int M, int K, int N, const __fp16 *scale); +void shl_c920_gemm_a0nb1n_dot_fp16_q4(__fp16 *dst, const __fp16 *sa, const int8_t *sb, __fp16 *bias, + int M, int K, int N, const __fp16 *scale); + +void shl_c920_gemm_a0nb1_dot_fp16_q8_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + __fp16 *bias, int M, int K, int N, + const __fp16 *scale); + +void shl_c920_gemm_a0nb1_dot_fp16_q4_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + __fp16 *bias, int M, int K, int N, + const __fp16 *scale); + +/************************************ fullyconnected **********************************/ int shl_c920_fullyconnected_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weights, struct csinn_tensor *bias, struct csinn_fc_params *params); @@ -110,12 +133,23 @@ int shl_c920_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_t struct csinn_fc_params *params); /*************************************** matmul ***************************************/ -int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, - struct csinn_tensor *output, struct csinn_matmul_params *params); -int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, - struct csinn_tensor *output, struct csinn_matmul_params *params); -int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1, - struct csinn_tensor *output, struct csinn_matmul_params *params); +int shl_c920_matmul_a0b0_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); +int shl_c920_matmul_a0b1_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); +int shl_c920_matmul_a0b1_fp32_block_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, + struct csinn_matmul_params *params); +int shl_c920_matmul_a0b0_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); +int shl_c920_matmul_a0b0_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, + struct csinn_matmul_params *params); +int shl_c920_matmul_a0b1_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params); +int shl_c920_matmul_a0b1_fp16_block_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, + struct csinn_matmul_params *params); void shl_c920_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale, uint32_t length); diff --git a/include/backend/c920/perf.h b/include/backend/c920/perf.h new file mode 100644 index 00000000..7eb4a024 --- /dev/null +++ b/include/backend/c920/perf.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_SHL_C920_PERF_H_ +#define INCLUDE_SHL_C920_PERF_H_ + +#include "csi_nn.h" +#include "shl_utils.h" + +int shl_c920_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_c920_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info); + +int shl_c920_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info); + +#endif // INCLUDE_SHL_C920_PERF_H_ diff --git a/include/backend/c920v2/perf.h b/include/backend/c920v2/perf.h new file mode 100644 index 00000000..e5d6a619 --- /dev/null +++ b/include/backend/c920v2/perf.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_SHL_C920V2_PERF_H_ +#define INCLUDE_SHL_C920V2_PERF_H_ + +#include "csi_nn.h" +#include "shl_utils.h" + +int shl_c920v2_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +#endif // INCLUDE_SHL_C920V2_PERF_H_ diff --git a/include/backend/reference/perf.h b/include/backend/reference/perf.h new file mode 100644 index 00000000..9956bdb1 --- /dev/null +++ b/include/backend/reference/perf.h @@ -0,0 +1,724 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_SHL_REF_PERF_H_ +#define INCLUDE_SHL_REF_PERF_H_ + +#include "csi_nn.h" +#include "shl_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int shl_ref_abs_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_acos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_acosh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_and_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_arange_perf(struct csinn_tensor *output, struct csinn_arange_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_argmax_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_argmin_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_asin_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_asinh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_atan_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_atanh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_avgpool3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_batch_normalization_perf(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_batch_to_space_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_broadcast_to_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_ceil_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_col2im_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_conv2d_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_conv2d_relu_perf(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_cache_matmul_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_cache_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_conv2d_channel_relu_perf(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_conv2d_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_conv2d_channel_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_conv2d_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_conv2d_relu_perf(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_conv2d_channel_relu_perf(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_conv2d_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_conv2d_channel_relu6_perf(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, + struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_group_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_group_conv2d_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_group_conv2d_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_group_conv2d_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_group_conv2d_channel_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_conv3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_cos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_cosh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_cumprod_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_cumsum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_data_convert_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_depthwise_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_group_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_deconv3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_depth_to_space_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_elu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_fsmn_perf(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_erf_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_exp_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_expand_dims_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_expm1_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_flatten_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_floor_divide_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_floor_mod_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_floor_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_gather_nd_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_gather_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_greater_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_greater_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_hard_sigmoid_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_im2col_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_isnan_bool_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_l2_normalization_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_l2pool_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_layer_norm_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_less_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_less_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_log_softmax_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_log_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_log1p_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_logical_and_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_logical_not_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_logical_or_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_logical_xor_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_lrn_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_max_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_maximum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_maxpool2d_locat_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_maxpool3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_mean_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_mean_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_min_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_minimum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_mod_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_ndarray_size_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_negative_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_non_max_suppression_std_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_not_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_not_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_or_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_pad_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_power_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_prod_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_proposal_perf(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_psroipooling_perf(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_reduce_logsumexp_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_reduce_max_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_reduce_mean_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_reduce_min_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_reduce_prod_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_relu1_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_relun_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_resize_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_reverse_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_roi_align_perf(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_roipool_perf(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_round_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_rsqrt_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_scatter_nd_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_unsorted_segment_max_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_segment_max_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_unsorted_segment_mean_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_segment_mean_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_unsorted_segment_min_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_segment_min_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_unsorted_segment_prod_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_segment_prod_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_unsorted_segment_sum_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_segment_sum_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_select_perf(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_shape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_shuffle_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_sigmoid_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_silu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_sign_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_sin_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_sinh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_slice_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_softmax_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_softplus_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_softrelu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_softsign_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_space_to_batch_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_space_to_depth_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_split_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_sqrt_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_square_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_squeeze_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_stack_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_strided_slice_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_sum_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_tan_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_tanh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_threshold_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_tile_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_topk_perf(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_transpose_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_trunc_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_unpooling_perf(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_unstack_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_xor_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_yuv_rgb_scale_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_one_hot_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_where_perf(struct csinn_tensor *condition, struct csinn_tensor *x, + struct csinn_tensor *y, struct csinn_tensor *output, + struct csinn_where_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_where_softmax_perf(struct csinn_tensor *condition, struct csinn_tensor *y, + struct csinn_tensor *output, + struct csinn_where_softmax_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_cast_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cast_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_instance_norm_perf(struct csinn_tensor *input, struct csinn_tensor *scales, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_instance_norm_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_rms_norm_perf(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_rope_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_llm_pos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params, struct csinn_perf_info *perf_info); + +int shl_ref_embedding_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_ref_scaled_dot_product_attention_perf(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output, + struct csinn_scale_dot_attention_params *params, + struct csinn_perf_info *perf_info); + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_SHL_REF_PERF_H_ diff --git a/include/backend/reference/ref.h b/include/backend/reference/ref.h index d80168a6..ce7de9a4 100644 --- a/include/backend/reference/ref.h +++ b/include/backend/reference/ref.h @@ -1192,11 +1192,11 @@ int shl_ref_instance_norm_quant(struct csinn_tensor *input, struct csinn_tensor struct csinn_tensor *bias, struct csinn_tensor *output, struct csinn_instance_norm_params *params); -int shl_ref_rms_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params); +int shl_ref_rms_norm_f32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); -int shl_ref_rms_norm_quant(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params); +int shl_ref_rms_norm_quant(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); int shl_ref_rope_f32(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_rope_params *params); diff --git a/include/backend/rvm/rvm.h b/include/backend/rvm/rvm.h index 6291bc8a..876fd928 100644 --- a/include/backend/rvm/rvm.h +++ b/include/backend/rvm/rvm.h @@ -41,20 +41,6 @@ int shl_rvm_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_ struct csinn_tensor *kernel, struct csinn_tensor *bias, struct csinn_conv2d_params *params); -int shl_rvm_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvm_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvm_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvm_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvm_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); - -int shl_rvm_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); - int shl_rvm_fullyconnected_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weights, struct csinn_tensor *bias, struct csinn_fc_params *params); @@ -134,6 +120,10 @@ void shl_rvm_fc_gemm_reorder_weight_fp16(struct csinn_tensor *weights); void shl_rvm_fc_gemm_reorder_weight_fp16_w_int8(struct csinn_tensor *weights); void shl_rvm_fc_gemm_reorder_weight_int8(struct csinn_tensor *weights); +void shl_rvm_fc_dequantize_per_channel_i8_to_f16(struct csinn_tensor *weights, + struct csinn_fc_params *params, + __fp16 *weights_fp16); + int shl_rvm_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weights, struct csinn_tensor *bias, struct csinn_fc_params *params); diff --git a/include/backend/rvv/cap.h b/include/backend/rvv/cap.h index 375d7f4f..6f6dbffb 100644 --- a/include/backend/rvv/cap.h +++ b/include/backend/rvv/cap.h @@ -121,7 +121,24 @@ int shl_rvv_split_cap(struct csinn_tensor *input, struct csinn_tensor **output, int shl_rvv_silu_cap(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_sigmoid_params *params); -int shl_rvv_rms_norm_cap(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params); +int shl_rvv_rms_norm_cap(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); + +int shl_rvv_embedding_cap(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_rvv_expand_dims_cap(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); + +int shl_rvv_rope_cap(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params); + +int shl_rvv_scaled_dot_product_attention_cap(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params); + +int shl_rvv_llm_pos_cap(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params); #endif // INCLUDE_SHL_RVV_CAP_H_ diff --git a/include/backend/rvv/perf.h b/include/backend/rvv/perf.h new file mode 100644 index 00000000..e227ddb3 --- /dev/null +++ b/include/backend/rvv/perf.h @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_SHL_RVV_PERF_H_ +#define INCLUDE_SHL_RVV_PERF_H_ + +#include "csi_nn.h" +#include "shl_utils.h" + +int shl_rvv_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_sigmoid_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_softmax_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_layer_norm_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_transpose_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_gather_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_erf_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_strided_slice_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_split_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_silu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_rms_norm_perf(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_embedding_perf(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_expand_dims_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_rope_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params, struct csinn_perf_info *perf_info); + +int shl_rvv_scaled_dot_product_attention_perf(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params, + struct csinn_perf_info *perf_info); + +int shl_rvv_llm_pos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params, struct csinn_perf_info *perf_info); + +#endif // INCLUDE_SHL_RVV_PERF_H_ diff --git a/include/backend/rvv/rvv.h b/include/backend/rvv/rvv.h index 79e83f87..a813dff6 100644 --- a/include/backend/rvv/rvv.h +++ b/include/backend/rvv/rvv.h @@ -29,9 +29,11 @@ #endif #ifdef __riscv_xtheadvdot +#ifndef SHL_DISABLE_VDOT #define XTHEADVDOT #define SHL_USE_DOT_INT8 // default: support int8 dot // #define SHL_USE_DOT_INT4 // easter eggs +#endif // SHL_DISABLE_VDOT #endif // __riscv_xtheadvdot #endif // __riscv_vector @@ -749,44 +751,6 @@ int shl_rvv_global_maxpool2d_fp32(struct csinn_tensor *input, struct csinn_tenso int shl_rvv_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_pool_params *params); -int shl_rvv_maxpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_maxpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); - -int shl_rvv_avgpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); -int shl_rvv_avgpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_pool_params *params); - int shl_rvv_global_maxpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_pool_params *params); int shl_rvv_global_maxpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output, @@ -847,6 +811,10 @@ void shl_rvv_fc_gemm_reorder_weight_fp16(struct csinn_tensor *weights); void shl_rvv_fc_gemm_reorder_weight_fp16_w_int8(struct csinn_tensor *weights); void shl_rvv_fc_gemm_reorder_weight_int8(struct csinn_tensor *weights); +void shl_rvv_fc_npack2n_dequantize_per_channel_i8_to_f16(struct csinn_tensor *weights, + struct csinn_fc_params *params, + __fp16 *weights_fp16); + void shl_rvv_gemm_a0b1_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, int M, int K, int N); void shl_rvv_gemm_a0b1_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, @@ -964,6 +932,11 @@ int shl_rvv_gather_int8(struct csinn_tensor *input, struct csinn_tensor *indices int shl_rvv_strided_slice_fp16(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_strided_slice_params *params); +int shl_rvv_expand_dims_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); +int shl_rvv_expand_dims_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params); + /************************************ basic math *********************************/ int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, struct csinn_diso_params *params); @@ -1014,12 +987,12 @@ int shl_rvv_layer_norm_int8(struct csinn_tensor *input, struct csinn_tensor *out struct csinn_tensor *gamma, struct csinn_tensor *beta, struct csinn_layer_norm_params *params); -int shl_rvv_rms_norm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params); -int shl_rvv_rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params); -int shl_rvv_rms_norm_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params); +int shl_rvv_rms_norm_fp32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); +int shl_rvv_rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); +int shl_rvv_rms_norm_int8(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); /*********************************** matmul *********************************/ void shl_rvv_matmul_reorder_weight_fp32(struct csinn_tensor *mat1, const int K_BLK, @@ -1067,6 +1040,27 @@ int shl_rvv_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, int shl_rvv_matmul_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output, struct csinn_matmul_params *params); +/******************************** llm *****************************/ +int shl_rvv_embedding_int32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params); + +int shl_rvv_rope_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params); +int shl_rvv_rope_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params); + +int shl_rvv_scaled_dot_product_attention_fp32(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params); +int shl_rvv_scaled_dot_product_attention_fp16(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params); + +int shl_rvv_llm_pos_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params); + /************************************ utils *********************************/ void shl_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw, int padded_h, int padded_w, int pad_top, int pad_left); @@ -1189,7 +1183,7 @@ void shl_rvv_nc1xc0_fp16_to_nchw_fp32(struct csinn_tensor *dest, struct csinn_te struct csinn_callback *shl_cb_map_rvv(int op, int dtype); void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec, - void *est, void *cap); + void *est, void *cap, void *perf); int csrr_vl(); int csrr_vlenb(); diff --git a/include/csinn/csi_nn.h b/include/csinn/csi_nn.h index 0b64a436..7c31b6be 100644 --- a/include/csinn/csi_nn.h +++ b/include/csinn/csi_nn.h @@ -4546,8 +4546,8 @@ int csinn_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output, * If an error occurred while executing the function, the return value is less than or * equal to 0. */ -int csinn_rms_norm_init(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params); +int csinn_rms_norm_init(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); /** * @brief RMS normalization function @@ -4560,8 +4560,8 @@ int csinn_rms_norm_init(struct csinn_tensor *input, struct csinn_tensor *output, * If an error occurred while executing the function, the return value is less than or * equal to 0. */ -int csinn_rms_norm(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params); +int csinn_rms_norm(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); /** * @brief Cache matmul initialization function diff --git a/include/csinn/csinn_data_structure.h b/include/csinn/csinn_data_structure.h index 3390776f..53f570ce 100644 --- a/include/csinn/csinn_data_structure.h +++ b/include/csinn/csinn_data_structure.h @@ -62,6 +62,8 @@ enum csinn_mem_type_enum { CSINN_MEM_TYPE_BLOCK_Q2_K, /**< Block quantization from llama.cpp */ CSINN_MEM_TYPE_BLOCK_Q4_0, /**< Block quantization from llama.cpp */ CSINN_MEM_TYPE_BLOCK_Q8_0, /**< Block quantization from llama.cpp */ + CSINN_MEM_TYPE_BLOCK_Q8_0_REARRANGE, + CSINN_MEM_TYPE_BLOCK_Q4_0_REARRANGE, }; /** CSI-NN quant type */ @@ -421,6 +423,7 @@ enum csinn_layout_enum { CSINN_LAYOUT_1HWO, /**< NHWC constant, depthwise convolution only */ CSINN_LAYOUT_1HW16O16, /**< 16 bytes in parallel for ASP platform */ CSINN_LAYOUT_1HW32O32, /**< 32 bytes in parallel for ASP platform */ + CSINN_LAYOUT_O1HWIO0, /**< NHWC constant, 5 dimensions, winograd convolution only*/ // NC1HWC0 // ACTIVITION @@ -468,6 +471,7 @@ enum csinn_profiler_enum { the output tensor value of every layer. */ CSINN_PROFILER_LEVEL_ALL, /**< The performance analysis mode, do all operations that mentioned above. */ + CSINN_PROFILER_LEVEL_TRACE, /**< The performance analysis mode, generate trace data. */ }; /** debug type */ @@ -544,6 +548,9 @@ struct csinn_session { void *td; /**< Refers to private data, which can generally point to the structure representing the graph in the driver */ bool dynamic_shape; /**< Wether to infer shape */ + + void *trace; /**< Refers to trace data, it is valid after set + profiler_level=CSINN_PROFILER_LEVEL_TRACE */ }; /** CSI-NN tensor */ @@ -555,6 +562,11 @@ struct csinn_callback { int (*perf)(); /**< profiling */ }; +/* CSI-NN perf information obtained by perf csinn_callback */ +struct csinn_perf_info { + char *kernel_name; /**< The actual kernel name used */ +}; + /** CSI-NN params base */ struct csinn_params_base { struct csinn_callback *cb; /**< The callback function pointing to the operator */ @@ -1217,6 +1229,8 @@ struct csinn_rope_params { int32_t xpos_down; int32_t n_dims; int32_t *pos; + bool use_rope_cache; + void *rope_cache; }; /** CSI-NN LLM position OP type */ @@ -1240,10 +1254,16 @@ struct csinn_llm_pos_params { /** CSI-NN scaled_dot_product_attention params */ struct csinn_scale_dot_attention_params { struct csinn_params_base base; /**< The basic information of the operator */ + float norm_factor; bool casual; bool transpose_v; // if transpose_v = true, v should be [batch,np,dim_head,sk] }; +struct csinn_enum_map { + int type; + char *name; +}; + /** * @} */ diff --git a/include/graph/shl_gref.h b/include/graph/shl_gref.h index 6a5bc2dd..3b93c576 100644 --- a/include/graph/shl_gref.h +++ b/include/graph/shl_gref.h @@ -981,11 +981,11 @@ int shl_gref_instance_norm_infer_shape(struct csinn_tensor *input, struct csinn_ struct csinn_tensor *bias, struct csinn_tensor *output, struct csinn_instance_norm_params *params); -int shl_gref_rms_norm(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params); +int shl_gref_rms_norm(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); -int shl_gref_rms_norm_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, +int shl_gref_rms_norm_infer_shape(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params); int shl_gref_llm_pos(struct csinn_tensor *input, struct csinn_tensor *output, diff --git a/include/llm/shl_llm.h b/include/llm/shl_llm.h index 9428e280..322b8de5 100644 --- a/include/llm/shl_llm.h +++ b/include/llm/shl_llm.h @@ -95,6 +95,12 @@ struct shl_llm_ctx { char *path; struct shl_llm_model *shl_model; + + /* for csinn_session */ + int32_t base_api; // basic computing unit + int32_t base_dtype; + int32_t base_quant_type; + int32_t save_model; }; struct shl_llm_input { @@ -112,6 +118,12 @@ struct llama_config { int vocab_size; struct shl_llm_model *shl_model; + + /* for csinn_session */ + int32_t base_api; + int32_t base_dtype; + int32_t base_quant_type; + int32_t save_model; }; struct shl_llm_ctx *llama2_build(struct llama_config *config); diff --git a/include/shl_debug.h b/include/shl_debug.h index 4c455828..3a053786 100644 --- a/include/shl_debug.h +++ b/include/shl_debug.h @@ -50,7 +50,7 @@ int shl_debug_get_level(); void shl_debug_set_level(int level); int shl_benchmark_layer(struct shl_node *node, uint64_t start_time, uint64_t end_time, int layer_idx); -int shl_dump_output_tensor(struct shl_node *node); +int shl_dump_output_tensor(struct shl_node *node, char **output_filenames); int shl_conv2d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel, struct csinn_tensor *bias, @@ -180,8 +180,8 @@ int shl_layer_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *o struct csinn_tensor *gamma, struct csinn_tensor *beta, struct csinn_layer_norm_params *params, const char *name); -int shl_rms_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params, +int shl_rms_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params, const char *name); int shl_softmax_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, diff --git a/include/shl_multithread.h b/include/shl_multithread.h new file mode 100644 index 00000000..149835f4 --- /dev/null +++ b/include/shl_multithread.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef INCLUDE_SHL_MULTITHREAD_H_ +#define INCLUDE_SHL_MULTITHREAD_H_ + +#if (!defined SHL_BUILD_RTOS) +#include +#endif +#include "csinn/csi_nn.h" + +void shl_multithread_set_threads(int threads); + +int shl_multithread_is_enable(); + +#endif // INCLUDE_SHL_MULTITHREAD_H_ diff --git a/include/shl_profiler.h b/include/shl_profiler.h new file mode 100644 index 00000000..f1d5faa7 --- /dev/null +++ b/include/shl_profiler.h @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef INCLUDE_PROFILER_H_ +#define INCLUDE_PROFILER_H_ + +#include +#include +#include +#include +#include +#include + +#include "shl_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHL_TRACE_VERSION "2.9.5" + +#define SHL_TRACE_EVENT_ARGS_ITEM_KEY_MAX 64 +#define SHL_TRACE_EVENT_ARGS_CAPACITY_STEP 8 +#define SHL_TRACE_EVENT_CAPACITY_STEP 32 +#define SHL_TRACE_FILENAME_LENGTH_MAX 128 +#define SHL_TRACE_EVENT_NAME 64 + +enum shl_trace_event_category { + SHL_TRACE_EVENT_RUNTIME = 0, + SHL_TRACE_EVENT_CPU_OPERATOR, + SHL_TRACE_EVENT_MEMORY, + SHL_TRACE_EVENT_CPU_KERNEL, + SHL_TRACE_EVENT_NPU_KERNEL, + SHL_TRACE_EVENT_KERNEL, + SHL_TRACE_EVENT_CATEGORY_MAX +}; + +extern const char *SHL_TRACE_EVENT_CATEGORY_NAMES[]; + +enum shl_trace_event_type { + SHL_TRACE_EVENT_TYPE_DURATION_B = 0, + SHL_TRACE_EVENT_TYPE_DURATION_E, + SHL_TRACE_EVENT_TYPE_COMPLETE_X, + SHL_TRACE_EVENT_TYPE_INSTANT_i, + SHL_TRACE_EVENT_TYPE_COUNTER_C, + SHL_TRACE_EVENT_TYPE_ASYNC_b, + SHL_TRACE_EVENT_TYPE_ASYNC_n, + SHL_TRACE_EVENT_TYPE_ASYNC_e, + SHL_TRACE_EVENT_TYPE_FLOW_s, + SHL_TRACE_EVENT_TYPE_FLOW_t, + SHL_TRACE_EVENT_TYPE_FLOW_f, + SHL_TRACE_EVENT_TYPE_METADATA_M, + SHL_TRACE_EVENT_TYPE_MAX, +}; + +extern const char *SHL_TRACE_EVENT_TYPE_NAMES[]; + +enum shl_trace_value_type { + SHL_TRACE_VALUE_TYPE_INT64, + SHL_TRACE_VALUE_TYPE_UINT64, + SHL_TRACE_VALUE_TYPE_DOUBLE, + SHL_TRACE_VALUE_TYPE_STRING, + SHL_TRACE_VALUE_TYPE_LIST, +}; + +union shl_trace_value_content { + int64_t i64; + uint64_t u64; + double f64; + char *str; + struct shl_trace_value_list *list; +}; + +struct shl_trace_value { + enum shl_trace_value_type type; + union shl_trace_value_content content; +}; + +struct shl_trace_value_list { + struct shl_trace_value **value; + int size; +}; + +struct shl_trace_dict_item { + char key[32]; + struct shl_trace_value *value; +}; + +struct shl_trace_dict { + struct shl_trace_dict_item **items; + uint32_t items_capacity; + uint32_t items_size; +}; + +struct shl_trace_event_format { + char name[SHL_TRACE_EVENT_NAME]; /** The name of the event.*/ + enum shl_trace_event_category cat; /** The event categories. */ + enum shl_trace_event_type ph; /** The event type. */ + uint64_t ts; /** The tracing clock timestamps */ + // uint64_t tts; /** The thread clock timestamps */ + uint32_t pid; /** The process id. */ + uint32_t tid; /** The thread id. */ + struct shl_trace_dict *args; /** Any arguments provided for the event. */ +}; + +struct shl_trace_other_data { + char version[32]; + struct shl_trace_dict *data; +}; + +struct shl_trace { + bool enable_trace; + bool is_init; + char filename[SHL_TRACE_FILENAME_LENGTH_MAX]; + struct shl_trace_event_format **events; + uint32_t events_capacity; + uint32_t events_size; + + struct shl_trace_other_data *other_data; +}; + +uint32_t shl_trace_get_current_pid(); +uint32_t shl_trace_get_current_tid(); +uint64_t shl_trace_get_timestamps_us(); + +struct shl_trace_value *shl_trace_create_string(const char *value); +struct shl_trace_value *shl_trace_create_int64(int64_t value); +struct shl_trace_value *shl_trace_create_uint64(uint64_t value); +struct shl_trace_value *shl_trace_create_double(double value); +/** + * Create variable length list + * + * For example: + * value = shl_trace_create_list(2, + * shl_trace_create_string("value"), + * shl_trace_create_int64(10) + * ) + */ +struct shl_trace_value *shl_trace_create_list(int num, ...); +struct shl_trace_value *shl_trace_create_list_int(int num, int *arr); + +#define SHL_TRACE_STRING(value) shl_trace_create_string(value) +#define SHL_TRACE_INT64(value) shl_trace_create_int64(value) +#define SHL_TRACE_UINT64(value) shl_trace_create_uint64(value) +#define SHL_TRACE_DOUBLE(value) shl_trace_create_double(value) +#define SHL_TRACE_LIST(num, ...) shl_trace_create_list(num, __VA_ARGS__) +#define SHL_TRACE_LIST_INT(num, ptr) shl_trace_create_list_int(num, ptr) + +/* release value itself and its members. */ +void shl_trace_release_value(struct shl_trace_value *value); + +struct shl_trace_dict_item *shl_trace_create_dict_item(const char *key, + struct shl_trace_value *value); +struct shl_trace_dict *shl_trace_create_dict_by_item(int argc, ...); + +/** + * Create dict with variable length (key, value) + * + * For example: + * dict = shl_trace_create_dict(5, + * "string", shl_trace_create_string("string"), + * "int64", shl_trace_create_int64(-5), + * "uint64", shl_trace_create_uint64(4), + * "double", shl_trace_create_double(4.6), + * "list", shl_trace_create_list( + * 2, + * shl_trace_create_int64(256), + * shl_trace_create_int64(256) + * ) + * ); + */ +struct shl_trace_dict *shl_trace_create_dict(int argc, ...); + +/* release value itself and its members. */ +void shl_trace_release_dict(struct shl_trace_dict *dict); + +struct shl_trace_event_format *shl_trace_create_common_event(); +void shl_trace_insert_event(struct shl_trace *trace, struct shl_trace_event_format *event); +void shl_trace_init(struct shl_trace *trace); +void shl_trace_deinit(struct shl_trace *trace); +void shl_trace_to_json(struct shl_trace *trace); +void shl_trace_move_events(struct shl_trace *from_trace, struct shl_trace *to_trace); + +/************************** Main functions ***************************/ +#ifdef SHL_TRACE +#define SHL_TRACE_CALL(func) func +void shl_trace_begin(struct shl_trace *trace, const char *filename); +void shl_trace_end(struct shl_trace *trace); +void shl_trace_other_data(struct shl_trace *trace, struct shl_trace_dict *data); +void shl_trace_duration_begin(struct shl_trace *trace, const char *name, + enum shl_trace_event_category cat, struct shl_trace_dict *args); +void shl_trace_duration_end(struct shl_trace *trace, const char *name, + enum shl_trace_event_category cat, struct shl_trace_dict *args); +#else +#define SHL_TRACE_CALL(func) +inline void shl_trace_begin(struct shl_trace *trace, const char *filename) {} +inline void shl_trace_end(struct shl_trace *trace) {} +inline void shl_trace_other_data(struct shl_trace *trace, struct shl_trace_dict *data) {} +inline void shl_trace_duration_begin(struct shl_trace *trace, const char *name, + enum shl_trace_event_category cat, struct shl_trace_dict *args) +{ +} +inline void shl_trace_duration_end(struct shl_trace *trace, const char *name, + enum shl_trace_event_category cat, struct shl_trace_dict *args) +{ +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif // INCLUDE_PROFILER_H_ diff --git a/include/shl_utils.h b/include/shl_utils.h index f78b27e4..18e9c0da 100644 --- a/include/shl_utils.h +++ b/include/shl_utils.h @@ -26,12 +26,12 @@ #include #include #include -#if (!defined SHL_BUILD_RTOS) -#include -#endif + #include "csinn/csinn_data_structure.h" #include "shl_debug.h" #include "shl_memory.h" +#include "shl_multithread.h" +#include "shl_profiler.h" #ifdef SHL_MCONF_CONFIG #include "mconf_config.h" #endif @@ -140,6 +140,20 @@ struct shl_yolov5_params { float anchors[18]; /**< Anchor box of three strides */ }; +struct shl_function_map { + void *func; + char *name; +}; + +char *shl_find_function_name(struct shl_function_map *fmap, void *func); +char *shl_find_enum_name(struct csinn_enum_map *map, int map_len, int type); + +char *shl_find_dtype_name(enum csinn_dtype_enum type); +char *shl_find_quant_name(enum csinn_quant_enum type); +char *shl_find_api_name(enum csinn_api_enum type); +char *shl_find_rmod_name(enum csinn_rmode_enum type); +char *shl_find_layout_name(enum csinn_layout_enum type); + #ifdef __cplusplus } #endif diff --git a/module/json/.clang-format b/module/json/.clang-format new file mode 100644 index 00000000..9d159247 --- /dev/null +++ b/module/json/.clang-format @@ -0,0 +1,2 @@ +DisableFormat: true +SortIncludes: false diff --git a/source/c906_opt/fp16/convolution.c b/source/c906_opt/fp16/convolution.c index 57202569..b9c5235e 100644 --- a/source/c906_opt/fp16/convolution.c +++ b/source/c906_opt/fp16/convolution.c @@ -99,10 +99,11 @@ int shl_c906_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o // pack4 for winograd convolution if ((out_c % 8 == 0) && (in_c % 8 == 0)) { params->conv_extra.conv_mode = CSINN_WINOGRAD; - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); - shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(kernel, t_kernel); - params->conv_extra.kernel_tm = t_kernel; + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(kernel, t_kernel); + params->conv_extra.kernel_tm = t_kernel; + } cb->exec = shl_c906_conv3x3s1_winograd64_pack8_fp16; } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c906_opt/fp16/convolution_3x3_fp16.c b/source/c906_opt/fp16/convolution_3x3_fp16.c index 70c5f2c5..e78f3360 100644 --- a/source/c906_opt/fp16/convolution_3x3_fp16.c +++ b/source/c906_opt/fp16/convolution_3x3_fp16.c @@ -235,6 +235,13 @@ void shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csinn_tens // }; csinn_tensor_copy(t_kernel, o_kernel); + t_kernel->dim_count = 5; + t_kernel->dim[0] = outch / 8; + t_kernel->dim[1] = 8; + t_kernel->dim[2] = 8; + t_kernel->dim[3] = inch; + t_kernel->dim[4] = 8; + t_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -309,7 +316,7 @@ void shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csinn_tens } } } - + o_kernel->data = NULL; shl_mem_free(kernel_tm); } diff --git a/source/c906_opt/fp16/fullyconnected.c b/source/c906_opt/fp16/fullyconnected.c index 618659b4..4cd57e3b 100644 --- a/source/c906_opt/fp16/fullyconnected.c +++ b/source/c906_opt/fp16/fullyconnected.c @@ -113,6 +113,41 @@ void shl_c906_fc_gemv_transform_weight_fp16_w_int8(struct csinn_tensor *weights) shl_mem_free(pa_reorder); } +void shl_c906_fc_n16_dequantize_per_channel_i8_to_f16(struct csinn_tensor *weights, + struct csinn_fc_params *params, + __fp16 *weights_fp16) +{ + int8_t *weights_int8 = (int8_t *)weights->data; + int n = weights->dim[0]; // out_nodes + int k = weights->dim[1]; // in_nodes + + int i = 0; + int vl = vsetvl_e16m2(16); + for (; i + 15 < n; i += 16) { + int8_t *w_src = weights_int8 + i * k; + __fp16 *w_dst = weights_fp16 + i * k; + vint32m4_t _z32 = + vlse32_v_i32m4(&(weights->qinfo[i].zero_point), sizeof(struct csinn_quant_info), vl); + vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl); + vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl); + vfloat32m4_t _s32 = + vlse32_v_f32m4(&(weights->qinfo[i].scale), sizeof(struct csinn_quant_info), vl); + vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl); + for (int j = 0; j < k; j++) { + vint8m1_t _i8 = vle8_v_i8m1(w_src, vl); + vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl); + vse16_v_f16m2(w_dst, _f16, vl); + w_src += vl; + w_dst += vl; + } + } + for (; i < n; i++) { + int32_t zp = weights->qinfo[i].zero_point; + float scale = weights->qinfo[i].scale; + shl_rvv_dequantize_i8_to_f16(weights_int8 + i * k, weights_fp16 + i * k, k, zp, scale); + } +} + /* best performance measured on D1 loop unroll: k = 1 && pack16 @@ -152,14 +187,9 @@ int shl_c906_fullyconnected_pack16_fp16(struct csinn_tensor *input, struct csinn float scale = weights->qinfo->scale; shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale); } else if (weights->quant_channel == output_depth) { - // support channel quantization - for (int c = 0; c < output_depth; c++) { - int32_t zp = weights->qinfo[c].zero_point; - float scale = weights->qinfo[c].scale; - shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth, - weights_fp16 + c * accum_depth, accum_depth, zp, - scale); - } + shl_c906_fc_n16_dequantize_per_channel_i8_to_f16(weights, params, weights_fp16); + } else { + shl_debug_error("%s unsupported quant_channel: %d\n", __func__, weights->quant_channel); } weights_data = weights_fp16; } else if (weights->dtype == CSINN_DTYPE_FLOAT16) { @@ -314,14 +344,9 @@ int shl_c906_fullyconnected_pack16_output16_fp16(struct csinn_tensor *input, float scale = weights->qinfo->scale; shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale); } else if (weights->quant_channel == output_depth) { - // support channel quantization - for (int c = 0; c < output_depth; c++) { - int32_t zp = weights->qinfo[c].zero_point; - float scale = weights->qinfo[c].scale; - shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth, - weights_fp16 + c * accum_depth, accum_depth, zp, - scale); - } + shl_c906_fc_n16_dequantize_per_channel_i8_to_f16(weights, params, weights_fp16); + } else { + shl_debug_error("%s unsupported quant_channel: %d\n", __func__, weights->quant_channel); } weights_data = weights_fp16; } else if (weights->dtype == CSINN_DTYPE_FLOAT16) { diff --git a/source/c906_opt/fp32/convolution.c b/source/c906_opt/fp32/convolution.c index d3997122..05b413a5 100644 --- a/source/c906_opt/fp32/convolution.c +++ b/source/c906_opt/fp32/convolution.c @@ -79,10 +79,11 @@ int shl_c906_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o // pack4 for winograd convolution if ((out_c % 4 == 0) && (in_c % 4 == 0)) { params->conv_extra.conv_mode = CSINN_WINOGRAD; - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); - shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(kernel, t_kernel); - params->conv_extra.kernel_tm = t_kernel; + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(kernel, t_kernel); + params->conv_extra.kernel_tm = t_kernel; + } cb->exec = shl_c906_conv3x3s1_winograd64_pack4; } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c906_opt/fp32/convolution_3x3_fp32.c b/source/c906_opt/fp32/convolution_3x3_fp32.c index c7d5ad27..1d5366f4 100644 --- a/source/c906_opt/fp32/convolution_3x3_fp32.c +++ b/source/c906_opt/fp32/convolution_3x3_fp32.c @@ -233,6 +233,13 @@ void shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csinn_tensor *o // }; csinn_tensor_copy(t_kernel, o_kernel); + t_kernel->dim_count = 5; + t_kernel->dim[0] = outch / 4; + t_kernel->dim[1] = 8; + t_kernel->dim[2] = 8; + t_kernel->dim[3] = inch; + t_kernel->dim[4] = 4; + t_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -295,7 +302,7 @@ void shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csinn_tensor *o } } } - + o_kernel->data = NULL; shl_mem_free(kernel_tm); } diff --git a/source/c906_opt/performance.c b/source/c906_opt/performance.c new file mode 100644 index 00000000..c5453af4 --- /dev/null +++ b/source/c906_opt/performance.c @@ -0,0 +1,327 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c906/c906.h" +#include "c906/perf.h" + +static struct shl_function_map shl_c906_kernel_map[] = { + {shl_c906_abs_f32, "shl_c906_abs_f32"}, + {shl_c906_add_f32, "shl_c906_add_f32"}, + {shl_c906_sub_f32, "shl_c906_sub_f32"}, + {shl_c906_mul_f32, "shl_c906_mul_f32"}, + {shl_c906_minimum_f32, "shl_c906_minimum_f32"}, + {shl_c906_broadcast_to_f32, "shl_c906_broadcast_to_f32"}, + {shl_c906_clip_f32, "shl_c906_clip_f32"}, + {shl_c906_concat_f32, "shl_c906_concat_f32"}, + {shl_c906_split_f32, "shl_c906_split_f32"}, + {shl_c906_pad_f32, "shl_c906_pad_f32"}, + {shl_c906_prelu_f32, "shl_c906_prelu_f32"}, + {shl_c906_relu_f32, "shl_c906_relu_f32"}, + {shl_c906_relu1_f32, "shl_c906_relu1_f32"}, + {shl_c906_relu6_f32, "shl_c906_relu6_f32"}, + {shl_c906_leaky_relu_f32, "shl_c906_leaky_relu_f32"}, + {shl_c906_global_maxpool2d_f32, "shl_c906_global_maxpool2d_f32"}, + {shl_c906_reorder_kernel, "shl_c906_reorder_kernel"}, + {shl_c906_reorder_input_1, "shl_c906_reorder_input_1"}, + {shl_c906_sgemm_kernel_f32, "shl_c906_sgemm_kernel_f32"}, + {shl_c906_conv1x1s1_sgemm_transform_kernel, "shl_c906_conv1x1s1_sgemm_transform_kernel"}, + {shl_c906_conv_im2col_sgemm_transform_kernel, "shl_c906_conv_im2col_sgemm_transform_kernel"}, + {shl_c906_conv3x3s1_winograd64_transform_kernel_pack4, + "shl_c906_conv3x3s1_winograd64_transform_kernel_pack4"}, + {shl_c906_conv1x1s1_sgemm, "shl_c906_conv1x1s1_sgemm"}, + {shl_c906_conv1x1s1_sgemm_fuse_relu, "shl_c906_conv1x1s1_sgemm_fuse_relu"}, + {shl_c906_conv_im2col_sgemm, "shl_c906_conv_im2col_sgemm"}, + {shl_c906_conv_im2col_sgemm_fuse_relu, "shl_c906_conv_im2col_sgemm_fuse_relu"}, + {shl_c906_conv3x3s1_winograd64_pack4, "shl_c906_conv3x3s1_winograd64_pack4"}, + {shl_c906_dwconv3x3s1, "shl_c906_dwconv3x3s1"}, + {shl_c906_dwconv3x3s2, "shl_c906_dwconv3x3s2"}, + {shl_c906_dwconv5x5s1, "shl_c906_dwconv5x5s1"}, + {shl_c906_dwconv5x5s2, "shl_c906_dwconv5x5s2"}, + {shl_c906_dwconv3x3s1_pack4, "shl_c906_dwconv3x3s1_pack4"}, + {shl_c906_dwconv3x3s2_pack4, "shl_c906_dwconv3x3s2_pack4"}, + {shl_c906_dwconv3x3s1_fuse_relu, "shl_c906_dwconv3x3s1_fuse_relu"}, + {shl_c906_dwconv3x3s2_fuse_relu, "shl_c906_dwconv3x3s2_fuse_relu"}, + {shl_c906_dwconv5x5s1_fuse_relu, "shl_c906_dwconv5x5s1_fuse_relu"}, + {shl_c906_dwconv5x5s2_fuse_relu, "shl_c906_dwconv5x5s2_fuse_relu"}, + {shl_c906_dwconv3x3s1_pack4_fuse_relu, "shl_c906_dwconv3x3s1_pack4_fuse_relu"}, + {shl_c906_dwconv3x3s2_pack4_fuse_relu, "shl_c906_dwconv3x3s2_pack4_fuse_relu"}, + {shl_c906_dwconv2d_s1_pad0_fp16, "shl_c906_dwconv2d_s1_pad0_fp16"}, + {shl_c906_add_fp16, "shl_c906_add_fp16"}, + {shl_c906_sub_fp16, "shl_c906_sub_fp16"}, + {shl_c906_mul_fp16, "shl_c906_mul_fp16"}, + {shl_c906_minimum_fp16, "shl_c906_minimum_fp16"}, + {shl_c906_global_avgpool2d_fp16, "shl_c906_global_avgpool2d_fp16"}, + {shl_c906_global_maxpool2d_fp16, "shl_c906_global_maxpool2d_fp16"}, + {shl_c906_pad_fp16, "shl_c906_pad_fp16"}, + {shl_c906_relu_fp16, "shl_c906_relu_fp16"}, + {shl_c906_relu1_fp16, "shl_c906_relu1_fp16"}, + {shl_c906_relu6_fp16, "shl_c906_relu6_fp16"}, + {shl_c906_prelu_fp16, "shl_c906_prelu_fp16"}, + {shl_c906_leaky_relu_fp16, "shl_c906_leaky_relu_fp16"}, + {shl_c906_abs_fp16, "shl_c906_abs_fp16"}, + {shl_c906_clip_fp16, "shl_c906_clip_fp16"}, + {shl_c906_concat_fp16, "shl_c906_concat_fp16"}, + {shl_c906_split_fp16, "shl_c906_split_fp16"}, + {shl_c906_fullyconnected_pack16_fp16, "shl_c906_fullyconnected_pack16_fp16"}, + {shl_c906_fullyconnected_pack16_output16_fp16, "shl_c906_fullyconnected_pack16_output16_fp16"}, + {shl_c906_reorder_weight_n16_fp16, "shl_c906_reorder_weight_n16_fp16"}, + {shl_c906_reorder_kernel_fp16, "shl_c906_reorder_kernel_fp16"}, + {shl_c906_reorder_input_fp16_1, "shl_c906_reorder_input_fp16_1"}, + {shl_c906_sgemm_kernel_fp16, "shl_c906_sgemm_kernel_fp16"}, + // {shl_c906_sgemm_kernel_fp16_1, "shl_c906_sgemm_kernel_fp16_1"}, + {shl_c906_conv1x1s1_sgemm_transform_kernel_fp16, + "shl_c906_conv1x1s1_sgemm_transform_kernel_fp16"}, + {shl_c906_conv1x1s1_sgemm_transform_kernel_fp16_w_int8, + "shl_c906_conv1x1s1_sgemm_transform_kernel_fp16_w_int8"}, + {shl_c906_conv_im2col_sgemm_transform_kernel_fp16, + "shl_c906_conv_im2col_sgemm_transform_kernel_fp16"}, + {shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8, + "shl_c906_conv_im2col_sgemm_transform_kernel_fp16_w_int8"}, + {shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16, + "shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16"}, + {shl_c906_conv1x1s1_sgemm_fp16, "shl_c906_conv1x1s1_sgemm_fp16"}, + {shl_c906_conv_im2col_sgemm_fp16, "shl_c906_conv_im2col_sgemm_fp16"}, + {shl_c906_conv3x3s1_winograd64_pack8_fp16, "shl_c906_conv3x3s1_winograd64_pack8_fp16"}, + {shl_c906_dwconv3x3s1_fp16, "shl_c906_dwconv3x3s1_fp16"}, + {shl_c906_dwconv3x3s2_fp16, "shl_c906_dwconv3x3s2_fp16"}, + {shl_c906_dwconv3x3s1_pack8_fp16, "shl_c906_dwconv3x3s1_pack8_fp16"}, + {shl_c906_dwconv3x3s2_pack8_fp16, "shl_c906_dwconv3x3s2_pack8_fp16"}, + // {shl_c906_matmul_fp32, "shl_c906_matmul_fp32"}, + {shl_c906_cache_matmul_fp16, "shl_c906_cache_matmul_fp16"}, + {shl_c906_matmul_fp16, "shl_c906_matmul_fp16"}, + {shl_c906_reshape_fp16, "shl_c906_reshape_fp16"}, + {shl_c906_cache_conv1d_fp16, "shl_c906_cache_conv1d_fp16"}, + {shl_c906_lrn_fp16, "shl_c906_lrn_fp16"}, + {shl_c906_reduce_sum_fp16, "shl_c906_reduce_sum_fp16"}, + {NULL, NULL}}; + +char *shl_rvv_get_kernel_name(void *exec); + +char *shl_c906_get_kernel_name(void *exec) +{ + char *name = shl_find_function_name(shl_c906_kernel_map, exec); + if (name == NULL) { + name = shl_rvv_get_kernel_name(exec); + } + return name; +} + +int shl_c906_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_depthwise_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_abs_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_lrn_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_minimum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_relu1_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_split_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c906_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c906_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} \ No newline at end of file diff --git a/source/c906_opt/setup.c b/source/c906_opt/setup.c index 49f04f50..85249961 100644 --- a/source/c906_opt/setup.c +++ b/source/c906_opt/setup.c @@ -18,6 +18,7 @@ #include "c906/c906.h" #include "c906/cap.h" +#include "c906/perf.h" static struct shl_cb_op_list shl_c906_cb_op_list; @@ -58,6 +59,18 @@ int shl_c906_reg_op_cap(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, return CSINN_TRUE; } +int shl_c906_reg_op_perf(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *perf) +{ + struct csinn_callback *cb = shl_cb_list_match(&shl_c906_cb_op_list, dtype, op_name); + if (cb == NULL) { + shl_debug_info("%s: cannot find c906 perf\n", __func__); + } else { + cb->perf = perf; + } + + return CSINN_TRUE; +} + struct csinn_callback *__attribute__((weak)) shl_cb_map_rvv(int op, int dtype); struct csinn_callback *shl_cb_map_c906(int op, int dtype) { @@ -725,4 +738,65 @@ void __attribute__((weak)) shl_target_init_c906() shl_c906_reg_op_cap(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, shl_c906_split_cap); shl_c906_reg_op_cap(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, shl_c906_sub_cap); shl_c906_reg_op_cap(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_SUM, shl_c906_reduce_sum_cap); + + /* register perf functions */ + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c906_conv2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, + shl_c906_depthwise_conv2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_c906_conv1d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_c906_avgpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, shl_c906_div_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, shl_c906_abs_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, shl_c906_add_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, shl_c906_clip_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, shl_c906_concat_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, + shl_c906_global_avgpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, + shl_c906_global_maxpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, shl_c906_leaky_relu_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, shl_c906_minimum_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, shl_c906_mul_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, shl_c906_prelu_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, shl_c906_relu_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, shl_c906_relu1_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, shl_c906_relu6_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, shl_c906_split_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, shl_c906_sub_perf); + + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c906_conv2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, + shl_c906_depthwise_conv2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, + shl_c906_fullyconnected_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_c906_conv1d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV1D, + shl_c906_depthwise_conv1d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_c906_avgpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, shl_c906_div_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, shl_c906_abs_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, shl_c906_add_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, shl_c906_clip_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, shl_c906_concat_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, + shl_c906_global_avgpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, + shl_c906_global_maxpool2d_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, shl_c906_leaky_relu_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, shl_c906_lrn_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, shl_c906_matmul_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, shl_c906_minimum_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, shl_c906_mul_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, shl_c906_prelu_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, shl_c906_relu_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, shl_c906_relu1_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, shl_c906_relu6_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, shl_c906_reshape_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, shl_c906_split_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, shl_c906_sub_perf); + shl_c906_reg_op_perf(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_SUM, shl_c906_reduce_sum_perf); } diff --git a/source/c908_opt/fp16/convolution.c b/source/c908_opt/fp16/convolution.c index 11a2efea..f9b8d361 100644 --- a/source/c908_opt/fp16/convolution.c +++ b/source/c908_opt/fp16/convolution.c @@ -83,16 +83,20 @@ int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } else { + shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_fp16; } else { - shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_c908_ncxhwx_wg_b6f3s1_packn_fp16; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c b/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c index a5822315..79621ae9 100644 --- a/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c +++ b/source/c908_opt/fp16/convolution_3x3_fp16_packn_1.c @@ -1676,7 +1676,16 @@ void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_ {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 6; + dst_kernel->dim[2] = 6; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -1713,9 +1722,6 @@ void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_ __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(__fp16)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(__fp16); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { __fp16 *g0 = kernel_tm_packn + oc * 36 * inch; @@ -1742,6 +1748,7 @@ void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_ } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } @@ -1882,7 +1889,16 @@ void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_ // {0.0f, 0.0f, 1.0f} // }; + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 8; + dst_kernel->dim[2] = 8; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -1917,9 +1933,6 @@ void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_ __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(__fp16); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { __fp16 *g0 = kernel_tm_packn + oc * 64 * inch; @@ -1946,6 +1959,7 @@ void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_ } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } diff --git a/source/c908_opt/fp32/convolution.c b/source/c908_opt/fp32/convolution.c index 53fd69fb..cb970c68 100644 --- a/source/c908_opt/fp32/convolution.c +++ b/source/c908_opt/fp32/convolution.c @@ -75,16 +75,20 @@ int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } else { + shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_fp32; } else { - shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_c908_ncxhwx_wg_b6f3s1_packn_fp32; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c b/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c index 3766733a..e61f963b 100644 --- a/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c +++ b/source/c908_opt/fp32/convolution_3x3_fp32_packn_1.c @@ -1674,7 +1674,16 @@ void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_ {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 6; + dst_kernel->dim[2] = 6; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -1711,9 +1720,6 @@ void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_ float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(float)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(float); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { float *g0 = kernel_tm_packn + oc * 36 * inch; @@ -1740,6 +1746,7 @@ void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_ } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } @@ -1877,7 +1884,16 @@ void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_ // {0.0f, 0.0f, 1.0f} // }; + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 8; + dst_kernel->dim[2] = 8; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -1913,9 +1929,6 @@ void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_ float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(float); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { float *g0 = kernel_tm_packn + oc * 64 * inch; @@ -1942,6 +1955,7 @@ void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_ } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } diff --git a/source/c908_opt/int8/convolution.c b/source/c908_opt/int8/convolution.c index 004df969..d232df65 100644 --- a/source/c908_opt/int8/convolution.c +++ b/source/c908_opt/int8/convolution.c @@ -32,6 +32,10 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o int32_t stride_w = params->stride_width; int32_t dilation_h = params->dilation_height; int32_t dilation_w = params->dilation_width; + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + struct csinn_callback *cb = params->base.cb; const int packn = csrr_vlenb() / sizeof(int8_t) / 2; @@ -53,13 +57,17 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o out_elempack = out_c % packn == 0 ? packn : 1; } + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); + // packn if (in_elempack % packn == 0 && out_elempack % packn == 0) { if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); - shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); + } cb->exec = shl_c908_conv1x1s1_gemm_packn_int8; // } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && // dilation_h == 1 && dilation_w == 1) { @@ -77,8 +85,10 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o // } } else { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); - shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + } cb->exec = shl_c908_conv_im2col_gemm_packn_int8; } } @@ -86,13 +96,18 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o // pack1ton if (in_elempack % packn != 0 && out_elempack % packn == 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); + } cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_int8; } else { - shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); + } cb->exec = shl_c908_conv_im2col_gemm_pack1ton_int8; } } @@ -100,13 +115,18 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o // packnto1 if (in_elempack % packn == 0 && out_elempack % packn != 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); + } cb->exec = shl_c908_conv1x1s1_gemm_packnto1_int8; } else { - shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); + } cb->exec = shl_c908_conv_im2col_gemm_packnto1_int8; } } @@ -114,13 +134,18 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o // pack1 if (in_elempack % packn != 0 && out_elempack % packn != 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - shl_c908_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); + } cb->exec = shl_c908_conv1x1s1_gemm_int8; } else { - shl_c908_conv_im2col_gemm_reorder_kernel_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_c908_conv_im2col_gemm_reorder_kernel_int8(kernel, params); + } cb->exec = shl_c908_conv_im2col_gemm_int8; } } @@ -140,10 +165,6 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o if (params->conv_extra.conv_mode == CSINN_GEMM) { if (!params->conv_extra.fuse_zp2bias) { params->conv_extra.fuse_zp2bias = true; - int32_t *bias_data = (int32_t *)bias->data; - int8_t *kernel_data = (int8_t *)kernel->data; - int32_t input_zp = input->qinfo->zero_point; - if (bias_data == NULL) { // XXX: memory leak bias_data = (int32_t *)shl_mem_alloc(out_c * params->group * sizeof(int32_t)); @@ -163,10 +184,6 @@ int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *o // recover fuse zeropoint to bias for winograd if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { if (params->conv_extra.fuse_zp2bias) { - int32_t *bias_data = (int32_t *)bias->data; - int8_t *kernel_data = (int8_t *)kernel->data; - int32_t input_zp = input->qinfo->zero_point; - int kernel_inner = in_c * kernel_h * kernel_w; for (int oc = 0; oc < out_c * params->group; oc++) { int32_t tmp = 0; diff --git a/source/c908_opt/int8/convolution_1x1_int8.c b/source/c908_opt/int8/convolution_1x1_int8.c index faaa5d50..5bb9355c 100644 --- a/source/c908_opt/int8/convolution_1x1_int8.c +++ b/source/c908_opt/int8/convolution_1x1_int8.c @@ -28,6 +28,8 @@ void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); + params->conv_extra.kernel_tm->dim[1] = k4; params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; @@ -35,6 +37,7 @@ void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, shl_c908_reorder_kernel_n8_int8_dot(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k); } + kernel->data = NULL; } int shl_c908_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, diff --git a/source/c908_opt/int8/convolution_gemm_int8.c b/source/c908_opt/int8/convolution_gemm_int8.c index 6a5697c3..5616ef48 100644 --- a/source/c908_opt/int8/convolution_gemm_int8.c +++ b/source/c908_opt/int8/convolution_gemm_int8.c @@ -28,6 +28,8 @@ void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); + params->conv_extra.kernel_tm->dim[1] = k4; params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; @@ -38,6 +40,7 @@ void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); // shl_mem_free(pa_reorder); + kernel->data = NULL; } int shl_c908_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, diff --git a/source/c920_opt/fp16/convolution.c b/source/c920_opt/fp16/convolution.c index 58d57d86..daf0d6fa 100644 --- a/source/c920_opt/fp16/convolution.c +++ b/source/c920_opt/fp16/convolution.c @@ -83,17 +83,20 @@ int shl_c920_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *o return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_c920_wg_b4f3s1_packn_fp16; } else { - shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_c920_wg_b6f3s1_packn_fp16; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c920_opt/fp16/fullyconnected.c b/source/c920_opt/fp16/fullyconnected.c index ece91be0..bfe2cbdb 100644 --- a/source/c920_opt/fp16/fullyconnected.c +++ b/source/c920_opt/fp16/fullyconnected.c @@ -55,14 +55,9 @@ int shl_c920_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_t float scale = weights->qinfo->scale; shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale); } else if (weights->quant_channel == output_depth) { - // support channel quantization - for (int c = 0; c < output_depth; c++) { - int32_t zp = weights->qinfo[c].zero_point; - float scale = weights->qinfo[c].scale; - shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth, - weights_fp16 + c * accum_depth, accum_depth, zp, - scale); - } + shl_rvv_fc_npack2n_dequantize_per_channel_i8_to_f16(weights, params, weights_fp16); + } else { + shl_debug_error("%s unsupported quant_channel: %d\n", __func__, weights->quant_channel); } weights_data = weights_fp16; } else if (weights->dtype == CSINN_DTYPE_FLOAT16) { diff --git a/source/c920_opt/fp16/gemm_a0nb1n_fp16.c b/source/c920_opt/fp16/gemm_a0nb1n_fp16.c new file mode 100644 index 00000000..61d46c0b --- /dev/null +++ b/source/c920_opt/fp16/gemm_a0nb1n_fp16.c @@ -0,0 +1,864 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c920/c920.h" + +static inline void c920_omp_get_mn_partition(int M, int N, int *M_start, int *M_end, int *N_start, + int *N_end) +{ +#ifdef _OPENMP + int rank = omp_get_thread_num(); + int threads = omp_get_num_threads(); + + if (M > 2 * N) { + int q = M / threads; + int r = M % threads; + *M_start = rank < r ? rank * (q + 1) : rank * q + r; + *M_end = rank < r ? (rank + 1) * (q + 1) : (rank + 1) * q + r; + } else if (N > 2 * M) { + int q = N / threads; + int r = N % threads; + *N_start = rank < r ? rank * (q + 1) : rank * q + r; + *N_end = rank < r ? (rank + 1) * (q + 1) : (rank + 1) * q + r; + } else { + // TODO: support any number of threads + float _s = sqrt(threads); + assert(floor(_s + 0.5) == _s); + int t_sqrt = (int)_s; + + int r_rank = rank / t_sqrt; + int c_rank = rank % (int)t_sqrt; + + int M_q = M / t_sqrt; + int M_r = M % t_sqrt; + *M_start = r_rank < M_r ? r_rank * (M_q + 1) : r_rank * M_q + M_r; + *M_end = r_rank < M_r ? (r_rank + 1) * (M_q + 1) : (r_rank + 1) * M_q + M_r; + + int N_q = N / t_sqrt; + int N_r = N % t_sqrt; + *N_start = c_rank < N_r ? c_rank * (N_q + 1) : c_rank * N_q + N_r; + *N_end = c_rank < N_r ? (c_rank + 1) * (N_q + 1) : (c_rank + 1) * N_q + N_r; + } +#endif +} + +static inline vfloat16m4_t vdeq_vf_f16m4(vint8m2_t _src, __fp16 scale, int vl) +{ + vint16m4_t _i16 = vwadd_vx_i16m4(_src, 0, vl); + vfloat16m4_t _f16 = vfcvt_f_x_v_f16m4(_i16, vl); + _f16 = vfmul_vf_f16m4(_f16, scale, vl); + return _f16; +} + +/************************************************************* + * constrain: vlen = 128, and K % 32 == 0 + ************************************************************/ +static inline void gemm_dot_1x1_fp16_q8(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + const __fp16 *scale, __fp16 *bias, int M, int K, int N, + int lda, int ldb, int ldc, int k_idx) +{ + int block_size = 32; + int i = 0; + for (; i < M; i++) { + const __fp16 *sa_ptr = sa + i * lda; + int j = 0; + for (; j < N; j++) { + const __fp16 *a0_ptr = sa_ptr; + const int8_t *b0_ptr = sb + j * ldb; + const __fp16 *s0_ptr = scale + j * ldb / block_size; + + // vlen128 e16m4=32 + int vl = vsetvl_e16m4(block_size); + // dst[0, 0] + vfloat16m4_t _acc00 = vfmv_v_f_f16m4(0.0f, vl); + + int c = 0; + for (; c + block_size - 1 < K; c += block_size) { + vfloat16m4_t _a0 = vle16_v_f16m4(a0_ptr + c, vl); + vint8m2_t _b0_i8 = vle8_v_i8m2(b0_ptr + c, vl); + vfloat16m4_t _b0_f32 = vdeq_vf_f16m4(_b0_i8, s0_ptr[0], vl); + s0_ptr += 1; + _acc00 = vfmacc_vv_f16m4(_acc00, _a0, _b0_f32, vl); + } + + int idx00 = (i + 0) * ldc + (j + 0); + vfloat16m1_t _sum00; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f16m1(bias[j + 0], 1); + } else { + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + } + + _sum00 = vfredosum_vs_f16m4_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + } + } +} + +static void gemm_dot_1x1_fp16_q8_omp(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + const __fp16 *scale, __fp16 *bias, int M, int K, int N, + int lda, int ldb, int ldc, int k_idx) +{ + if (shl_multithread_is_enable()) { +#pragma omp parallel + { + int M_start = 0, M_end = M; + int N_start = 0, N_end = N; + c920_omp_get_mn_partition(M, N, &M_start, &M_end, &N_start, &N_end); + + __fp16 *thread_dst = dst + M_start * ldc + N_start; + const __fp16 *thread_sa = sa + M_start * lda; + const int8_t *thread_sb = sb + N_start * ldb; + const __fp16 *thread_scale = scale + N_start * ldb / 32; + __fp16 *thread_bias = bias + N_start; + int thread_M = M_end - M_start; + int thread_N = N_end - N_start; + gemm_dot_1x1_fp16_q8(thread_dst, thread_sa, thread_sb, thread_scale, thread_bias, + thread_M, K, thread_N, lda, ldb, ldc, k_idx); + } + } else { + gemm_dot_1x1_fp16_q8(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, k_idx); + } +} + +/************************************************************* + * constrain: vlen = 128, and K % 32 == 0 + ************************************************************/ +static inline void gemm_dot_1x1_fp16_q4(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + const __fp16 *scale, __fp16 *bias, int M, int K, int N, + int lda, int ldb, int ldc, int k_idx) +{ + int block_size = 32; + int half_block = block_size / 2; + int i = 0; + for (; i < M; i++) { + const __fp16 *sa_ptr = sa + i * lda; + int j = 0; + for (; j < N; j++) { + const __fp16 *a0_ptr = sa_ptr; + const int8_t *b0_ptr = sb + j * ldb / 2; + const __fp16 *s0_ptr = scale + j * ldb / block_size; + + // vlen128 e16m2=16 + int vl = vsetvl_e16m2(half_block); + // dst[0, 0] + vfloat16m2_t _acc00 = vfmv_v_f_f16m2(0.0f, vl); + + int c = 0; + for (; c + block_size - 1 < K; c += block_size) { + vfloat16m2_t _a00 = vle16_v_f16m2(a0_ptr + c, vl); + vfloat16m2_t _a01 = vle16_v_f16m2(a0_ptr + c + half_block, vl); + + vint8m1_t _b0_i8 = vle8_v_i8m1(b0_ptr, vl); + b0_ptr += half_block; + + vint8m1_t _low_i8 = vand_vx_i8m1(_b0_i8, 0x0f, vl); + vint8m1_t _high_i8 = vsra_vx_i8m1(_b0_i8, 4, vl); + _high_i8 = vand_vx_i8m1(_high_i8, 0x0f, vl); + vint16m2_t _low_i16 = vwsub_vx_i16m2(_low_i8, 8, vl); + vint16m2_t _high_i16 = vwsub_vx_i16m2(_high_i8, 8, vl); + vfloat16m2_t _low_f16 = vfcvt_f_x_v_f16m2(_low_i16, vl); + vfloat16m2_t _high_f16 = vfcvt_f_x_v_f16m2(_high_i16, vl); + _low_f16 = vfmul_vf_f16m2(_low_f16, s0_ptr[0], vl); + _high_f16 = vfmul_vf_f16m2(_high_f16, s0_ptr[0], vl); + s0_ptr += 1; + + _acc00 = vfmacc_vv_f16m2(_acc00, _a00, _low_f16, vl); + _acc00 = vfmacc_vv_f16m2(_acc00, _a01, _high_f16, vl); + } + + int idx00 = (i + 0) * ldc + (j + 0); + vfloat16m1_t _sum00; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f16m1(bias[j + 0], 1); + } else { + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + } + + _sum00 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + } + } +} + +static void gemm_dot_1x1_fp16_q4_omp(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + const __fp16 *scale, __fp16 *bias, int M, int K, int N, + int lda, int ldb, int ldc, int k_idx) +{ + if (shl_multithread_is_enable()) { +#pragma omp parallel + { + int M_start = 0, M_end = M; + int N_start = 0, N_end = N; + c920_omp_get_mn_partition(M, N, &M_start, &M_end, &N_start, &N_end); + + __fp16 *thread_dst = dst + M_start * ldc + N_start; + const __fp16 *thread_sa = sa + M_start * lda; + const int8_t *thread_sb = sb + N_start * ldb / 2; + const __fp16 *thread_scale = scale + N_start * ldb / 32; + __fp16 *thread_bias = bias + N_start; + int thread_M = M_end - M_start; + int thread_N = N_end - N_start; + gemm_dot_1x1_fp16_q4(thread_dst, thread_sa, thread_sb, thread_scale, thread_bias, + thread_M, K, thread_N, lda, ldb, ldc, k_idx); + } + } else { + gemm_dot_1x1_fp16_q4(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, k_idx); + } +} + +/* q4 ****************************************************************************/ + +static inline void gemm_dot_4x4_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int M, int K, int N, int lda, int ldb, int ldc, int k_idx) +{ + int i = 0; + for (; i + 3 < M; i += 4) { + const __fp16 *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *a1_ptr = sa_ptr + 1 * lda; + const __fp16 *a2_ptr = sa_ptr + 2 * lda; + const __fp16 *a3_ptr = sa_ptr + 3 * lda; + const __fp16 *b0_ptr = sb + j * ldb; + const __fp16 *b1_ptr = b0_ptr + 1 * ldb; + const __fp16 *b2_ptr = b0_ptr + 2 * ldb; + const __fp16 *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[m, 0] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vlmax); + // dst[m, 1] + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc01, vlmax); + vfloat16m1_t _acc21 = vmv_v_v_f16m1(_acc01, vlmax); + vfloat16m1_t _acc31 = vmv_v_v_f16m1(_acc01, vlmax); + // dst[m, 2] + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc02, vlmax); + vfloat16m1_t _acc22 = vmv_v_v_f16m1(_acc02, vlmax); + vfloat16m1_t _acc32 = vmv_v_v_f16m1(_acc02, vlmax); + // dst[m, 3] + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc03, vlmax); + vfloat16m1_t _acc23 = vmv_v_v_f16m1(_acc03, vlmax); + vfloat16m1_t _acc33 = vmv_v_v_f16m1(_acc03, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _a1 = vle16_v_f16m1(a1_ptr + c, vl); + vfloat16m1_t _a2 = vle16_v_f16m1(a2_ptr + c, vl); + vfloat16m1_t _a3 = vle16_v_f16m1(a3_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr + c, vl); + vfloat16m1_t _b2 = vle16_v_f16m1(b2_ptr + c, vl); + vfloat16m1_t _b3 = vle16_v_f16m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vlmax); + + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vlmax); + _acc11 = vfmacc_vv_f16m1(_acc11, _a1, _b1, vlmax); + _acc21 = vfmacc_vv_f16m1(_acc21, _a2, _b1, vlmax); + _acc31 = vfmacc_vv_f16m1(_acc31, _a3, _b1, vlmax); + + _acc02 = vfmacc_vv_f16m1(_acc02, _a0, _b2, vlmax); + _acc12 = vfmacc_vv_f16m1(_acc12, _a1, _b2, vlmax); + _acc22 = vfmacc_vv_f16m1(_acc22, _a2, _b2, vlmax); + _acc32 = vfmacc_vv_f16m1(_acc32, _a3, _b2, vlmax); + + _acc03 = vfmacc_vv_f16m1(_acc03, _a0, _b3, vlmax); + _acc13 = vfmacc_vv_f16m1(_acc13, _a1, _b3, vlmax); + _acc23 = vfmacc_vv_f16m1(_acc23, _a2, _b3, vlmax); + _acc33 = vfmacc_vv_f16m1(_acc33, _a3, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + int idx01 = (i + 0) * ldc + (j + 1); + int idx11 = (i + 1) * ldc + (j + 1); + int idx21 = (i + 2) * ldc + (j + 1); + int idx31 = (i + 3) * ldc + (j + 1); + + int idx02 = (i + 0) * ldc + (j + 2); + int idx12 = (i + 1) * ldc + (j + 2); + int idx22 = (i + 2) * ldc + (j + 2); + int idx32 = (i + 3) * ldc + (j + 2); + + int idx03 = (i + 0) * ldc + (j + 3); + int idx13 = (i + 1) * ldc + (j + 3); + int idx23 = (i + 2) * ldc + (j + 3); + int idx33 = (i + 3) * ldc + (j + 3); + + // dst[m, 0] + vfloat16m1_t _sum00; + vfloat16m1_t _sum10; + vfloat16m1_t _sum20; + vfloat16m1_t _sum30; + // dst[m, 1] + vfloat16m1_t _sum01; + vfloat16m1_t _sum11; + vfloat16m1_t _sum21; + vfloat16m1_t _sum31; + // dst[m, 2] + vfloat16m1_t _sum02; + vfloat16m1_t _sum12; + vfloat16m1_t _sum22; + vfloat16m1_t _sum32; + // dst[m, 3] + vfloat16m1_t _sum03; + vfloat16m1_t _sum13; + vfloat16m1_t _sum23; + vfloat16m1_t _sum33; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f16m1(bias[j + 0], 1); + _sum10 = vmv_v_v_f16m1(_sum00, 1); + _sum20 = vmv_v_v_f16m1(_sum00, 1); + _sum30 = vmv_v_v_f16m1(_sum00, 1); + + _sum01 = vfmv_v_f_f16m1(bias[j + 1], 1); + _sum11 = vmv_v_v_f16m1(_sum01, 1); + _sum21 = vmv_v_v_f16m1(_sum01, 1); + _sum31 = vmv_v_v_f16m1(_sum01, 1); + + _sum02 = vfmv_v_f_f16m1(bias[j + 2], 1); + _sum12 = vmv_v_v_f16m1(_sum02, 1); + _sum22 = vmv_v_v_f16m1(_sum02, 1); + _sum32 = vmv_v_v_f16m1(_sum02, 1); + + _sum03 = vfmv_v_f_f16m1(bias[j + 3], 1); + _sum13 = vmv_v_v_f16m1(_sum03, 1); + _sum23 = vmv_v_v_f16m1(_sum03, 1); + _sum33 = vmv_v_v_f16m1(_sum03, 1); + } else { + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f16m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f16m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f16m1(dst[idx30], 1); + + _sum01 = vfmv_v_f_f16m1(dst[idx01], 1); + _sum11 = vfmv_v_f_f16m1(dst[idx11], 1); + _sum21 = vfmv_v_f_f16m1(dst[idx21], 1); + _sum31 = vfmv_v_f_f16m1(dst[idx31], 1); + + _sum02 = vfmv_v_f_f16m1(dst[idx02], 1); + _sum12 = vfmv_v_f_f16m1(dst[idx12], 1); + _sum22 = vfmv_v_f_f16m1(dst[idx22], 1); + _sum32 = vfmv_v_f_f16m1(dst[idx32], 1); + + _sum03 = vfmv_v_f_f16m1(dst[idx03], 1); + _sum13 = vfmv_v_f_f16m1(dst[idx13], 1); + _sum23 = vfmv_v_f_f16m1(dst[idx23], 1); + _sum33 = vfmv_v_f_f16m1(dst[idx33], 1); + } + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc30, _sum30, vlmax); + + _sum01 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc01, _sum01, vlmax); + _sum11 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc11, _sum11, vlmax); + _sum21 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc21, _sum21, vlmax); + _sum31 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc31, _sum31, vlmax); + + _sum02 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc02, _sum02, vlmax); + _sum12 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc12, _sum12, vlmax); + _sum22 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc22, _sum22, vlmax); + _sum32 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc32, _sum32, vlmax); + + _sum03 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc03, _sum03, vlmax); + _sum13 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc13, _sum13, vlmax); + _sum23 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc23, _sum23, vlmax); + _sum33 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc33, _sum33, vlmax); + + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + dst[idx10] = vfmv_f_s_f16m1_f16(_sum10); + dst[idx20] = vfmv_f_s_f16m1_f16(_sum20); + dst[idx30] = vfmv_f_s_f16m1_f16(_sum30); + + dst[idx01] = vfmv_f_s_f16m1_f16(_sum01); + dst[idx11] = vfmv_f_s_f16m1_f16(_sum11); + dst[idx21] = vfmv_f_s_f16m1_f16(_sum21); + dst[idx31] = vfmv_f_s_f16m1_f16(_sum31); + + dst[idx02] = vfmv_f_s_f16m1_f16(_sum02); + dst[idx12] = vfmv_f_s_f16m1_f16(_sum12); + dst[idx22] = vfmv_f_s_f16m1_f16(_sum22); + dst[idx32] = vfmv_f_s_f16m1_f16(_sum32); + + dst[idx03] = vfmv_f_s_f16m1_f16(_sum03); + dst[idx13] = vfmv_f_s_f16m1_f16(_sum13); + dst[idx23] = vfmv_f_s_f16m1_f16(_sum23); + dst[idx33] = vfmv_f_s_f16m1_f16(_sum33); + } + for (; j < N; j++) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *a1_ptr = sa_ptr + 1 * lda; + const __fp16 *a2_ptr = sa_ptr + 2 * lda; + const __fp16 *a3_ptr = sa_ptr + 3 * lda; + const __fp16 *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[m, 0] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _a1 = vle16_v_f16m1(a1_ptr + c, vl); + vfloat16m1_t _a2 = vle16_v_f16m1(a2_ptr + c, vl); + vfloat16m1_t _a3 = vle16_v_f16m1(a3_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + // dst[m, 0] + vfloat16m1_t _sum00; + vfloat16m1_t _sum10; + vfloat16m1_t _sum20; + vfloat16m1_t _sum30; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f16m1(bias[j + 0], 1); + _sum10 = vmv_v_v_f16m1(_sum00, 1); + _sum20 = vmv_v_v_f16m1(_sum00, 1); + _sum30 = vmv_v_v_f16m1(_sum00, 1); + } else { + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f16m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f16m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f16m1(dst[idx30], 1); + } + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc30, _sum30, vlmax); + + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + dst[idx10] = vfmv_f_s_f16m1_f16(_sum10); + dst[idx20] = vfmv_f_s_f16m1_f16(_sum20); + dst[idx30] = vfmv_f_s_f16m1_f16(_sum30); + } + } + for (; i < M; i += 1) { + const __fp16 *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * ldb; + const __fp16 *b1_ptr = b0_ptr + 1 * ldb; + const __fp16 *b2_ptr = b0_ptr + 2 * ldb; + const __fp16 *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[0, n] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr + c, vl); + vfloat16m1_t _b2 = vle16_v_f16m1(b2_ptr + c, vl); + vfloat16m1_t _b3 = vle16_v_f16m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vlmax); + _acc02 = vfmacc_vv_f16m1(_acc02, _a0, _b2, vlmax); + _acc03 = vfmacc_vv_f16m1(_acc03, _a0, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx01 = (i + 0) * ldc + (j + 1); + int idx02 = (i + 0) * ldc + (j + 2); + int idx03 = (i + 0) * ldc + (j + 3); + + // dst[0, n] + vfloat16m1_t _sum00; + vfloat16m1_t _sum01; + vfloat16m1_t _sum02; + vfloat16m1_t _sum03; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f16m1(bias[j + 0], 1); + _sum01 = vfmv_v_f_f16m1(bias[j + 1], 1); + _sum02 = vfmv_v_f_f16m1(bias[j + 2], 1); + _sum03 = vfmv_v_f_f16m1(bias[j + 3], 1); + } else { + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + _sum01 = vfmv_v_f_f16m1(dst[idx01], 1); + _sum02 = vfmv_v_f_f16m1(dst[idx02], 1); + _sum03 = vfmv_v_f_f16m1(dst[idx03], 1); + } + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + _sum01 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc01, _sum01, vlmax); + _sum02 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc02, _sum02, vlmax); + _sum03 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc03, _sum03, vlmax); + + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + dst[idx01] = vfmv_f_s_f16m1_f16(_sum01); + dst[idx02] = vfmv_f_s_f16m1_f16(_sum02); + dst[idx03] = vfmv_f_s_f16m1_f16(_sum03); + } + for (; j < N; j++) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[0, 0] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + + // dst[0, 0] + vfloat16m1_t _sum00; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f16m1(bias[j + 0], 1); + } else { + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + } + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + } + } +} + +static void gemm_dot_4x4_fp16_omp(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, + int M, int K, int N, int lda, int ldb, int ldc, int k_idx) +{ + if (shl_multithread_is_enable()) { +#pragma omp parallel + { + int M_start = 0, M_end = M; + int N_start = 0, N_end = N; + c920_omp_get_mn_partition(M, N, &M_start, &M_end, &N_start, &N_end); + + __fp16 *thread_dst = dst + M_start * ldc + N_start; + const __fp16 *thread_sa = sa + M_start * lda; + const __fp16 *thread_sb = sb + N_start * ldb; + __fp16 *thread_bias = bias + N_start; + int thread_M = M_end - M_start; + int thread_N = N_end - N_start; + gemm_dot_4x4_fp16(thread_dst, thread_sa, thread_sb, thread_bias, thread_M, K, thread_N, + lda, ldb, ldc, k_idx); + } + } else { + gemm_dot_4x4_fp16(dst, sa, sb, bias, M, K, N, lda, ldb, ldc, k_idx); + } +} + +#define MIN(a, b) ((a) > (b) ? (b) : (a)) +#define CAL_LAST(SIZE, x, y) ((SIZE - x * y) / (x + y)) + +static inline void c920_get_blk_size(int M, int N, int K, int *m_blk, int *n_blk, int *k_blk) +{ + const int M_BLK = 256; + const int N_BLK = 256; + const int K_BLK = 370; + const int CACHE_SIZE = 1024 * 1024 / sizeof(__fp16) * 0.75; + + if (M <= M_BLK && N <= N_BLK && K <= K_BLK) { + *m_blk = M; + *n_blk = N; + *k_blk = K; + } else if (M > M_BLK && N > N_BLK && K > K_BLK) { + *m_blk = M_BLK; + *n_blk = N_BLK; + *k_blk = K_BLK; + } else { + if (M <= M_BLK && N <= N_BLK && K > K_BLK) { + *m_blk = M; + *n_blk = N; + *k_blk = MIN(CAL_LAST(CACHE_SIZE, *m_blk, *n_blk), K); + } else if (M <= M_BLK && N > N_BLK && K <= K_BLK) { + *m_blk = M; + *k_blk = K; + *n_blk = MIN(CAL_LAST(CACHE_SIZE, *m_blk, *k_blk), N); + } else if (M > M_BLK && N <= N_BLK && K <= K_BLK) { + *n_blk = N; + *k_blk = K; + *m_blk = MIN(CAL_LAST(CACHE_SIZE, *n_blk, *k_blk), M); + } else if (M > M_BLK && N > N_BLK && K <= K_BLK) { + *k_blk = K; + int tmp_m = M_BLK; + *n_blk = MIN(CAL_LAST(CACHE_SIZE, tmp_m, *k_blk), N); + *m_blk = MIN(CAL_LAST(CACHE_SIZE, *n_blk, *k_blk), M); + } else if (M > M_BLK && N <= N_BLK && K > K_BLK) { + *n_blk = N; + int tmp_m = M_BLK; + *k_blk = MIN(CAL_LAST(CACHE_SIZE, tmp_m, *n_blk), K); + *m_blk = MIN(CAL_LAST(CACHE_SIZE, *n_blk, *k_blk), M); + } else if (M <= M_BLK && N > N_BLK && K > K_BLK) { + *m_blk = M; + int tmp_n = N_BLK; + *k_blk = MIN(CAL_LAST(CACHE_SIZE, tmp_n, *m_blk), K); + *n_blk = MIN(CAL_LAST(CACHE_SIZE, *m_blk, *k_blk), N); + } + } + + int tmp_n = *n_blk; + if (tmp_n < N && tmp_n % 4 != 0) { + *n_blk = (tmp_n / 4) * 4; + } + + int tmp_k = *k_blk; + const int block_size = 32; + if (tmp_k < K && tmp_k % block_size != 0) { + *k_blk = (tmp_k / block_size) * block_size; + } +} + +/************************************************************* + * constrain: vlen >= 128, and K % 32 == 0 + ************************************************************/ +static void dequantize_block_q8_to_f16(const int8_t *src, const __fp16 *scale, __fp16 *dst, + int n_blk, int k_blk, int ld_src, int ld_dst) +{ + int block_size = 32; + int vl = vsetvl_e8m2(block_size); + for (int i = 0; i < n_blk; i++) { + const int8_t *s_ptr = src + i * ld_src; + const __fp16 *scale_ptr = scale + i * ld_src / block_size; + __fp16 *d_ptr = dst + i * ld_dst; + for (int j = 0; j + block_size - 1 < k_blk; j += block_size) { + vint8m2_t _i8 = vle8_v_i8m2(s_ptr + j, vl); + vint16m4_t _i16 = vwadd_vx_i16m4(_i8, 0, vl); + vfloat16m4_t _f16 = vfcvt_f_x_v_f16m4(_i16, vl); + _f16 = vfmul_vf_f16m4(_f16, scale_ptr[0], vl); + scale_ptr += 1; + vse16_v_f16m4(d_ptr + j, _f16, vl); + } + } +} + +void shl_c920_gemm_a0nb1n_dot_fp16_q8(__fp16 *dst, const __fp16 *sa, const int8_t *sb, __fp16 *bias, + int M, int K, int N, const __fp16 *scale) +{ + int flag_bias = 1; + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(N * sizeof(__fp16)); + } + + if (M > 1) { + int M_BLK, N_BLK, K_BLK; + c920_get_blk_size(M, N, K, &M_BLK, &N_BLK, &K_BLK); + + __fp16 *b_fp16 = (__fp16 *)shl_mem_alloc(N_BLK * K_BLK * sizeof(__fp16)); + int lda = K; + int ldb = K_BLK; // after dequantize + int ldc = N; + + int m_block = M_BLK; + int m_idx = 0; + while (m_idx < M) { + if (M - m_idx < m_block) { + m_block = M - m_idx; + } + int n_block = N_BLK; + int n_idx = 0; + while (n_idx < N) { + if (N - n_idx < n_block) { + n_block = N - n_idx; + } + int k_block = K_BLK; + int k_idx = 0; + while (k_idx < K) { + if (K - k_idx < k_block) { + k_block = K - k_idx; + } + __fp16 *c_ptr = dst + m_idx * N + n_idx; + const __fp16 *a_ptr = sa + m_idx * K + k_idx; + const int8_t *b_ptr = sb + n_idx * K + k_idx; + const __fp16 *scale_ptr = scale + n_idx * (K / 32) + k_idx / 32; + + // dequantize before gemm + dequantize_block_q8_to_f16(b_ptr, scale_ptr, b_fp16, n_block, k_block, K, + K_BLK); + gemm_dot_4x4_fp16_omp(c_ptr, a_ptr, b_fp16, bias + n_idx, m_block, k_block, + n_block, lda, ldb, ldc, k_idx); + + k_idx += k_block; + } + n_idx += n_block; + } + m_idx += m_block; + } + + shl_mem_free(b_fp16); + } else { + int lda = K; + int ldb = K; + int ldc = N; + // dequantize in gemm + gemm_dot_1x1_fp16_q8_omp(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, 0); + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +/************************************************************* + * constrain: vlen >= 128, and K % 32 == 0 + ************************************************************/ +static void dequantize_block_q4_to_f16(const int8_t *src, const __fp16 *scale, __fp16 *dst, + int n_blk, int k_blk, int ld_src, int ld_dst) +{ + int block_size = 32; + int half_block = block_size / 2; + int vl = vsetvl_e8m1(half_block); + for (int i = 0; i < n_blk; i++) { + const int8_t *s_ptr = src + i * ld_src / 2; + const __fp16 *scale_ptr = scale + i * ld_src / block_size; + __fp16 *d_ptr = dst + i * ld_dst; + for (int j = 0; j + block_size - 1 < k_blk; j += block_size) { + vint8m1_t _in = vle8_v_i8m1(s_ptr, vl); + s_ptr += half_block; + vint8m1_t _low_i8 = vand_vx_i8m1(_in, 0x0f, vl); + vint8m1_t _high_i8 = vsra_vx_i8m1(_in, 4, vl); + _high_i8 = vand_vx_i8m1(_high_i8, 0x0f, vl); + vint16m2_t _low_i16 = vwsub_vx_i16m2(_low_i8, 8, vl); + vint16m2_t _high_i16 = vwsub_vx_i16m2(_high_i8, 8, vl); + vfloat16m2_t _low_f16 = vfcvt_f_x_v_f16m2(_low_i16, vl); + vfloat16m2_t _high_f16 = vfcvt_f_x_v_f16m2(_high_i16, vl); + _low_f16 = vfmul_vf_f16m2(_low_f16, scale_ptr[0], vl); + _high_f16 = vfmul_vf_f16m2(_high_f16, scale_ptr[0], vl); + scale_ptr += 1; + vse16_v_f16m2(d_ptr, _low_f16, vl); + vse16_v_f16m2(d_ptr + half_block, _high_f16, vl); + d_ptr += block_size; + } + } +} + +void shl_c920_gemm_a0nb1n_dot_fp16_q4(__fp16 *dst, const __fp16 *sa, const int8_t *sb, __fp16 *bias, + int M, int K, int N, const __fp16 *scale) +{ + int flag_bias = 1; + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(N * sizeof(__fp16)); + } + + if (M > 1) { + int M_BLK, N_BLK, K_BLK; + c920_get_blk_size(M, N, K, &M_BLK, &N_BLK, &K_BLK); + + __fp16 *b_fp16 = (__fp16 *)shl_mem_alloc(N_BLK * K_BLK * sizeof(__fp16)); + int lda = K; + int ldb = K_BLK; // after dequantize + int ldc = N; + + int m_block = M_BLK; + int m_idx = 0; + while (m_idx < M) { + if (M - m_idx < m_block) { + m_block = M - m_idx; + } + + int n_block = N_BLK; + int n_idx = 0; + while (n_idx < N) { + if (N - n_idx < n_block) { + n_block = N - n_idx; + } + + int k_block = K_BLK; + int k_idx = 0; + while (k_idx < K) { + if (K - k_idx < k_block) { + k_block = K - k_idx; + } + + __fp16 *c_ptr = dst + m_idx * N + n_idx; + const __fp16 *a_ptr = sa + m_idx * K + k_idx; + const int8_t *b_ptr = sb + n_idx * K / 2 + k_idx / 2; + const __fp16 *scale_ptr = scale + n_idx * (K / 32) + k_idx / 32; + + // dequantize before gemm + dequantize_block_q4_to_f16(b_ptr, scale_ptr, b_fp16, n_block, k_block, K, + K_BLK); + gemm_dot_4x4_fp16_omp(c_ptr, a_ptr, b_fp16, bias + n_idx, m_block, k_block, + n_block, lda, ldb, ldc, k_idx); + + k_idx += k_block; + } + + n_idx += n_block; + } + + m_idx += m_block; + } + shl_mem_free(b_fp16); + } else { + int lda = K; + int ldb = K; + int ldc = N; + // dequantize in gemm + gemm_dot_1x1_fp16_q4_omp(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, 0); + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c920_opt/fp16/gemm_a0nb1r_fp16.c b/source/c920_opt/fp16/gemm_a0nb1r_fp16.c new file mode 100644 index 00000000..aad958cd --- /dev/null +++ b/source/c920_opt/fp16/gemm_a0nb1r_fp16.c @@ -0,0 +1,372 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c920/c920.h" + +/************************************************************* + * packn = vlenb / sizeof(__fp16) + * m_blk: 8/4/2/1 + * n_blk: pack2n/packn/n_tail + * + * dst - output: [M, N] + * sa - input: [M, K] + * sb - weights: [N/n_blk, K, n_blk] + * bias: [N] + ************************************************************/ +void shl_c920_gemm_a0nb1r_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, + __fp16 *bias, int M, int K, int N) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int flag_bias = 1; + if (bias == NULL) { + flag_bias = 0; + bias = (__fp16 *)shl_mem_alloc(N * sizeof(__fp16)); + } + + int i = 0; + for (; i + 7 < M; i += 8) { + const __fp16 *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e16m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + const __fp16 *b1_ptr = b0_ptr + packn; + __fp16 *c0_ptr = dst + i * N + j; + __fp16 *c1_ptr = c0_ptr + packn; + + // [n, 0] + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc40 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc50 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc60 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc70 = vmv_v_v_f16m1(_acc00, vl); + // [n, 1] + vfloat16m1_t _acc01 = vle16_v_f16m1(bias + j + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc21 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc31 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc41 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc51 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc61 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc71 = vmv_v_v_f16m1(_acc01, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + vfloat16m1_t _a1 = vfmv_v_f_f16m1(a_ptr[1 * K], vl); + vfloat16m1_t _a2 = vfmv_v_f_f16m1(a_ptr[2 * K], vl); + vfloat16m1_t _a3 = vfmv_v_f_f16m1(a_ptr[3 * K], vl); + vfloat16m1_t _a4 = vfmv_v_f_f16m1(a_ptr[4 * K], vl); + vfloat16m1_t _a5 = vfmv_v_f_f16m1(a_ptr[5 * K], vl); + vfloat16m1_t _a6 = vfmv_v_f_f16m1(a_ptr[6 * K], vl); + vfloat16m1_t _a7 = vfmv_v_f_f16m1(a_ptr[7 * K], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vl); + _acc40 = vfmacc_vv_f16m1(_acc40, _a4, _b0, vl); + _acc50 = vfmacc_vv_f16m1(_acc50, _a5, _b0, vl); + _acc60 = vfmacc_vv_f16m1(_acc60, _a6, _b0, vl); + _acc70 = vfmacc_vv_f16m1(_acc70, _a7, _b0, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _a1, _b1, vl); + _acc21 = vfmacc_vv_f16m1(_acc21, _a2, _b1, vl); + _acc31 = vfmacc_vv_f16m1(_acc31, _a3, _b1, vl); + _acc41 = vfmacc_vv_f16m1(_acc41, _a4, _b1, vl); + _acc51 = vfmacc_vv_f16m1(_acc51, _a5, _b1, vl); + _acc61 = vfmacc_vv_f16m1(_acc61, _a6, _b1, vl); + _acc71 = vfmacc_vv_f16m1(_acc71, _a7, _b1, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c0_ptr + N, _acc10, vl); + vse16_v_f16m1(c0_ptr + N * 2, _acc20, vl); + vse16_v_f16m1(c0_ptr + N * 3, _acc30, vl); + vse16_v_f16m1(c0_ptr + N * 4, _acc40, vl); + vse16_v_f16m1(c0_ptr + N * 5, _acc50, vl); + vse16_v_f16m1(c0_ptr + N * 6, _acc60, vl); + vse16_v_f16m1(c0_ptr + N * 7, _acc70, vl); + vse16_v_f16m1(c1_ptr, _acc01, vl); + vse16_v_f16m1(c1_ptr + N, _acc11, vl); + vse16_v_f16m1(c1_ptr + N * 2, _acc21, vl); + vse16_v_f16m1(c1_ptr + N * 3, _acc31, vl); + vse16_v_f16m1(c1_ptr + N * 4, _acc41, vl); + vse16_v_f16m1(c1_ptr + N * 5, _acc51, vl); + vse16_v_f16m1(c1_ptr + N * 6, _acc61, vl); + vse16_v_f16m1(c1_ptr + N * 7, _acc71, vl); + } + while (j < N) { + int vl = vsetvl_e16m1(N - j); + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + __fp16 *c0_ptr = dst + i * N + j; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc40 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc50 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc60 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc70 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + vfloat16m1_t _a1 = vfmv_v_f_f16m1(a_ptr[1 * K], vl); + vfloat16m1_t _a2 = vfmv_v_f_f16m1(a_ptr[2 * K], vl); + vfloat16m1_t _a3 = vfmv_v_f_f16m1(a_ptr[3 * K], vl); + vfloat16m1_t _a4 = vfmv_v_f_f16m1(a_ptr[4 * K], vl); + vfloat16m1_t _a5 = vfmv_v_f_f16m1(a_ptr[5 * K], vl); + vfloat16m1_t _a6 = vfmv_v_f_f16m1(a_ptr[6 * K], vl); + vfloat16m1_t _a7 = vfmv_v_f_f16m1(a_ptr[7 * K], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vl); + _acc40 = vfmacc_vv_f16m1(_acc40, _a4, _b0, vl); + _acc50 = vfmacc_vv_f16m1(_acc50, _a5, _b0, vl); + _acc60 = vfmacc_vv_f16m1(_acc60, _a6, _b0, vl); + _acc70 = vfmacc_vv_f16m1(_acc70, _a7, _b0, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c0_ptr + N, _acc10, vl); + vse16_v_f16m1(c0_ptr + N * 2, _acc20, vl); + vse16_v_f16m1(c0_ptr + N * 3, _acc30, vl); + vse16_v_f16m1(c0_ptr + N * 4, _acc40, vl); + vse16_v_f16m1(c0_ptr + N * 5, _acc50, vl); + vse16_v_f16m1(c0_ptr + N * 6, _acc60, vl); + vse16_v_f16m1(c0_ptr + N * 7, _acc70, vl); + j += vl; + } + } + for (; i + 3 < M; i += 4) { + const __fp16 *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e16m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + const __fp16 *b1_ptr = b0_ptr + packn; + __fp16 *c0_ptr = dst + i * N + j; + __fp16 *c1_ptr = c0_ptr + packn; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc01 = vle16_v_f16m1(bias + j + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc21 = vmv_v_v_f16m1(_acc01, vl); + vfloat16m1_t _acc31 = vmv_v_v_f16m1(_acc01, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + vfloat16m1_t _a1 = vfmv_v_f_f16m1(a_ptr[1 * K], vl); + vfloat16m1_t _a2 = vfmv_v_f_f16m1(a_ptr[2 * K], vl); + vfloat16m1_t _a3 = vfmv_v_f_f16m1(a_ptr[3 * K], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _a1, _b1, vl); + _acc21 = vfmacc_vv_f16m1(_acc21, _a2, _b1, vl); + _acc31 = vfmacc_vv_f16m1(_acc31, _a3, _b1, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c0_ptr + N, _acc10, vl); + vse16_v_f16m1(c0_ptr + N * 2, _acc20, vl); + vse16_v_f16m1(c0_ptr + N * 3, _acc30, vl); + vse16_v_f16m1(c1_ptr, _acc01, vl); + vse16_v_f16m1(c1_ptr + N, _acc11, vl); + vse16_v_f16m1(c1_ptr + N * 2, _acc21, vl); + vse16_v_f16m1(c1_ptr + N * 3, _acc31, vl); + } + while (j < N) { + int vl = vsetvl_e16m1(N - j); + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + __fp16 *c0_ptr = dst + i * N + j; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + vfloat16m1_t _a1 = vfmv_v_f_f16m1(a_ptr[1 * K], vl); + vfloat16m1_t _a2 = vfmv_v_f_f16m1(a_ptr[2 * K], vl); + vfloat16m1_t _a3 = vfmv_v_f_f16m1(a_ptr[3 * K], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c0_ptr + N, _acc10, vl); + vse16_v_f16m1(c0_ptr + N * 2, _acc20, vl); + vse16_v_f16m1(c0_ptr + N * 3, _acc30, vl); + j += vl; + } + } + for (; i + 1 < M; i += 2) { + const __fp16 *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e16m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + const __fp16 *b1_ptr = b0_ptr + packn; + __fp16 *c0_ptr = dst + i * N + j; + __fp16 *c1_ptr = c0_ptr + packn; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vl); + vfloat16m1_t _acc01 = vle16_v_f16m1(bias + j + packn, vl); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc01, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + vfloat16m1_t _a1 = vfmv_v_f_f16m1(a_ptr[1 * K], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vl); + _acc11 = vfmacc_vv_f16m1(_acc11, _a1, _b1, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c0_ptr + N, _acc10, vl); + vse16_v_f16m1(c1_ptr, _acc01, vl); + vse16_v_f16m1(c1_ptr + N, _acc11, vl); + } + while (j < N) { + int vl = vsetvl_e16m1(N - j); + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + __fp16 *c0_ptr = dst + i * N + j; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + vfloat16m1_t _a1 = vfmv_v_f_f16m1(a_ptr[1 * K], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c0_ptr + N, _acc10, vl); + j += vl; + } + } + for (; i < M; i++) { + const __fp16 *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e16m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + const __fp16 *b1_ptr = b0_ptr + packn; + __fp16 *c0_ptr = dst + i * N + j; + __fp16 *c1_ptr = c0_ptr + packn; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + vfloat16m1_t _acc01 = vle16_v_f16m1(bias + j + packn, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + vse16_v_f16m1(c1_ptr, _acc01, vl); + } + while (j < N) { + int vl = vsetvl_e16m1(N - j); + const __fp16 *a_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * K; + __fp16 *c0_ptr = dst + i * N + j; + + vfloat16m1_t _acc00 = vle16_v_f16m1(bias + j, vl); + + for (int c = 0; c < K; c++) { + vfloat16m1_t _a0 = vfmv_v_f_f16m1(a_ptr[0], vl); + a_ptr += 1; + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vl); + } + + vse16_v_f16m1(c0_ptr, _acc00, vl); + j += vl; + } + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c920_opt/fp16/gemm_fp16_block.c b/source/c920_opt/fp16/gemm_fp16_block.c index 73cb8301..ffe67f55 100644 --- a/source/c920_opt/fp16/gemm_fp16_block.c +++ b/source/c920_opt/fp16/gemm_fp16_block.c @@ -483,7 +483,7 @@ static inline void gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp1 * k_blk: K_BLK, K_tail * * dst - output: [m, n] - * sa - kernel: [m/m_blk, k/k_blk, m_blk/8, 8, k_blk] + * sa - kernel: [m/m_blk, k/k_blk, m_blk/8, k_blk, 8] * sb - input: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n] * bias: [m] ************************************************************/ @@ -529,7 +529,7 @@ void shl_c920_gemm_block_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp __fp16 *out = output_data + m_idx * n + n_idx; const __fp16 *ker = kernel_data + m_idx * k + k_idx * m_block; const __fp16 *in = input_data + n_idx * k + k_idx * n_block; - gemm_8xpack2n_fp16(out, ker, in, bias, m_block, n_block, k_block, n, k_idx); + gemm_8xpack2n_fp16(out, ker, in, bias + m_idx, m_block, n_block, k_block, n, k_idx); k_idx += k_block; } diff --git a/source/c920_opt/fp16/gemm_fp16_rearrange.c b/source/c920_opt/fp16/gemm_fp16_rearrange.c new file mode 100644 index 00000000..75ce4056 --- /dev/null +++ b/source/c920_opt/fp16/gemm_fp16_rearrange.c @@ -0,0 +1,143 @@ +#include "c920/c920.h" + +static inline vfloat16m4_t vdeq_vf_f16m4(vint8m2_t _src, __fp16 scale, int vl) +{ + vint16m4_t _i16 = vwadd_vx_i16m4(_src, 0, vl); + vfloat16m4_t _f16 = vfcvt_f_x_v_f16m4(_i16, vl); + _f16 = vfmul_vf_f16m4(_f16, scale, vl); + return _f16; +} + +/** + * 1 the multithread is simply divided by 32, if necessary, it can be redesigned by thread rank. + */ +static inline void gemm_dot_1x1_fp16_q8_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + const __fp16 *scale, int M, int K, int N) +{ + int block_size = 32; + for (int i = 0; i < M; i++) { + int vl = vsetvl_e16m4(block_size); + const __fp16 *a0_ptr = sa + i * K; + + if (shl_multithread_is_enable()) { +#pragma omp parallel for + for (int j = 0; j < N - 31; j += 32) { + const __fp16 *s0_ptr = scale + j / 32 * K; + const int8_t *b0_ptr = sb + j * K; + __fp16 *dst_ptr = dst + i * N + j; + vfloat16m4_t _acc00 = vfmv_v_f_f16m4(0.0f, vl); + for (int k = 0; k < K; k++) { + vint8m2_t _b0_i8 = vle8_v_i8m2(b0_ptr, vl); + vfloat16m4_t _b0_f32 = vdeq_vf_f16m4(_b0_i8, s0_ptr[0], vl); + _acc00 = vfmacc_vf_f16m4(_acc00, a0_ptr[k], _b0_f32, vl); + s0_ptr += 1; + b0_ptr += block_size; + } + vse16_v_f16m4(dst_ptr, _acc00, vl); + } + } else { + for (int j = 0; j < N - 31; j += 32) { + const __fp16 *s0_ptr = scale + j / 32 * K; + const int8_t *b0_ptr = sb + j * K; + __fp16 *dst_ptr = dst + i * N + j; + vfloat16m4_t _acc00 = vfmv_v_f_f16m4(0.0f, vl); + for (int k = 0; k < K; k++) { + vint8m2_t _b0_i8 = vle8_v_i8m2(b0_ptr, vl); + vfloat16m4_t _b0_f32 = vdeq_vf_f16m4(_b0_i8, s0_ptr[0], vl); + _acc00 = vfmacc_vf_f16m4(_acc00, a0_ptr[k], _b0_f32, vl); + s0_ptr += 1; + b0_ptr += block_size; + } + vse16_v_f16m4(dst_ptr, _acc00, vl); + } + } + } +} + +void shl_c920_gemm_a0nb1_dot_fp16_q8_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + __fp16 *bias, int M, int K, int N, + const __fp16 *scale) +{ + // dequantize in gemm + gemm_dot_1x1_fp16_q8_rearrange(dst, sa, sb, scale, M, K, N); +} + +/** + * 1 for int4 weight in single core, the computational efficiency may be improved. + * 2 the multithread is simply divided by 32, if necessary, it can be redesigned by thread rank. + */ +static inline void gemm_dot_1x1_fp16_q4_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + const __fp16 *scale, int M, int K, int N) +{ + int block_size = 32; + int half_block = block_size / 2; // 16 + for (int i = 0; i < M; i++) { + int vl = vsetvl_e16m2(half_block); + const __fp16 *a0_ptr = sa + i * K; + + if (shl_multithread_is_enable()) { +#pragma omp parallel for + for (int j = 0; j < N - 31; j += 32) { + const __fp16 *s0_ptr = scale + j / 32 * K; + const int8_t *b0_ptr = sb + j / 2 * K; + __fp16 *dst_ptr = dst + i * N + j; + + vfloat16m2_t _acc00 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc01 = vfmv_v_f_f16m2(0.0f, vl); + for (int k = 0; k < K; k++) { + vint8m1_t _b0_i8 = vle8_v_i8m1(b0_ptr, vl); + vint8m1_t _low_i8 = vand_vx_i8m1(_b0_i8, 0x0f, vl); + vint8m1_t _high_i8 = vsra_vx_i8m1(_b0_i8, 4, vl); + _high_i8 = vand_vx_i8m1(_high_i8, 0x0f, vl); + vint16m2_t _low_i16 = vwsub_vx_i16m2(_low_i8, 8, vl); + vint16m2_t _high_i16 = vwsub_vx_i16m2(_high_i8, 8, vl); + vfloat16m2_t _low_f16 = vfcvt_f_x_v_f16m2(_low_i16, vl); + vfloat16m2_t _high_f16 = vfcvt_f_x_v_f16m2(_high_i16, vl); + _low_f16 = vfmul_vf_f16m2(_low_f16, s0_ptr[0], vl); + _high_f16 = vfmul_vf_f16m2(_high_f16, s0_ptr[0], vl); + _acc00 = vfmacc_vf_f16m2(_acc00, a0_ptr[k], _low_f16, vl); + _acc01 = vfmacc_vf_f16m2(_acc01, a0_ptr[k], _high_f16, vl); + s0_ptr += 1; + b0_ptr += half_block; + } + vse16_v_f16m2(dst_ptr, _acc00, vl); + vse16_v_f16m2(dst_ptr + half_block, _acc01, vl); + } + } else { + for (int j = 0; j < N - 31; j += 32) { + const __fp16 *s0_ptr = scale + j / 32 * K; + const int8_t *b0_ptr = sb + j / 2 * K; + __fp16 *dst_ptr = dst + i * N + j; + + vfloat16m2_t _acc00 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc01 = vfmv_v_f_f16m2(0.0f, vl); + for (int k = 0; k < K; k++) { + vint8m1_t _b0_i8 = vle8_v_i8m1(b0_ptr, vl); + vint8m1_t _low_i8 = vand_vx_i8m1(_b0_i8, 0x0f, vl); + vint8m1_t _high_i8 = vsra_vx_i8m1(_b0_i8, 4, vl); + _high_i8 = vand_vx_i8m1(_high_i8, 0x0f, vl); + vint16m2_t _low_i16 = vwsub_vx_i16m2(_low_i8, 8, vl); + vint16m2_t _high_i16 = vwsub_vx_i16m2(_high_i8, 8, vl); + vfloat16m2_t _low_f16 = vfcvt_f_x_v_f16m2(_low_i16, vl); + vfloat16m2_t _high_f16 = vfcvt_f_x_v_f16m2(_high_i16, vl); + _low_f16 = vfmul_vf_f16m2(_low_f16, s0_ptr[0], vl); + _high_f16 = vfmul_vf_f16m2(_high_f16, s0_ptr[0], vl); + _acc00 = vfmacc_vf_f16m2(_acc00, a0_ptr[k], _low_f16, vl); + _acc01 = vfmacc_vf_f16m2(_acc01, a0_ptr[k], _high_f16, vl); + s0_ptr += 1; + b0_ptr += half_block; + } + vse16_v_f16m2(dst_ptr, _acc00, vl); + vse16_v_f16m2(dst_ptr + half_block, _acc01, vl); + } + } + } +} + +void shl_c920_gemm_a0nb1_dot_fp16_q4_rearrange(__fp16 *dst, const __fp16 *sa, const int8_t *sb, + __fp16 *bias, int M, int K, int N, + const __fp16 *scale) +{ + // dequantize in gemm + gemm_dot_1x1_fp16_q4_rearrange(dst, sa, sb, scale, M, K, N); +} \ No newline at end of file diff --git a/source/c920_opt/fp16/matmul_fp16.c b/source/c920_opt/fp16/matmul_fp16.c index 207ecbb1..752dbc7e 100644 --- a/source/c920_opt/fp16/matmul_fp16.c +++ b/source/c920_opt/fp16/matmul_fp16.c @@ -34,8 +34,8 @@ #define MATMUL_K_BLK 128 #define MATMUL_N_BLK 64 -int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, - struct csinn_tensor *output, struct csinn_matmul_params *params) +int shl_c920_matmul_a0b0_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0); @@ -64,77 +64,74 @@ int shl_c920_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)]; const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)]; - if (!params->trans_a && !params->trans_b) { - if (batches_a == batches_b) { - __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); - __fp16 *in1; - if (!(mat1->is_const)) { - in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); - } - - for (int b = 0; b < batches_a; b++) { - shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, - MATMUL_K_BLK); - if (!(mat1->is_const)) { - shl_rvv_reorder_b_block_pack2nxk_fp16(mat1_data, in1, dim_k, dim_n, - MATMUL_K_BLK, MATMUL_N_BLK); - } else { - in1 = mat1_data; - } - - shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, - MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); + if (batches_a == batches_b) { + __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); + __fp16 *in1; + if (!(mat1->is_const)) { + in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); + } - mat0_data += dim_m * dim_k; - mat1_data += dim_k * dim_n; - output_data += dim_m * dim_n; - } - shl_mem_free(in0); - if (!(mat1->is_const)) { - shl_mem_free(in1); - } - // requantize - shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1); - } else if (batches_a > 1 && batches_b == 1) { - __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); - __fp16 *in1; + for (int b = 0; b < batches_a; b++) { + shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, + MATMUL_K_BLK); if (!(mat1->is_const)) { - in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); shl_rvv_reorder_b_block_pack2nxk_fp16(mat1_data, in1, dim_k, dim_n, MATMUL_K_BLK, MATMUL_N_BLK); } else { in1 = mat1_data; } - for (int b = 0; b < batches_a; b++) { - shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, - MATMUL_K_BLK); - - shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, - MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); + shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, + MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); - mat0_data += dim_m * dim_k; - output_data += dim_m * dim_n; - } - shl_mem_free(in0); - if (!(mat1->is_const)) { - shl_mem_free(in1); - } - // requantize - shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1); + mat0_data += dim_m * dim_k; + mat1_data += dim_k * dim_n; + output_data += dim_m * dim_n; + } + shl_mem_free(in0); + if (!(mat1->is_const)) { + shl_mem_free(in1); + } + // requantize + shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1); + } else if (batches_a > 1 && batches_b == 1) { + __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); + __fp16 *in1; + if (!(mat1->is_const)) { + in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); + shl_rvv_reorder_b_block_pack2nxk_fp16(mat1_data, in1, dim_k, dim_n, MATMUL_K_BLK, + MATMUL_N_BLK); } else { - shl_debug_error("matmul unsupported this broadcast\n"); - return CSINN_FALSE; + in1 = mat1_data; + } + + for (int b = 0; b < batches_a; b++) { + shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, + MATMUL_K_BLK); + + shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, + MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); + + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; } + shl_mem_free(in0); + if (!(mat1->is_const)) { + shl_mem_free(in1); + } + // requantize + shl_rvv_sidcso_op_requantize_fp16(mat0, output, mat1); } else { - return shl_ref_matmul_quant(mat0, mat1, output, params); + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; } return CSINN_TRUE; } -int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1, - struct csinn_tensor *output, struct csinn_matmul_params *params) +int shl_c920_matmul_a0b0_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, + struct csinn_matmul_params *params) { if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0); @@ -164,53 +161,222 @@ int shl_c920_matmul_fp16_w_int8(struct csinn_tensor *mat0, struct csinn_tensor * float scale = mat1->qinfo->scale; int size1 = csinn_tensor_size(mat1); - if (!params->trans_a && !params->trans_b) { - if (batches_a == batches_b) { - __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); - __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16)); - shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale); + if (batches_a == batches_b) { + __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); + __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16)); + shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale); - for (int b = 0; b < batches_a; b++) { - shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, - MATMUL_K_BLK); + for (int b = 0; b < batches_a; b++) { + shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, + MATMUL_K_BLK); - shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1 + b * dim_k * dim_n, NULL, - dim_m, dim_k, dim_n, MATMUL_M_BLK, MATMUL_K_BLK, - MATMUL_N_BLK); + shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1 + b * dim_k * dim_n, NULL, + dim_m, dim_k, dim_n, MATMUL_M_BLK, MATMUL_K_BLK, + MATMUL_N_BLK); - mat0_data += dim_m * dim_k; - output_data += dim_m * dim_n; - } - shl_mem_free(in0); - shl_mem_free(in1); - } else if (batches_a > 1 && batches_b == 1) { - __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); - __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16)); - shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale); + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; + } + shl_mem_free(in0); + shl_mem_free(in1); + } else if (batches_a > 1 && batches_b == 1) { + __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16)); + __fp16 *in1 = (__fp16 *)shl_mem_alloc(size1 * sizeof(__fp16)); + shl_rvv_dequantize_i8_to_f16(mat1_data, in1, size1, zp, scale); - for (int b = 0; b < batches_a; b++) { - shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, - MATMUL_K_BLK); + for (int b = 0; b < batches_a; b++) { + shl_c920_reorder_a_block_8xk_fp16(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, + MATMUL_K_BLK); - shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, - MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); + shl_c920_gemm_block_8xpack2n_fp16(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, + MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); - mat0_data += dim_m * dim_k; - output_data += dim_m * dim_n; - } - shl_mem_free(in0); - shl_mem_free(in1); - } else { - shl_debug_error("matmul unsupported this broadcast\n"); - return CSINN_FALSE; + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; } + shl_mem_free(in0); + shl_mem_free(in1); } else { - return shl_ref_matmul_quant(mat0, mat1, output, params); + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; } return CSINN_TRUE; } +/************************************************************* + * packn = vlenb / sizeof(__fp16) + * n_blk: pack2n/packn/n_tail + * src: [n, k] + * dst: [n/n_blk, k, n_blk] + ************************************************************/ +static void reorder_mat1_npack2n_fp16(const __fp16 *src, __fp16 *dst, int n, int k) +{ + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int i = 0; + int vl = vsetvl_e16m2(pack2n); + for (; i + pack2n - 1 < n; i += pack2n) { + const __fp16 *s_ptr = src + i * k; + for (int j = 0; j < k; j++) { + vfloat16m2_t _src = vlse16_v_f16m2(s_ptr, k * sizeof(__fp16), vl); + vse16_v_f16m2(dst, _src, vl); + s_ptr += 1; + dst += vl; + } + } + while (i < n) { + int vl = vsetvl_e16m1(n - i); + const __fp16 *s_ptr = src + i * k; + for (int j = 0; j < k; j++) { + vfloat16m1_t _src = vlse16_v_f16m1(s_ptr, k * sizeof(__fp16), vl); + vse16_v_f16m1(dst, _src, vl); + s_ptr += 1; + dst += vl; + } + i += vl; + } +} + +int shl_c920_matmul_a0b1_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) +{ + if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0); + } + if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat1); + } + + __fp16 *mat0_data = (__fp16 *)mat0->data; + __fp16 *mat1_data = (__fp16 *)mat1->data; + __fp16 *output_data = (__fp16 *)output->data; + + const int dims_count = mat0->dim_count; + int batches_a = 1; + int batches_b = 1; + + /* compute the outer size */ + for (int i = 0; i < dims_count - 2; i++) { + batches_a *= mat0->dim[i]; + } + for (int i = 0; i < mat1->dim_count - 2; i++) { + batches_b *= mat1->dim[i]; + } + + const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)]; + const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)]; + const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)]; + + if (batches_a == batches_b) { + __fp16 *in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); + + for (int b = 0; b < batches_a; b++) { + reorder_mat1_npack2n_fp16(mat1_data, in1, dim_n, dim_k); + shl_c920_gemm_a0nb1r_8xpack2n_fp16(output_data, mat0_data, in1, NULL, dim_m, dim_k, + dim_n); + mat0_data += dim_m * dim_k; + mat1_data += dim_k * dim_n; + output_data += dim_m * dim_n; + } + + shl_mem_free(in1); + } else if (batches_a > 1 && batches_b == 1) { + __fp16 *in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16)); + reorder_mat1_npack2n_fp16(mat1_data, in1, dim_n, dim_k); + + for (int b = 0; b < batches_a; b++) { + shl_c920_gemm_a0nb1r_8xpack2n_fp16(output_data, mat0_data, in1, NULL, dim_m, dim_k, + dim_n); + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; + } + shl_mem_free(in1); + } else { + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; + } + + return CSINN_TRUE; +} + +int shl_c920_matmul_a0b1_fp16_block_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, + struct csinn_matmul_params *params) +{ + if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(mat0); + } + + __fp16 *mat0_data = (__fp16 *)mat0->data; + int8_t *mat1_data = (int8_t *)mat1->data; + __fp16 *output_data = (__fp16 *)output->data; + + const int dims_count = mat0->dim_count; + int batches_a = 1; + int batches_b = 1; + + /* compute the outer size */ + for (int i = 0; i < dims_count - 2; i++) { + batches_a *= mat0->dim[i]; + } + for (int i = 0; i < mat1->dim_count - 2; i++) { + batches_b *= mat1->dim[i]; + } + + const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)]; + const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)]; + const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)]; + + int size1 = csinn_tensor_size(mat1); + __fp16 *scale_data; + int weight_k = dim_k; + void (*gemm_a0nb1n_dot_fp16)(); + if (mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) { + scale_data = (__fp16 *)(mat1_data + size1); + gemm_a0nb1n_dot_fp16 = shl_c920_gemm_a0nb1n_dot_fp16_q8; + } else if (mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0_REARRANGE) { + scale_data = (__fp16 *)(mat1_data + size1); + gemm_a0nb1n_dot_fp16 = shl_c920_gemm_a0nb1_dot_fp16_q8_rearrange; + } else if (mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) { + // uint4 is only half of tensor size + scale_data = (__fp16 *)(mat1_data + size1 / 2); + weight_k = dim_k / 2; + gemm_a0nb1n_dot_fp16 = shl_c920_gemm_a0nb1n_dot_fp16_q4; + } else if (mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0_REARRANGE) { + // uint4 is only half of tensor size + scale_data = (__fp16 *)(mat1_data + size1 / 2); + weight_k = dim_k / 2; + gemm_a0nb1n_dot_fp16 = shl_c920_gemm_a0nb1_dot_fp16_q4_rearrange; + } else { + shl_debug_error("%s: unsupported mtype %d\n", __func__, mat1->mtype); + return CSINN_FALSE; + } + + if (batches_a == batches_b) { + for (int b = 0; b < batches_a; b++) { + gemm_a0nb1n_dot_fp16(output_data, mat0_data, mat1_data, NULL, dim_m, dim_k, dim_n, + scale_data); + mat0_data += dim_m * dim_k; + mat1_data += dim_n * weight_k; + scale_data += dim_n * dim_k / 32; + output_data += dim_m * dim_n; + } + } else if (batches_a > 1 && batches_b == 1) { + for (int b = 0; b < batches_a; b++) { + gemm_a0nb1n_dot_fp16(output_data, mat0_data, mat1_data, NULL, dim_m, dim_k, dim_n, + scale_data); + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; + } + } else { + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; + } + return CSINN_TRUE; +} + int shl_c920_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output, struct csinn_matmul_params *params) { @@ -223,21 +389,36 @@ int shl_c920_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *ma if (!binary_model_op_init) { shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK); } - cb->exec = shl_c920_matmul_fp16_w_int8; + cb->exec = shl_c920_matmul_a0b0_fp16_w_int8; } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) { if (mat1->is_const) { if (!binary_model_op_init) { shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK); } } - cb->exec = shl_c920_matmul_fp16; + cb->exec = shl_c920_matmul_a0b0_fp16; } } } + + if (!params->trans_a && params->trans_b) { + if (mat0->dtype == CSINN_DTYPE_FLOAT16 && mat1->dtype == CSINN_DTYPE_FLOAT16) { + cb->exec = shl_c920_matmul_a0b1_fp16; + } else if (mat0->dtype == CSINN_DTYPE_FLOAT16 && + ((mat1->dtype == CSINN_DTYPE_INT8 && mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) || + (mat1->dtype == CSINN_DTYPE_INT4 && mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) || + (mat1->dtype == CSINN_DTYPE_INT8 && + mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0_REARRANGE) || + (mat1->dtype == CSINN_DTYPE_INT4 && + mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0_REARRANGE))) { + cb->exec = shl_c920_matmul_a0b1_fp16_block_quant; + } + } + if (cb->exec == NULL) { shl_debug_warning( - "matmul is not optimized to achieve under this condition, call reference func " - "replaced.\n"); + "matmul is not optimized to achieve under this condition on C920 FP16, call reference " + "func replaced.\n"); cb->exec = shl_ref_matmul_quant; } return CSINN_TRUE; diff --git a/source/c920_opt/fp32/convolution.c b/source/c920_opt/fp32/convolution.c index 6f06f4c0..d38aa1c3 100644 --- a/source/c920_opt/fp32/convolution.c +++ b/source/c920_opt/fp32/convolution.c @@ -75,17 +75,20 @@ int shl_c920_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *o return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_c920_wg_b4f3s1_packn_fp32; } else { - shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_c920_wg_b6f3s1_packn_fp32; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c920_opt/fp32/gemm_a0nb1n_fp32.c b/source/c920_opt/fp32/gemm_a0nb1n_fp32.c new file mode 100644 index 00000000..5a4e8297 --- /dev/null +++ b/source/c920_opt/fp32/gemm_a0nb1n_fp32.c @@ -0,0 +1,868 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c920/c920.h" + +static inline void c920_omp_get_mn_partition(int M, int N, int *M_start, int *M_end, int *N_start, + int *N_end) +{ +#ifdef _OPENMP + int rank = omp_get_thread_num(); + int threads = omp_get_num_threads(); + + if (M > 2 * N) { + int q = M / threads; + int r = M % threads; + *M_start = rank < r ? rank * (q + 1) : rank * q + r; + *M_end = rank < r ? (rank + 1) * (q + 1) : (rank + 1) * q + r; + } else if (N > 2 * M) { + int q = N / threads; + int r = N % threads; + *N_start = rank < r ? rank * (q + 1) : rank * q + r; + *N_end = rank < r ? (rank + 1) * (q + 1) : (rank + 1) * q + r; + } else { + // TODO: support any number of threads + float _s = sqrt(threads); + assert(floor(_s + 0.5) == _s); + int t_sqrt = (int)_s; + + int r_rank = rank / t_sqrt; + int c_rank = rank % (int)t_sqrt; + + int M_q = M / t_sqrt; + int M_r = M % t_sqrt; + *M_start = r_rank < M_r ? r_rank * (M_q + 1) : r_rank * M_q + M_r; + *M_end = r_rank < M_r ? (r_rank + 1) * (M_q + 1) : (r_rank + 1) * M_q + M_r; + + int N_q = N / t_sqrt; + int N_r = N % t_sqrt; + *N_start = c_rank < N_r ? c_rank * (N_q + 1) : c_rank * N_q + N_r; + *N_end = c_rank < N_r ? (c_rank + 1) * (N_q + 1) : (c_rank + 1) * N_q + N_r; + } +#endif +} + +static inline vfloat32m8_t vdeq_vf_f32m8(vint8m2_t _src, __fp16 scale, int vl) +{ + vint16m4_t _i16 = vwadd_vx_i16m4(_src, 0, vl); + vfloat16m4_t _f16 = vfcvt_f_x_v_f16m4(_i16, vl); + _f16 = vfmul_vf_f16m4(_f16, scale, vl); + vfloat32m8_t _f32 = vfwcvt_f_f_v_f32m8(_f16, vl); + return _f32; +} + +/************************************************************* + * constrain: vlen = 128, and K % 32 == 0 + ************************************************************/ +static inline void gemm_dot_1x1_fp32_q8(float *dst, const float *sa, const int8_t *sb, + const __fp16 *scale, float *bias, int M, int K, int N, + int lda, int ldb, int ldc, int k_idx) +{ + int block_size = 32; + int i = 0; + for (; i < M; i++) { + const float *sa_ptr = sa + i * lda; + int j = 0; + for (; j < N; j++) { + const float *a0_ptr = sa_ptr; + const int8_t *b0_ptr = sb + j * ldb; + const __fp16 *s0_ptr = scale + j * ldb / block_size; + + // vlen128 e32m8=32 + int vl = vsetvl_e32m8(block_size); + // dst[0, 0] + vfloat32m8_t _acc00 = vfmv_v_f_f32m8(0.0f, vl); + + int c = 0; + for (; c + block_size - 1 < K; c += block_size) { + vfloat32m8_t _a0 = vle32_v_f32m8(a0_ptr + c, vl); + vint8m2_t _b0_i8 = vle8_v_i8m2(b0_ptr + c, vl); + vfloat32m8_t _b0_f32 = vdeq_vf_f32m8(_b0_i8, s0_ptr[0], vl); + s0_ptr += 1; + _acc00 = vfmacc_vv_f32m8(_acc00, _a0, _b0_f32, vl); + } + + int idx00 = (i + 0) * ldc + (j + 0); + vfloat32m1_t _sum00; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f32m1(bias[j + 0], 1); + } else { + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + } + + _sum00 = vfredosum_vs_f32m8_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + } + } +} + +static void gemm_dot_1x1_fp32_q8_omp(float *dst, const float *sa, const int8_t *sb, + const __fp16 *scale, float *bias, int M, int K, int N, int lda, + int ldb, int ldc, int k_idx) +{ + if (shl_multithread_is_enable()) { +#pragma omp parallel + { + int M_start = 0, M_end = M; + int N_start = 0, N_end = N; + c920_omp_get_mn_partition(M, N, &M_start, &M_end, &N_start, &N_end); + + float *thread_dst = dst + M_start * ldc + N_start; + const float *thread_sa = sa + M_start * lda; + const int8_t *thread_sb = sb + N_start * ldb; + const __fp16 *thread_scale = scale + N_start * ldb / 32; + float *thread_bias = bias + N_start; + int thread_M = M_end - M_start; + int thread_N = N_end - N_start; + gemm_dot_1x1_fp32_q8(thread_dst, thread_sa, thread_sb, thread_scale, thread_bias, + thread_M, K, thread_N, lda, ldb, ldc, k_idx); + } + } else { + gemm_dot_1x1_fp32_q8(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, k_idx); + } +} + +/************************************************************* + * constrain: vlen = 128, and K % 32 == 0 + ************************************************************/ +static inline void gemm_dot_1x1_fp32_q4(float *dst, const float *sa, const int8_t *sb, + const __fp16 *scale, float *bias, int M, int K, int N, + int lda, int ldb, int ldc, int k_idx) +{ + int block_size = 32; + int half_block = block_size / 2; + int i = 0; + for (; i < M; i++) { + const float *sa_ptr = sa + i * lda; + int j = 0; + for (; j < N; j++) { + const float *a0_ptr = sa_ptr; + const int8_t *b0_ptr = sb + j * ldb / 2; + const __fp16 *s0_ptr = scale + j * ldb / block_size; + + // vlen128 e32m4=16 + int vl = vsetvl_e32m4(half_block); + // dst[0, 0] + vfloat32m4_t _acc00 = vfmv_v_f_f32m4(0.0f, vl); + + int c = 0; + for (; c + block_size - 1 < K; c += block_size) { + vfloat32m4_t _a00 = vle32_v_f32m4(a0_ptr + c, vl); + vfloat32m4_t _a01 = vle32_v_f32m4(a0_ptr + c + half_block, vl); + + vint8m1_t _b0_i8 = vle8_v_i8m1(b0_ptr, vl); + b0_ptr += half_block; + + vint8m1_t _low_i8 = vand_vx_i8m1(_b0_i8, 0x0f, vl); + vint8m1_t _high_i8 = vsra_vx_i8m1(_b0_i8, 4, vl); + _high_i8 = vand_vx_i8m1(_high_i8, 0x0f, vl); + vint16m2_t _low_i16 = vwsub_vx_i16m2(_low_i8, 8, vl); + vint16m2_t _high_i16 = vwsub_vx_i16m2(_high_i8, 8, vl); + vfloat16m2_t _low_f16 = vfcvt_f_x_v_f16m2(_low_i16, vl); + vfloat16m2_t _high_f16 = vfcvt_f_x_v_f16m2(_high_i16, vl); + _low_f16 = vfmul_vf_f16m2(_low_f16, s0_ptr[0], vl); + _high_f16 = vfmul_vf_f16m2(_high_f16, s0_ptr[0], vl); + s0_ptr += 1; + vfloat32m4_t _low_f32 = vfwcvt_f_f_v_f32m4(_low_f16, vl); + vfloat32m4_t _high_f32 = vfwcvt_f_f_v_f32m4(_high_f16, vl); + + _acc00 = vfmacc_vv_f32m4(_acc00, _a00, _low_f32, vl); + _acc00 = vfmacc_vv_f32m4(_acc00, _a01, _high_f32, vl); + } + + int idx00 = (i + 0) * ldc + (j + 0); + vfloat32m1_t _sum00; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f32m1(bias[j + 0], 1); + } else { + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + } + + _sum00 = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + } + } +} + +static void gemm_dot_1x1_fp32_q4_omp(float *dst, const float *sa, const int8_t *sb, + const __fp16 *scale, float *bias, int M, int K, int N, int lda, + int ldb, int ldc, int k_idx) +{ + if (shl_multithread_is_enable()) { +#pragma omp parallel + { + int M_start = 0, M_end = M; + int N_start = 0, N_end = N; + c920_omp_get_mn_partition(M, N, &M_start, &M_end, &N_start, &N_end); + + float *thread_dst = dst + M_start * ldc + N_start; + const float *thread_sa = sa + M_start * lda; + const int8_t *thread_sb = sb + N_start * ldb / 2; + const __fp16 *thread_scale = scale + N_start * ldb / 32; + float *thread_bias = bias + N_start; + int thread_M = M_end - M_start; + int thread_N = N_end - N_start; + gemm_dot_1x1_fp32_q4(thread_dst, thread_sa, thread_sb, thread_scale, thread_bias, + thread_M, K, thread_N, lda, ldb, ldc, k_idx); + } + } else { + gemm_dot_1x1_fp32_q4(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, k_idx); + } +} + +static inline void gemm_dot_4x4_fp32(float *dst, const float *sa, const float *sb, float *bias, + int M, int K, int N, int lda, int ldb, int ldc, int k_idx) +{ + int i = 0; + for (; i + 3 < M; i += 4) { + const float *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const float *a0_ptr = sa_ptr; + const float *a1_ptr = sa_ptr + 1 * lda; + const float *a2_ptr = sa_ptr + 2 * lda; + const float *a3_ptr = sa_ptr + 3 * lda; + const float *b0_ptr = sb + j * ldb; + const float *b1_ptr = b0_ptr + 1 * ldb; + const float *b2_ptr = b0_ptr + 2 * ldb; + const float *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[m, 0] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vlmax); + // dst[m, 1] + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc01, vlmax); + vfloat32m1_t _acc21 = vmv_v_v_f32m1(_acc01, vlmax); + vfloat32m1_t _acc31 = vmv_v_v_f32m1(_acc01, vlmax); + // dst[m, 2] + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc02, vlmax); + vfloat32m1_t _acc22 = vmv_v_v_f32m1(_acc02, vlmax); + vfloat32m1_t _acc32 = vmv_v_v_f32m1(_acc02, vlmax); + // dst[m, 3] + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc03, vlmax); + vfloat32m1_t _acc23 = vmv_v_v_f32m1(_acc03, vlmax); + vfloat32m1_t _acc33 = vmv_v_v_f32m1(_acc03, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _a1 = vle32_v_f32m1(a1_ptr + c, vl); + vfloat32m1_t _a2 = vle32_v_f32m1(a2_ptr + c, vl); + vfloat32m1_t _a3 = vle32_v_f32m1(a3_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr + c, vl); + vfloat32m1_t _b2 = vle32_v_f32m1(b2_ptr + c, vl); + vfloat32m1_t _b3 = vle32_v_f32m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vlmax); + + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vlmax); + _acc11 = vfmacc_vv_f32m1(_acc11, _a1, _b1, vlmax); + _acc21 = vfmacc_vv_f32m1(_acc21, _a2, _b1, vlmax); + _acc31 = vfmacc_vv_f32m1(_acc31, _a3, _b1, vlmax); + + _acc02 = vfmacc_vv_f32m1(_acc02, _a0, _b2, vlmax); + _acc12 = vfmacc_vv_f32m1(_acc12, _a1, _b2, vlmax); + _acc22 = vfmacc_vv_f32m1(_acc22, _a2, _b2, vlmax); + _acc32 = vfmacc_vv_f32m1(_acc32, _a3, _b2, vlmax); + + _acc03 = vfmacc_vv_f32m1(_acc03, _a0, _b3, vlmax); + _acc13 = vfmacc_vv_f32m1(_acc13, _a1, _b3, vlmax); + _acc23 = vfmacc_vv_f32m1(_acc23, _a2, _b3, vlmax); + _acc33 = vfmacc_vv_f32m1(_acc33, _a3, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + int idx01 = (i + 0) * ldc + (j + 1); + int idx11 = (i + 1) * ldc + (j + 1); + int idx21 = (i + 2) * ldc + (j + 1); + int idx31 = (i + 3) * ldc + (j + 1); + + int idx02 = (i + 0) * ldc + (j + 2); + int idx12 = (i + 1) * ldc + (j + 2); + int idx22 = (i + 2) * ldc + (j + 2); + int idx32 = (i + 3) * ldc + (j + 2); + + int idx03 = (i + 0) * ldc + (j + 3); + int idx13 = (i + 1) * ldc + (j + 3); + int idx23 = (i + 2) * ldc + (j + 3); + int idx33 = (i + 3) * ldc + (j + 3); + + // dst[m, 0] + vfloat32m1_t _sum00; + vfloat32m1_t _sum10; + vfloat32m1_t _sum20; + vfloat32m1_t _sum30; + // dst[m, 1] + vfloat32m1_t _sum01; + vfloat32m1_t _sum11; + vfloat32m1_t _sum21; + vfloat32m1_t _sum31; + // dst[m, 2] + vfloat32m1_t _sum02; + vfloat32m1_t _sum12; + vfloat32m1_t _sum22; + vfloat32m1_t _sum32; + // dst[m, 3] + vfloat32m1_t _sum03; + vfloat32m1_t _sum13; + vfloat32m1_t _sum23; + vfloat32m1_t _sum33; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f32m1(bias[j + 0], 1); + _sum10 = vmv_v_v_f32m1(_sum00, 1); + _sum20 = vmv_v_v_f32m1(_sum00, 1); + _sum30 = vmv_v_v_f32m1(_sum00, 1); + + _sum01 = vfmv_v_f_f32m1(bias[j + 1], 1); + _sum11 = vmv_v_v_f32m1(_sum01, 1); + _sum21 = vmv_v_v_f32m1(_sum01, 1); + _sum31 = vmv_v_v_f32m1(_sum01, 1); + + _sum02 = vfmv_v_f_f32m1(bias[j + 2], 1); + _sum12 = vmv_v_v_f32m1(_sum02, 1); + _sum22 = vmv_v_v_f32m1(_sum02, 1); + _sum32 = vmv_v_v_f32m1(_sum02, 1); + + _sum03 = vfmv_v_f_f32m1(bias[j + 3], 1); + _sum13 = vmv_v_v_f32m1(_sum03, 1); + _sum23 = vmv_v_v_f32m1(_sum03, 1); + _sum33 = vmv_v_v_f32m1(_sum03, 1); + } else { + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f32m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f32m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f32m1(dst[idx30], 1); + + _sum01 = vfmv_v_f_f32m1(dst[idx01], 1); + _sum11 = vfmv_v_f_f32m1(dst[idx11], 1); + _sum21 = vfmv_v_f_f32m1(dst[idx21], 1); + _sum31 = vfmv_v_f_f32m1(dst[idx31], 1); + + _sum02 = vfmv_v_f_f32m1(dst[idx02], 1); + _sum12 = vfmv_v_f_f32m1(dst[idx12], 1); + _sum22 = vfmv_v_f_f32m1(dst[idx22], 1); + _sum32 = vfmv_v_f_f32m1(dst[idx32], 1); + + _sum03 = vfmv_v_f_f32m1(dst[idx03], 1); + _sum13 = vfmv_v_f_f32m1(dst[idx13], 1); + _sum23 = vfmv_v_f_f32m1(dst[idx23], 1); + _sum33 = vfmv_v_f_f32m1(dst[idx33], 1); + } + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc30, _sum30, vlmax); + + _sum01 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc01, _sum01, vlmax); + _sum11 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc11, _sum11, vlmax); + _sum21 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc21, _sum21, vlmax); + _sum31 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc31, _sum31, vlmax); + + _sum02 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc02, _sum02, vlmax); + _sum12 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc12, _sum12, vlmax); + _sum22 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc22, _sum22, vlmax); + _sum32 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc32, _sum32, vlmax); + + _sum03 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc03, _sum03, vlmax); + _sum13 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc13, _sum13, vlmax); + _sum23 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc23, _sum23, vlmax); + _sum33 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc33, _sum33, vlmax); + + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + dst[idx10] = vfmv_f_s_f32m1_f32(_sum10); + dst[idx20] = vfmv_f_s_f32m1_f32(_sum20); + dst[idx30] = vfmv_f_s_f32m1_f32(_sum30); + + dst[idx01] = vfmv_f_s_f32m1_f32(_sum01); + dst[idx11] = vfmv_f_s_f32m1_f32(_sum11); + dst[idx21] = vfmv_f_s_f32m1_f32(_sum21); + dst[idx31] = vfmv_f_s_f32m1_f32(_sum31); + + dst[idx02] = vfmv_f_s_f32m1_f32(_sum02); + dst[idx12] = vfmv_f_s_f32m1_f32(_sum12); + dst[idx22] = vfmv_f_s_f32m1_f32(_sum22); + dst[idx32] = vfmv_f_s_f32m1_f32(_sum32); + + dst[idx03] = vfmv_f_s_f32m1_f32(_sum03); + dst[idx13] = vfmv_f_s_f32m1_f32(_sum13); + dst[idx23] = vfmv_f_s_f32m1_f32(_sum23); + dst[idx33] = vfmv_f_s_f32m1_f32(_sum33); + } + for (; j < N; j++) { + const float *a0_ptr = sa_ptr; + const float *a1_ptr = sa_ptr + 1 * lda; + const float *a2_ptr = sa_ptr + 2 * lda; + const float *a3_ptr = sa_ptr + 3 * lda; + const float *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[m, 0] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _a1 = vle32_v_f32m1(a1_ptr + c, vl); + vfloat32m1_t _a2 = vle32_v_f32m1(a2_ptr + c, vl); + vfloat32m1_t _a3 = vle32_v_f32m1(a3_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + // dst[m, 0] + vfloat32m1_t _sum00; + vfloat32m1_t _sum10; + vfloat32m1_t _sum20; + vfloat32m1_t _sum30; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f32m1(bias[j + 0], 1); + _sum10 = vmv_v_v_f32m1(_sum00, 1); + _sum20 = vmv_v_v_f32m1(_sum00, 1); + _sum30 = vmv_v_v_f32m1(_sum00, 1); + } else { + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f32m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f32m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f32m1(dst[idx30], 1); + } + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc30, _sum30, vlmax); + + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + dst[idx10] = vfmv_f_s_f32m1_f32(_sum10); + dst[idx20] = vfmv_f_s_f32m1_f32(_sum20); + dst[idx30] = vfmv_f_s_f32m1_f32(_sum30); + } + } + for (; i < M; i += 1) { + const float *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const float *a0_ptr = sa_ptr; + const float *b0_ptr = sb + j * ldb; + const float *b1_ptr = b0_ptr + 1 * ldb; + const float *b2_ptr = b0_ptr + 2 * ldb; + const float *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[0, n] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr + c, vl); + vfloat32m1_t _b2 = vle32_v_f32m1(b2_ptr + c, vl); + vfloat32m1_t _b3 = vle32_v_f32m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vlmax); + _acc02 = vfmacc_vv_f32m1(_acc02, _a0, _b2, vlmax); + _acc03 = vfmacc_vv_f32m1(_acc03, _a0, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx01 = (i + 0) * ldc + (j + 1); + int idx02 = (i + 0) * ldc + (j + 2); + int idx03 = (i + 0) * ldc + (j + 3); + + // dst[0, n] + vfloat32m1_t _sum00; + vfloat32m1_t _sum01; + vfloat32m1_t _sum02; + vfloat32m1_t _sum03; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f32m1(bias[j + 0], 1); + _sum01 = vfmv_v_f_f32m1(bias[j + 1], 1); + _sum02 = vfmv_v_f_f32m1(bias[j + 2], 1); + _sum03 = vfmv_v_f_f32m1(bias[j + 3], 1); + } else { + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + _sum01 = vfmv_v_f_f32m1(dst[idx01], 1); + _sum02 = vfmv_v_f_f32m1(dst[idx02], 1); + _sum03 = vfmv_v_f_f32m1(dst[idx03], 1); + } + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + _sum01 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc01, _sum01, vlmax); + _sum02 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc02, _sum02, vlmax); + _sum03 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc03, _sum03, vlmax); + + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + dst[idx01] = vfmv_f_s_f32m1_f32(_sum01); + dst[idx02] = vfmv_f_s_f32m1_f32(_sum02); + dst[idx03] = vfmv_f_s_f32m1_f32(_sum03); + } + for (; j < N; j++) { + const float *a0_ptr = sa_ptr; + const float *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[0, 0] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + + // dst[0, 0] + vfloat32m1_t _sum00; + if (k_idx == 0) { + _sum00 = vfmv_v_f_f32m1(bias[j + 0], 1); + } else { + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + } + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + } + } +} + +static void gemm_dot_4x4_fp32_omp(float *dst, const float *sa, const float *sb, float *bias, int M, + int K, int N, int lda, int ldb, int ldc, int k_idx) +{ + if (shl_multithread_is_enable()) { +#pragma omp parallel + { + int M_start = 0, M_end = M; + int N_start = 0, N_end = N; + c920_omp_get_mn_partition(M, N, &M_start, &M_end, &N_start, &N_end); + + float *thread_dst = dst + M_start * ldc + N_start; + const float *thread_sa = sa + M_start * lda; + const float *thread_sb = sb + N_start * ldb; + float *thread_bias = bias + N_start; + int thread_M = M_end - M_start; + int thread_N = N_end - N_start; + gemm_dot_4x4_fp32(thread_dst, thread_sa, thread_sb, thread_bias, thread_M, K, thread_N, + lda, ldb, ldc, k_idx); + } + } else { + gemm_dot_4x4_fp32(dst, sa, sb, bias, M, K, N, lda, ldb, ldc, k_idx); + } +} + +#define MIN(a, b) ((a) > (b) ? (b) : (a)) +#define CAL_LAST(SIZE, x, y) ((SIZE - x * y) / (x + y)) + +static inline void c920_get_blk_size(int M, int N, int K, int *m_blk, int *n_blk, int *k_blk) +{ + const int M_BLK = 256; + const int N_BLK = 256; + const int K_BLK = 370; + const int CACHE_SIZE = 1024 * 1024 / sizeof(float) * 0.75; + + if (M <= M_BLK && N <= N_BLK && K <= K_BLK) { + *m_blk = M; + *n_blk = N; + *k_blk = K; + } else if (M > M_BLK && N > N_BLK && K > K_BLK) { + *m_blk = M_BLK; + *n_blk = N_BLK; + *k_blk = K_BLK; + } else { + if (M <= M_BLK && N <= N_BLK && K > K_BLK) { + *m_blk = M; + *n_blk = N; + *k_blk = MIN(CAL_LAST(CACHE_SIZE, *m_blk, *n_blk), K); + } else if (M <= M_BLK && N > N_BLK && K <= K_BLK) { + *m_blk = M; + *k_blk = K; + *n_blk = MIN(CAL_LAST(CACHE_SIZE, *m_blk, *k_blk), N); + } else if (M > M_BLK && N <= N_BLK && K <= K_BLK) { + *n_blk = N; + *k_blk = K; + *m_blk = MIN(CAL_LAST(CACHE_SIZE, *n_blk, *k_blk), M); + } else if (M > M_BLK && N > N_BLK && K <= K_BLK) { + *k_blk = K; + int tmp_m = M_BLK; + *n_blk = MIN(CAL_LAST(CACHE_SIZE, tmp_m, *k_blk), N); + *m_blk = MIN(CAL_LAST(CACHE_SIZE, *n_blk, *k_blk), M); + } else if (M > M_BLK && N <= N_BLK && K > K_BLK) { + *n_blk = N; + int tmp_m = M_BLK; + *k_blk = MIN(CAL_LAST(CACHE_SIZE, tmp_m, *n_blk), K); + *m_blk = MIN(CAL_LAST(CACHE_SIZE, *n_blk, *k_blk), M); + } else if (M <= M_BLK && N > N_BLK && K > K_BLK) { + *m_blk = M; + int tmp_n = N_BLK; + *k_blk = MIN(CAL_LAST(CACHE_SIZE, tmp_n, *m_blk), K); + *n_blk = MIN(CAL_LAST(CACHE_SIZE, *m_blk, *k_blk), N); + } + } + + int tmp_n = *n_blk; + if (tmp_n < N && tmp_n % 4 != 0) { + *n_blk = (tmp_n / 4) * 4; + } + + int tmp_k = *k_blk; + const int block_size = 32; + if (tmp_k < K && tmp_k % block_size != 0) { + *k_blk = (tmp_k / block_size) * block_size; + } +} + +/************************************************************* + * constrain: vlen >= 128, and K % 32 == 0 + ************************************************************/ +static void dequantize_block_q8_to_f32(const int8_t *src, const __fp16 *scale, float *dst, + int n_blk, int k_blk, int ld_src, int ld_dst) +{ + int block_size = 32; + int vl = vsetvl_e8m2(block_size); + for (int i = 0; i < n_blk; i++) { + const int8_t *s_ptr = src + i * ld_src; + const __fp16 *scale_ptr = scale + i * ld_src / block_size; + float *d_ptr = dst + i * ld_dst; + for (int j = 0; j + block_size - 1 < k_blk; j += block_size) { + vint8m2_t _i8 = vle8_v_i8m2(s_ptr + j, vl); + vint16m4_t _i16 = vwadd_vx_i16m4(_i8, 0, vl); + vfloat16m4_t _f16 = vfcvt_f_x_v_f16m4(_i16, vl); + _f16 = vfmul_vf_f16m4(_f16, scale_ptr[0], vl); + scale_ptr += 1; + vfloat32m8_t _f32 = vfwcvt_f_f_v_f32m8(_f16, vl); + vse32_v_f32m8(d_ptr + j, _f32, vl); + } + } +} + +void shl_c920_gemm_a0nb1n_dot_fp32_q8(float *dst, const float *sa, const int8_t *sb, float *bias, + int M, int K, int N, const __fp16 *scale) +{ + int flag_bias = 1; + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(N * sizeof(float)); + } + + if (M > 1) { + int M_BLK, N_BLK, K_BLK; + c920_get_blk_size(M, N, K, &M_BLK, &N_BLK, &K_BLK); + + float *b_fp32 = (float *)shl_mem_alloc(N_BLK * K_BLK * sizeof(float)); + int lda = K; + int ldb = K_BLK; // after dequantize + int ldc = N; + + int m_block = M_BLK; + int m_idx = 0; + while (m_idx < M) { + if (M - m_idx < m_block) { + m_block = M - m_idx; + } + int n_block = N_BLK; + int n_idx = 0; + while (n_idx < N) { + if (N - n_idx < n_block) { + n_block = N - n_idx; + } + int k_block = K_BLK; + int k_idx = 0; + while (k_idx < K) { + if (K - k_idx < k_block) { + k_block = K - k_idx; + } + float *c_ptr = dst + m_idx * N + n_idx; + const float *a_ptr = sa + m_idx * K + k_idx; + const int8_t *b_ptr = sb + n_idx * K + k_idx; + const __fp16 *scale_ptr = scale + n_idx * (K / 32) + k_idx / 32; + + // dequantize before gemm + dequantize_block_q8_to_f32(b_ptr, scale_ptr, b_fp32, n_block, k_block, K, + K_BLK); + gemm_dot_4x4_fp32_omp(c_ptr, a_ptr, b_fp32, bias + n_idx, m_block, k_block, + n_block, lda, ldb, ldc, k_idx); + + k_idx += k_block; + } + n_idx += n_block; + } + m_idx += m_block; + } + + shl_mem_free(b_fp32); + } else { + int lda = K; + int ldb = K; + int ldc = N; + // dequantize in gemm + gemm_dot_1x1_fp32_q8_omp(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, 0); + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} + +/************************************************************* + * constrain: vlen >= 128, and K % 32 == 0 + ************************************************************/ +static void dequantize_block_q4_to_f32(const int8_t *src, const __fp16 *scale, float *dst, + int n_blk, int k_blk, int ld_src, int ld_dst) +{ + int block_size = 32; + int half_block = block_size / 2; + int vl = vsetvl_e8m1(half_block); + for (int i = 0; i < n_blk; i++) { + const int8_t *s_ptr = src + i * ld_src / 2; + const __fp16 *scale_ptr = scale + i * ld_src / block_size; + float *d_ptr = dst + i * ld_dst; + for (int j = 0; j + block_size - 1 < k_blk; j += block_size) { + vint8m1_t _in = vle8_v_i8m1(s_ptr, vl); + s_ptr += half_block; + vint8m1_t _low_i8 = vand_vx_i8m1(_in, 0x0f, vl); + vint8m1_t _high_i8 = vsra_vx_i8m1(_in, 4, vl); + _high_i8 = vand_vx_i8m1(_high_i8, 0x0f, vl); + vint16m2_t _low_i16 = vwsub_vx_i16m2(_low_i8, 8, vl); + vint16m2_t _high_i16 = vwsub_vx_i16m2(_high_i8, 8, vl); + vfloat16m2_t _low_f16 = vfcvt_f_x_v_f16m2(_low_i16, vl); + vfloat16m2_t _high_f16 = vfcvt_f_x_v_f16m2(_high_i16, vl); + _low_f16 = vfmul_vf_f16m2(_low_f16, scale_ptr[0], vl); + _high_f16 = vfmul_vf_f16m2(_high_f16, scale_ptr[0], vl); + scale_ptr += 1; + vfloat32m4_t _low_f32 = vfwcvt_f_f_v_f32m4(_low_f16, vl); + vfloat32m4_t _high_f32 = vfwcvt_f_f_v_f32m4(_high_f16, vl); + vse32_v_f32m4(d_ptr, _low_f32, vl); + vse32_v_f32m4(d_ptr + half_block, _high_f32, vl); + d_ptr += block_size; + } + } +} + +void shl_c920_gemm_a0nb1n_dot_fp32_q4(float *dst, const float *sa, const int8_t *sb, float *bias, + int M, int K, int N, const __fp16 *scale) +{ + int flag_bias = 1; + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(N * sizeof(float)); + } + + if (M > 1) { + int M_BLK, N_BLK, K_BLK; + c920_get_blk_size(M, N, K, &M_BLK, &N_BLK, &K_BLK); + + float *b_fp32 = (float *)shl_mem_alloc(N_BLK * K_BLK * sizeof(float)); + int lda = K; + int ldb = K_BLK; // after dequantize + int ldc = N; + + int m_block = M_BLK; + int m_idx = 0; + while (m_idx < M) { + if (M - m_idx < m_block) { + m_block = M - m_idx; + } + + int n_block = N_BLK; + int n_idx = 0; + while (n_idx < N) { + if (N - n_idx < n_block) { + n_block = N - n_idx; + } + + int k_block = K_BLK; + int k_idx = 0; + while (k_idx < K) { + if (K - k_idx < k_block) { + k_block = K - k_idx; + } + + float *c_ptr = dst + m_idx * N + n_idx; + const float *a_ptr = sa + m_idx * K + k_idx; + const int8_t *b_ptr = sb + n_idx * K / 2 + k_idx / 2; + const __fp16 *scale_ptr = scale + n_idx * (K / 32) + k_idx / 32; + + // dequantize before gemm + dequantize_block_q4_to_f32(b_ptr, scale_ptr, b_fp32, n_block, k_block, K, + K_BLK); + gemm_dot_4x4_fp32_omp(c_ptr, a_ptr, b_fp32, bias + n_idx, m_block, k_block, + n_block, lda, ldb, ldc, k_idx); + + k_idx += k_block; + } + + n_idx += n_block; + } + + m_idx += m_block; + } + shl_mem_free(b_fp32); + } else { + int lda = K; + int ldb = K; + int ldc = N; + // dequantize in gemm + gemm_dot_1x1_fp32_q4_omp(dst, sa, sb, scale, bias, M, K, N, lda, ldb, ldc, 0); + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c920_opt/fp32/gemm_a0nb1r_fp32.c b/source/c920_opt/fp32/gemm_a0nb1r_fp32.c new file mode 100644 index 00000000..ae7ef5ae --- /dev/null +++ b/source/c920_opt/fp32/gemm_a0nb1r_fp32.c @@ -0,0 +1,372 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c920/c920.h" + +/************************************************************* + * packn = vlenb / sizeof(float) + * m_blk: 8/4/2/1 + * n_blk: pack2n/packn/n_tail + * + * dst - output: [M, N] + * sa - input: [M, K] + * sb - weights: [N/n_blk, K, n_blk] + * bias: [N] + ************************************************************/ +void shl_c920_gemm_a0nb1r_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias, + int M, int K, int N) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int flag_bias = 1; + if (bias == NULL) { + flag_bias = 0; + bias = (float *)shl_mem_alloc(N * sizeof(float)); + } + + int i = 0; + for (; i + 7 < M; i += 8) { + const float *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e32m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + const float *b1_ptr = b0_ptr + packn; + float *c0_ptr = dst + i * N + j; + float *c1_ptr = c0_ptr + packn; + + // [n, 0] + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc40 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc50 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc60 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc70 = vmv_v_v_f32m1(_acc00, vl); + // [n, 1] + vfloat32m1_t _acc01 = vle32_v_f32m1(bias + j + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc21 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc31 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc41 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc51 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc61 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc71 = vmv_v_v_f32m1(_acc01, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + vfloat32m1_t _a1 = vfmv_v_f_f32m1(a_ptr[1 * K], vl); + vfloat32m1_t _a2 = vfmv_v_f_f32m1(a_ptr[2 * K], vl); + vfloat32m1_t _a3 = vfmv_v_f_f32m1(a_ptr[3 * K], vl); + vfloat32m1_t _a4 = vfmv_v_f_f32m1(a_ptr[4 * K], vl); + vfloat32m1_t _a5 = vfmv_v_f_f32m1(a_ptr[5 * K], vl); + vfloat32m1_t _a6 = vfmv_v_f_f32m1(a_ptr[6 * K], vl); + vfloat32m1_t _a7 = vfmv_v_f_f32m1(a_ptr[7 * K], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vl); + _acc40 = vfmacc_vv_f32m1(_acc40, _a4, _b0, vl); + _acc50 = vfmacc_vv_f32m1(_acc50, _a5, _b0, vl); + _acc60 = vfmacc_vv_f32m1(_acc60, _a6, _b0, vl); + _acc70 = vfmacc_vv_f32m1(_acc70, _a7, _b0, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _a1, _b1, vl); + _acc21 = vfmacc_vv_f32m1(_acc21, _a2, _b1, vl); + _acc31 = vfmacc_vv_f32m1(_acc31, _a3, _b1, vl); + _acc41 = vfmacc_vv_f32m1(_acc41, _a4, _b1, vl); + _acc51 = vfmacc_vv_f32m1(_acc51, _a5, _b1, vl); + _acc61 = vfmacc_vv_f32m1(_acc61, _a6, _b1, vl); + _acc71 = vfmacc_vv_f32m1(_acc71, _a7, _b1, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c0_ptr + N, _acc10, vl); + vse32_v_f32m1(c0_ptr + N * 2, _acc20, vl); + vse32_v_f32m1(c0_ptr + N * 3, _acc30, vl); + vse32_v_f32m1(c0_ptr + N * 4, _acc40, vl); + vse32_v_f32m1(c0_ptr + N * 5, _acc50, vl); + vse32_v_f32m1(c0_ptr + N * 6, _acc60, vl); + vse32_v_f32m1(c0_ptr + N * 7, _acc70, vl); + vse32_v_f32m1(c1_ptr, _acc01, vl); + vse32_v_f32m1(c1_ptr + N, _acc11, vl); + vse32_v_f32m1(c1_ptr + N * 2, _acc21, vl); + vse32_v_f32m1(c1_ptr + N * 3, _acc31, vl); + vse32_v_f32m1(c1_ptr + N * 4, _acc41, vl); + vse32_v_f32m1(c1_ptr + N * 5, _acc51, vl); + vse32_v_f32m1(c1_ptr + N * 6, _acc61, vl); + vse32_v_f32m1(c1_ptr + N * 7, _acc71, vl); + } + while (j < N) { + int vl = vsetvl_e32m1(N - j); + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + float *c0_ptr = dst + i * N + j; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc40 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc50 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc60 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc70 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + vfloat32m1_t _a1 = vfmv_v_f_f32m1(a_ptr[1 * K], vl); + vfloat32m1_t _a2 = vfmv_v_f_f32m1(a_ptr[2 * K], vl); + vfloat32m1_t _a3 = vfmv_v_f_f32m1(a_ptr[3 * K], vl); + vfloat32m1_t _a4 = vfmv_v_f_f32m1(a_ptr[4 * K], vl); + vfloat32m1_t _a5 = vfmv_v_f_f32m1(a_ptr[5 * K], vl); + vfloat32m1_t _a6 = vfmv_v_f_f32m1(a_ptr[6 * K], vl); + vfloat32m1_t _a7 = vfmv_v_f_f32m1(a_ptr[7 * K], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vl); + _acc40 = vfmacc_vv_f32m1(_acc40, _a4, _b0, vl); + _acc50 = vfmacc_vv_f32m1(_acc50, _a5, _b0, vl); + _acc60 = vfmacc_vv_f32m1(_acc60, _a6, _b0, vl); + _acc70 = vfmacc_vv_f32m1(_acc70, _a7, _b0, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c0_ptr + N, _acc10, vl); + vse32_v_f32m1(c0_ptr + N * 2, _acc20, vl); + vse32_v_f32m1(c0_ptr + N * 3, _acc30, vl); + vse32_v_f32m1(c0_ptr + N * 4, _acc40, vl); + vse32_v_f32m1(c0_ptr + N * 5, _acc50, vl); + vse32_v_f32m1(c0_ptr + N * 6, _acc60, vl); + vse32_v_f32m1(c0_ptr + N * 7, _acc70, vl); + j += vl; + } + } + for (; i + 3 < M; i += 4) { + const float *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e32m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + const float *b1_ptr = b0_ptr + packn; + float *c0_ptr = dst + i * N + j; + float *c1_ptr = c0_ptr + packn; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc01 = vle32_v_f32m1(bias + j + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc21 = vmv_v_v_f32m1(_acc01, vl); + vfloat32m1_t _acc31 = vmv_v_v_f32m1(_acc01, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + vfloat32m1_t _a1 = vfmv_v_f_f32m1(a_ptr[1 * K], vl); + vfloat32m1_t _a2 = vfmv_v_f_f32m1(a_ptr[2 * K], vl); + vfloat32m1_t _a3 = vfmv_v_f_f32m1(a_ptr[3 * K], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _a1, _b1, vl); + _acc21 = vfmacc_vv_f32m1(_acc21, _a2, _b1, vl); + _acc31 = vfmacc_vv_f32m1(_acc31, _a3, _b1, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c0_ptr + N, _acc10, vl); + vse32_v_f32m1(c0_ptr + N * 2, _acc20, vl); + vse32_v_f32m1(c0_ptr + N * 3, _acc30, vl); + vse32_v_f32m1(c1_ptr, _acc01, vl); + vse32_v_f32m1(c1_ptr + N, _acc11, vl); + vse32_v_f32m1(c1_ptr + N * 2, _acc21, vl); + vse32_v_f32m1(c1_ptr + N * 3, _acc31, vl); + } + while (j < N) { + int vl = vsetvl_e32m1(N - j); + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + float *c0_ptr = dst + i * N + j; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + vfloat32m1_t _a1 = vfmv_v_f_f32m1(a_ptr[1 * K], vl); + vfloat32m1_t _a2 = vfmv_v_f_f32m1(a_ptr[2 * K], vl); + vfloat32m1_t _a3 = vfmv_v_f_f32m1(a_ptr[3 * K], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vl); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vl); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c0_ptr + N, _acc10, vl); + vse32_v_f32m1(c0_ptr + N * 2, _acc20, vl); + vse32_v_f32m1(c0_ptr + N * 3, _acc30, vl); + j += vl; + } + } + for (; i + 1 < M; i += 2) { + const float *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e32m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + const float *b1_ptr = b0_ptr + packn; + float *c0_ptr = dst + i * N + j; + float *c1_ptr = c0_ptr + packn; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vl); + vfloat32m1_t _acc01 = vle32_v_f32m1(bias + j + packn, vl); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc01, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + vfloat32m1_t _a1 = vfmv_v_f_f32m1(a_ptr[1 * K], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vl); + _acc11 = vfmacc_vv_f32m1(_acc11, _a1, _b1, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c0_ptr + N, _acc10, vl); + vse32_v_f32m1(c1_ptr, _acc01, vl); + vse32_v_f32m1(c1_ptr + N, _acc11, vl); + } + while (j < N) { + int vl = vsetvl_e32m1(N - j); + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + float *c0_ptr = dst + i * N + j; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + vfloat32m1_t _a1 = vfmv_v_f_f32m1(a_ptr[1 * K], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c0_ptr + N, _acc10, vl); + j += vl; + } + } + for (; i < M; i++) { + const float *sa_ptr = sa + i * K; + int j = 0; + int vl = vsetvl_e32m1(packn); + for (; j + pack2n - 1 < N; j += pack2n) { + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + const float *b1_ptr = b0_ptr + packn; + float *c0_ptr = dst + i * N + j; + float *c1_ptr = c0_ptr + packn; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + vfloat32m1_t _acc01 = vle32_v_f32m1(bias + j + packn, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr, vl); + b0_ptr += pack2n; + b1_ptr += pack2n; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + vse32_v_f32m1(c1_ptr, _acc01, vl); + } + while (j < N) { + int vl = vsetvl_e32m1(N - j); + const float *a_ptr = sa_ptr; + const float *b0_ptr = sb + j * K; + float *c0_ptr = dst + i * N + j; + + vfloat32m1_t _acc00 = vle32_v_f32m1(bias + j, vl); + + for (int c = 0; c < K; c++) { + vfloat32m1_t _a0 = vfmv_v_f_f32m1(a_ptr[0], vl); + a_ptr += 1; + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr, vl); + b0_ptr += vl; + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vl); + } + + vse32_v_f32m1(c0_ptr, _acc00, vl); + j += vl; + } + } + + if (!flag_bias) { + shl_mem_free(bias); + bias = NULL; + } +} diff --git a/source/c920_opt/fp32/gemm_fp32_block.c b/source/c920_opt/fp32/gemm_fp32_block.c index 70a09e21..b1900897 100644 --- a/source/c920_opt/fp32/gemm_fp32_block.c +++ b/source/c920_opt/fp32/gemm_fp32_block.c @@ -483,7 +483,7 @@ static inline void gemm_8xpack2n_fp32(float *dst, const float *sa, const float * * k_blk: K_BLK, K_tail * * dst - output: [m, n] - * sa - kernel: [m/m_blk, k/k_blk, m_blk/8, 8, k_blk] + * sa - kernel: [m/m_blk, k/k_blk, m_blk/8, k_blk, 8] * sb - input: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n] * bias: [m] ************************************************************/ @@ -529,7 +529,7 @@ void shl_c920_gemm_block_8xpack2n_fp32(float *dst, const float *sa, const float float *out = output_data + m_idx * n + n_idx; const float *ker = kernel_data + m_idx * k + k_idx * m_block; const float *in = input_data + n_idx * k + k_idx * n_block; - gemm_8xpack2n_fp32(out, ker, in, bias, m_block, n_block, k_block, n, k_idx); + gemm_8xpack2n_fp32(out, ker, in, bias + m_idx, m_block, n_block, k_block, n, k_idx); k_idx += k_block; } diff --git a/source/c920_opt/fp32/matmul_fp32.c b/source/c920_opt/fp32/matmul_fp32.c index c97102f1..6af199bd 100644 --- a/source/c920_opt/fp32/matmul_fp32.c +++ b/source/c920_opt/fp32/matmul_fp32.c @@ -34,8 +34,8 @@ #define MATMUL_K_BLK 64 #define MATMUL_N_BLK 64 -int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, - struct csinn_tensor *output, struct csinn_matmul_params *params) +int shl_c920_matmul_a0b0_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) { if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat0); @@ -64,71 +64,232 @@ int shl_c920_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)]; const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)]; - if (!params->trans_a && !params->trans_b) { - if (batches_a == batches_b) { - float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float)); - float *in1; - if (!(mat1->is_const)) { - in1 = (float *)shl_mem_alloc(dim_k * dim_n * sizeof(float)); - } - - for (int b = 0; b < batches_a; b++) { - shl_c920_reorder_a_block_8xk_fp32(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, - MATMUL_K_BLK); - if (!(mat1->is_const)) { - shl_rvv_reorder_b_block_pack2nxk_fp32(mat1_data, in1, dim_k, dim_n, - MATMUL_K_BLK, MATMUL_N_BLK); - } else { - in1 = mat1_data; - } - - shl_c920_gemm_block_8xpack2n_fp32(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, - MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); + if (batches_a == batches_b) { + float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float)); + float *in1; + if (!(mat1->is_const)) { + in1 = (float *)shl_mem_alloc(dim_k * dim_n * sizeof(float)); + } - mat0_data += dim_m * dim_k; - mat1_data += dim_k * dim_n; - output_data += dim_m * dim_n; - } - shl_mem_free(in0); - if (!(mat1->is_const)) { - shl_mem_free(in1); - } - } else if (batches_a > 1 && batches_b == 1) { - float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float)); - float *in1; + for (int b = 0; b < batches_a; b++) { + shl_c920_reorder_a_block_8xk_fp32(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, + MATMUL_K_BLK); if (!(mat1->is_const)) { - in1 = (float *)shl_mem_alloc(dim_k * dim_n * sizeof(float)); shl_rvv_reorder_b_block_pack2nxk_fp32(mat1_data, in1, dim_k, dim_n, MATMUL_K_BLK, MATMUL_N_BLK); } else { in1 = mat1_data; } - for (int b = 0; b < batches_a; b++) { - shl_c920_reorder_a_block_8xk_fp32(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, - MATMUL_K_BLK); + shl_c920_gemm_block_8xpack2n_fp32(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, + MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); - shl_c920_gemm_block_8xpack2n_fp32(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, - MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); - - mat0_data += dim_m * dim_k; - output_data += dim_m * dim_n; - } - shl_mem_free(in0); - if (!(mat1->is_const)) { - shl_mem_free(in1); - } + mat0_data += dim_m * dim_k; + mat1_data += dim_k * dim_n; + output_data += dim_m * dim_n; + } + shl_mem_free(in0); + if (!(mat1->is_const)) { + shl_mem_free(in1); + } + } else if (batches_a > 1 && batches_b == 1) { + float *in0 = (float *)shl_mem_alloc(dim_m * dim_k * sizeof(float)); + float *in1; + if (!(mat1->is_const)) { + in1 = (float *)shl_mem_alloc(dim_k * dim_n * sizeof(float)); + shl_rvv_reorder_b_block_pack2nxk_fp32(mat1_data, in1, dim_k, dim_n, MATMUL_K_BLK, + MATMUL_N_BLK); } else { - shl_debug_error("matmul unsupported this broadcast\n"); - return CSINN_FALSE; + in1 = mat1_data; + } + + for (int b = 0; b < batches_a; b++) { + shl_c920_reorder_a_block_8xk_fp32(mat0_data, in0, dim_m, dim_k, MATMUL_M_BLK, + MATMUL_K_BLK); + + shl_c920_gemm_block_8xpack2n_fp32(output_data, in0, in1, NULL, dim_m, dim_k, dim_n, + MATMUL_M_BLK, MATMUL_K_BLK, MATMUL_N_BLK); + + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; + } + shl_mem_free(in0); + if (!(mat1->is_const)) { + shl_mem_free(in1); } } else { - return shl_ref_matmul_quant(mat0, mat1, output, params); + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; } return CSINN_TRUE; } +/************************************************************* + * packn = vlenb / sizeof(float) + * n_blk: pack2n/packn/n_tail + * src: [n, k] + * dst: [n/n_blk, k, n_blk] + ************************************************************/ +static void reorder_mat1_npack2n_fp32(const float *src, float *dst, int n, int k) +{ + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; + + int i = 0; + int vl = vsetvl_e32m2(pack2n); + for (; i + pack2n - 1 < n; i += pack2n) { + const float *s_ptr = src + i * k; + for (int j = 0; j < k; j++) { + vfloat32m2_t _src = vlse32_v_f32m2(s_ptr, k * sizeof(float), vl); + vse32_v_f32m2(dst, _src, vl); + s_ptr += 1; + dst += vl; + } + } + while (i < n) { + int vl = vsetvl_e32m1(n - i); + const float *s_ptr = src + i * k; + for (int j = 0; j < k; j++) { + vfloat32m1_t _src = vlse32_v_f32m1(s_ptr, k * sizeof(float), vl); + vse32_v_f32m1(dst, _src, vl); + s_ptr += 1; + dst += vl; + } + i += vl; + } +} + +int shl_c920_matmul_a0b1_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params) +{ + if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat0); + } + if (mat1->layout >= CSINN_LAYOUT_NC1C0 && mat1->layout <= CSINN_LAYOUT_NC1DHWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat1); + } + + float *mat0_data = (float *)mat0->data; + float *mat1_data = (float *)mat1->data; + float *output_data = (float *)output->data; + + const int dims_count = mat0->dim_count; + int batches_a = 1; + int batches_b = 1; + + /* compute the outer size */ + for (int i = 0; i < dims_count - 2; i++) { + batches_a *= mat0->dim[i]; + } + for (int i = 0; i < mat1->dim_count - 2; i++) { + batches_b *= mat1->dim[i]; + } + + const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)]; + const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)]; + const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)]; + + if (batches_a == batches_b) { + float *in1 = (float *)shl_mem_alloc(dim_k * dim_n * sizeof(float)); + + for (int b = 0; b < batches_a; b++) { + reorder_mat1_npack2n_fp32(mat1_data, in1, dim_n, dim_k); + shl_c920_gemm_a0nb1r_8xpack2n_fp32(output_data, mat0_data, in1, NULL, dim_m, dim_k, + dim_n); + mat0_data += dim_m * dim_k; + mat1_data += dim_k * dim_n; + output_data += dim_m * dim_n; + } + + shl_mem_free(in1); + } else if (batches_a > 1 && batches_b == 1) { + float *in1 = (float *)shl_mem_alloc(dim_k * dim_n * sizeof(float)); + reorder_mat1_npack2n_fp32(mat1_data, in1, dim_n, dim_k); + + for (int b = 0; b < batches_a; b++) { + shl_c920_gemm_a0nb1r_8xpack2n_fp32(output_data, mat0_data, in1, NULL, dim_m, dim_k, + dim_n); + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; + } + shl_mem_free(in1); + } else { + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; + } + + return CSINN_TRUE; +} + +int shl_c920_matmul_a0b1_fp32_block_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, + struct csinn_matmul_params *params) +{ + if (mat0->layout >= CSINN_LAYOUT_NC1C0 && mat0->layout <= CSINN_LAYOUT_NC1DHWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(mat0); + } + + float *mat0_data = (float *)mat0->data; + int8_t *mat1_data = (int8_t *)mat1->data; + float *output_data = (float *)output->data; + + const int dims_count = mat0->dim_count; + int batches_a = 1; + int batches_b = 1; + + /* compute the outer size */ + for (int i = 0; i < dims_count - 2; i++) { + batches_a *= mat0->dim[i]; + } + for (int i = 0; i < mat1->dim_count - 2; i++) { + batches_b *= mat1->dim[i]; + } + + const int dim_m = mat0->dim[dims_count - (params->trans_a ? 1 : 2)]; + const int dim_k = mat0->dim[dims_count - (params->trans_a ? 2 : 1)]; + const int dim_n = mat1->dim[mat1->dim_count - (params->trans_b ? 2 : 1)]; + + int size1 = csinn_tensor_size(mat1); + __fp16 *scale_data; + int weight_k = dim_k; + void (*gemm_a0nb1n_dot_fp32)(); + if (mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) { + scale_data = (__fp16 *)(mat1_data + size1); + gemm_a0nb1n_dot_fp32 = shl_c920_gemm_a0nb1n_dot_fp32_q8; + } else if (mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) { + // uint4 is only half of tensor size + scale_data = (__fp16 *)(mat1_data + size1 / 2); + weight_k = dim_k / 2; + gemm_a0nb1n_dot_fp32 = shl_c920_gemm_a0nb1n_dot_fp32_q4; + } else { + shl_debug_error("%s: unsupported mtype %d\n", __func__, mat1->mtype); + return CSINN_FALSE; + } + + if (batches_a == batches_b) { + for (int b = 0; b < batches_a; b++) { + gemm_a0nb1n_dot_fp32(output_data, mat0_data, mat1_data, NULL, dim_m, dim_k, dim_n, + scale_data); + mat0_data += dim_m * dim_k; + mat1_data += dim_n * weight_k; + scale_data += dim_n * dim_k / 32; + output_data += dim_m * dim_n; + } + } else if (batches_a > 1 && batches_b == 1) { + for (int b = 0; b < batches_a; b++) { + gemm_a0nb1n_dot_fp32(output_data, mat0_data, mat1_data, NULL, dim_m, dim_k, dim_n, + scale_data); + mat0_data += dim_m * dim_k; + output_data += dim_m * dim_n; + } + } else { + shl_debug_error("matmul unsupported this broadcast\n"); + return CSINN_FALSE; + } + return CSINN_TRUE; +} + int shl_c920_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output, struct csinn_matmul_params *params) { @@ -142,13 +303,25 @@ int shl_c920_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK); } } - cb->exec = shl_c920_matmul_fp32; + cb->exec = shl_c920_matmul_a0b0_fp32; + } + } + + if (!params->trans_a && params->trans_b) { + if (mat0->dtype == CSINN_DTYPE_FLOAT32 && mat1->dtype == CSINN_DTYPE_FLOAT32) { + cb->exec = shl_c920_matmul_a0b1_fp32; + } else if (mat0->dtype == CSINN_DTYPE_FLOAT32 && + ((mat1->dtype == CSINN_DTYPE_INT8 && mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) || + (mat1->dtype == CSINN_DTYPE_INT4 && + mat1->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0))) { + cb->exec = shl_c920_matmul_a0b1_fp32_block_quant; } } + if (cb->exec == NULL) { shl_debug_warning( - "matmul is not optimized to achieve under this condition, call reference func " - "replaced.\n"); + "matmul is not optimized to achieve under this condition on C920 FP32, call reference " + "func replaced.\n"); cb->exec = shl_ref_matmul_quant; } return CSINN_TRUE; diff --git a/source/c920_opt/performance.c b/source/c920_opt/performance.c new file mode 100644 index 00000000..a9dd788e --- /dev/null +++ b/source/c920_opt/performance.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c920/c920.h" +#include "c920/perf.h" + +static struct shl_function_map shl_c920_kernel_map[] = { + {shl_c920_conv_im2col_gemm_packn_fp32, "shl_c920_conv_im2col_gemm_packn_fp32"}, + {shl_c920_conv_im2col_gemm_packn_fp16, "shl_c920_conv_im2col_gemm_packn_fp16"}, + {shl_c920_conv1x1s1_gemm_packn_fp32, "shl_c920_conv1x1s1_gemm_packn_fp32"}, + {shl_c920_conv1x1s1_gemm_packn_fp16, "shl_c920_conv1x1s1_gemm_packn_fp16"}, + {shl_c920_wg_b4f3s1_packn_fp32, "shl_c920_wg_b4f3s1_packn_fp32"}, + {shl_c920_wg_b6f3s1_packn_fp32, "shl_c920_wg_b6f3s1_packn_fp32"}, + {shl_c920_wg_b4f3s1_packn_fp16, "shl_c920_wg_b4f3s1_packn_fp16"}, + {shl_c920_wg_b6f3s1_packn_fp16, "shl_c920_wg_b6f3s1_packn_fp16"}, + {shl_c920_ncxhwx_gemm_8xpack2n_fp32, "shl_c920_ncxhwx_gemm_8xpack2n_fp32"}, + {shl_c920_ncxhwx_gemm_8xpack2n_fp16, "shl_c920_ncxhwx_gemm_8xpack2n_fp16"}, + {shl_c920_reorder_a_block_8xk_fp32, "shl_c920_reorder_a_block_8xk_fp32"}, + {shl_c920_gemm_block_8xpack2n_fp32, "shl_c920_gemm_block_8xpack2n_fp32"}, + {shl_c920_reorder_a_block_8xk_fp16, "shl_c920_reorder_a_block_8xk_fp16"}, + {shl_c920_gemm_block_8xpack2n_fp16, "shl_c920_gemm_block_8xpack2n_fp16"}, + {shl_c920_gemm_a0b1_8xpack2n_fp32, "shl_c920_gemm_a0b1_8xpack2n_fp32"}, + {shl_c920_gemm_a0b1_8xpack2n_fp16, "shl_c920_gemm_a0b1_8xpack2n_fp16"}, + {shl_c920_fullyconnected_gemm_fp32, "shl_c920_fullyconnected_gemm_fp32"}, + {shl_c920_fullyconnected_gemm_fp16, "shl_c920_fullyconnected_gemm_fp16"}, + // {shl_c920_matmul_fp32, "shl_c920_matmul_fp32"}, + // {shl_c920_matmul_fp16, "shl_c920_matmul_fp16"}, + // {shl_c920_matmul_fp16_w_int8, "shl_c920_matmul_fp16_w_int8"}, + {NULL, NULL}}; + +char *shl_rvv_get_kernel_name(void *exec); + +char *shl_c920_get_kernel_name(void *exec) +{ + char *name = shl_find_function_name(shl_c920_kernel_map, exec); + if (name == NULL) { + name = shl_rvv_get_kernel_name(exec); + } + return name; +} + +int shl_c920_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c920_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c920_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c920_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_c920_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c920_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} \ No newline at end of file diff --git a/source/c920_opt/setup.c b/source/c920_opt/setup.c index e03faa29..ee554c3d 100644 --- a/source/c920_opt/setup.c +++ b/source/c920_opt/setup.c @@ -18,12 +18,13 @@ #include "c920/c920.h" #include "c920/cap.h" +#include "c920/perf.h" #define C920_OP_PATTERN_MAX 40 static struct shl_cb_table shl_c920_cb_table[C920_OP_PATTERN_MAX]; void shl_c920_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, - void *exec, void *est, void *cap) + void *exec, void *est, void *cap, void *perf) { static int i = 0; if (i >= C920_OP_PATTERN_MAX) { @@ -34,6 +35,7 @@ void shl_c920_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, vo shl_c920_cb_table[i].shl_cb_value.exec = exec; shl_c920_cb_table[i].shl_cb_value.est = est; shl_c920_cb_table[i].shl_cb_value.caps = cap; + shl_c920_cb_table[i].shl_cb_value.perf = perf; i++; } @@ -389,31 +391,33 @@ void shl_target_init_c920() { #ifndef CONFIG_C920_CONVOLUTION_FP32_DISABLED shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c920_conv2d_init_fp32, NULL, - shl_gref_conv2d, shl_c920_conv2d_cap); + shl_gref_conv2d, shl_c920_conv2d_cap, shl_c920_conv2d_perf); shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c920_conv2d_init_fp32, NULL, - shl_gref_group_conv2d, shl_c920_conv2d_cap); + shl_gref_group_conv2d, shl_c920_conv2d_cap, shl_c920_conv2d_perf); #endif #ifndef CONFIG_C920_CONVOLUTION_FP16_DISABLED shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c920_conv2d_init_fp16, NULL, - shl_gref_conv2d, shl_c920_conv2d_cap); + shl_gref_conv2d, shl_c920_conv2d_cap, shl_c920_conv2d_perf); shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c920_conv2d_init_fp16, NULL, - shl_gref_group_conv2d, shl_c920_conv2d_cap); + shl_gref_group_conv2d, shl_c920_conv2d_cap, shl_c920_conv2d_perf); #endif #ifndef CONFIG_C920_FULLYCONNECTED_FP32_DISABLED shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_c920_fullyconnected_init_fp32, - NULL, shl_gref_fullyconnected, shl_c920_fullyconnected_cap); + NULL, shl_gref_fullyconnected, shl_c920_fullyconnected_cap, + shl_c920_fullyconnected_perf); #endif #ifndef CONFIG_C920_FULLYCONNECTED_FP16_DISABLED shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_c920_fullyconnected_init_fp16, - NULL, shl_gref_fullyconnected, shl_c920_fullyconnected_cap); + NULL, shl_gref_fullyconnected, shl_c920_fullyconnected_cap, + shl_c920_fullyconnected_perf); #endif #ifndef CONFIG_C920_MATMUL_FP32_DISABLED shl_c920_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MATMUL, shl_c920_matmul_init_fp32, NULL, - shl_gref_matmul, shl_c920_matmul_cap); + shl_gref_matmul, shl_c920_matmul_cap, shl_c920_matmul_perf); #endif #ifndef CONFIG_C920_MATMUL_FP16_DISABLED shl_c920_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, shl_c920_matmul_init_fp16, NULL, - shl_gref_matmul, shl_c920_matmul_cap); + shl_gref_matmul, shl_c920_matmul_cap, shl_c920_matmul_perf); #endif shl_register_op_callback(CSINN_C920, shl_cb_map_c920); shl_register_runtime_callback(CSINN_C920, shl_c920_runtime_callback); diff --git a/source/c920v2_opt/fp16/convolution.c b/source/c920v2_opt/fp16/convolution.c index 8b6e9245..9d98541b 100644 --- a/source/c920v2_opt/fp16/convolution.c +++ b/source/c920v2_opt/fp16/convolution.c @@ -83,17 +83,20 @@ int shl_c920v2_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_rvv_wg_b4f3s1_packn_fp16; } else { - shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_rvv_wg_b6f3s1_packn_fp16; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c920v2_opt/fp32/convolution.c b/source/c920v2_opt/fp32/convolution.c index 8f8ae57b..4c9062ab 100644 --- a/source/c920v2_opt/fp32/convolution.c +++ b/source/c920v2_opt/fp32/convolution.c @@ -75,17 +75,20 @@ int shl_c920v2_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_rvv_wg_b4f3s1_packn_fp32; } else { - shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_rvv_wg_b6f3s1_packn_fp32; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/c920v2_opt/int8/convolution.c b/source/c920v2_opt/int8/convolution.c index a75260a1..16f05295 100644 --- a/source/c920v2_opt/int8/convolution.c +++ b/source/c920v2_opt/int8/convolution.c @@ -34,6 +34,10 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor int32_t dilation_w = params->dilation_width; struct csinn_callback *cb = params->base.cb; + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; + const int packn = csrr_vlenb() / sizeof(int8_t) / 2; int in_elempack = 1; int out_elempack = 1; @@ -60,8 +64,8 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); } cb->exec = shl_c920v2_conv1x1s1_gemm_packn_int8; @@ -69,22 +73,24 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor dilation_h == 1 && dilation_w == 1) { if (params->group > 1) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); } cb->exec = shl_rvv_conv_im2col_gemm_packn_int8; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); - shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel); + params->conv_extra.kernel_tm = t_kernel; + } cb->exec = shl_rvv_wg_b4f3s1_packn_int8; - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); } cb->exec = shl_rvv_conv_im2col_gemm_packn_int8; @@ -94,15 +100,16 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor // pack1ton if (in_elempack % packn != 0 && out_elempack % packn == 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); } cb->exec = shl_c920v2_conv1x1s1_gemm_pack1ton_int8; } else { if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); } cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_int8; @@ -112,15 +119,16 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor // packnto1 if (in_elempack % packn == 0 && out_elempack % packn != 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); } cb->exec = shl_c920v2_conv1x1s1_gemm_packnto1_int8; } else { if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); } cb->exec = shl_rvv_conv_im2col_gemm_packnto1_int8; @@ -130,15 +138,16 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor // pack1 if (in_elempack % packn != 0 && out_elempack % packn != 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); } cb->exec = shl_rvv_conv1x1s1_gemm_int8; } else { if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); shl_rvv_conv_im2col_gemm_reorder_kernel_int8(kernel, params); } cb->exec = shl_rvv_conv_im2col_gemm_int8; @@ -160,10 +169,6 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor if (params->conv_extra.conv_mode == CSINN_GEMM) { if (!params->conv_extra.fuse_zp2bias) { params->conv_extra.fuse_zp2bias = true; - int32_t *bias_data = (int32_t *)bias->data; - int8_t *kernel_data = (int8_t *)kernel->data; - int32_t input_zp = input->qinfo->zero_point; - if (bias_data == NULL) { // XXX: memory leak bias_data = (int32_t *)shl_mem_alloc(out_c * params->group * sizeof(int32_t)); @@ -183,10 +188,6 @@ int shl_c920v2_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor // recover fuse zeropoint to bias for winograd if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { if (params->conv_extra.fuse_zp2bias) { - int32_t *bias_data = (int32_t *)bias->data; - int8_t *kernel_data = (int8_t *)kernel->data; - int32_t input_zp = input->qinfo->zero_point; - int kernel_inner = in_c * kernel_h * kernel_w; for (int oc = 0; oc < out_c * params->group; oc++) { int32_t tmp = 0; diff --git a/source/c920v2_opt/performance.c b/source/c920v2_opt/performance.c new file mode 100644 index 00000000..66dbf3e1 --- /dev/null +++ b/source/c920v2_opt/performance.c @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c920v2/c920v2.h" +#include "c920v2/perf.h" + +static struct shl_function_map shl_c920v2_kernel_map[] = { + {shl_c920v2_conv_im2col_gemm_packn_fp32, "shl_c920v2_conv_im2col_gemm_packn_fp32"}, + {shl_c920v2_conv_im2col_gemm_packn_fp16, "shl_c920v2_conv_im2col_gemm_packn_fp16"}, + {shl_c920v2_conv_im2col_gemm_pack1ton_fp32, "shl_c920v2_conv_im2col_gemm_pack1ton_fp32"}, + {shl_c920v2_conv_im2col_gemm_pack1ton_fp16, "shl_c920v2_conv_im2col_gemm_pack1ton_fp16"}, + {shl_c920v2_conv_im2col_gemm_packnto1_fp32, "shl_c920v2_conv_im2col_gemm_packnto1_fp32"}, + {shl_c920v2_conv_im2col_gemm_packnto1_fp16, "shl_c920v2_conv_im2col_gemm_packnto1_fp16"}, + {shl_c920v2_conv1x1s1_gemm_packn_fp32, "shl_c920v2_conv1x1s1_gemm_packn_fp32"}, + {shl_c920v2_conv1x1s1_gemm_packn_fp16, "shl_c920v2_conv1x1s1_gemm_packn_fp16"}, + {shl_c920v2_conv1x1s1_gemm_packn_int8, "shl_c920v2_conv1x1s1_gemm_packn_int8"}, + {shl_c920v2_conv1x1s1_gemm_pack1ton_fp32, "shl_c920v2_conv1x1s1_gemm_pack1ton_fp32"}, + {shl_c920v2_conv1x1s1_gemm_pack1ton_fp16, "shl_c920v2_conv1x1s1_gemm_pack1ton_fp16"}, + {shl_c920v2_conv1x1s1_gemm_pack1ton_int8, "shl_c920v2_conv1x1s1_gemm_pack1ton_int8"}, + {shl_c920v2_conv1x1s1_gemm_packnto1_fp32, "shl_c920v2_conv1x1s1_gemm_packnto1_fp32"}, + {shl_c920v2_conv1x1s1_gemm_packnto1_fp16, "shl_c920v2_conv1x1s1_gemm_packnto1_fp16"}, + {shl_c920v2_conv1x1s1_gemm_packnto1_int8, "shl_c920v2_conv1x1s1_gemm_packnto1_int8"}, + {shl_c920v2_ncxhwx_gemm_12xpack2n_fp32, "shl_c920v2_ncxhwx_gemm_12xpack2n_fp32"}, + {shl_c920v2_ncxhwx_gemm_12xpack2n_fp16, "shl_c920v2_ncxhwx_gemm_12xpack2n_fp16"}, + {shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot, "shl_c920v2_ncxhwx_gemm_12xpackn_int8_dot"}, + {shl_c920v2_ncxhwx_gemm_4xpack2n_int8, "shl_c920v2_ncxhwx_gemm_4xpack2n_int8"}, + {NULL, NULL}}; + +char *shl_rvv_get_kernel_name(void *exec); + +char *shl_c920v2_get_kernel_name(void *exec) +{ + char *name = shl_find_function_name(shl_c920v2_kernel_map, exec); + if (name == NULL) { + name = shl_rvv_get_kernel_name(exec); + } + return name; +} + +int shl_c920v2_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_c920v2_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} \ No newline at end of file diff --git a/source/c920v2_opt/setup.c b/source/c920v2_opt/setup.c index faf0be2c..19a9a1b0 100644 --- a/source/c920v2_opt/setup.c +++ b/source/c920v2_opt/setup.c @@ -18,12 +18,13 @@ #include "c920v2/c920v2.h" #include "c920v2/cap.h" +#include "c920v2/perf.h" #define C920V2_OP_PATTERN_MAX 40 static struct shl_cb_table shl_c920v2_cb_table[C920V2_OP_PATTERN_MAX]; void shl_c920v2_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, - void *exec, void *est, void *cap) + void *exec, void *est, void *cap, void *perf) { static int i = 0; if (i >= C920V2_OP_PATTERN_MAX) { @@ -34,6 +35,7 @@ void shl_c920v2_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, shl_c920v2_cb_table[i].shl_cb_value.exec = exec; shl_c920v2_cb_table[i].shl_cb_value.est = est; shl_c920v2_cb_table[i].shl_cb_value.caps = cap; + shl_c920v2_cb_table[i].shl_cb_value.perf = perf; i++; } @@ -180,7 +182,8 @@ void shl_c920v2_session_setup(struct csinn_session *sess) bool save_binary_model = false; if (sess->model.save_mode == CSINN_SAVE_AND_RUN || sess->model.save_mode == CSINN_SAVE_ONLY) { - if (sess->base_dtype == CSINN_DTYPE_FLOAT16 || sess->base_dtype == CSINN_DTYPE_FLOAT32) { + if (sess->base_dtype == CSINN_DTYPE_INT8 || sess->base_dtype == CSINN_DTYPE_FLOAT16 || + sess->base_dtype == CSINN_DTYPE_FLOAT32) { save_binary_model = true; } else { shl_debug_warning("Unsupport to save this dtype binary model yet\n"); @@ -389,21 +392,21 @@ void shl_target_init_c920v2() { #ifndef CONFIG_C920V2_CONVOLUTION_FP32_DISABLED shl_c920v2_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c920v2_conv2d_init_fp32, NULL, - shl_gref_conv2d, shl_c920v2_conv2d_cap); + shl_gref_conv2d, shl_c920v2_conv2d_cap, shl_c920v2_conv2d_perf); shl_c920v2_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c920v2_conv2d_init_fp32, NULL, - shl_gref_group_conv2d, shl_c920v2_conv2d_cap); + shl_gref_group_conv2d, shl_c920v2_conv2d_cap, shl_c920v2_conv2d_perf); #endif #ifndef CONFIG_C920V2_CONVOLUTION_FP16_DISABLED shl_c920v2_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c920v2_conv2d_init_fp16, NULL, - shl_gref_conv2d, shl_c920v2_conv2d_cap); + shl_gref_conv2d, shl_c920v2_conv2d_cap, shl_c920v2_conv2d_perf); shl_c920v2_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c920v2_conv2d_init_fp16, NULL, - shl_gref_group_conv2d, shl_c920v2_conv2d_cap); + shl_gref_group_conv2d, shl_c920v2_conv2d_cap, shl_c920v2_conv2d_perf); #endif #ifndef CONFIG_C920V2_CONVOLUTION_INT8_DISABLED shl_c920v2_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_c920v2_conv2d_init_int8, NULL, - shl_gref_conv2d, shl_c920v2_conv2d_cap); + shl_gref_conv2d, shl_c920v2_conv2d_cap, shl_c920v2_conv2d_perf); shl_c920v2_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_c920v2_conv2d_init_int8, NULL, - shl_gref_group_conv2d, shl_c920v2_conv2d_cap); + shl_gref_group_conv2d, shl_c920v2_conv2d_cap, shl_c920v2_conv2d_perf); #endif shl_register_op_callback(CSINN_C920V2, shl_cb_map_c920v2); shl_register_runtime_callback(CSINN_C920V2, shl_c920v2_runtime_callback); diff --git a/source/graph_ref/rms_norm.c b/source/graph_ref/rms_norm.c index 68e854bb..5437df35 100644 --- a/source/graph_ref/rms_norm.c +++ b/source/graph_ref/rms_norm.c @@ -18,16 +18,15 @@ #include "shl_gref.h" -int shl_gref_rms_norm(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params) +int shl_gref_rms_norm(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { - shl_gref_diso_op(input, output, weights, CSINN_OP_RMS_NORM, params); + shl_gref_diso_op(input, weights, output, CSINN_OP_RMS_NORM, params); return CSINN_TRUE; } -int shl_gref_rms_norm_infer_shape(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, - struct csinn_rms_norm_params *params) +int shl_gref_rms_norm_infer_shape(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { shl_tensor_try_nc1xc0_to_ndarray_shape(input); output->dim_count = input->dim_count; @@ -35,6 +34,6 @@ int shl_gref_rms_norm_infer_shape(struct csinn_tensor *input, struct csinn_tenso output->dim[i] = input->dim[i]; } - SHL_DEBUG_CALL(shl_rms_norm_debug_info(input, output, weights, params, __func__)); + SHL_DEBUG_CALL(shl_rms_norm_debug_info(input, weights, output, params, __func__)); return CSINN_TRUE; } diff --git a/source/graph_ref/setup.c b/source/graph_ref/setup.c index dda414e5..c369bd7f 100644 --- a/source/graph_ref/setup.c +++ b/source/graph_ref/setup.c @@ -297,7 +297,7 @@ int shl_gref_call_layer_func(void *fn, struct shl_node *node) ret = func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params); break; case CSINN_OP_RMS_NORM: - ret = func(node->in[0]->data, node->out[0]->data, node->in[1]->data, params); + ret = func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params); break; case CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION: ret = func(node->in[0]->data, node->in[1]->data, node->in[2]->data, node->out[0]->data, @@ -343,6 +343,277 @@ int shl_gref_call_layer_func(void *fn, struct shl_node *node) return ret; } +int shl_gref_call_layer_perf(void *fn, struct shl_node *node, struct csinn_perf_info *perf_info) +{ + /* base has same address with params */ + struct csinn_params_base *params = node->data; + int (*func)(); + func = fn; + if (!func) { + shl_debug_fatal("Can't find exec func %s\n", node->name); + } + int ret = CSINN_TRUE; + struct csinn_tensor **inputs; + struct csinn_tensor **outputs; + + switch (node->type) { + case CSINN_OP_ABS: + case CSINN_OP_ACOS: + case CSINN_OP_ACOSH: + case CSINN_OP_ANY: + case CSINN_OP_ARGMAX: + case CSINN_OP_ARGMIN: + case CSINN_OP_ASIN: + case CSINN_OP_ASINH: + case CSINN_OP_ATAN: + case CSINN_OP_ATANH: + case CSINN_OP_AVGPOOL2D: + case CSINN_OP_AVGPOOL3D: + case CSINN_OP_BATCH_TO_SPACE: + case CSINN_OP_BATCH_TO_SPACE_ND: + case CSINN_OP_BROADCOST: + case CSINN_OP_CEIL: + case CSINN_OP_CLIP: + case CSINN_OP_COL2IM: + case CSINN_OP_COS: + case CSINN_OP_COSH: + case CSINN_OP_CROP: + case CSINN_OP_CUMPROD: + case CSINN_OP_CUMSUM: + case CSINN_OP_DATA_CONVERT: + case CSINN_OP_DEPTH_TO_SPACE: + case CSINN_OP_ELU: + case CSINN_OP_ERF: + case CSINN_OP_EXP: + case CSINN_OP_EXPAND_DIMS: + case CSINN_OP_EXPM1: + case CSINN_OP_FLATTEN: + case CSINN_OP_FLOOR: + case CSINN_OP_GLOBAL_AVGPOOL2D: + case CSINN_OP_GLOBAL_MAXPOOL2D: + case CSINN_OP_HARD_SIGMOID: + case CSINN_OP_IM2COL: + case CSINN_OP_ISNAN: + case CSINN_OP_L2N: + case CSINN_OP_L2POOL2D: + case CSINN_OP_LEAKY_RELU: + case CSINN_OP_LOG_SOFTMAX: + case CSINN_OP_LOG: + case CSINN_OP_LOG1P: + case CSINN_OP_LOGICAL_NOT: + case CSINN_OP_LRN: + case CSINN_OP_MAX: + case CSINN_OP_MAXPOOL2D: + case CSINN_OP_MAXPOOL2D_LOCAT: + case CSINN_OP_MAXPOOL3D: + case CSINN_OP_MEAN: + case CSINN_OP_MEAN_STRIDE: + case CSINN_OP_MIN: + case CSINN_OP_NDARRAY_SIZE: + case CSINN_OP_NEGATIVE: + case CSINN_OP_NOT: + case CSINN_OP_ONE_HOT: + case CSINN_OP_PAD: + case CSINN_OP_PROD: + case CSINN_OP_REDUCE_LOGSUMEXP: + case CSINN_OP_REDUCE_MAX: + case CSINN_OP_REDUCE_MEAN: + case CSINN_OP_REDUCE_MIN: + case CSINN_OP_REDUCE_PROD: + case CSINN_OP_REDUCE_SUM: + case CSINN_OP_RELU: + case CSINN_OP_RELU1: + case CSINN_OP_RELU6: + case CSINN_OP_RELUN: + case CSINN_OP_REORG: + case CSINN_OP_RESHAPE: + case CSINN_OP_RESIZE: + case CSINN_OP_REVERSE: + case CSINN_OP_ROUND: + case CSINN_OP_RSQRT: + case CSINN_OP_SHAPE: + case CSINN_OP_SHUFFLE_CHANNEL: + case CSINN_OP_SIGMOID: + case CSINN_OP_SIGN: + case CSINN_OP_SIN: + case CSINN_OP_SINH: + case CSINN_OP_SLICE: + case CSINN_OP_SOFTMAX: + case CSINN_OP_SOFTPLUS: + case CSINN_OP_SOFTRELU: + case CSINN_OP_SOFTSIGN: + case CSINN_OP_SPACE_TO_BATCH: + case CSINN_OP_SPACE_TO_BATCH_ND: + case CSINN_OP_SPACE_TO_DEPTH: + case CSINN_OP_SQRT: + case CSINN_OP_SQUARE: + case CSINN_OP_SQUEEZE: + case CSINN_OP_STACK: + case CSINN_OP_STRIDED_SLICE: + case CSINN_OP_SUM: + case CSINN_OP_TAN: + case CSINN_OP_TANH: + case CSINN_OP_THRESHOLD_RELU: + case CSINN_OP_TILE: + case CSINN_OP_TRANSPOSE: + case CSINN_OP_TRUNC: + case CSINN_OP_UNPOOLING: + case CSINN_OP_UNSTACK: + case CSINN_OP_CAST: + case CSINN_OP_YUV_RGB_SCALE: + case CSINN_OP_SILU: + case CSINN_OP_ROPE: + case CSINN_OP_LLM_POS: + ret = func(node->in[0]->data, node->out[0]->data, params, perf_info); + break; + case CSINN_OP_ADD: + case CSINN_OP_AND: + case CSINN_OP_DIV: + case CSINN_OP_EQUANL: + case CSINN_OP_FLOOR_DIVIDE: + case CSINN_OP_FLOOR_MOD: + case CSINN_OP_GATHER_ND: + case CSINN_OP_GATHER: + case CSINN_OP_GREATHER_EQUAL: + case CSINN_OP_GREATHER: + case CSINN_OP_LESS_EQUAL: + case CSINN_OP_LESS: + case CSINN_OP_LOGICAL_AND: + case CSINN_OP_LOGICAL_OR: + case CSINN_OP_LOGICAL_XOR: + case CSINN_OP_MATMUL: + case CSINN_OP_MAXIMUM: + case CSINN_OP_MINIMUM: + case CSINN_OP_MOD: + case CSINN_OP_MUL: + case CSINN_OP_NON_MAX_SUPPRESSION: + case CSINN_OP_NOT_EQUAL: + case CSINN_OP_OR: + case CSINN_OP_POWER: + case CSINN_OP_PRELU: + case CSINN_OP_SEQUENCE_MASK: + case CSINN_OP_SEGMENT_MAX: + case CSINN_OP_UNSORTED_SEGMENT_MAX: + case CSINN_OP_SEGMENT_MEAN: + case CSINN_OP_UNSORTED_SEGMENT_MEAN: + case CSINN_OP_SEGMENT_MIN: + case CSINN_OP_UNSORTED_SEGMENT_MIN: + case CSINN_OP_SEGMENT_PROD: + case CSINN_OP_UNSORTED_SEGMENT_PROD: + case CSINN_OP_SEGMENT_SUM: + case CSINN_OP_UNSORTED_SEGMENT_SUM: + case CSINN_OP_SUB: + case CSINN_OP_XOR: + case CSINN_OP_EMBEDDING: + ret = func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params, perf_info); + break; + case CSINN_OP_CONV1D: + case CSINN_OP_CONV2D: + case CSINN_OP_CONV2D_RELU: + case CSINN_OP_CONV2D_RELU6: + case CSINN_OP_CONV2D_CHANNEL: + case CSINN_OP_CONV2D_CHANNEL_RELU: + case CSINN_OP_CONV2D_CHANNEL_RELU6: + case CSINN_OP_DEPTHWISE_CONV1D: + case CSINN_OP_DEPTHWISE_CONV2D: + case CSINN_OP_DEPTHWISE_CONV2D_RELU: + case CSINN_OP_DEPTHWISE_CONV2D_RELU6: + case CSINN_OP_DEPTHWISE_CONV2D_CHANNEL: + case CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU: + case CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6: + case CSINN_OP_GROUP_CONV2D: + case CSINN_OP_GROUP_CONV2D_RELU: + case CSINN_OP_GROUP_CONV2D_RELU6: + case CSINN_OP_GROUP_CONV2D_CHANNEL: + case CSINN_OP_GROUP_CONV2D_CHANNEL_RELU: + case CSINN_OP_CONV3D: + case CSINN_OP_DECONV2D: + case CSINN_OP_DEPTHWISE_DECONV2D: + case CSINN_OP_GROUP_DECONV2D: + case CSINN_OP_DECONV3D: + case CSINN_OP_FULLYCONNECTED: + case CSINN_OP_LAYER_NORM: + case CSINN_OP_CACHE_MATMUL: + case CSINN_OP_CACHE_CONV1D: + ret = func(node->in[0]->data, node->out[0]->data, node->in[1]->data, node->in[2]->data, + params, perf_info); + break; + case CSINN_OP_FSMN: + ret = func(node->in[0]->data, node->in[1]->data, node->in[2]->data, node->in[3]->data, + node->in[4]->data, node->out[0]->data, params, perf_info); + break; + case CSINN_OP_CONCAT: + inputs = shl_mem_alloc(sizeof(struct csinn_tensor *) * + ((struct csinn_concat_params *)params)->inputs_count); + for (int i = 0; i < ((struct csinn_concat_params *)params)->inputs_count; i++) { + inputs[i] = node->in[i]->data; + } + ret = func(inputs, node->out[0]->data, params, perf_info); + shl_mem_free(inputs); + break; + case CSINN_OP_SPLIT: + outputs = shl_mem_alloc(sizeof(struct csinn_tensor *) * + ((struct csinn_split_params *)params)->output_num); + for (int i = 0; i < ((struct csinn_split_params *)params)->output_num; i++) { + outputs[i] = node->out[i]->data; + } + ret = func(node->in[0]->data, outputs, params, perf_info); + shl_mem_free(outputs); + break; + case CSINN_OP_WHERE: + ret = func(node->in[0]->data, node->in[1]->data, node->in[2]->data, node->out[0]->data, + params, perf_info); + break; + case CSINN_OP_WHERE_SOFTMAX: + ret = func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params, perf_info); + break; + case CSINN_OP_RMS_NORM: + ret = func(node->in[0]->data, node->in[1]->data, node->out[0]->data, params, perf_info); + break; + case CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION: + ret = func(node->in[0]->data, node->in[1]->data, node->in[2]->data, node->out[0]->data, + params, perf_info); + break; + case CSINN_OP_ALL: + shl_debug_error("unsupported CSINN_OP_ALL\n"); + break; + case CSINN_OP_ARANGE: + shl_debug_error("unsupported CSINN_OP_ARANGE\n"); + break; + case CSINN_OP_BN: + shl_debug_error("unsupported CSINN_OP_BN\n"); + break; + case CSINN_OP_MIN_STRIDE: + shl_debug_error("unsupported CSINN_OP_MIN_STRIDE\n"); + break; + case CSINN_OP_PROPOSAL: + shl_debug_error("unsupported CSINN_OP_PROPOSAL\n"); + break; + case CSINN_OP_PSROIPOOLING: + shl_debug_error("unsupported CSINN_OP_PSROIPOOLING\n"); + break; + case CSINN_OP_ROIALIGN: + shl_debug_error("unsupported CSINN_OP_ROIALIGN\n"); + break; + case CSINN_OP_ROIPOOL: + shl_debug_error("unsupported CSINN_OP_ROIPOOL\n"); + break; + case CSINN_OP_SCATTER_ND: + shl_debug_error("unsupported CSINN_OP_SCATTER_ND\n"); + break; + case CSINN_OP_SELECT: + shl_debug_error("unsupported CSINN_OP_SELECT\n"); + break; + case CSINN_OP_TOPK: + shl_debug_error("unsupported CSINN_OP_TOPK\n"); + break; + default: + shl_debug_error("%s: unknown op %d\n", __func__, node->type); + return CSINN_FALSE; + } + return ret; +} + struct csinn_callback *shl_gref_best_callback(struct shl_node *node) { struct csinn_params_base *params = node->data; @@ -386,6 +657,10 @@ static int init_op(struct shl_node *node) { /* base has same address with params */ struct csinn_params_base *params = node->data; + int ret = CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_begin(params->sess->trace, __func__, + SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); int (*func)(); @@ -395,12 +670,13 @@ static int init_op(struct shl_node *node) params->sess->base_run_mode = org_rm; if (cb->init != NULL) { - if (shl_gref_call_layer_func(cb->init, node) != CSINN_TRUE) { - return CSINN_FALSE; - } + ret = shl_gref_call_layer_func(cb->init, node); } - return CSINN_TRUE; + SHL_TRACE_CALL( + shl_trace_duration_end(params->sess->trace, __func__, SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + + return ret; } int shl_gref_size_align(int orig, int align) @@ -877,6 +1153,106 @@ static int op_run_deinit(struct shl_node *node, struct shl_ref_graph *graph) return CSINN_TRUE; } +static struct shl_trace_value *create_strings_with_trace(char **strs, int num) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_mem_alloc(sizeof(struct shl_trace_value)); + + struct shl_trace_value_list *list = + (struct shl_trace_value_list *)shl_mem_alloc(sizeof(struct shl_trace_value_list)); + list->size = num; + list->value = (struct shl_trace_value **)shl_mem_alloc(sizeof(struct shl_trace_value *) * num); + + res->type = SHL_TRACE_VALUE_TYPE_LIST; + res->content.list = list; + + for (int i = 0; i < num; i++) { + struct shl_trace_value *value = SHL_TRACE_STRING(strs[i]); + list->value[i] = value; + } + + return res; +} + +static char **get_node_output_names(struct shl_node *node) +{ + struct shl_ref_graph *sgraph = NULL; + int output_num; + if (node->type == CSINN_SUBGRAPH) { + sgraph = node->data; + output_num = sgraph->output_num; + } else { + output_num = node->out_num; + } + + /* get subgraph output names(including layer name) */ + char **output_names = (char **)shl_mem_alloc(sizeof(char *) * output_num); + for (int i = 0; i < output_num; i++) { + output_names[i] = (char *)shl_mem_alloc(sizeof(char) * 1024); + + if (node->type == CSINN_SUBGRAPH) { + for (int j = 0; j < sgraph->layer_index; j++) { + struct shl_node *curr_n = sgraph->layer[j]; + if (curr_n->type >= CSINN_OP_SIZE) continue; + + for (int k = 0; k < curr_n->out_num; k++) { + if (sgraph->output[i] == curr_n->out[k]) { + snprintf(output_names[i], 1024, "%s:out%d", curr_n->name, k); + break; + } + } + } + } else { + snprintf(output_names[i], 1024, "%s:out%d", node->name, i); + } + } + return output_names; +} + +static struct shl_trace_value *create_node_dtype_with_trace(struct shl_node **node, int num) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_mem_alloc(sizeof(struct shl_trace_value)); + + struct shl_trace_value_list *list = + (struct shl_trace_value_list *)shl_mem_alloc(sizeof(struct shl_trace_value_list)); + list->size = num; + list->value = (struct shl_trace_value **)shl_mem_alloc(sizeof(struct shl_trace_value *) * num); + + res->type = SHL_TRACE_VALUE_TYPE_LIST; + res->content.list = list; + + for (int i = 0; i < num; i++) { + struct csinn_tensor *tensor = node[i]->data; + struct shl_trace_value *value = SHL_TRACE_STRING(shl_find_dtype_name(tensor->dtype)); + list->value[i] = value; + } + + return res; +} + +static struct shl_trace_value *create_node_shape_with_trace(struct shl_node **node, int num) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_mem_alloc(sizeof(struct shl_trace_value)); + + struct shl_trace_value_list *list = + (struct shl_trace_value_list *)shl_mem_alloc(sizeof(struct shl_trace_value_list)); + list->size = num; + list->value = (struct shl_trace_value **)shl_mem_alloc(sizeof(struct shl_trace_value *) * num); + + res->type = SHL_TRACE_VALUE_TYPE_LIST; + res->content.list = list; + + for (int i = 0; i < num; i++) { + struct csinn_tensor *tensor = node[i]->data; + struct shl_trace_value *value = SHL_TRACE_LIST_INT(tensor->dim_count, tensor->dim); + list->value[i] = value; + } + + return res; +} + static int op_run(struct shl_node *node) { /* base has same address with params */ @@ -891,11 +1267,46 @@ static int op_run(struct shl_node *node) int (*func)(); struct csinn_callback *cb = params->cb; func = cb->exec; - return shl_gref_call_layer_func(func, node); + char *kernel_name = ""; + if (params->sess->profiler_level >= CSINN_PROFILER_LEVEL_TRACE) { + if (cb->perf) { + struct csinn_perf_info *perf_info = + (struct csinn_perf_info *)shl_mem_alloc(sizeof(struct csinn_perf_info)); + shl_gref_call_layer_perf(cb->perf, node, perf_info); + if (perf_info->kernel_name) { + kernel_name = perf_info->kernel_name; + } + shl_mem_free(perf_info); + } + SHL_TRACE_CALL(shl_trace_duration_begin( + params->sess->trace, kernel_name, SHL_TRACE_EVENT_CPU_KERNEL, + shl_trace_create_dict( + 6, "name", SHL_TRACE_STRING(params->name), "layout", + SHL_TRACE_STRING(shl_find_layout_name(params->layout)), "api", + SHL_TRACE_STRING(shl_find_api_name(params->api)), "quant_type", + SHL_TRACE_STRING(shl_find_quant_name(params->quant_type)), "input_shape", + create_node_shape_with_trace(node->in, node->in_num), "input_dtype", + create_node_dtype_with_trace(node->in, node->in_num)))); + } + + int ret = shl_gref_call_layer_func(func, node); + + if (params->sess->profiler_level >= CSINN_PROFILER_LEVEL_TRACE) { + SHL_TRACE_CALL(shl_trace_duration_end( + params->sess->trace, kernel_name, SHL_TRACE_EVENT_CPU_KERNEL, + shl_trace_create_dict( + 2, "output_shape", create_node_shape_with_trace(node->out, node->out_num), + "output_dtype", create_node_dtype_with_trace(node->out, node->out_num)))); + } + + return ret; } int shl_gref_session_run(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; struct shl_ref_graph *g = shl_gref_get_graph(sess); uint64_t time_acc = 0; node_ref_reset(sess); @@ -906,8 +1317,14 @@ int shl_gref_session_run(struct csinn_session *sess) for (int i = 0; i < g->layer_index; i++) { struct shl_node *n = g->layer[i]; + + char **output_filenames = NULL; + char **output_names = NULL; + int output_num = 0; if (n->type == CSINN_SUBGRAPH) { if (sess->base_run_mode == CSINN_RM_CPU_BASE_HYBRID) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, "subgraph_execution", + SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); shl_subgraph_run_init(n); #ifdef SHL_LAYER_BENCHMARK if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER || @@ -930,15 +1347,40 @@ int shl_gref_session_run(struct csinn_session *sess) shl_subgraph_run_deinit(n, g); if (sess->profiler_level == CSINN_PROFILER_LEVEL_DUMP || - sess->profiler_level == CSINN_PROFILER_LEVEL_ALL) { - shl_dump_output_tensor(n); + sess->profiler_level == CSINN_PROFILER_LEVEL_ALL || + (sess->profiler_level > CSINN_PROFILER_LEVEL_TRACE && + (sess->profiler_level - CSINN_PROFILER_LEVEL_TRACE) == + CSINN_PROFILER_LEVEL_DUMP)) { + struct shl_ref_graph *sgraph = n->data; + output_num = sgraph->output_num; + + output_filenames = (char **)shl_mem_alloc(sizeof(char *) * output_num); + for (int i = 0; i < output_num; i++) { + output_filenames[i] = (char *)shl_mem_alloc(sizeof(char) * 1024); + } + shl_dump_output_tensor(n, output_filenames); } #else shl_subgraph_run(n); shl_subgraph_run_deinit(n, g); #endif + if (output_filenames == NULL) { + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, "subgraph_execution", + SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + } else { + output_names = get_node_output_names(n); + SHL_TRACE_CALL(shl_trace_duration_end( + sess->trace, "subgraph_execution", SHL_TRACE_EVENT_CPU_OPERATOR, + shl_trace_create_dict( + 3, "name", SHL_TRACE_STRING(n->name), "output_files", + create_strings_with_trace(output_filenames, output_num), "output_names", + create_strings_with_trace(output_names, output_num)))); + } } } else if (n->type >= 0 && n->type < CSINN_OP_SIZE) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, "cpu_ops_execution", + SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + op_run_init(n); #ifdef SHL_LAYER_BENCHMARK if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER || @@ -952,21 +1394,59 @@ int shl_gref_session_run(struct csinn_session *sess) op_run(n); } if (sess->profiler_level == CSINN_PROFILER_LEVEL_DUMP || - sess->profiler_level == CSINN_PROFILER_LEVEL_ALL) { - shl_dump_output_tensor(n); + sess->profiler_level == CSINN_PROFILER_LEVEL_ALL || + (sess->profiler_level > CSINN_PROFILER_LEVEL_TRACE && + (sess->profiler_level - CSINN_PROFILER_LEVEL_TRACE) == + CSINN_PROFILER_LEVEL_DUMP)) { + output_num = n->out_num; + output_filenames = (char **)shl_mem_alloc(sizeof(char *) * output_num); + for (int idx = 0; idx < output_num; idx++) { + output_filenames[idx] = (char *)shl_mem_alloc(sizeof(char) * 1024); + } + shl_dump_output_tensor(n, output_filenames); } #else op_run(n); #endif op_run_deinit(n, g); + if (output_filenames == NULL) { + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, "cpu_ops_execution", + SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + } else { + output_names = get_node_output_names(n); + SHL_TRACE_CALL(shl_trace_duration_end( + sess->trace, "cpu_ops_execution", SHL_TRACE_EVENT_CPU_OPERATOR, + shl_trace_create_dict(3, "name", SHL_TRACE_STRING(n->name), "output_files", + create_strings_with_trace(output_filenames, output_num), + "output_names", + create_strings_with_trace(output_names, output_num)))); + } } else { - return CSINN_FALSE; + ret = CSINN_FALSE; + } + + if (output_filenames) { + for (int idx = 0; idx < output_num; idx++) { + shl_mem_free(output_filenames[idx]); + } + shl_mem_free(output_filenames); + output_filenames = NULL; + } + if (output_names) { + for (int idx = 0; idx < output_num; idx++) { + shl_mem_free(output_names[idx]); + } + shl_mem_free(output_names); + output_names = NULL; } } #ifdef SHL_LAYER_BENCHMARK shl_debug_info("[layer-benchmark]: network exec time = %f\n\n", time_acc / 1000000.0f); #endif - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } void shl_gref_set_tensor(struct csinn_tensor *input, struct csinn_session *sess) @@ -1001,6 +1481,18 @@ void shl_gref_session_deinit(struct csinn_session *sess) for (int i = 0; i < g->layer_index; i++) { struct shl_node *n = g->layer[i]; if (n->type == CSINN_SUBGRAPH) { + if (sess->profiler_level >= CSINN_PROFILER_LEVEL_TRACE) { + struct shl_ref_graph *sgraph = n->data; + struct shl_node *node = sgraph->layer[0]; + struct csinn_params_base *params = node->data; + + // move trace data of subgraph into that of main graph. + shl_trace_move_events(params->sess->trace, sess->trace); + + // (Note:@chenf) disable trace temporarily + struct shl_trace *sub_trace = params->sess->trace; + sub_trace->enable_trace = false; + } shl_subgraph_deinit(n); } } diff --git a/source/graph_ref/subgraph.c b/source/graph_ref/subgraph.c index 311a4fbf..2db89cb0 100644 --- a/source/graph_ref/subgraph.c +++ b/source/graph_ref/subgraph.c @@ -352,6 +352,7 @@ static void set_sub_session(struct csinn_session *sub_sess, struct csinn_params_ { struct csinn_session *base_sess = params->sess; sub_sess->base_api = params->api; + sub_sess->profiler_level = base_sess->profiler_level; if (params->api == CSINN_TH1520) { sub_sess->base_dtype = base_sess->base_dtype; sub_sess->debug_level = base_sess->debug_level; @@ -384,6 +385,11 @@ int shl_subgraph_setup(struct shl_node *n) struct shl_ref_graph *sgraph = n->data; struct shl_node *init_node = sgraph->layer[0]; struct csinn_params_base *init_params = init_node->data; + struct csinn_session *ori_sess = init_params->sess; + + SHL_TRACE_CALL( + shl_trace_duration_begin(ori_sess->trace, __func__, SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + struct csinn_session *sub_sess = csinn_alloc_session(); set_sub_session(sub_sess, init_params, sgraph); csinn_session_init(sub_sess); @@ -586,6 +592,8 @@ int shl_subgraph_setup(struct shl_node *n) } default: shl_debug_error("%s unknown op\n", __func__); + SHL_TRACE_CALL(shl_trace_duration_end(ori_sess->trace, __func__, + SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); return CSINN_FALSE; } } @@ -606,6 +614,9 @@ int shl_subgraph_setup(struct shl_node *n) csinn_session_setup(sub_sess); + SHL_TRACE_CALL( + shl_trace_duration_end(ori_sess->trace, __func__, SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + return ret; } @@ -901,6 +912,47 @@ static int is_memory_op(enum csinn_op_enum op) return 0; } +static int is_subgraph_nodes_th1520(enum csinn_op_enum op) +{ + enum csinn_op_enum ops[CSINN_OP_SIZE] = { + CSINN_OP_CONV1D, + CSINN_OP_CONV2D, + CSINN_OP_CONV2D_RELU, + CSINN_OP_CONV2D_RELU6, + CSINN_OP_CONV2D_CHANNEL, + CSINN_OP_CONV2D_CHANNEL_RELU, + CSINN_OP_CONV2D_CHANNEL_RELU6, + CSINN_OP_DEPTHWISE_CONV1D, + CSINN_OP_DEPTHWISE_CONV2D, + CSINN_OP_DEPTHWISE_CONV2D_RELU, + CSINN_OP_DEPTHWISE_CONV2D_RELU6, + CSINN_OP_DEPTHWISE_CONV2D_CHANNEL, + CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU, + CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6, + CSINN_OP_GROUP_CONV1D, + CSINN_OP_GROUP_CONV2D, + CSINN_OP_GROUP_CONV2D_RELU, + CSINN_OP_GROUP_CONV2D_RELU6, + CSINN_OP_GROUP_CONV2D_CHANNEL, + CSINN_OP_GROUP_CONV2D_CHANNEL_RELU, + CSINN_OP_CONV3D, + CSINN_OP_DECONV2D, + CSINN_OP_DEPTHWISE_DECONV2D, + CSINN_OP_GROUP_DECONV2D, + CSINN_OP_DECONV3D, + CSINN_OP_FULLYCONNECTED, + + CSINN_OP_ADD, + }; + + for (int idx = 0; idx < CSINN_OP_SIZE; idx++) { + if (ops[idx] == op) { + return 1; + } + } + return 0; +} + void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node) { /* CPU nodes needn't be added into subgraph. */ @@ -921,7 +973,8 @@ void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node int is_th1520 = shl_subgraph_get_device(node) == CSINN_TH1520 ? 1 : 0; int is_profiler = params->sess->profiler_level == CSINN_PROFILER_LEVEL_UNSET ? 0 : 1; - if (shl_gref_is_root_node(graph, node) || (is_profiler && is_th1520)) { + if (shl_gref_is_root_node(graph, node) || + (is_profiler && is_th1520 && is_subgraph_nodes_th1520(node->type))) { // if (shl_gref_is_root_node(graph, node) || (is_profiler && !is_th1520) || // (is_profiler && is_th1520 && !is_memory_op(node->type) && node->type != // CSINN_OP_ADD)) { @@ -1047,12 +1100,12 @@ void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node td->is_hybrid_quantization_type && (is_concat_case || is_before_concat || is_abnormal_concat || is_special_reshape); - int is_th1520_profiler = 0; - if (is_profiler && is_th1520 && !is_memory_op(i_node->type)) { - is_th1520_profiler = 1; - } + // int is_th1520_profiler = 0; + // if (is_profiler && is_th1520 && !is_memory_op(i_node->type)) { + // is_th1520_profiler = 1; + // } - if (!is_restrict && !filter_flag && !is_th1520_profiler) { + if (!is_restrict && !filter_flag) { /* add current node into its i-th input subgraph. */ node->subgraph_idx = i_node->subgraph_idx; struct shl_ref_graph *sgraph = graph->layer[i_node->subgraph_idx]->data; @@ -1126,18 +1179,30 @@ void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node // } find_flag = shl_is_restricted_by_node(m_node->subgraph_idx, graph->layer[in_m_subgraph_index], graph); + + if (is_profiler && is_th1520) { + struct shl_ref_graph *curr_in_sgraph = + graph->layer[in_m_subgraph_index]->data; + for (int kk = 0; kk < curr_in_sgraph->layer_index; kk++) { + if (is_subgraph_nodes_th1520(curr_in_sgraph->layer[kk]->type)) { + find_flag = 1; + break; + } + } + } + if (find_flag) { is_restrict2 = 1; break; } } - int is_th1520_profiler = 0; - if (is_profiler && is_th1520 && !is_memory_op(sgraph->layer[0]->type)) { - is_th1520_profiler = 1; - } + // int is_th1520_profiler = 0; + // if (is_profiler && is_th1520 && !is_memory_op(sgraph->layer[0]->type)) { + // is_th1520_profiler = 1; + // } - if (!is_restrict && !is_restrict2 && !is_th1520_profiler) { + if (!is_restrict && !is_restrict2) { /* can fuse subgraph into current subgraph. */ for (int n = 0; n < sgraph->layer_index; n++) { struct shl_node *subgraph_node = sgraph->layer[n]; diff --git a/source/llm/llama2.c b/source/llm/llama2.c index b048a7fa..815cc378 100644 --- a/source/llm/llama2.c +++ b/source/llm/llama2.c @@ -10,14 +10,14 @@ static char *alloc_name(char *name) static char *alloc_index_name(int index, char *name) { char *ret = shl_mem_alloc(strlen(name) + 10); - sprintf(ret, "%s_%d_", name, index); + sprintf(ret, "%s_%d", name, index); return ret; } static char *concat_name(char *name, char *append) { char *ret = shl_mem_alloc(strlen(name) + strlen(append) + 10); - sprintf(ret, "%s%s", name, append); + sprintf(ret, "%s_%s", name, append); return ret; } @@ -43,10 +43,12 @@ static struct csinn_tensor *linear(struct csinn_session *sess, struct csinn_tens { struct csinn_tensor *linear_output = csinn_alloc_tensor(sess); linear_output->name = concat_name(name, "output"); + linear_output->dtype = sess->base_dtype; y->is_const = 1; struct csinn_matmul_params *linear_params = csinn_alloc_params(sizeof(struct csinn_matmul_params), sess); + linear_params->base.name = concat_name(name, "params"); linear_params->trans_b = true; csinn_matmul_init(x, y, linear_output, linear_params); csinn_matmul(x, y, linear_output, linear_params); @@ -58,10 +60,11 @@ static struct csinn_tensor *matmul(struct csinn_session *sess, struct csinn_tens { struct csinn_tensor *matmul_output = csinn_alloc_tensor(sess); matmul_output->name = concat_name(name, "output"); - matmul_output->dtype = CSINN_DTYPE_FLOAT32; + matmul_output->dtype = sess->base_dtype; struct csinn_matmul_params *matmul_params = csinn_alloc_params(sizeof(struct csinn_matmul_params), sess); + matmul_params->base.name = concat_name(name, "params"); csinn_matmul_init(x, y, matmul_output, matmul_params); csinn_matmul(x, y, matmul_output, matmul_params); return matmul_output; @@ -71,10 +74,11 @@ static struct csinn_tensor *silu(struct csinn_session *sess, struct csinn_tensor { struct csinn_tensor *silu_output = csinn_alloc_tensor(sess); silu_output->name = concat_name(name, "output"); - silu_output->dtype = CSINN_DTYPE_FLOAT32; + silu_output->dtype = sess->base_dtype; struct csinn_sigmoid_params *silu_params = csinn_alloc_params(sizeof(struct csinn_sigmoid_params), sess); + silu_params->base.name = concat_name(name, "params"); csinn_silu_init(x, silu_output, silu_params); csinn_silu(x, silu_output, silu_params); return silu_output; @@ -85,13 +89,13 @@ static struct csinn_tensor *norm(struct csinn_session *sess, struct csinn_tensor { struct csinn_tensor *output = csinn_alloc_tensor(sess); output->name = concat_name(name, "output"); - output->dtype = CSINN_DTYPE_FLOAT32; + output->dtype = sess->base_dtype; /* * output = x * rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) * weight */ struct csinn_rms_norm_params *rms_params = csinn_alloc_params(sizeof(struct csinn_rms_norm_params), sess); - + rms_params->base.name = concat_name(name, "params"); // FIXME: from params.json's norm_eps rms_params->epsilon = 1e-05; // last dim @@ -102,20 +106,6 @@ static struct csinn_tensor *norm(struct csinn_session *sess, struct csinn_tensor return output; } -static struct csinn_tensor *view(struct csinn_session *sess, struct csinn_tensor *in, char *name) -{ - struct csinn_tensor *output = csinn_alloc_tensor(sess); - output->name = concat_name(name, "output"); - output->dtype = CSINN_DTYPE_FLOAT32; - - struct csinn_reshape_params *params = - csinn_alloc_params(sizeof(struct csinn_reshape_params), sess); - - csinn_reshape_init(in, output, params); - csinn_reshape(in, output, params); - return output; -} - static struct csinn_tensor *attention(struct shl_transformer_block *block, struct csinn_tensor *x, struct shl_llm_layer *llayer, char *name) { @@ -137,6 +127,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc // xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) struct csinn_reshape_params *xk_reshape_params = csinn_alloc_params(sizeof(struct csinn_reshape_params), sess); + xk_reshape_params->base.name = concat_name(name, "xk_reshape_params"); xk_reshape_params->shape_num = 4; xk_reshape_params->shape = shl_mem_alloc(4 * sizeof(int32_t)); xk_reshape_params->shape[0] = bsz; @@ -145,13 +136,14 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc xk_reshape_params->shape[3] = head_dim; struct csinn_tensor *xk_reshape_output = csinn_alloc_tensor(sess); - xk_reshape_output->name = alloc_name("xk_reshape_output"); + xk_reshape_output->name = concat_name(name, "xk_reshape_output"); csinn_reshape_init(xk, xk_reshape_output, xk_reshape_params); csinn_reshape(xk, xk_reshape_output, xk_reshape_params); // xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) struct csinn_reshape_params *xq_reshape_params = csinn_alloc_params(sizeof(struct csinn_reshape_params), sess); + xq_reshape_params->base.name = concat_name(name, "xq_reshape_params"); xq_reshape_params->shape_num = 4; xq_reshape_params->shape = shl_mem_alloc(4 * sizeof(int32_t)); xq_reshape_params->shape[0] = bsz; @@ -160,7 +152,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc xq_reshape_params->shape[3] = head_dim; struct csinn_tensor *xq_reshape_output = csinn_alloc_tensor(sess); - xq_reshape_output->name = alloc_name("xq_reshape_output"); + xq_reshape_output->name = concat_name(name, "xq_reshape_output"); csinn_reshape_init(xq, xq_reshape_output, xq_reshape_params); csinn_reshape(xq, xq_reshape_output, xq_reshape_params); @@ -168,6 +160,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc xq_rope->name = concat_name(name, "xq_rope"); struct csinn_rope_params *rope_params = csinn_alloc_params(sizeof(struct csinn_rope_params), sess); + rope_params->base.name = concat_name(name, "rope_params"); rope_params->freq_base = 10000; rope_params->freq_scale = 1; rope_params->xpos_base = 0; @@ -189,6 +182,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc // xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) struct csinn_reshape_params *xv_reshape_params = csinn_alloc_params(sizeof(struct csinn_reshape_params), sess); + xv_reshape_params->base.name = concat_name(name, "xv_reshape_params"); xv_reshape_params->shape_num = 4; xv_reshape_params->shape = shl_mem_alloc(4 * sizeof(int32_t)); xv_reshape_params->shape[0] = bsz; @@ -197,14 +191,14 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc xv_reshape_params->shape[3] = head_dim; struct csinn_tensor *xv_reshape_output = csinn_alloc_tensor(sess); - xv_reshape_output->name = alloc_name("xv_reshape_output"); + xv_reshape_output->name = concat_name(name, "xv_reshape_output"); csinn_reshape_init(xv, xv_reshape_output, xv_reshape_params); csinn_reshape(xv, xv_reshape_output, xv_reshape_params); // cache_k[:bsz, start_pos : start_pos + seqlen] = xk struct csinn_tensor *cache_k = csinn_alloc_tensor(sess); - cache_k->name = alloc_name("cache_k"); - cache_k->dtype = CSINN_DTYPE_FLOAT32; + cache_k->name = concat_name(name, "cache_k"); + cache_k->dtype = sess->base_dtype; cache_k->dim_count = 4; cache_k->dim[0] = 1; cache_k->dim[1] = 2048; // max_seq_len @@ -216,6 +210,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_llm_pos_params *xk_cache_params = csinn_alloc_params(sizeof(struct csinn_llm_pos_params), sess); + xk_cache_params->base.name = concat_name(name, "xk_cache_params"); xk_cache_params->bsz = bsz; xk_cache_params->seqlen = seqlen; xk_cache_params->mode = CSINN_LLM_POS_CACHE_COPY_IN; @@ -225,8 +220,8 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc // cache_v[:bsz, start_pos : start_pos + seqlen] = xv struct csinn_tensor *cache_v = csinn_alloc_tensor(sess); - cache_v->name = alloc_name("cache_v"); - cache_v->dtype = CSINN_DTYPE_FLOAT32; + cache_v->name = concat_name(name, "cache_v"); + cache_v->dtype = sess->base_dtype; cache_v->dim_count = 4; cache_v->dim[0] = 1; cache_v->dim[1] = 2048; // max_seq_len @@ -238,6 +233,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_llm_pos_params *xv_cache_params = csinn_alloc_params(sizeof(struct csinn_llm_pos_params), sess); + xv_cache_params->base.name = concat_name(name, "xv_cache_params"); xv_cache_params->bsz = bsz; xv_cache_params->seqlen = seqlen; xv_cache_params->mode = CSINN_LLM_POS_CACHE_COPY_IN; @@ -251,6 +247,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_llm_pos_params *keys_params = csinn_alloc_params(sizeof(struct csinn_llm_pos_params), sess); + keys_params->base.name = concat_name(name, "keys_params"); keys_params->bsz = bsz; keys_params->seqlen = seqlen; keys_params->mode = CSINN_LLM_POS_CACHE_COPY_OUT; @@ -263,10 +260,11 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc // xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) struct csinn_tensor *xq_transpose = csinn_alloc_tensor(sess); - xq_transpose->name = alloc_name("xq_transpose"); + xq_transpose->name = concat_name(name, "xq_transpose"); struct csinn_transpose_params *xq_transpose_params = csinn_alloc_params(sizeof(struct csinn_transpose_params), sess); + xq_transpose_params->base.name = concat_name(name, "xq_transpose_params"); xq_transpose_params->permute_num = 4; xq_transpose_params->permute = shl_mem_alloc(4 * sizeof(int32_t)); xq_transpose_params->permute[0] = 0; @@ -278,10 +276,11 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc // keys = keys.transpose(1, 2) struct csinn_tensor *keys_transpose = csinn_alloc_tensor(sess); - keys_transpose->name = alloc_name("keys_transpose"); + keys_transpose->name = concat_name(name, "keys_transpose"); struct csinn_transpose_params *keys_transpose_params = csinn_alloc_params(sizeof(struct csinn_transpose_params), sess); + keys_transpose_params->base.name = concat_name(name, "keys_transpose_params"); keys_transpose_params->permute_num = 4; keys_transpose_params->permute = shl_mem_alloc(4 * sizeof(int32_t)); keys_transpose_params->permute[0] = 0; @@ -297,6 +296,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_matmul_params *scores_matmul_params = csinn_alloc_params(sizeof(struct csinn_matmul_params), sess); + scores_matmul_params->base.name = concat_name(name, "scores_matmul_params"); scores_matmul_params->trans_b = true; csinn_matmul_init(xq_transpose, keys_transpose, scores, scores_matmul_params); csinn_matmul(xq_transpose, keys_transpose, scores, scores_matmul_params); @@ -306,13 +306,20 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_tensor *scale = csinn_alloc_tensor(sess); scale->is_const = 1; - float *scale_value = shl_mem_alloc(4); - scale_value[0] = 0.088388347648318; - scale->data = scale_value; scale->dim_count = 1; scale->dim[0] = 1; + scale->data = shl_mem_alloc(csinn_tensor_byte_size(scale)); + float scale_value = 0.088388347648318; + if (sess->base_dtype == CSINN_DTYPE_FLOAT32) { + float *scale_data = scale->data; + scale_data[0] = scale_value; + } else if (sess->base_dtype == CSINN_DTYPE_FLOAT16) { + int16_t *scale_data = scale->data; + scale_data[0] = shl_ref_float32_to_float16(scale_value); + } struct csinn_diso_params *scores_mul_params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); + scores_mul_params->base.name = concat_name(name, "scores_mul_params"); csinn_mul_init(scores, scale, scores_mul, scores_mul_params); csinn_mul(scores, scale, scores_mul, scores_mul_params); @@ -323,6 +330,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_llm_pos_params *scores_mask_params = csinn_alloc_params(sizeof(struct csinn_llm_pos_params), sess); + scores_mask_params->base.name = concat_name(name, "scores_mask_params"); scores_mask_params->bsz = bsz; scores_mask_params->seqlen = seqlen; scores_mask_params->mode = CSINN_LLM_POS_MASK; @@ -335,6 +343,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_softmax_params *scores_softmax_params = csinn_alloc_params(sizeof(struct csinn_softmax_params), sess); + scores_softmax_params->base.name = concat_name(name, "scores_softmax_params"); scores_softmax_params->axis = 3; csinn_softmax_init(scores_mask, scores_softmax, scores_softmax_params); csinn_softmax(scores_mask, scores_softmax, scores_softmax_params); @@ -345,6 +354,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_llm_pos_params *values_params = csinn_alloc_params(sizeof(struct csinn_llm_pos_params), sess); + values_params->base.name = concat_name(name, "values_params"); values_params->bsz = bsz; values_params->seqlen = seqlen; values_params->mode = CSINN_LLM_POS_CACHE_COPY_OUT; @@ -354,10 +364,11 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc // values = values.transpose(1, 2) struct csinn_tensor *values_transpose = csinn_alloc_tensor(sess); - values_transpose->name = alloc_name("values_transpose"); + values_transpose->name = concat_name(name, "values_transpose"); struct csinn_transpose_params *values_transpose_params = csinn_alloc_params(sizeof(struct csinn_transpose_params), sess); + values_transpose_params->base.name = concat_name(name, "values_transpose_params"); values_transpose_params->permute_num = 4; values_transpose_params->permute = shl_mem_alloc(4 * sizeof(int32_t)); values_transpose_params->permute[0] = 0; @@ -373,6 +384,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_matmul_params *output_matmul_params = csinn_alloc_params(sizeof(struct csinn_matmul_params), sess); + output_matmul_params->base.name = concat_name(name, "output_matmul_params"); csinn_matmul_init(scores_softmax, values_transpose, output_matmul, output_matmul_params); csinn_matmul(scores_softmax, values_transpose, output_matmul, output_matmul_params); @@ -382,6 +394,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_transpose_params *output_transpose_params = csinn_alloc_params(sizeof(struct csinn_transpose_params), sess); + output_transpose_params->base.name = concat_name(name, "output_transpose_params"); output_transpose_params->permute_num = 4; output_transpose_params->permute = shl_mem_alloc(4 * sizeof(int32_t)); output_transpose_params->permute[0] = 0; @@ -393,6 +406,8 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc struct csinn_reshape_params *output_transpose_reshape_params = csinn_alloc_params(sizeof(struct csinn_reshape_params), sess); + output_transpose_reshape_params->base.name = + concat_name(name, "output_transpose_reshape_params"); output_transpose_reshape_params->shape_num = 3; output_transpose_reshape_params->shape = shl_mem_alloc(3 * sizeof(int32_t)); output_transpose_reshape_params->shape[0] = bsz; @@ -400,7 +415,7 @@ static struct csinn_tensor *attention(struct shl_transformer_block *block, struc output_transpose_reshape_params->shape[2] = n_heads * head_dim; struct csinn_tensor *output_transpose_reshape_output = csinn_alloc_tensor(sess); - output_transpose_reshape_output->name = alloc_name("output_transpose_reshape_output"); + output_transpose_reshape_output->name = concat_name(name, "output_transpose_reshape_output"); csinn_reshape_init(output_transpose, output_transpose_reshape_output, output_transpose_reshape_params); csinn_reshape(output_transpose, output_transpose_reshape_output, @@ -430,9 +445,9 @@ static struct csinn_tensor *feed_forward(struct csinn_session *sess, struct csin x2->name = concat_name(name, "ff_0_x2_mul_output"); struct csinn_diso_params *x2_mul_params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); + x2_mul_params->base.name = concat_name(name, "x2_mul_params"); csinn_mul_init(silu_output, x3, x2, x2_mul_params); csinn_mul(silu_output, x3, x2, x2_mul_params); - // struct csinn_tensor *x2 = matmul(sess, silu_output, x3, concat_name(name, "x2_matmul")); // x2 = linear(x2, w2) struct csinn_tensor *x2_linear_output = linear(sess, x2, w2, concat_name(name, "x2_linear")); @@ -449,11 +464,11 @@ static struct shl_transformer_block *layer(struct shl_llm_ctx *ctx, struct csinn struct csinn_session *sess = csinn_alloc_session(); sess->base_run_mode = CSINN_RM_CPU_GRAPH; - sess->base_quant_type = CSINN_QUANT_FLOAT32; + sess->base_quant_type = ctx->base_quant_type; sess->model.save_mode = CSINN_RUN_ONLY; sess->base_layout = CSINN_LAYOUT_NCHW; - sess->base_api = CSINN_REF; - sess->base_dtype = CSINN_DTYPE_FLOAT32; + sess->base_api = ctx->base_api; + sess->base_dtype = ctx->base_dtype; sess->dynamic_shape = CSINN_FALSE; // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; csinn_session_init(sess); @@ -480,6 +495,7 @@ static struct shl_transformer_block *layer(struct shl_llm_ctx *ctx, struct csinn h_attention->name = alloc_index_name(layer_id, "h_attention"); struct csinn_diso_params *x_add_params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); + x_add_params->base.name = alloc_index_name(layer_id, "x_add_params"); csinn_add_init(x, attention_output, h_attention, x_add_params); csinn_add(x, attention_output, h_attention, x_add_params); @@ -502,6 +518,7 @@ static struct shl_transformer_block *layer(struct shl_llm_ctx *ctx, struct csinn struct csinn_diso_params *h_add_params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); + h_add_params->base.name = alloc_index_name(layer_id, "h_add_params"); csinn_add_init(h_attention, ff_output, h_ff, h_add_params); csinn_add(h_attention, ff_output, h_ff, h_add_params); @@ -512,15 +529,15 @@ static struct shl_transformer_block *layer(struct shl_llm_ctx *ctx, struct csinn return ret; } -static struct csinn_session *tok_embedding(struct llama_config *config) +static struct csinn_session *tok_embedding(struct shl_llm_model *model, struct shl_llm_ctx *config) { struct csinn_session *sess = csinn_alloc_session(); sess->base_run_mode = CSINN_RM_CPU_GRAPH; - sess->base_quant_type = CSINN_QUANT_FLOAT16; + sess->base_quant_type = config->base_quant_type; sess->model.save_mode = CSINN_RUN_ONLY; sess->base_layout = CSINN_LAYOUT_NCHW; - sess->base_api = CSINN_REF; - sess->base_dtype = CSINN_DTYPE_FLOAT16; + sess->base_api = config->base_api; + sess->base_dtype = config->base_dtype; sess->dynamic_shape = CSINN_TRUE; // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; csinn_session_init(sess); @@ -534,26 +551,27 @@ static struct csinn_session *tok_embedding(struct llama_config *config) csinn_set_input(0, embd, sess); struct csinn_tensor *embd_output = csinn_alloc_tensor(sess); embd_output->name = "embd_output"; - embd_output->dtype = CSINN_DTYPE_FLOAT32; + embd_output->dtype = sess->base_dtype; embd_output->dim_count = 2; embd_output->dim[0] = 0; - embd_output->dim[1] = config->shl_model->tok_embeddings->dim[1]; + embd_output->dim[1] = model->tok_embeddings->dim[1]; struct csinn_tensor *embd_weight = csinn_alloc_tensor(sess); embd_weight->name = "embd_weight"; embd_weight->is_const = 1; - embd_weight->dtype = config->shl_model->tok_embeddings->dtype; - embd_weight->mtype = config->shl_model->tok_embeddings->mtype; - embd_weight->dim_count = config->shl_model->tok_embeddings->dim_count; - embd_weight->dim[0] = config->shl_model->tok_embeddings->dim[0]; - embd_weight->dim[1] = config->shl_model->tok_embeddings->dim[1]; + embd_weight->dtype = model->tok_embeddings->dtype; + embd_weight->mtype = model->tok_embeddings->mtype; + embd_weight->dim_count = model->tok_embeddings->dim_count; + embd_weight->dim[0] = model->tok_embeddings->dim[0]; + embd_weight->dim[1] = model->tok_embeddings->dim[1]; - embd_weight->data = config->shl_model->tok_embeddings->data; + embd_weight->data = model->tok_embeddings->data; struct csinn_diso_params *embd_params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess); + embd_params->base.name = alloc_name("embd_params"); csinn_embedding_init(embd, embd_weight, embd_output, embd_params); csinn_embedding(embd, embd_weight, embd_output, embd_params); @@ -567,11 +585,11 @@ static struct csinn_session *llama2_output(struct shl_llm_ctx *ctx) { struct csinn_session *sess = csinn_alloc_session(); sess->base_run_mode = CSINN_RM_CPU_GRAPH; - sess->base_quant_type = CSINN_QUANT_FLOAT32; + sess->base_quant_type = ctx->base_quant_type; sess->model.save_mode = CSINN_RUN_ONLY; sess->base_layout = CSINN_LAYOUT_NCHW; - sess->base_api = CSINN_REF; - sess->base_dtype = CSINN_DTYPE_FLOAT32; + sess->base_api = ctx->base_api; + sess->base_dtype = ctx->base_dtype; sess->dynamic_shape = CSINN_FALSE; // sess->debug_level = CSINN_DEBUG_LEVEL_INFO; csinn_session_init(sess); @@ -606,10 +624,14 @@ static struct csinn_session *llama2_output(struct shl_llm_ctx *ctx) struct shl_llm_ctx *llama2_build(struct llama_config *config) { struct shl_llm_ctx *ctx = shl_mem_alloc(sizeof(struct shl_llm_ctx)); + /* TODO: target may have multiple computing units */ + ctx->base_api = config->base_api; + ctx->base_dtype = config->base_dtype; + ctx->base_quant_type = config->base_quant_type; ctx->shl_model = config->shl_model; // h = tok_embedding(tokens) - ctx->embeding_session = tok_embedding(config); + ctx->embeding_session = tok_embedding(config->shl_model, ctx); // TransformerBlocks: h = layer(h, start_pos, freqes_cis, mask) ctx->layers_num = config->n_layers; diff --git a/source/llm/llm.c b/source/llm/llm.c index 9146899b..ab204f42 100644 --- a/source/llm/llm.c +++ b/source/llm/llm.c @@ -126,7 +126,7 @@ static void llm_session_dynamic_infer_shape(struct csinn_session *sess, struct s shl_gref_rope_infer_shape(n->in[0]->data, n->out[0]->data, rope_params); break; case CSINN_OP_RMS_NORM: - shl_gref_rms_norm_infer_shape(n->in[0]->data, n->out[0]->data, n->in[1]->data, + shl_gref_rms_norm_infer_shape(n->in[0]->data, n->in[1]->data, n->out[0]->data, (struct csinn_rms_norm_params *)params); break; case CSINN_OP_SILU: diff --git a/source/nn2/format.c b/source/nn2/format.c index 3b735188..51b39e4f 100644 --- a/source/nn2/format.c +++ b/source/nn2/format.c @@ -198,7 +198,7 @@ static char *tensor_dump(struct csinn_tensor *tensor, int *size) ret->layout = tensor->layout; ret->quant_channel = tensor->quant_channel; - if (tensor->is_const) { + if (tensor->is_const && tensor->data != NULL) { ret = shl_mem_realloc(ret, tensor_size + csinn_tensor_byte_size(tensor), tensor_size); append_ptr = (char *)ret + tensor_size; memcpy(append_ptr, tensor->data, csinn_tensor_byte_size(tensor)); @@ -206,7 +206,7 @@ static char *tensor_dump(struct csinn_tensor *tensor, int *size) tensor_size += csinn_tensor_byte_size(tensor); } else { /* ignore data */ - ret->data = 0; + ret->data = NULL; } *size = tensor_size; @@ -228,7 +228,7 @@ static void tensor_load(struct csinn_tensor *dest, struct csinn_tensor *src) dest->is_const = src->is_const; char *src_qinfo = (char *)src + read_offset(src->qinfo); memcpy(dest->qinfo, src_qinfo, sizeof(struct csinn_quant_info) * src->quant_channel); - if (src->is_const) { + if (src->is_const && src->data != NULL) { dest->data = copy_from_bm(ptr_offset_to_addr(src, src->data), csinn_tensor_byte_size(src)); } } @@ -310,6 +310,7 @@ void shl_bm_session_load(struct csinn_session *dest, struct csinn_session *src) dest->base_dtype = src->base_dtype; dest->base_run_mode = src->base_run_mode; dest->debug_level = src->debug_level; + dest->profiler_level = src->profiler_level; csinn_session_init(dest); csinn_set_input_number(src->input_num, dest); csinn_set_output_number(src->output_num, dest); @@ -428,7 +429,22 @@ static char *layer_data_dump(struct shl_node *layer, int *size) *size = extend_size; - if (layer->type == CSINN_OP_RESHAPE) { + if (layer->type == CSINN_OP_CONV2D || layer->type == CSINN_OP_DEPTHWISE_CONV2D || + layer->type == CSINN_OP_GROUP_CONV2D) { + struct csinn_conv2d_params *conv2d_params = layer->data; + if (conv2d_params->conv_extra.kernel_tm != NULL) { + int kernel_tm_size; + char *kernel_tm_buf = tensor_dump(conv2d_params->conv_extra.kernel_tm, &kernel_tm_size); + ret = shl_mem_realloc(ret, extend_size + kernel_tm_size, extend_size); + struct csinn_conv2d_params *ret_conv2d_params = (struct csinn_conv2d_params *)ret; + ret_conv2d_params->conv_extra.kernel_tm = + (struct csinn_tensor *)offset_to_ptr(extend_size); + memcpy((char *)ret + extend_size, kernel_tm_buf, kernel_tm_size); + shl_mem_free(kernel_tm_buf); + extend_size += kernel_tm_size; + *size = extend_size; + } + } else if (layer->type == CSINN_OP_RESHAPE) { struct csinn_reshape_params *reshape_params = layer->data; int shape_size = reshape_params->shape_num * sizeof(int32_t); ret = shl_mem_realloc(ret, extend_size + shape_size, extend_size); @@ -567,8 +583,17 @@ static void layer_data_load(struct shl_node *dest, struct shl_node *src) // /* dest's input have been loaded */ // struct csinn_tensor *input = dest->in[0]->data; // shl_op_callback_map(ret, src->type, input->dtype); - - if (src->type == CSINN_OP_RESHAPE) { + if (src->type == CSINN_OP_CONV2D || src->type == CSINN_OP_DEPTHWISE_CONV2D || + src->type == CSINN_OP_GROUP_CONV2D) { + struct csinn_conv2d_params *src_conv2d_params = ptr_offset_to_addr(src, src->data); + struct csinn_conv2d_params *conv2d_params = (struct csinn_conv2d_params *)ret; + if (src_conv2d_params->conv_extra.kernel_tm != NULL) { + conv2d_params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + tensor_load( + conv2d_params->conv_extra.kernel_tm, + ptr_offset_to_addr(src_conv2d_params, src_conv2d_params->conv_extra.kernel_tm)); + } + } else if (src->type == CSINN_OP_RESHAPE) { struct csinn_reshape_params *reshape_params = (struct csinn_reshape_params *)ret; char *shape_addr = ptr_offset_to_addr(ptr_offset_to_addr(src, src->data), reshape_params->shape); diff --git a/source/nn2/rms_norm.c b/source/nn2/rms_norm.c index 7ebbd3c3..3e32cb03 100644 --- a/source/nn2/rms_norm.c +++ b/source/nn2/rms_norm.c @@ -23,13 +23,13 @@ * @addtogroup INIT * @{ */ -int csinn_rms_norm_init(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params) +int csinn_rms_norm_init(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { shl_op_callback_map(¶ms->base, CSINN_OP_RMS_NORM, input->dtype); int (*func)() = shl_get_init_cb(¶ms->base); if (func != NULL) { - func(input, output, weights, params); + func(input, weights, output, params); } return CSINN_TRUE; } @@ -41,13 +41,13 @@ int csinn_rms_norm_init(struct csinn_tensor *input, struct csinn_tensor *output, * @addtogroup NN * @{ */ -int csinn_rms_norm(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params) +int csinn_rms_norm(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { - SHL_DEBUG_CALL(shl_rms_norm_debug_info(input, output, weights, params, __func__)); + SHL_DEBUG_CALL(shl_rms_norm_debug_info(input, weights, output, params, __func__)); int (*func)() = shl_get_p0_cb(¶ms->base); if (func != NULL) { - func(input, output, weights, params); + func(input, weights, output, params); } else { return CSINN_CALLBACK_UNSET; } diff --git a/source/nn2/setup.c b/source/nn2/setup.c index da30b0a3..d27aa67c 100644 --- a/source/nn2/setup.c +++ b/source/nn2/setup.c @@ -153,11 +153,30 @@ void *shl_get_runtime_callback(struct csinn_session *sess, int op) void csinn_session_init(struct csinn_session *sess) { shl_debug_set_level(sess->debug_level); + if (sess->profiler_level >= CSINN_PROFILER_LEVEL_TRACE) { + struct shl_trace *trace = (struct shl_trace *)shl_mem_alloc(sizeof(struct shl_trace)); + trace->enable_trace = true; + sess->trace = trace; + + SHL_TRACE_CALL(shl_trace_begin(trace, NULL)); + + // add some meta-data + SHL_TRACE_CALL(shl_trace_other_data( + trace, shl_trace_create_dict( + 4, "source", SHL_TRACE_STRING("csinn"), "base_api", + SHL_TRACE_STRING(shl_find_api_name(sess->base_api)), "base_run_mode", + SHL_TRACE_STRING(shl_find_rmod_name(sess->base_run_mode)), "base_quant_type", + SHL_TRACE_STRING(shl_find_quant_name(sess->base_quant_type))))); + } + + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); void *(*func)() = shl_get_runtime_callback(sess, CSINN_SESSION_INIT); if (func != NULL) { func(sess); } + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); } /** * @} @@ -169,11 +188,20 @@ void csinn_session_init(struct csinn_session *sess) */ void csinn_session_deinit(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + void *(*func)(); func = shl_get_runtime_callback(sess, CSINN_SESSION_DEINIT); if (func != NULL) { func(sess); } + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + if (sess->profiler_level >= CSINN_PROFILER_LEVEL_TRACE) { + SHL_TRACE_CALL(shl_trace_end(sess->trace)); + shl_mem_free(sess->trace); + } } /** * @} @@ -185,6 +213,8 @@ void csinn_session_deinit(struct csinn_session *sess) */ void csinn_set_output_number(int number, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + sess->output_num = number; sess->output = shl_mem_alloc(sess->output_num * sizeof(struct csinn_tensor *)); void (*func)(); @@ -192,6 +222,8 @@ void csinn_set_output_number(int number, struct csinn_session *sess) if (func != NULL) { func(number, sess); } + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); } /** * @} @@ -203,6 +235,8 @@ void csinn_set_output_number(int number, struct csinn_session *sess) */ void csinn_set_input_number(int number, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + sess->input_num = number; sess->input = shl_mem_alloc(sess->input_num * sizeof(struct csinn_tensor *)); void (*func)(); @@ -210,6 +244,8 @@ void csinn_set_input_number(int number, struct csinn_session *sess) if (func != NULL) { func(number, sess); } + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); } /** * @} @@ -221,13 +257,20 @@ void csinn_set_input_number(int number, struct csinn_session *sess) */ int csinn_get_output_number(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = 0; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_GET_OUTPUT_NUMBER); if (func != NULL) { - return func(sess); + ret = func(sess); } else { - return sess->output_num; + ret = sess->output_num; } + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -239,13 +282,20 @@ int csinn_get_output_number(struct csinn_session *sess) */ int csinn_get_input_number(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = 0; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_GET_INPUT_NUMBER); if (func != NULL) { - return func(sess); + ret = func(sess); } else { - return sess->input_num; + ret = sess->input_num; } + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -257,13 +307,19 @@ int csinn_get_input_number(struct csinn_session *sess) */ int csinn_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; sess->output[index] = output; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_SET_OUTPUT); if (func != NULL) { - return func(index, output, sess); + ret = func(index, output, sess); } - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -275,13 +331,19 @@ int csinn_set_output(int index, struct csinn_tensor *output, struct csinn_sessio */ int csinn_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; sess->input[index] = input; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_SET_INPUT); if (func != NULL) { - return func(index, input, sess); + ret = func(index, input, sess); } - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -293,13 +355,19 @@ int csinn_set_input(int index, struct csinn_tensor *input, struct csinn_session */ int csinn_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; csinn_tensor_copy(output, sess->output[index]); int (*func)(); func = shl_get_runtime_callback(sess, CSINN_GET_OUTPUT); if (func != NULL) { - return func(index, output, sess); + ret = func(index, output, sess); } - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -311,13 +379,19 @@ int csinn_get_output(int index, struct csinn_tensor *output, struct csinn_sessio */ int csinn_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; csinn_tensor_copy(input, sess->input[index]); int (*func)(); func = shl_get_runtime_callback(sess, CSINN_GET_INPUT); if (func != NULL) { - return func(index, input, sess); + ret = func(index, input, sess); } - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -329,6 +403,9 @@ int csinn_get_input(int index, struct csinn_tensor *input, struct csinn_session */ int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; sess->input[index]->data = input->data; if (sess->dynamic_shape) { memcpy(sess->input[index]->dim, input->dim, sizeof(int32_t) * MAX_DIM); @@ -337,7 +414,6 @@ int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_sessi int (*func)(); func = shl_get_runtime_callback(sess, CSINN_UPDATE_INPUT); if (func != NULL) { - int ret = CSINN_FALSE; if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER) { uint64_t start = shl_get_timespec(); ret = func(index, input, sess); @@ -346,9 +422,11 @@ int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_sessi } else { ret = func(index, input, sess); } - return ret; } - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -360,13 +438,19 @@ int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_sessi */ int csinn_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_TRUE; sess->output[index]->data = output->data; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_UPDATE_OUTPUT); if (func != NULL) { - return func(index, output, sess); + ret = func(index, output, sess); } - return CSINN_TRUE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -378,10 +462,12 @@ int csinn_update_output(int index, struct csinn_tensor *output, struct csinn_ses */ int csinn_session_setup(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_FALSE; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_SESSION_SETUP); if (func != NULL) { - int ret = CSINN_FALSE; if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER) { uint64_t start = shl_get_timespec(); ret = func(sess); @@ -390,9 +476,11 @@ int csinn_session_setup(struct csinn_session *sess) } else { ret = func(sess); } - return ret; } - return CSINN_FALSE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -404,10 +492,12 @@ int csinn_session_setup(struct csinn_session *sess) */ int csinn_session_run(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_FALSE; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_SESSION_RUN); if (func != NULL) { - int ret = CSINN_FALSE; if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER) { uint64_t start = shl_get_timespec(); ret = func(sess); @@ -416,9 +506,11 @@ int csinn_session_run(struct csinn_session *sess) } else { ret = func(sess); } - return ret; } - return CSINN_FALSE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -430,12 +522,18 @@ int csinn_session_run(struct csinn_session *sess) */ int csinn_set_tensor_entry(struct csinn_tensor *t, struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_FALSE; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_TENSOR_ENTRY); if (func != NULL) { - return func(t, sess); + ret = func(t, sess); } - return CSINN_FALSE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} @@ -447,10 +545,12 @@ int csinn_set_tensor_entry(struct csinn_tensor *t, struct csinn_session *sess) */ int csinn_load_binary_model(struct csinn_session *sess) { + SHL_TRACE_CALL(shl_trace_duration_begin(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + int ret = CSINN_FALSE; int (*func)(); func = shl_get_runtime_callback(sess, CSINN_LOAD_BG); if (func != NULL) { - int ret = CSINN_FALSE; if (sess->profiler_level == CSINN_PROFILER_LEVEL_TIMER) { uint64_t start = shl_get_timespec(); ret = func(sess); @@ -459,9 +559,11 @@ int csinn_load_binary_model(struct csinn_session *sess) } else { ret = func(sess); } - return ret; } - return CSINN_FALSE; + + SHL_TRACE_CALL(shl_trace_duration_end(sess->trace, __func__, SHL_TRACE_EVENT_RUNTIME, NULL)); + + return ret; } /** * @} diff --git a/source/nn2/utils.c b/source/nn2/utils.c index 2e53b81c..3231ccbf 100644 --- a/source/nn2/utils.c +++ b/source/nn2/utils.c @@ -21,6 +21,90 @@ #include "csi_nn.h" #include "shl_utils.h" +static struct csinn_enum_map csinn_dtype_map[] = { + {CSINN_DTYPE_BOOL, "CSINN_DTYPE_BOOL"}, {CSINN_DTYPE_INT4, "CSINN_DTYPE_INT4"}, + {CSINN_DTYPE_UINT8, "CSINN_DTYPE_UINT8"}, {CSINN_DTYPE_INT8, "CSINN_DTYPE_INT8"}, + {CSINN_DTYPE_UINT16, "CSINN_DTYPE_UINT16"}, {CSINN_DTYPE_INT16, "CSINN_DTYPE_INT16"}, + {CSINN_DTYPE_UINT32, "CSINN_DTYPE_UINT32"}, {CSINN_DTYPE_INT32, "CSINN_DTYPE_INT32"}, + {CSINN_DTYPE_FLOAT16, "CSINN_DTYPE_FLOAT16"}, {CSINN_DTYPE_BFLOAT16, "CSINN_DTYPE_BFLOAT16"}, + {CSINN_DTYPE_FLOAT32, "CSINN_DTYPE_FLOAT32"}, {CSINN_DTYPE_FLOAT64, "CSINN_DTYPE_FLOAT64"}, + {CSINN_DTYPE_INT64, "CSINN_DTYPE_INT64"}, +}; + +static struct csinn_enum_map csinn_quant_map[] = { + {CSINN_QUANT_UNSET, "CSINN_QUANT_UNSET"}, + {CSINN_QUANT_INT4_SYM, "CSINN_QUANT_INT4_SYM"}, + {CSINN_QUANT_UINT8_ASYM, "CSINN_QUANT_UINT8_ASYM"}, + {CSINN_QUANT_UINT8_SYM, "CSINN_QUANT_UINT8_SYM"}, + {CSINN_QUANT_INT8_ASYM, "CSINN_QUANT_INT8_ASYM"}, + {CSINN_QUANT_INT8_SYM, "CSINN_QUANT_INT8_SYM"}, + {CSINN_QUANT_INT16_SYM, "CSINN_QUANT_INT16_SYM"}, + {CSINN_QUANT_FLOAT16, "CSINN_QUANT_FLOAT16"}, + {CSINN_QUANT_BFLOAT16, "CSINN_QUANT_BFLOAT16"}, + {CSINN_QUANT_FLOAT32, "CSINN_QUANT_FLOAT32"}, + {CSINN_QUANT_INT4_ASYM_W_SYM, "CSINN_QUANT_INT4_ASYM_W_SYM"}, + {CSINN_QUANT_INT8_ASYM_W_SYM, "CSINN_QUANT_INT8_ASYM_W_SYM"}, + {CSINN_QUANT_FLOAT16_W_INT8, "CSINN_QUANT_FLOAT16_W_INT8"}, + {CSINN_QUANT_BLOCK_Q2_K, "CSINN_QUANT_BLOCK_Q2_K"}, + {CSINN_QUANT_BLOCK_Q4_0, "CSINN_QUANT_BLOCK_Q4_0"}, + {CSINN_QUANT_BLOCK_Q8_0, "CSINN_QUANT_BLOCK_Q8_0"}, +}; + +static struct csinn_enum_map csinn_api_map[] = { + {CSINN_REF, "CSINN_REF"}, {CSINN_GREF, "CSINN_GREF"}, + {CSINN_C860, "CSINN_C860"}, {CSINN_C906, "CSINN_C906"}, + {CSINN_C920, "CSINN_C920"}, {CSINN_ANOLE, "CSINN_ANOLE"}, + {CSINN_CH8601, "CSINN_CH8601"}, {CSINN_TH1520, "CSINN_TH1520"}, + {CSINN_DP1K, "CSINN_DP1K"}, {CSINN_I805, "CSINN_I805"}, + {CSINN_E804, "CSINN_E804"}, {CSINN_REF_I805, "CSINN_REF_I805"}, + {CSINN_C908, "CSINN_C908"}, {CSINN_TVMGEN, "CSINN_TVMGEN"}, + {CSINN_ASP, "CSINN_ASP"}, {CSINN_RVV, "CSINN_RVV"}, + {CSINN_RVM, "CSINN_RVM"}, {CSINN_E907, "CSINN_E907"}, + {CSINN_C920V2, "CSINN_C920V2"}, +}; + +static struct csinn_enum_map csinn_rmod_map[] = { + {CSINN_RM_LAYER, "CSINN_RM_LAYER"}, + {CSINN_RM_CPU_GRAPH, "CSINN_RM_CPU_GRAPH"}, + {CSINN_RM_NPU_GRAPH, "CSINN_RM_NPU_GRAPH"}, + {CSINN_RM_CPU_BASE_HYBRID, "CSINN_RM_CPU_BASE_HYBRID"}, +}; + +static struct csinn_enum_map csinn_layout_map[] = { + {CSINN_LAYOUT_NULL, "CSINN_LAYOUT_NULL"}, + {CSINN_LAYOUT_N, "CSINN_LAYOUT_N"}, + {CSINN_LAYOUT_NC, "CSINN_LAYOUT_NC"}, + {CSINN_LAYOUT_NCW, "CSINN_LAYOUT_NCW"}, + {CSINN_LAYOUT_NCHW, "CSINN_LAYOUT_NCHW"}, + {CSINN_LAYOUT_NCDHW, "CSINN_LAYOUT_NCDHW"}, + {CSINN_LAYOUT_O, "CSINN_LAYOUT_O"}, + {CSINN_LAYOUT_OI, "CSINN_LAYOUT_OI"}, + {CSINN_LAYOUT_O16I16, "CSINN_LAYOUT_O16I16"}, + {CSINN_LAYOUT_O32I32, "CSINN_LAYOUT_O32I32"}, + {CSINN_LAYOUT_OIW, "CSINN_LAYOUT_OIW"}, + {CSINN_LAYOUT_OIHW, "CSINN_LAYOUT_OIHW"}, + {CSINN_LAYOUT_OIDHW, "CSINN_LAYOUT_OIDHW"}, + {CSINN_LAYOUT_O1HW, "CSINN_LAYOUT_O1HW"}, + {CSINN_LAYOUT_NWC, "CSINN_LAYOUT_NWC"}, + {CSINN_LAYOUT_NHWC, "CSINN_LAYOUT_NHWC"}, + {CSINN_LAYOUT_NDHWC, "CSINN_LAYOUT_NDHWC"}, + {CSINN_LAYOUT_OWI, "CSINN_LAYOUT_OWI"}, + {CSINN_LAYOUT_OHWI, "CSINN_LAYOUT_OHWI"}, + {CSINN_LAYOUT_O16HWI16, "CSINN_LAYOUT_O16HWI16"}, + {CSINN_LAYOUT_O32HWI32, "CSINN_LAYOUT_O32HWI32"}, + {CSINN_LAYOUT_ODHWI, "CSINN_LAYOUT_ODHWI"}, + {CSINN_LAYOUT_1HWO, "CSINN_LAYOUT_1HWO"}, + {CSINN_LAYOUT_1HW16O16, "CSINN_LAYOUT_1HW16O16"}, + {CSINN_LAYOUT_1HW32O32, "CSINN_LAYOUT_1HW32O32"}, + {CSINN_LAYOUT_O1HWIO0, "CSINN_LAYOUT_O1HWIO0"}, + {CSINN_LAYOUT_NC1C0, "CSINN_LAYOUT_NC1C0"}, + {CSINN_LAYOUT_NC1WC0, "CSINN_LAYOUT_NC1WC0"}, + {CSINN_LAYOUT_NC1HWC0, "CSINN_LAYOUT_NC1HWC0"}, + {CSINN_LAYOUT_NC1DHWC0, "CSINN_LAYOUT_NC1DHWC0"}, + {CSINN_LAYOUT_NLCDHW, "CSINN_LAYOUT_NLCDHW"}, + {CSINN_LAYOUT_IOHW, "CSINN_LAYOUT_IOHW"}, +}; + /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/quantization_util.cc */ static int64_t integer_from_exp(double input, int32_t *shift) @@ -2013,7 +2097,6 @@ static int block_dequantize_q4(struct csinn_tensor *dst, struct csinn_tensor *sr dst_data[output_index] = ((float)(value & 0xf) - 8) * fp32_scale; dst_data[output_index + block_size / 2] = ((float)((value & 0xf0) >> 4) - 8) * fp32_scale; - ; } } @@ -2044,6 +2127,78 @@ static int block_dequantize_q8(struct csinn_tensor *dst, struct csinn_tensor *sr return CSINN_TRUE; } +static int block_quantize_q4(struct csinn_tensor *dst, struct csinn_tensor *src) +{ + if (dst->dtype != CSINN_DTYPE_INT4 || src->dtype != CSINN_DTYPE_FLOAT32) { + shl_debug_error("%s: unsupported convert dtype\n", __func__); + return CSINN_FALSE; + } + + float *src_data = src->data; + int8_t *dst_data = dst->data; + int16_t *scale_data = dst->data + csinn_tensor_size(dst) / 2; + int block_size = 32; + int block_num = csinn_tensor_size(src) / block_size; + + for (int i = 0; i < block_num; i++) { + float max_value = 0.0f; + float abs_max_value = 0.0f; + + for (int j = 0; j < block_size; j++) { + float value = src_data[i * block_size + j]; + if (abs_max_value < fabsf(value)) { + abs_max_value = fabsf(value); + max_value = value; + } + } + + float fp32_scale = max_value / -8.0f; + float id = fp32_scale ? 1.0f / fp32_scale : 0.0f; + scale_data[i] = float32_to_float16_base(fp32_scale); + + for (int j = 0; j < block_size / 2; ++j) { + float value0 = src_data[i * block_size + j]; + float value1 = src_data[i * block_size + block_size / 2 + j]; + uint8_t q4_value0 = fminf((int8_t)(value0 * id + 8.5f), 15); + uint8_t q4_value1 = fminf((int8_t)(value1 * id + 8.5f), 15); + dst_data[i * block_size / 2 + j] = q4_value0 | (q4_value1 << 4); + } + } +} + +static int block_quantize_q8(struct csinn_tensor *dst, struct csinn_tensor *src) +{ + if (dst->dtype != CSINN_DTYPE_INT8 || src->dtype != CSINN_DTYPE_FLOAT32) { + shl_debug_error("%s: unsupported convert dtype\n", __func__); + return CSINN_FALSE; + } + + float *src_data = src->data; + int8_t *dst_data = dst->data; + int16_t *scale_data = dst->data + csinn_tensor_size(dst); + int block_size = 32; + int block_num = csinn_tensor_size(src) / block_size; + + for (int i = 0; i < block_num; i++) { + float max_value = 0.0f; + + for (int j = 0; j < block_size; j++) { + float value = src_data[i * block_size + j]; + max_value = fmaxf(max_value, fabsf(value)); + } + + float fp32_scale = max_value / ((1 << 7) - 1); + float id = fp32_scale ? 1.0f / fp32_scale : 0.0f; + scale_data[i] = float32_to_float16_base(fp32_scale); + + for (int j = 0; j < block_size; j++) { + float value = src_data[i * block_size + j]; + const float q8_value = value * id; + dst_data[i * block_size + j] = roundf(q8_value); + } + } +} + /** * @addtogroup TENSOR * @{ @@ -2054,6 +2209,10 @@ int csinn_tensor_data_convert(struct csinn_tensor *dest, struct csinn_tensor *sr return block_dequantize_q8(dest, src); } else if (src->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) { return block_dequantize_q4(dest, src); + } else if (dest->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) { + return block_quantize_q8(dest, src); + } else if (dest->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) { + return block_quantize_q4(dest, src); } if (dest->layout == src->layout && dest->dtype == src->dtype) { @@ -2222,3 +2381,68 @@ int csinn_version(char *vstr) } return (major << (VERSION_SHIFT * 2)) | (minor << VERSION_SHIFT) | patch; } + +char *shl_find_function_name(struct shl_function_map *fmap, void *func) +{ + char *res = NULL; + if (fmap == NULL || func == NULL) { + return res; + } + int idx = 0; + struct shl_function_map curr = fmap[idx]; + while (curr.func != NULL && curr.name != NULL) { + if (func == curr.func) { + res = curr.name; + break; + } + idx++; + curr = fmap[idx]; + } + return res; +} + +char *shl_find_enum_name(struct csinn_enum_map *map, int map_len, int type) +{ + char *res = NULL; + if (!map || map_len <= 0) { + return res; + } + for (int i = 0; i < map_len; i++) { + if (map[i].type == type) { + res = map[i].name; + break; + } + } + + return res; +} + +char *shl_find_dtype_name(enum csinn_dtype_enum type) +{ + return shl_find_enum_name(csinn_dtype_map, + sizeof(csinn_dtype_map) / sizeof(struct csinn_enum_map), type); +} + +char *shl_find_quant_name(enum csinn_quant_enum type) +{ + return shl_find_enum_name(csinn_quant_map, + sizeof(csinn_quant_map) / sizeof(struct csinn_enum_map), type); +} + +char *shl_find_api_name(enum csinn_api_enum type) +{ + return shl_find_enum_name(csinn_api_map, sizeof(csinn_api_map) / sizeof(struct csinn_enum_map), + type); +} + +char *shl_find_rmod_name(enum csinn_rmode_enum type) +{ + return shl_find_enum_name(csinn_rmod_map, + sizeof(csinn_rmod_map) / sizeof(struct csinn_enum_map), type); +} + +char *shl_find_layout_name(enum csinn_layout_enum type) +{ + return shl_find_enum_name(csinn_layout_map, + sizeof(csinn_layout_map) / sizeof(struct csinn_enum_map), type); +} diff --git a/source/reference/performance.c b/source/reference/performance.c new file mode 100644 index 00000000..c3eef328 --- /dev/null +++ b/source/reference/performance.c @@ -0,0 +1,1826 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reference/perf.h" +#include "reference/ref.h" + +static struct shl_function_map shl_ref_kernel_map[] = { + {shl_ref_abs_f32, "shl_ref_abs_f32"}, + {shl_ref_abs_quant, "shl_ref_abs_quant"}, + {shl_ref_acos_f32, "shl_ref_acos_f32"}, + {shl_ref_acos_quant, "shl_ref_acos_quant"}, + {shl_ref_acosh_f32, "shl_ref_acosh_f32"}, + {shl_ref_acosh_quant, "shl_ref_acosh_quant"}, + {shl_ref_add_f32, "shl_ref_add_f32"}, + // {shl_ref_add_u8, "shl_ref_add_u8"}, + {shl_ref_add_f32, "shl_ref_add_f32"}, + {shl_ref_add_quant, "shl_ref_add_quant"}, + {shl_ref_and_u32, "shl_ref_and_u32"}, + {shl_ref_and_u8, "shl_ref_and_u8"}, + {shl_ref_and_i8, "shl_ref_and_i8"}, + {shl_ref_arange_f32, "shl_ref_arange_f32"}, + {shl_ref_arange_quant, "shl_ref_arange_quant"}, + {shl_ref_argmax_stride_i32_f32, "shl_ref_argmax_stride_i32_f32"}, + {shl_ref_argmax_stride_quant, "shl_ref_argmax_stride_quant"}, + {shl_ref_argmin_stride_i32_f32, "shl_ref_argmin_stride_i32_f32"}, + {shl_ref_argmin_stride_quant, "shl_ref_argmin_stride_quant"}, + {shl_ref_asin_f32, "shl_ref_asin_f32"}, + {shl_ref_asin_quant, "shl_ref_asin_quant"}, + {shl_ref_asinh_f32, "shl_ref_asinh_f32"}, + {shl_ref_asinh_quant, "shl_ref_asinh_quant"}, + {shl_ref_atan_f32, "shl_ref_atan_f32"}, + {shl_ref_atan_quant, "shl_ref_atan_quant"}, + {shl_ref_atanh_f32, "shl_ref_atanh_f32"}, + {shl_ref_atanh_quant, "shl_ref_atanh_quant"}, + {shl_ref_avgpool2d_f32, "shl_ref_avgpool2d_f32"}, + {shl_ref_avgpool2d_quant, "shl_ref_avgpool2d_quant"}, + {shl_ref_avgpool3d_f32, "shl_ref_avgpool3d_f32"}, + {shl_ref_avgpool3d_quant, "shl_ref_avgpool3d_quant"}, + {shl_ref_batch_normalization_f32, "shl_ref_batch_normalization_f32"}, + {shl_ref_batch_normalization_quant, "shl_ref_batch_normalization_quant"}, + {shl_ref_batch_to_space_f32, "shl_ref_batch_to_space_f32"}, + {shl_ref_batch_to_space_quant, "shl_ref_batch_to_space_quant"}, + {shl_ref_broadcast_to_f32, "shl_ref_broadcast_to_f32"}, + {shl_ref_broadcast_to_quant, "shl_ref_broadcast_to_quant"}, + {shl_ref_ceil_f32, "shl_ref_ceil_f32"}, + {shl_ref_ceil_quant, "shl_ref_ceil_quant"}, + {shl_ref_clip_f32, "shl_ref_clip_f32"}, + {shl_ref_clip_quant, "shl_ref_clip_quant"}, + {shl_ref_col2im_f32, "shl_ref_col2im_f32"}, + {shl_ref_concat_f32, "shl_ref_concat_f32"}, + {shl_ref_concat_quant, "shl_ref_concat_quant"}, + {shl_ref_conv1d_f32, "shl_ref_conv1d_f32"}, + {shl_ref_conv1d_quant, "shl_ref_conv1d_quant"}, + {shl_ref_conv2d_f32, "shl_ref_conv2d_f32"}, + {shl_ref_conv2d_quant, "shl_ref_conv2d_quant"}, + {shl_ref_conv2d_channel_quant, "shl_ref_conv2d_channel_quant"}, + {shl_ref_conv2d_relu_f32, "shl_ref_conv2d_relu_f32"}, + {shl_ref_conv2d_relu_quant, "shl_ref_conv2d_relu_quant"}, + {shl_ref_cache_matmul_f32, "shl_ref_cache_matmul_f32"}, + {shl_ref_cache_matmul_quant, "shl_ref_cache_matmul_quant"}, + {shl_ref_cache_conv1d_f32, "shl_ref_cache_conv1d_f32"}, + {shl_ref_cache_conv1d_quant, "shl_ref_cache_conv1d_quant"}, + {shl_ref_conv2d_channel_relu_quant, "shl_ref_conv2d_channel_relu_quant"}, + {shl_ref_conv2d_relu6_quant, "shl_ref_conv2d_relu6_quant"}, + {shl_ref_conv2d_channel_relu6_quant, "shl_ref_conv2d_channel_relu6_quant"}, + {shl_ref_depthwise_conv2d_f32, "shl_ref_depthwise_conv2d_f32"}, + {shl_ref_depthwise_conv2d_quant, "shl_ref_depthwise_conv2d_quant"}, + {shl_ref_depthwise_conv2d_channel_quant, "shl_ref_depthwise_conv2d_channel_quant"}, + {shl_ref_depthwise_conv2d_relu_f32, "shl_ref_depthwise_conv2d_relu_f32"}, + {shl_ref_depthwise_conv2d_relu_quant, "shl_ref_depthwise_conv2d_relu_quant"}, + {shl_ref_depthwise_conv2d_channel_relu_quant, "shl_ref_depthwise_conv2d_channel_relu_quant"}, + {shl_ref_depthwise_conv2d_relu6_quant, "shl_ref_depthwise_conv2d_relu6_quant"}, + {shl_ref_depthwise_conv2d_channel_relu6_quant, "shl_ref_depthwise_conv2d_channel_relu6_quant"}, + {shl_ref_group_conv2d_f32, "shl_ref_group_conv2d_f32"}, + {shl_ref_group_conv2d_quant, "shl_ref_group_conv2d_quant"}, + {shl_ref_group_conv2d_channel_quant, "shl_ref_group_conv2d_channel_quant"}, + {shl_ref_group_conv2d_relu_quant, "shl_ref_group_conv2d_relu_quant"}, + {shl_ref_group_conv2d_relu6_quant, "shl_ref_group_conv2d_relu6_quant"}, + {shl_ref_group_conv2d_channel_relu_quant, "shl_ref_group_conv2d_channel_relu_quant"}, + {shl_ref_conv3d_f32, "shl_ref_conv3d_f32"}, + {shl_ref_conv3d_quant, "shl_ref_conv3d_quant"}, + {shl_ref_cos_f32, "shl_ref_cos_f32"}, + {shl_ref_cos_quant, "shl_ref_cos_quant"}, + {shl_ref_cosh_f32, "shl_ref_cosh_f32"}, + {shl_ref_cosh_quant, "shl_ref_cosh_quant"}, + {shl_ref_cumprod_f32, "shl_ref_cumprod_f32"}, + {shl_ref_cumprod_quant, "shl_ref_cumprod_quant"}, + {shl_ref_cumsum_f32, "shl_ref_cumsum_f32"}, + {shl_ref_cumsum_quant, "shl_ref_cumsum_quant"}, + {shl_ref_data_convert_f32, "shl_ref_data_convert_f32"}, + {shl_ref_data_convert_quant, "shl_ref_data_convert_quant"}, + {shl_ref_deconv2d_f32, "shl_ref_deconv2d_f32"}, + {shl_ref_deconv2d_quant, "shl_ref_deconv2d_quant"}, + {shl_ref_depthwise_deconv2d_f32, "shl_ref_depthwise_deconv2d_f32"}, + {shl_ref_depthwise_deconv2d_quant, "shl_ref_depthwise_deconv2d_quant"}, + {shl_ref_group_deconv2d_f32, "shl_ref_group_deconv2d_f32"}, + {shl_ref_group_deconv2d_quant, "shl_ref_group_deconv2d_quant"}, + {shl_ref_deconv3d_f32, "shl_ref_deconv3d_f32"}, + {shl_ref_deconv3d_quant, "shl_ref_deconv3d_quant"}, + {shl_ref_depth_to_space_f32, "shl_ref_depth_to_space_f32"}, + {shl_ref_depth_to_space_quant, "shl_ref_depth_to_space_quant"}, + {shl_ref_div_f32, "shl_ref_div_f32"}, + {shl_ref_div_quant, "shl_ref_div_quant"}, + {shl_ref_elu_f32, "shl_ref_elu_f32"}, + {shl_ref_elu_quant, "shl_ref_elu_quant"}, + {shl_ref_fsmn_f32, "shl_ref_fsmn_f32"}, + {shl_ref_fsmn_quant, "shl_ref_fsmn_quant"}, + {shl_ref_equal_f32, "shl_ref_equal_f32"}, + {shl_ref_equal_quant, "shl_ref_equal_quant"}, + {shl_ref_erf_f32, "shl_ref_erf_f32"}, + {shl_ref_erf_quant, "shl_ref_erf_quant"}, + {shl_ref_exp_f32, "shl_ref_exp_f32"}, + {shl_ref_exp_quant, "shl_ref_exp_quant"}, + {shl_ref_expand_dims_f32, "shl_ref_expand_dims_f32"}, + {shl_ref_expand_dims_quant, "shl_ref_expand_dims_quant"}, + {shl_ref_expm1_f32, "shl_ref_expm1_f32"}, + {shl_ref_expm1_quant, "shl_ref_expm1_quant"}, + {shl_ref_flatten, "shl_ref_flatten"}, + {shl_ref_flatten_quant, "shl_ref_flatten_quant"}, + {shl_ref_floor_divide_f32, "shl_ref_floor_divide_f32"}, + {shl_ref_floor_divide_quant, "shl_ref_floor_divide_quant"}, + {shl_ref_floor_mod_f32, "shl_ref_floor_mod_f32"}, + {shl_ref_floor_mod_quant, "shl_ref_floor_mod_quant"}, + {shl_ref_floor_f32, "shl_ref_floor_f32"}, + {shl_ref_floor_quant, "shl_ref_floor_quant"}, + {shl_ref_fullyconnected_f32, "shl_ref_fullyconnected_f32"}, + {shl_ref_fullyconnected_quant, "shl_ref_fullyconnected_quant"}, + {shl_ref_gather_nd_f32, "shl_ref_gather_nd_f32"}, + {shl_ref_gather_nd_quant, "shl_ref_gather_nd_quant"}, + {shl_ref_gather_f32, "shl_ref_gather_f32"}, +#if __riscv + {shl_ref_gather_f16, "shl_ref_gather_f16"}, +#endif + {shl_ref_gather_int8, "shl_ref_gather_int8"}, + {shl_ref_gather_quant, "shl_ref_gather_quant"}, + {shl_ref_global_avgpool2d_f32, "shl_ref_global_avgpool2d_f32"}, + {shl_ref_global_avgpool2d_quant, "shl_ref_global_avgpool2d_quant"}, + {shl_ref_global_maxpool2d_f32, "shl_ref_global_maxpool2d_f32"}, + {shl_ref_global_maxpool2d_quant, "shl_ref_global_maxpool2d_quant"}, + {shl_ref_greater_equal_f32, "shl_ref_greater_equal_f32"}, + {shl_ref_greater_equal_quant, "shl_ref_greater_equal_quant"}, + {shl_ref_greater_f32, "shl_ref_greater_f32"}, + {shl_ref_greater_quant, "shl_ref_greater_quant"}, + {shl_ref_hard_sigmoid_f32, "shl_ref_hard_sigmoid_f32"}, + {shl_ref_hard_sigmoid_quant, "shl_ref_hard_sigmoid_quant"}, + {shl_ref_im2col_f32, "shl_ref_im2col_f32"}, + {shl_ref_im2col_quant, "shl_ref_im2col_quant"}, + {shl_ref_isnan_bool_f32, "shl_ref_isnan_bool_f32"}, + {shl_ref_l2_normalization_f32, "shl_ref_l2_normalization_f32"}, + {shl_ref_l2_normalization_quant, "shl_ref_l2_normalization_quant"}, + {shl_ref_l2pool_f32, "shl_ref_l2pool_f32"}, + {shl_ref_layer_norm_f32, "shl_ref_layer_norm_f32"}, + {shl_ref_layer_norm_quant, "shl_ref_layer_norm_quant"}, + {shl_ref_leaky_relu_f32, "shl_ref_leaky_relu_f32"}, + {shl_ref_leaky_relu_quant, "shl_ref_leaky_relu_quant"}, + {shl_ref_less_equal_f32, "shl_ref_less_equal_f32"}, + {shl_ref_less_equal_quant, "shl_ref_less_equal_quant"}, + {shl_ref_less_f32, "shl_ref_less_f32"}, + {shl_ref_less_quant, "shl_ref_less_quant"}, + {shl_ref_log_softmax_f32, "shl_ref_log_softmax_f32"}, + {shl_ref_log_softmax_quant, "shl_ref_log_softmax_quant"}, + {shl_ref_log_f32, "shl_ref_log_f32"}, + {shl_ref_log_quant, "shl_ref_log_quant"}, + {shl_ref_log1p_f32, "shl_ref_log1p_f32"}, + {shl_ref_log1p_quant, "shl_ref_log1p_quant"}, + {shl_ref_logical_and_f32, "shl_ref_logical_and_f32"}, + {shl_ref_logical_and_quant, "shl_ref_logical_and_quant"}, + {shl_ref_logical_not_f32, "shl_ref_logical_not_f32"}, + {shl_ref_logical_not_quant, "shl_ref_logical_not_quant"}, + {shl_ref_logical_or_f32, "shl_ref_logical_or_f32"}, + {shl_ref_logical_or_quant, "shl_ref_logical_or_quant"}, + {shl_ref_logical_xor_f32, "shl_ref_logical_xor_f32"}, + {shl_ref_logical_xor_quant, "shl_ref_logical_xor_quant"}, + {shl_ref_lrn_f32, "shl_ref_lrn_f32"}, + {shl_ref_lrn_quant, "shl_ref_lrn_quant"}, + {shl_ref_matmul_f32, "shl_ref_matmul_f32"}, + {shl_ref_matmul_quant, "shl_ref_matmul_quant"}, + {shl_ref_max_stride_f32, "shl_ref_max_stride_f32"}, + {shl_ref_max_stride_quant, "shl_ref_max_stride_quant"}, + {shl_ref_maximum_f32, "shl_ref_maximum_f32"}, + {shl_ref_maximum_quant, "shl_ref_maximum_quant"}, + {shl_ref_maxpool2d_f32, "shl_ref_maxpool2d_f32"}, + {shl_ref_maxpool2d_quant, "shl_ref_maxpool2d_quant"}, + {shl_ref_maxpool2d_locat_f32, "shl_ref_maxpool2d_locat_f32"}, + {shl_ref_maxpool2d_locat_quant, "shl_ref_maxpool2d_locat_quant"}, + {shl_ref_maxpool3d_f32, "shl_ref_maxpool3d_f32"}, + {shl_ref_maxpool3d_quant, "shl_ref_maxpool3d_quant"}, + {shl_ref_mean_stride_f32, "shl_ref_mean_stride_f32"}, + {shl_ref_mean_stride_quant, "shl_ref_mean_stride_quant"}, + {shl_ref_mean_quant, "shl_ref_mean_quant"}, + {shl_ref_min_stride_f32, "shl_ref_min_stride_f32"}, + {shl_ref_min_stride_quant, "shl_ref_min_stride_quant"}, + {shl_ref_minimum_f32, "shl_ref_minimum_f32"}, + {shl_ref_minimum_quant, "shl_ref_minimum_quant"}, + {shl_ref_mod_f32, "shl_ref_mod_f32"}, + {shl_ref_mod_quant, "shl_ref_mod_quant"}, + {shl_ref_mul_f32, "shl_ref_mul_f32"}, + {shl_ref_mul_quant, "shl_ref_mul_quant"}, + {shl_ref_ndarray_size_f32, "shl_ref_ndarray_size_f32"}, + {shl_ref_ndarray_size_u8, "shl_ref_ndarray_size_u8"}, + {shl_ref_ndarray_size_i8, "shl_ref_ndarray_size_i8"}, + {shl_ref_ndarray_size_i32, "shl_ref_ndarray_size_i32"}, + {shl_ref_negative_f32, "shl_ref_negative_f32"}, + {shl_ref_negative_quant, "shl_ref_negative_quant"}, + {shl_ref_non_max_suppression_std, "shl_ref_non_max_suppression_std"}, + {shl_ref_not_equal_f32, "shl_ref_not_equal_f32"}, + {shl_ref_not_equal_quant, "shl_ref_not_equal_quant"}, + {shl_ref_not_u32, "shl_ref_not_u32"}, + {shl_ref_not_u8, "shl_ref_not_u8"}, + {shl_ref_not_i8, "shl_ref_not_i8"}, + {shl_ref_or_u32, "shl_ref_or_u32"}, + {shl_ref_or_u8, "shl_ref_or_u8"}, + {shl_ref_or_i8, "shl_ref_or_i8"}, + {shl_ref_pad_f32, "shl_ref_pad_f32"}, + {shl_ref_pad_quant, "shl_ref_pad_quant"}, + {shl_ref_power_f32, "shl_ref_power_f32"}, + {shl_ref_power_quant, "shl_ref_power_quant"}, + {shl_ref_prelu_f32, "shl_ref_prelu_f32"}, + {shl_ref_prelu_quant, "shl_ref_prelu_quant"}, + {shl_ref_prod_stride_f32, "shl_ref_prod_stride_f32"}, + {shl_ref_prod_stride_quant, "shl_ref_prod_stride_quant"}, + {shl_ref_proposal_f32, "shl_ref_proposal_f32"}, + {shl_ref_proposal_quant, "shl_ref_proposal_quant"}, + {shl_ref_psroipooling_f32, "shl_ref_psroipooling_f32"}, + {shl_ref_psroipooling_quant, "shl_ref_psroipooling_quant"}, + {shl_ref_reduce_logsumexp_f32, "shl_ref_reduce_logsumexp_f32"}, + {shl_ref_reduce_logsumexp_quant, "shl_ref_reduce_logsumexp_quant"}, + {shl_ref_reduce_max_f32, "shl_ref_reduce_max_f32"}, + {shl_ref_reduce_max_quant, "shl_ref_reduce_max_quant"}, + {shl_ref_reduce_mean_f32, "shl_ref_reduce_mean_f32"}, + {shl_ref_reduce_mean_quant, "shl_ref_reduce_mean_quant"}, + {shl_ref_reduce_min_f32, "shl_ref_reduce_min_f32"}, + {shl_ref_reduce_min_quant, "shl_ref_reduce_min_quant"}, + {shl_ref_reduce_prod_f32, "shl_ref_reduce_prod_f32"}, + {shl_ref_reduce_prod_quant, "shl_ref_reduce_prod_quant"}, + {shl_ref_reduce_sum_f32, "shl_ref_reduce_sum_f32"}, + {shl_ref_reduce_sum_quant, "shl_ref_reduce_sum_quant"}, + {shl_ref_relu_f32, "shl_ref_relu_f32"}, + {shl_ref_relu_quant, "shl_ref_relu_quant"}, + {shl_ref_relu1_f32, "shl_ref_relu1_f32"}, + {shl_ref_relu1_quant, "shl_ref_relu1_quant"}, + {shl_ref_relu6_f32, "shl_ref_relu6_f32"}, + {shl_ref_relu6_quant, "shl_ref_relu6_quant"}, + {shl_ref_relun_f32, "shl_ref_relun_f32"}, + {shl_ref_relun_quant, "shl_ref_relun_quant"}, + {shl_ref_reshape, "shl_ref_reshape"}, + {shl_ref_reshape_quant, "shl_ref_reshape_quant"}, + {shl_ref_resize_f32, "shl_ref_resize_f32"}, +#if __riscv + {shl_ref_resize_f16, "shl_ref_resize_f16"}, +#endif + {shl_ref_resize_i8, "shl_ref_resize_i8"}, + {shl_ref_resize_quant, "shl_ref_resize_quant"}, + {shl_ref_reverse_f32, "shl_ref_reverse_f32"}, + {shl_ref_reverse_quant, "shl_ref_reverse_quant"}, + {shl_ref_roi_align_f32, "shl_ref_roi_align_f32"}, + {shl_ref_roipool_f32, "shl_ref_roipool_f32"}, + {shl_ref_roipool_quant, "shl_ref_roipool_quant"}, + {shl_ref_round_f32, "shl_ref_round_f32"}, + {shl_ref_round_quant, "shl_ref_round_quant"}, + {shl_ref_rsqrt_f32, "shl_ref_rsqrt_f32"}, + {shl_ref_rsqrt_quant, "shl_ref_rsqrt_quant"}, + {shl_ref_scatter_nd_f32, "shl_ref_scatter_nd_f32"}, + {shl_ref_scatter_nd_quant, "shl_ref_scatter_nd_quant"}, + {shl_ref_unsorted_segment_max_f32, "shl_ref_unsorted_segment_max_f32"}, + {shl_ref_segment_max_f32, "shl_ref_segment_max_f32"}, + {shl_ref_unsorted_segment_max_quant, "shl_ref_unsorted_segment_max_quant"}, + {shl_ref_segment_max_quant, "shl_ref_segment_max_quant"}, + {shl_ref_unsorted_segment_mean_f32, "shl_ref_unsorted_segment_mean_f32"}, + {shl_ref_segment_mean_f32, "shl_ref_segment_mean_f32"}, + {shl_ref_unsorted_segment_mean_quant, "shl_ref_unsorted_segment_mean_quant"}, + {shl_ref_segment_mean_quant, "shl_ref_segment_mean_quant"}, + {shl_ref_unsorted_segment_min_f32, "shl_ref_unsorted_segment_min_f32"}, + {shl_ref_segment_min_f32, "shl_ref_segment_min_f32"}, + {shl_ref_unsorted_segment_min_quant, "shl_ref_unsorted_segment_min_quant"}, + {shl_ref_segment_min_quant, "shl_ref_segment_min_quant"}, + {shl_ref_unsorted_segment_prod_f32, "shl_ref_unsorted_segment_prod_f32"}, + {shl_ref_segment_prod_f32, "shl_ref_segment_prod_f32"}, + {shl_ref_unsorted_segment_prod_quant, "shl_ref_unsorted_segment_prod_quant"}, + {shl_ref_segment_prod_quant, "shl_ref_segment_prod_quant"}, + {shl_ref_unsorted_segment_sum_f32, "shl_ref_unsorted_segment_sum_f32"}, + {shl_ref_segment_sum_f32, "shl_ref_segment_sum_f32"}, + {shl_ref_unsorted_segment_sum_quant, "shl_ref_unsorted_segment_sum_quant"}, + {shl_ref_segment_sum_quant, "shl_ref_segment_sum_quant"}, + {shl_ref_select_f32, "shl_ref_select_f32"}, + {shl_ref_select_u8, "shl_ref_select_u8"}, + {shl_ref_select_i8, "shl_ref_select_i8"}, + {shl_ref_shape_i32, "shl_ref_shape_i32"}, + {shl_ref_shape_u8, "shl_ref_shape_u8"}, + {shl_ref_shape_i8, "shl_ref_shape_i8"}, + {shl_ref_shuffle_channel_f32, "shl_ref_shuffle_channel_f32"}, + {shl_ref_shuffle_channel_quant, "shl_ref_shuffle_channel_quant"}, + {shl_ref_sigmoid_f32, "shl_ref_sigmoid_f32"}, + {shl_ref_sigmoid_quant, "shl_ref_sigmoid_quant"}, + {shl_ref_silu_f32, "shl_ref_silu_f32"}, + {shl_ref_silu_quant, "shl_ref_silu_quant"}, + {shl_ref_sign_f32, "shl_ref_sign_f32"}, + {shl_ref_sign_quant, "shl_ref_sign_quant"}, + {shl_ref_sin_f32, "shl_ref_sin_f32"}, + {shl_ref_sin_quant, "shl_ref_sin_quant"}, + {shl_ref_sinh_f32, "shl_ref_sinh_f32"}, + {shl_ref_sinh_quant, "shl_ref_sinh_quant"}, + {shl_ref_slice_f32, "shl_ref_slice_f32"}, + {shl_ref_slice_quant, "shl_ref_slice_quant"}, + {shl_ref_softmax_f32, "shl_ref_softmax_f32"}, + {shl_ref_softmax_quant, "shl_ref_softmax_quant"}, + {shl_ref_softplus_f32, "shl_ref_softplus_f32"}, + {shl_ref_softplus_quant, "shl_ref_softplus_quant"}, + {shl_ref_softrelu_f32, "shl_ref_softrelu_f32"}, + {shl_ref_softrelu_quant, "shl_ref_softrelu_quant"}, + {shl_ref_softsign_f32, "shl_ref_softsign_f32"}, + {shl_ref_softsign_quant, "shl_ref_softsign_quant"}, + {shl_ref_space_to_batch_f32, "shl_ref_space_to_batch_f32"}, + {shl_ref_space_to_batch_quant, "shl_ref_space_to_batch_quant"}, + {shl_ref_space_to_depth_f32, "shl_ref_space_to_depth_f32"}, + {shl_ref_space_to_depth_quant, "shl_ref_space_to_depth_quant"}, + {shl_ref_split_f32, "shl_ref_split_f32"}, + {shl_ref_split_quant, "shl_ref_split_quant"}, + {shl_ref_sqrt_f32, "shl_ref_sqrt_f32"}, + {shl_ref_sqrt_quant, "shl_ref_sqrt_quant"}, + {shl_ref_square_f32, "shl_ref_square_f32"}, + {shl_ref_square_quant, "shl_ref_square_quant"}, + {shl_ref_squeeze, "shl_ref_squeeze"}, + {shl_ref_squeeze_quant, "shl_ref_squeeze_quant"}, + {shl_ref_stack_f32, "shl_ref_stack_f32"}, + {shl_ref_stack_quant, "shl_ref_stack_quant"}, + {shl_ref_strided_slice, "shl_ref_strided_slice"}, + {shl_ref_strided_slice_f32, "shl_ref_strided_slice_f32"}, +#if __riscv + {shl_ref_strided_slice_f16, "shl_ref_strided_slice_f16"}, +#endif + {shl_ref_strided_slice_i8, "shl_ref_strided_slice_i8"}, + {shl_ref_strided_slice_quant, "shl_ref_strided_slice_quant"}, + {shl_ref_sub_f32, "shl_ref_sub_f32"}, + {shl_ref_sub_quant, "shl_ref_sub_quant"}, + {shl_ref_sum_stride_f32, "shl_ref_sum_stride_f32"}, + {shl_ref_sum_stride_quant, "shl_ref_sum_stride_quant"}, + {shl_ref_tan_f32, "shl_ref_tan_f32"}, + {shl_ref_tan_quant, "shl_ref_tan_quant"}, + {shl_ref_tanh_f32, "shl_ref_tanh_f32"}, + {shl_ref_tanh_f64, "shl_ref_tanh_f64"}, + {shl_ref_tanh_quant, "shl_ref_tanh_quant"}, + {shl_ref_threshold_relu_f32, "shl_ref_threshold_relu_f32"}, + {shl_ref_threshold_relu_quant, "shl_ref_threshold_relu_quant"}, + {shl_ref_tile_f32, "shl_ref_tile_f32"}, + {shl_ref_tile_quant, "shl_ref_tile_quant"}, + {shl_ref_topk_f32, "shl_ref_topk_f32"}, + {shl_ref_topk_quant, "shl_ref_topk_quant"}, + {shl_ref_transpose, "shl_ref_transpose"}, + {shl_ref_transpose_quant, "shl_ref_transpose_quant"}, + {shl_ref_trunc_f32, "shl_ref_trunc_f32"}, + {shl_ref_trunc_quant, "shl_ref_trunc_quant"}, + {shl_ref_unpooling_f32, "shl_ref_unpooling_f32"}, + {shl_ref_unpooling_quant, "shl_ref_unpooling_quant"}, + {shl_ref_unstack_f32, "shl_ref_unstack_f32"}, + {shl_ref_unstack_qunat, "shl_ref_unstack_qunat"}, + {shl_ref_xor_u32, "shl_ref_xor_u32"}, + {shl_ref_xor_u8, "shl_ref_xor_u8"}, + {shl_ref_xor_i8, "shl_ref_xor_i8"}, + {shl_ref_yuv_rgb_scale_f32, "shl_ref_yuv_rgb_scale_f32"}, + {shl_ref_yuv_rgb_scale_quant, "shl_ref_yuv_rgb_scale_quant"}, + {shl_ref_one_hot_f32, "shl_ref_one_hot_f32"}, + {shl_ref_one_hot_quant, "shl_ref_one_hot_quant"}, + {shl_ref_where_f32, "shl_ref_where_f32"}, + {shl_ref_where_quant, "shl_ref_where_quant"}, + {shl_ref_where_softmax_f32, "shl_ref_where_softmax_f32"}, + {shl_ref_where_softmax_quant, "shl_ref_where_softmax_quant"}, + {shl_ref_cast_f32, "shl_ref_cast_f32"}, + {shl_ref_cast_bool, "shl_ref_cast_bool"}, + {shl_ref_cast_i64, "shl_ref_cast_i64"}, + {shl_ref_cast_quant, "shl_ref_cast_quant"}, + {shl_ref_instance_norm_f32, "shl_ref_instance_norm_f32"}, + {shl_ref_instance_norm_quant, "shl_ref_instance_norm_quant"}, + {shl_ref_rms_norm_f32, "shl_ref_rms_norm_f32"}, + {shl_ref_rms_norm_quant, "shl_ref_rms_norm_quant"}, + {shl_ref_rope_f32, "shl_ref_rope_f32"}, + {shl_ref_rope_quant, "shl_ref_rope_quant"}, + {shl_ref_llm_pos_f32, "shl_ref_llm_pos_f32"}, + {shl_ref_llm_pos_quant, "shl_ref_llm_pos_quant"}, + {shl_ref_embedding_f32, "shl_ref_embedding_f32"}, + {shl_ref_embedding_quant, "shl_ref_embedding_quant"}, + {shl_ref_scaled_dot_product_attention_f32, "shl_ref_scaled_dot_product_attention_f32"}, + {shl_ref_scaled_dot_product_attention_quant, "shl_ref_scaled_dot_product_attention_quant"}, + {NULL, NULL}}; + +char *shl_ref_get_kernel_name(void *exec) +{ + return shl_find_function_name(shl_ref_kernel_map, exec); +} + +int shl_ref_abs_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_acos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_acosh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_and_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_arange_perf(struct csinn_tensor *output, struct csinn_arange_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_argmax_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_argmin_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_asin_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_asinh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_atan_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_atanh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_avgpool3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_batch_normalization_perf(struct csinn_tensor *input, struct csinn_tensor *mean, + struct csinn_tensor *variance, struct csinn_tensor *gamma, + struct csinn_tensor *beta, struct csinn_tensor *output, + struct csinn_bn_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_batch_to_space_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_batch_to_space_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_broadcast_to_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_broadcast_to_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_ceil_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_col2im_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_col2im_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_concat_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv2d_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv2d_relu_perf(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cache_matmul_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_matmul_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cache_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weight, struct csinn_tensor *bias, + struct csinn_cache_conv1d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv2d_channel_relu_perf(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv2d_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv2d_channel_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_conv2d_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_conv2d_relu_perf(struct csinn_tensor *o_input, struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_conv2d_channel_relu_perf(struct csinn_tensor *o_input, + struct csinn_tensor *o_output, + struct csinn_tensor *o_kernel, + struct csinn_tensor *o_bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_conv2d_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_conv2d_channel_relu6_perf(struct csinn_tensor *input, + struct csinn_tensor *output, + struct csinn_tensor *kernel, + struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_group_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_group_conv2d_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_group_conv2d_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_group_conv2d_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_group_conv2d_channel_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_conv3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cosh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cumprod_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumprod_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cumsum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cumsum_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_data_convert_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depthwise_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_group_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_deconv3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv3d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_depth_to_space_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_depth_to_space_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_elu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_fsmn_perf(struct csinn_tensor *frame, struct csinn_tensor *l_filter, + struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence, + struct csinn_tensor *frame_counter, struct csinn_tensor *output, + struct csinn_fsmn_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_erf_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_exp_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_expand_dims_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_expm1_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_flatten_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_flatten_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_floor_divide_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_floor_mod_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_floor_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_gather_nd_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_nd_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_gather_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_greater_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_greater_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_hard_sigmoid_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_im2col_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_im2col_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_isnan_bool_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_l2_normalization_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_l2n_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_l2pool_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_layer_norm_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_less_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_less_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_log_softmax_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_log_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_log1p_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_logical_and_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_logical_not_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_logical_or_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_logical_xor_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_lrn_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_lrn_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_max_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_maximum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_maxpool2d_locat_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_maxpool3d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_mean_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_mean_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_min_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_minimum_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_mod_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_ndarray_size_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_ndarray_size_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_negative_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_non_max_suppression_std_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, + struct csinn_non_max_suppression_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_not_equal_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_not_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_or_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_pad_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pad_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_power_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_prod_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_proposal_perf(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred, + struct csinn_tensor *im_info, struct csinn_tensor *output, + struct csinn_proposal_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_psroipooling_perf(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_psroipooling_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reduce_logsumexp_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reduce_max_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reduce_mean_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reduce_min_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reduce_prod_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_relu1_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_relun_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_resize_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_resize_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_reverse_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reverse_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_roi_align_perf(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_align_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_roipool_perf(struct csinn_tensor *data, struct csinn_tensor *rois, + struct csinn_tensor *output, struct csinn_roi_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_round_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_rsqrt_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_scatter_nd_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *updates, struct csinn_tensor *output, + struct csinn_scatter_nd_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unsorted_segment_max_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_segment_max_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unsorted_segment_mean_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_segment_mean_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unsorted_segment_min_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_segment_min_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unsorted_segment_prod_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_segment_prod_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unsorted_segment_sum_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, + struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_segment_sum_perf(struct csinn_tensor *input, struct csinn_tensor *segment_ids, + struct csinn_tensor *output, struct csinn_segment_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_select_perf(struct csinn_tensor *condition, struct csinn_tensor *input0, + struct csinn_tensor *input1, struct csinn_tensor *output, + struct csinn_select_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_shape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shape_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_shuffle_channel_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_shuffle_channel_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sigmoid_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_silu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sign_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sin_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sinh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_slice_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_slice_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_softmax_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_softplus_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_softrelu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_softsign_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_space_to_batch_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_batch_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_space_to_depth_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_space_to_depth_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_split_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sqrt_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_square_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_squeeze_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_squeeze_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_stack_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_stack_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_strided_slice_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_sum_stride_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_tan_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_tanh_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_threshold_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_tile_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tile_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_topk_perf(struct csinn_tensor *input, struct csinn_tensor *output1, + struct csinn_tensor *output2, struct csinn_topk_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_transpose_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_trunc_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unpooling_perf(struct csinn_tensor *input, struct csinn_tensor *mask, + struct csinn_tensor *output, struct csinn_unpooling_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_unstack_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_unstack_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_xor_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_yuv_rgb_scale_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_siso_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_one_hot_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_one_hot_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_where_perf(struct csinn_tensor *condition, struct csinn_tensor *x, + struct csinn_tensor *y, struct csinn_tensor *output, + struct csinn_where_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_where_softmax_perf(struct csinn_tensor *condition, struct csinn_tensor *y, + struct csinn_tensor *output, + struct csinn_where_softmax_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_cast_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_cast_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_instance_norm_perf(struct csinn_tensor *input, struct csinn_tensor *scales, + struct csinn_tensor *bias, struct csinn_tensor *output, + struct csinn_instance_norm_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_rms_norm_perf(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_rope_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_llm_pos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_embedding_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_ref_scaled_dot_product_attention_perf(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output, + struct csinn_scale_dot_attention_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_ref_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} \ No newline at end of file diff --git a/source/reference/rms_norm.c b/source/reference/rms_norm.c index 08e9a823..dadc63cb 100644 --- a/source/reference/rms_norm.c +++ b/source/reference/rms_norm.c @@ -18,8 +18,8 @@ #include "reference/ref.h" -int shl_ref_rms_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params) +int shl_ref_rms_norm_f32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -56,14 +56,14 @@ int shl_ref_rms_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output return CSINN_TRUE; } -int shl_ref_rms_norm_quant(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params) +int shl_ref_rms_norm_quant(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input); struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output); struct csinn_tensor *float_gamma = shl_ref_tensor_transform_f32(weight); - int ret = shl_ref_rms_norm_f32(float_input, float_output, float_gamma, params); + int ret = shl_ref_rms_norm_f32(float_input, float_gamma, float_output, params); csinn_tensor_data_convert(output, float_output); diff --git a/source/reference/rope.c b/source/reference/rope.c index 255b72a5..b1f586e5 100644 --- a/source/reference/rope.c +++ b/source/reference/rope.c @@ -33,32 +33,58 @@ int shl_ref_rope_f32(struct csinn_tensor *input, struct csinn_tensor *output, float *dst_data = output->data; int32_t *pos = params->pos; - for (int i3 = 0; i3 < input->dim[0]; i3++) { - for (int i2 = 0; i2 < input->dim[1]; i2++) { - int p = pos[i2]; - for (int i1 = 0; i1 < input->dim[2]; i1++) { - float theta = freq_scale * (float)p; + if (!params->use_rope_cache) { + for (int i3 = 0; i3 < input->dim[0]; i3++) { + for (int i2 = 0; i2 < input->dim[1]; i2++) { + int p = pos[i2]; + for (int i1 = 0; i1 < input->dim[2]; i1++) { + float theta = freq_scale * (float)p; - for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { - float cos_theta = cosf(theta); - float sin_theta = sinf(theta); - // zeta scaling for xPos only: - float zeta = xpos_base != 0.0f - ? powf((i0 + 0.4f * input->dim[0]) / (1.4f * input->dim[0]), - p / xpos_base) - : 1.0f; - if (xpos_down) zeta = 1.0f / zeta; + for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { + float cos_theta = cosf(theta); + float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = + xpos_base != 0.0f + ? powf((i0 + 0.4f * input->dim[0]) / (1.4f * input->dim[0]), + p / xpos_base) + : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; - theta *= theta_scale; + theta *= theta_scale; - int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + - i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; - float x0 = src_data[index]; - float x1 = src_data[index + 1]; + float x0 = src_data[index]; + float x1 = src_data[index + 1]; - dst_data[index] = x0 * cos_theta * zeta - x1 * sin_theta * zeta; - dst_data[index + 1] = x0 * sin_theta * zeta + x1 * cos_theta * zeta; + dst_data[index] = x0 * cos_theta * zeta - x1 * sin_theta * zeta; + dst_data[index + 1] = x0 * sin_theta * zeta + x1 * cos_theta * zeta; + } + } + } + } + } else { + float *rope_cache = &((float *)params->rope_cache)[pos[0] * input->dim[2] * input->dim[3]]; + for (int i3 = 0; i3 < input->dim[0]; i3++) { + for (int i2 = 0; i2 < input->dim[1]; i2++) { + for (int i1 = 0; i1 < input->dim[2]; i1++) { + for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { + int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + int rope_cache_index = + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + float x0 = src_data[index]; + float x1 = src_data[index + 1]; + float sin_theta = rope_cache[index]; + float cos_theta = rope_cache[index + 1]; + + dst_data[index] = x0 * cos_theta - x1 * sin_theta; + dst_data[index + 1] = x0 * sin_theta + x1 * cos_theta; + } } } } diff --git a/source/reference/scaled_dot_product_attention.c b/source/reference/scaled_dot_product_attention.c index 457f4d4c..8b6d1693 100644 --- a/source/reference/scaled_dot_product_attention.c +++ b/source/reference/scaled_dot_product_attention.c @@ -39,66 +39,12 @@ int shl_ref_scaled_dot_product_attention_f32(struct csinn_tensor *query, struct int32_t sk = key->dim[2]; int32_t sq = query->dim[2]; int32_t head_dim = query->dim[3]; - float norm_factor = sqrt(128); + float norm_factor = 1.0f / params->norm_factor; - // matmul_result = torch.matmul(query, key.transpose(-1, -2)) - // matmul_result = matmul_result / norm_factor - float *matmul_res_data = shl_mem_alloc(batch * np * sq * sk * sizeof(float)); - int cnt = 0; - for (int i = 0; i < np; i++) { - float *mat_input1 = query_data + i * sq * head_dim; - float *mat_input2 = key_data + i * sk * head_dim; - - for (int j = 0; j < sq; j++) { - for (int k = 0; k < sk; k++) { - float sum = 0; - for (int l = 0; l < head_dim; l++) { - sum += (mat_input1[j * head_dim + l] * mat_input2[k * head_dim + l]); - } - - matmul_res_data[cnt] = sum / norm_factor; - cnt++; - } - } - } - - // attention_mask and softmax - // attention_scores: [batch,np,sq,sk] - - float *input = matmul_res_data; - float *output = matmul_res_data; - - for (int i = 0; i < np; i++) { - for (int k = 0; k < sq; k++) { - float acc_exp = 0.0f; - float max = -FLT_MAX; - int cnt = sk; - if (params->casual) { - cnt = k + 1 + (sk - sq); - } - for (int j = 0; j < cnt; j++) { - max = fmax(max, *(input + j)); - } - // compute sum - for (int j = 0; j < cnt; j++) { - acc_exp += exp(*(input + j) - max); - } - // compute final result - for (int j = 0; j < cnt; j++) { - *(output + j) = exp(*(input + j) - max) / acc_exp; - } - if (params->casual) { - for (int j = cnt; j < sk; j++) { - *(output + j) = 0; - } - } - input += sk; - output += sk; - } - } - - // if value is [batch,np,sk,dim_head],do transpose(-2,-1) - if (!params->transpose_v) { + // deal with transpose_v first + // batch,np,sq,sk * batch,np,sk,dim_head + if (!params->transpose_v) // if value is [batch,np,sk,dim_head],do transpose(-2,-1) + { float *value_transpose_tmp = shl_mem_alloc(batch * np * sk * head_dim * sizeof(float)); memcpy(value_transpose_tmp, value_data, batch * np * sk * head_dim * sizeof(float)); for (int i = 0; i < np; i++) { @@ -114,11 +60,47 @@ int shl_ref_scaled_dot_product_attention_f32(struct csinn_tensor *query, struct shl_mem_free(value_transpose_tmp); } - cnt = 0; - // context_layer = torch.matmul(attention_probs, value_layer1) - for (int i = 0; i < np; i++) { - float *mat_input1 = matmul_res_data + i * sq * sk; - float *mat_input2 = value_data + i * head_dim * sk; + // matmul_result = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + // matmul_result = matmul_result / norm_factor + size_t matmul_res_size = batch * np * sq * sk * sizeof(float); + float *matmul_res_data = shl_mem_alloc(matmul_res_size); + memset(matmul_res_data, 0, matmul_res_size); + for (int i = 0; i < np; i++) // split into multiple threads from here. + { + float *mat_input1 = query_data + i * sq * head_dim; + float *mat_input2 = key_data + i * sk * head_dim; + + for (int j = 0; j < sq; j++) { + float max = -FLT_MAX; + float acc_exp = 0; + int casual_cnt = sk; + if (params->casual) { + casual_cnt = j + 1 + (sk - sq); + } + for (int k = 0; k < casual_cnt; k++) { + float sum = 0; + for (int l = 0; l < head_dim; l++) { + sum += (mat_input1[j * head_dim + l] * mat_input2[k * head_dim + l]); + } + sum *= norm_factor; + // cal exp_sum + float tmp = max; + max = fmax(max, sum); + acc_exp *= exp(tmp - max); + acc_exp += exp(sum - max); + matmul_res_data[i * sq * sk + j * sk + k] = sum; + } + // do softmax + for (int k = 0; k < casual_cnt; k++) { + *(matmul_res_data + i * sq * sk + j * sk + k) = + exp(*(matmul_res_data + i * sq * sk + j * sk + k) - max) / acc_exp; + } + } + + // context_layer = torch.matmul(attention_probs, value_layer) + + mat_input1 = matmul_res_data + i * sq * sk; + mat_input2 = value_data + i * head_dim * sk; for (int j = 0; j < sq; j++) { for (int k = 0; k < head_dim; k++) { @@ -126,8 +108,7 @@ int shl_ref_scaled_dot_product_attention_f32(struct csinn_tensor *query, struct for (int l = 0; l < sk; l++) { sum += (mat_input1[j * sk + l] * mat_input2[k * sk + l]); } - output_data[cnt] = sum; - cnt++; + output_data[i * sq * head_dim + j * head_dim + k] = sum; } } } @@ -149,7 +130,7 @@ int shl_ref_scaled_dot_product_attention_quant(struct csinn_tensor *query, struc float_output, params); csinn_tensor_data_convert(output, float_output); shl_ref_tensor_transform_free_f32(float_query); - shl_ref_tensor_transform_free_f32(float_output); + shl_ref_tensor_transform_free_f32(float_value); shl_ref_tensor_transform_free_f32(float_key); shl_ref_tensor_transform_free_f32(float_output); return ret; diff --git a/source/reference/setup.c b/source/reference/setup.c index 4b03fac1..67fbf950 100644 --- a/source/reference/setup.c +++ b/source/reference/setup.c @@ -16,6 +16,7 @@ * limitations under the License. */ +#include "reference/perf.h" #include "reference/ref.h" void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output) @@ -211,594 +212,816 @@ static void *setup_cb_map() for (int i = CSINN_DTYPE_INT4; i <= CSINN_DTYPE_FLOAT32; i++) { #ifndef CONFIG_C_REFERENCE_ABS_DISABLED cb_map[CSINN_OP_ABS][i].exec = shl_ref_abs_quant; + cb_map[CSINN_OP_ABS][i].perf = shl_ref_abs_perf; #endif #ifndef CONFIG_C_REFERENCE_ACOS_DISABLED cb_map[CSINN_OP_ACOS][i].exec = shl_ref_acos_quant; + cb_map[CSINN_OP_ACOS][i].perf = shl_ref_acos_perf; #endif #ifndef CONFIG_C_REFERENCE_ACOSH_DISABLED cb_map[CSINN_OP_ACOSH][i].exec = shl_ref_acosh_quant; + cb_map[CSINN_OP_ACOSH][i].perf = shl_ref_acosh_perf; #endif #ifndef CONFIG_C_REFERENCE_ADD_DISABLED cb_map[CSINN_OP_ADD][i].exec = shl_ref_add_quant; + cb_map[CSINN_OP_ADD][i].perf = shl_ref_add_perf; #endif #ifndef CONFIG_C_REFERENCE_ARANGE_DISABLED cb_map[CSINN_OP_ARANGE][i].exec = shl_ref_arange_quant; + cb_map[CSINN_OP_ARANGE][i].perf = shl_ref_arange_perf; #endif #ifndef CONFIG_C_REFERENCE_ARGMAX_DISABLED cb_map[CSINN_OP_ARGMAX][i].exec = shl_ref_argmax_stride_quant; + cb_map[CSINN_OP_ARGMAX][i].perf = shl_ref_argmax_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_ARGMIN_DISABLED cb_map[CSINN_OP_ARGMIN][i].exec = shl_ref_argmin_stride_quant; + cb_map[CSINN_OP_ARGMIN][i].perf = shl_ref_argmin_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_ASIN_DISABLED cb_map[CSINN_OP_ASIN][i].exec = shl_ref_asin_quant; + cb_map[CSINN_OP_ASIN][i].perf = shl_ref_asin_perf; #endif #ifndef CONFIG_C_REFERENCE_ASINH_DISABLED cb_map[CSINN_OP_ASINH][i].exec = shl_ref_asinh_quant; + cb_map[CSINN_OP_ASINH][i].perf = shl_ref_asinh_perf; #endif #ifndef CONFIG_C_REFERENCE_ATAN_DISABLED cb_map[CSINN_OP_ATAN][i].exec = shl_ref_atan_quant; + cb_map[CSINN_OP_ATAN][i].perf = shl_ref_atan_perf; #endif #ifndef CONFIG_C_REFERENCE_ATANH_DISABLED cb_map[CSINN_OP_ATANH][i].exec = shl_ref_atanh_quant; + cb_map[CSINN_OP_ATANH][i].perf = shl_ref_atanh_perf; #endif #ifndef CONFIG_C_REFERENCE_AVERAGEPOOL_DISABLED cb_map[CSINN_OP_AVGPOOL2D][i].exec = shl_ref_avgpool2d_quant; + cb_map[CSINN_OP_AVGPOOL2D][i].perf = shl_ref_avgpool2d_perf; #endif #ifndef CONFIG_C_REFERENCE_AVERAGEPOOL3D_DISABLED cb_map[CSINN_OP_AVGPOOL3D][i].exec = shl_ref_avgpool3d_quant; + cb_map[CSINN_OP_AVGPOOL3D][i].perf = shl_ref_avgpool3d_perf; #endif #ifndef CONFIG_C_REFERENCE_BATCH_NORMALIZATION_DISABLED cb_map[CSINN_OP_BN][i].exec = shl_ref_batch_normalization_quant; + cb_map[CSINN_OP_BN][i].perf = shl_ref_batch_normalization_perf; #endif #ifndef CONFIG_C_REFERENCE_BATCH_TO_SPACE_DISABLED cb_map[CSINN_OP_BATCH_TO_SPACE][i].exec = shl_ref_batch_to_space_quant; + cb_map[CSINN_OP_BATCH_TO_SPACE][i].perf = shl_ref_batch_to_space_perf; #endif #ifndef CONFIG_C_REFERENCE_BROADCAST_TO_DISABLED cb_map[CSINN_OP_BROADCOST][i].exec = shl_ref_broadcast_to_quant; + cb_map[CSINN_OP_BROADCOST][i].perf = shl_ref_broadcast_to_perf; #endif #ifndef CONFIG_C_REFERENCE_CACHE_MATMUL_DISABLED cb_map[CSINN_OP_CACHE_MATMUL][i].exec = shl_ref_cache_matmul_quant; + cb_map[CSINN_OP_CACHE_MATMUL][i].perf = shl_ref_cache_matmul_perf; cb_map[CSINN_OP_CACHE_MATMUL][i].init = shl_ref_cache_matmul_init; #endif #ifndef CONFIG_C_REFERENCE_CACHE_CONV1D_DISABLED cb_map[CSINN_OP_CACHE_CONV1D][i].exec = shl_ref_cache_conv1d_quant; + cb_map[CSINN_OP_CACHE_CONV1D][i].perf = shl_ref_cache_conv1d_perf; cb_map[CSINN_OP_CACHE_CONV1D][i].init = shl_ref_cache_conv1d_init; #endif #ifndef CONFIG_C_REFERENCE_CONV1D_DISABLED cb_map[CSINN_OP_CONV1D][i].exec = shl_ref_conv1d_quant; + cb_map[CSINN_OP_CONV1D][i].perf = shl_ref_conv1d_perf; cb_map[CSINN_OP_DEPTHWISE_CONV1D][i].exec = shl_ref_conv1d_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV1D][i].perf = shl_ref_conv1d_perf; cb_map[CSINN_OP_GROUP_CONV1D][i].exec = shl_ref_conv1d_quant; + cb_map[CSINN_OP_GROUP_CONV1D][i].perf = shl_ref_conv1d_perf; #endif #ifndef CONFIG_C_REFERENCE_CEIL_DISABLED cb_map[CSINN_OP_CEIL][i].exec = shl_ref_ceil_quant; + cb_map[CSINN_OP_CEIL][i].perf = shl_ref_ceil_perf; #endif #ifndef CONFIG_C_REFERENCE_CLIP_DISABLED cb_map[CSINN_OP_CLIP][i].exec = shl_ref_clip_quant; + cb_map[CSINN_OP_CLIP][i].perf = shl_ref_clip_perf; #endif #ifndef CONFIG_C_REFERENCE_CONCAT_DISABLED cb_map[CSINN_OP_CONCAT][i].exec = shl_ref_concat_quant; + cb_map[CSINN_OP_CONCAT][i].perf = shl_ref_concat_perf; #endif #ifndef CONFIG_C_REFERENCE_COS_DISABLED cb_map[CSINN_OP_COS][i].exec = shl_ref_cos_quant; + cb_map[CSINN_OP_COS][i].perf = shl_ref_cos_perf; #endif #ifndef CONFIG_C_REFERENCE_COSH_DISABLED cb_map[CSINN_OP_COSH][i].exec = shl_ref_cosh_quant; + cb_map[CSINN_OP_COSH][i].perf = shl_ref_cosh_perf; #endif #ifndef CONFIG_C_REFERENCE_CUMPROD_DISABLED cb_map[CSINN_OP_CUMPROD][i].exec = shl_ref_cumprod_quant; + cb_map[CSINN_OP_CUMPROD][i].perf = shl_ref_cumprod_perf; #endif #ifndef CONFIG_C_REFERENCE_CUMSUM_DISABLED cb_map[CSINN_OP_CUMSUM][i].exec = shl_ref_cumsum_quant; + cb_map[CSINN_OP_CUMSUM][i].perf = shl_ref_cumsum_perf; #endif #ifndef CONFIG_C_REFERENCE_DEPTH_TO_SPACE_DISABLED cb_map[CSINN_OP_DEPTH_TO_SPACE][i].exec = shl_ref_depth_to_space_quant; + cb_map[CSINN_OP_DEPTH_TO_SPACE][i].perf = shl_ref_depth_to_space_perf; #endif #ifndef CONFIG_C_REFERENCE_DIV_DISABLED cb_map[CSINN_OP_DIV][i].exec = shl_ref_div_quant; + cb_map[CSINN_OP_DIV][i].perf = shl_ref_div_perf; #endif #ifndef CONFIG_C_REFERENCE_ELU_DISABLED cb_map[CSINN_OP_ELU][i].exec = shl_ref_elu_quant; + cb_map[CSINN_OP_ELU][i].perf = shl_ref_elu_perf; #endif #ifndef CONFIG_C_REFERENCE_EMBEDDING_DISABLED cb_map[CSINN_OP_EMBEDDING][i].exec = shl_ref_embedding_quant; + cb_map[CSINN_OP_EMBEDDING][i].perf = shl_ref_embedding_perf; #endif #ifndef CONFIG_C_REFERENCE_EQUAL_DISABLED cb_map[CSINN_OP_EQUANL][i].exec = shl_ref_equal_quant; + cb_map[CSINN_OP_EQUANL][i].perf = shl_ref_equal_perf; #endif #ifndef CONFIG_C_REFERENCE_ERF_DISABLED cb_map[CSINN_OP_ERF][i].exec = shl_ref_erf_quant; + cb_map[CSINN_OP_ERF][i].perf = shl_ref_erf_perf; #endif #ifndef CONFIG_C_REFERENCE_EXP_DISABLED cb_map[CSINN_OP_EXP][i].exec = shl_ref_exp_quant; + cb_map[CSINN_OP_EXP][i].perf = shl_ref_exp_perf; #endif #ifndef CONFIG_C_REFERENCE_EXPAND_DIMS_DISABLED cb_map[CSINN_OP_EXPAND_DIMS][i].exec = shl_ref_expand_dims_quant; + cb_map[CSINN_OP_EXPAND_DIMS][i].perf = shl_ref_expand_dims_perf; #endif #ifndef CONFIG_C_REFERENCE_EXPM1_DISABLED cb_map[CSINN_OP_EXPM1][i].exec = shl_ref_expm1_quant; + cb_map[CSINN_OP_EXPM1][i].perf = shl_ref_expm1_perf; #endif #ifndef CONFIG_C_REFERENCE_FLATTEN_DISABLED cb_map[CSINN_OP_FLATTEN][i].exec = shl_ref_flatten; + cb_map[CSINN_OP_FLATTEN][i].perf = shl_ref_flatten_perf; cb_map[CSINN_OP_FLATTEN][i].init = shl_ref_flatten_init; #endif #ifndef CONFIG_C_REFERENCE_FLOOR_DIVIDE_DISABLED cb_map[CSINN_OP_FLOOR_DIVIDE][i].exec = shl_ref_floor_divide_quant; + cb_map[CSINN_OP_FLOOR_DIVIDE][i].perf = shl_ref_floor_divide_perf; #endif #ifndef CONFIG_C_REFERENCE_FLOOR_MOD_DISABLED cb_map[CSINN_OP_FLOOR_MOD][i].exec = shl_ref_floor_mod_quant; + cb_map[CSINN_OP_FLOOR_MOD][i].perf = shl_ref_floor_mod_perf; #endif #ifndef CONFIG_C_REFERENCE_FLOOR_DISABLED cb_map[CSINN_OP_FLOOR][i].exec = shl_ref_floor_quant; + cb_map[CSINN_OP_FLOOR][i].perf = shl_ref_floor_perf; #endif #ifndef CONFIG_C_REFERENCE_FSMN_DISABLED cb_map[CSINN_OP_FSMN][i].exec = shl_ref_fsmn_quant; + cb_map[CSINN_OP_FSMN][i].perf = shl_ref_fsmn_perf; #endif #ifndef CONFIG_C_REFERENCE_GATHER_ND_DISABLED cb_map[CSINN_OP_GATHER_ND][i].exec = shl_ref_gather_nd_quant; + cb_map[CSINN_OP_GATHER_ND][i].perf = shl_ref_gather_nd_perf; #endif #ifndef CONFIG_C_REFERENCE_GATHER_DISABLED cb_map[CSINN_OP_GATHER][i].exec = shl_ref_gather_quant; + cb_map[CSINN_OP_GATHER][i].perf = shl_ref_gather_perf; #endif #ifndef CONFIG_C_REFERENCE_GLOBAL_AVERAGEPOOL_DISABLED cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][i].exec = shl_ref_global_avgpool2d_quant; + cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][i].perf = shl_ref_global_avgpool2d_perf; #endif #ifndef CONFIG_C_REFERENCE_GLOBAL_MAXPOOL_DISABLED cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][i].exec = shl_ref_global_maxpool2d_quant; + cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][i].perf = shl_ref_global_maxpool2d_perf; #endif #ifndef CONFIG_C_REFERENCE_GREATER_EQUAL_DISABLED cb_map[CSINN_OP_GREATHER_EQUAL][i].exec = shl_ref_greater_equal_quant; + cb_map[CSINN_OP_GREATHER_EQUAL][i].perf = shl_ref_greater_equal_perf; #endif #ifndef CONFIG_C_REFERENCE_GREATER_DISABLED cb_map[CSINN_OP_GREATHER][i].exec = shl_ref_greater_quant; + cb_map[CSINN_OP_GREATHER][i].perf = shl_ref_greater_perf; #endif #ifndef CONFIG_C_REFERENCE_HARD_SIGMOID_DISABLED cb_map[CSINN_OP_HARD_SIGMOID][i].exec = shl_ref_hard_sigmoid_quant; + cb_map[CSINN_OP_HARD_SIGMOID][i].perf = shl_ref_hard_sigmoid_perf; #endif #ifndef CONFIG_C_REFERENCE_IM2COL_DISABLED cb_map[CSINN_OP_IM2COL][i].exec = shl_ref_im2col_quant; + cb_map[CSINN_OP_IM2COL][i].perf = shl_ref_im2col_perf; #endif #ifndef CONFIG_C_REFERENCE_L2_NORMALIZATION_DISABLED cb_map[CSINN_OP_L2N][i].exec = shl_ref_l2_normalization_quant; + cb_map[CSINN_OP_L2N][i].perf = shl_ref_l2_normalization_perf; #endif #ifndef CONFIG_C_REFERENCE_LAYER_NORM_DISABLED cb_map[CSINN_OP_LAYER_NORM][i].exec = shl_ref_layer_norm_quant; + cb_map[CSINN_OP_LAYER_NORM][i].perf = shl_ref_layer_norm_perf; #endif #ifndef CONFIG_C_REFERENCE_LEAKY_RELU_DISABLED cb_map[CSINN_OP_LEAKY_RELU][i].exec = shl_ref_leaky_relu_quant; + cb_map[CSINN_OP_LEAKY_RELU][i].perf = shl_ref_leaky_relu_perf; #endif #ifndef CONFIG_C_REFERENCE_LESS_EQUAL_DISABLED cb_map[CSINN_OP_LESS_EQUAL][i].exec = shl_ref_less_equal_quant; + cb_map[CSINN_OP_LESS_EQUAL][i].perf = shl_ref_less_equal_perf; #endif #ifndef CONFIG_C_REFERENCE_LESS_DISABLED cb_map[CSINN_OP_LESS][i].exec = shl_ref_less_quant; + cb_map[CSINN_OP_LESS][i].perf = shl_ref_less_perf; #endif #ifndef CONFIG_C_REFERENCE_LLM_POS_DISABLED cb_map[CSINN_OP_LLM_POS][i].exec = shl_ref_llm_pos_quant; + cb_map[CSINN_OP_LLM_POS][i].perf = shl_ref_llm_pos_perf; #endif #ifndef CONFIG_C_REFERENCE_LOG_SOFTMAX_DISABLED cb_map[CSINN_OP_LOG_SOFTMAX][i].exec = shl_ref_log_softmax_quant; + cb_map[CSINN_OP_LOG_SOFTMAX][i].perf = shl_ref_log_softmax_perf; #endif #ifndef CONFIG_C_REFERENCE_LOG_DISABLED cb_map[CSINN_OP_LOG][i].exec = shl_ref_log_quant; + cb_map[CSINN_OP_LOG][i].perf = shl_ref_log_perf; #endif #ifndef CONFIG_C_REFERENCE_LOG1P_DISABLED cb_map[CSINN_OP_LOG1P][i].exec = shl_ref_log1p_quant; + cb_map[CSINN_OP_LOG1P][i].perf = shl_ref_log1p_perf; #endif #ifndef CONFIG_C_REFERENCE_LOGICAL_AND_DISABLED cb_map[CSINN_OP_LOGICAL_AND][i].exec = shl_ref_logical_and_quant; + cb_map[CSINN_OP_LOGICAL_AND][i].perf = shl_ref_logical_and_perf; #endif #ifndef CONFIG_C_REFERENCE_LOGICAL_NOT_DISABLED cb_map[CSINN_OP_LOGICAL_NOT][i].exec = shl_ref_logical_not_quant; + cb_map[CSINN_OP_LOGICAL_NOT][i].perf = shl_ref_logical_not_perf; #endif #ifndef CONFIG_C_REFERENCE_LOGICAL_OR_DISABLED cb_map[CSINN_OP_LOGICAL_OR][i].exec = shl_ref_logical_or_quant; + cb_map[CSINN_OP_LOGICAL_OR][i].perf = shl_ref_logical_or_perf; #endif #ifndef CONFIG_C_REFERENCE_LOGICAL_XOR_DISABLED cb_map[CSINN_OP_LOGICAL_XOR][i].exec = shl_ref_logical_xor_quant; + cb_map[CSINN_OP_LOGICAL_XOR][i].perf = shl_ref_logical_xor_perf; #endif #ifndef CONFIG_C_REFERENCE_LRN_DISABLED cb_map[CSINN_OP_LRN][i].exec = shl_ref_lrn_quant; + cb_map[CSINN_OP_LRN][i].perf = shl_ref_lrn_perf; #endif #ifndef CONFIG_C_REFERENCE_MATMUL_DISABLED cb_map[CSINN_OP_MATMUL][i].exec = shl_ref_matmul_quant; + cb_map[CSINN_OP_MATMUL][i].perf = shl_ref_matmul_perf; #endif #ifndef CONFIG_C_REFERENCE_MAX_DISABLED cb_map[CSINN_OP_MAX][i].exec = shl_ref_max_stride_quant; + cb_map[CSINN_OP_MAX][i].perf = shl_ref_max_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_MAXIMUM_DISABLED cb_map[CSINN_OP_MAXIMUM][i].exec = shl_ref_maximum_quant; + cb_map[CSINN_OP_MAXIMUM][i].perf = shl_ref_maximum_perf; #endif #ifndef CONFIG_C_REFERENCE_MAXPOOL_DISABLED cb_map[CSINN_OP_MAXPOOL2D][i].exec = shl_ref_maxpool2d_quant; + cb_map[CSINN_OP_MAXPOOL2D][i].perf = shl_ref_maxpool2d_perf; #endif #ifndef CONFIG_C_REFERENCE_MAXPOOL2D_LOCAT_DISABLED cb_map[CSINN_OP_MAXPOOL2D_LOCAT][i].exec = shl_ref_maxpool2d_locat_quant; + cb_map[CSINN_OP_MAXPOOL2D_LOCAT][i].perf = shl_ref_maxpool2d_locat_perf; #endif #ifndef CONFIG_C_REFERENCE_MAXPOOL3D_DISABLED cb_map[CSINN_OP_MAXPOOL3D][i].exec = shl_ref_maxpool3d_quant; + cb_map[CSINN_OP_MAXPOOL3D][i].perf = shl_ref_maxpool3d_perf; #endif #ifndef CONFIG_C_REFERENCE_MEAN_DISABLED cb_map[CSINN_OP_MEAN][i].exec = shl_ref_mean_stride_quant; + cb_map[CSINN_OP_MEAN][i].perf = shl_ref_mean_stride_perf; cb_map[CSINN_OP_MEAN_STRIDE][i].exec = shl_ref_mean_stride_quant; + cb_map[CSINN_OP_MEAN_STRIDE][i].perf = shl_ref_mean_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_MIN_DISABLED cb_map[CSINN_OP_MIN][i].exec = shl_ref_min_stride_quant; + cb_map[CSINN_OP_MIN][i].perf = shl_ref_min_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_MINIMUM_DISABLED cb_map[CSINN_OP_MINIMUM][i].exec = shl_ref_minimum_quant; + cb_map[CSINN_OP_MINIMUM][i].perf = shl_ref_minimum_perf; #endif #ifndef CONFIG_C_REFERENCE_MOD_DISABLED cb_map[CSINN_OP_MOD][i].exec = shl_ref_mod_quant; + cb_map[CSINN_OP_MOD][i].perf = shl_ref_mod_perf; #endif #ifndef CONFIG_C_REFERENCE_MUL_DISABLED cb_map[CSINN_OP_MUL][i].exec = shl_ref_mul_quant; + cb_map[CSINN_OP_MUL][i].perf = shl_ref_mul_perf; #endif #ifndef CONFIG_C_REFERENCE_NEGATIVE_DISABLED cb_map[CSINN_OP_NEGATIVE][i].exec = shl_ref_negative_quant; + cb_map[CSINN_OP_NEGATIVE][i].perf = shl_ref_negative_perf; #endif #ifndef CONFIG_C_REFERENCE_NOT_EQUAL_DISABLED cb_map[CSINN_OP_NOT_EQUAL][i].exec = shl_ref_not_equal_quant; + cb_map[CSINN_OP_NOT_EQUAL][i].perf = shl_ref_not_equal_perf; #endif #ifndef CONFIG_C_REFERENCE_PAD_DISABLED cb_map[CSINN_OP_PAD][i].exec = shl_ref_pad_quant; + cb_map[CSINN_OP_PAD][i].perf = shl_ref_pad_perf; #endif #ifndef CONFIG_C_REFERENCE_POWER_DISABLED cb_map[CSINN_OP_POWER][i].exec = shl_ref_power_quant; + cb_map[CSINN_OP_POWER][i].perf = shl_ref_power_perf; #endif #ifndef CONFIG_C_REFERENCE_PRELU_DISABLED cb_map[CSINN_OP_PRELU][i].exec = shl_ref_prelu_quant; + cb_map[CSINN_OP_PRELU][i].perf = shl_ref_prelu_perf; #endif #ifndef CONFIG_C_REFERENCE_PROD_DISABLED cb_map[CSINN_OP_PROD][i].exec = shl_ref_prod_stride_quant; + cb_map[CSINN_OP_PROD][i].perf = shl_ref_prod_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_PROPOSAL_DISABLED cb_map[CSINN_OP_PROPOSAL][i].exec = shl_ref_proposal_quant; + cb_map[CSINN_OP_PROPOSAL][i].perf = shl_ref_proposal_perf; #endif #ifndef CONFIG_C_REFERENCE_PSROIPOOLING_DISABLED cb_map[CSINN_OP_PSROIPOOLING][i].exec = shl_ref_psroipooling_quant; + cb_map[CSINN_OP_PSROIPOOLING][i].perf = shl_ref_psroipooling_perf; #endif #ifndef CONFIG_C_REFERENCE_REDUCE_LOGSUMEXP_DISABLED cb_map[CSINN_OP_REDUCE_LOGSUMEXP][i].exec = shl_ref_reduce_logsumexp_quant; + cb_map[CSINN_OP_REDUCE_LOGSUMEXP][i].perf = shl_ref_reduce_logsumexp_perf; #endif #ifndef CONFIG_C_REFERENCE_REDUCE_MAX_DISABLED cb_map[CSINN_OP_REDUCE_MAX][i].exec = shl_ref_reduce_max_quant; + cb_map[CSINN_OP_REDUCE_MAX][i].perf = shl_ref_reduce_max_perf; #endif #ifndef CONFIG_C_REFERENCE_REDUCE_MEAN_DISABLED cb_map[CSINN_OP_REDUCE_MEAN][i].exec = shl_ref_reduce_mean_quant; + cb_map[CSINN_OP_REDUCE_MEAN][i].perf = shl_ref_reduce_mean_perf; #endif #ifndef CONFIG_C_REFERENCE_REDUCE_MIN_DISABLED cb_map[CSINN_OP_REDUCE_MIN][i].exec = shl_ref_reduce_min_quant; + cb_map[CSINN_OP_REDUCE_MIN][i].perf = shl_ref_reduce_min_perf; #endif #ifndef CONFIG_C_REFERENCE_REDUCE_PROD_DISABLED cb_map[CSINN_OP_REDUCE_PROD][i].exec = shl_ref_reduce_prod_quant; + cb_map[CSINN_OP_REDUCE_PROD][i].perf = shl_ref_reduce_prod_perf; #endif #ifndef CONFIG_C_REFERENCE_REDUCE_SUM_DISABLED cb_map[CSINN_OP_REDUCE_SUM][i].exec = shl_ref_reduce_sum_quant; + cb_map[CSINN_OP_REDUCE_SUM][i].perf = shl_ref_reduce_sum_perf; #endif #ifndef CONFIG_C_REFERENCE_RELU_DISABLED cb_map[CSINN_OP_RELU][i].exec = shl_ref_relu_quant; + cb_map[CSINN_OP_RELU][i].perf = shl_ref_relu_perf; #endif #ifndef CONFIG_C_REFERENCE_RELU1_DISABLED cb_map[CSINN_OP_RELU1][i].exec = shl_ref_relu1_quant; + cb_map[CSINN_OP_RELU1][i].perf = shl_ref_relu1_perf; #endif #ifndef CONFIG_C_REFERENCE_RELU6_DISABLED cb_map[CSINN_OP_RELU6][i].exec = shl_ref_relu6_quant; + cb_map[CSINN_OP_RELU6][i].perf = shl_ref_relu6_perf; #endif #ifndef CONFIG_C_REFERENCE_RELUN_DISABLED cb_map[CSINN_OP_RELUN][i].exec = shl_ref_relun_quant; + cb_map[CSINN_OP_RELUN][i].perf = shl_ref_relun_perf; #endif #ifndef CONFIG_C_REFERENCE_RESHAPE_DISABLED cb_map[CSINN_OP_RESHAPE][i].exec = shl_ref_reshape; + cb_map[CSINN_OP_RESHAPE][i].perf = shl_ref_reshape_perf; cb_map[CSINN_OP_RESHAPE][i].init = shl_ref_reshape_init; #endif #ifndef CONFIG_C_REFERENCE_RESIZE_DISABLED cb_map[CSINN_OP_RESIZE][i].exec = shl_ref_resize_quant; + cb_map[CSINN_OP_RESIZE][i].perf = shl_ref_resize_perf; #endif #ifndef CONFIG_C_REFERENCE_REVERSE_DISABLED cb_map[CSINN_OP_REVERSE][i].exec = shl_ref_reverse_quant; + cb_map[CSINN_OP_REVERSE][i].perf = shl_ref_reverse_perf; #endif #ifndef CONFIG_C_REFERENCE_ROIPOOL_DISABLED cb_map[CSINN_OP_ROIPOOL][i].exec = shl_ref_roipool_quant; + cb_map[CSINN_OP_ROIPOOL][i].perf = shl_ref_roipool_perf; #endif #ifndef CONFIG_C_REFERENCE_ROPE_DISABLED cb_map[CSINN_OP_ROPE][i].exec = shl_ref_rope_quant; + cb_map[CSINN_OP_ROPE][i].perf = shl_ref_rope_perf; #endif #ifndef CONFIG_C_REFERENCE_ROUND_DISABLED cb_map[CSINN_OP_ROUND][i].exec = shl_ref_round_quant; + cb_map[CSINN_OP_ROUND][i].perf = shl_ref_round_perf; #endif #ifndef CONFIG_C_REFERENCE_RSQRT_DISABLED cb_map[CSINN_OP_RSQRT][i].exec = shl_ref_rsqrt_quant; + cb_map[CSINN_OP_RSQRT][i].perf = shl_ref_rsqrt_perf; #endif #ifndef CONFIG_C_REFERENCE_SEGMENT_MAX_DISABLED cb_map[CSINN_OP_SEGMENT_MAX][i].exec = shl_ref_segment_max_quant; + cb_map[CSINN_OP_SEGMENT_MAX][i].perf = shl_ref_segment_max_perf; cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i].exec = shl_ref_unsorted_segment_max_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i].perf = shl_ref_unsorted_segment_max_perf; #endif #ifndef CONFIG_C_REFERENCE_SEGMENT_MEAN_DISABLED cb_map[CSINN_OP_SEGMENT_MEAN][i].exec = shl_ref_segment_mean_quant; + cb_map[CSINN_OP_SEGMENT_MEAN][i].perf = shl_ref_segment_mean_perf; cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i].exec = shl_ref_unsorted_segment_mean_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i].perf = shl_ref_unsorted_segment_mean_perf; #endif #ifndef CONFIG_C_REFERENCE_SEGMENT_MIN_DISABLED cb_map[CSINN_OP_SEGMENT_MIN][i].exec = shl_ref_segment_min_quant; + cb_map[CSINN_OP_SEGMENT_MIN][i].perf = shl_ref_segment_min_perf; cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i].exec = shl_ref_unsorted_segment_min_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i].perf = shl_ref_unsorted_segment_min_perf; #endif #ifndef CONFIG_C_REFERENCE_SEGMENT_PROD_DISABLED cb_map[CSINN_OP_SEGMENT_PROD][i].exec = shl_ref_segment_prod_quant; + cb_map[CSINN_OP_SEGMENT_PROD][i].perf = shl_ref_segment_prod_perf; cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i].exec = shl_ref_unsorted_segment_prod_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i].perf = shl_ref_unsorted_segment_prod_perf; #endif #ifndef CONFIG_C_REFERENCE_SEGMENT_SUM_DISABLED cb_map[CSINN_OP_SEGMENT_SUM][i].exec = shl_ref_segment_sum_quant; + cb_map[CSINN_OP_SEGMENT_SUM][i].perf = shl_ref_segment_sum_perf; cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i].exec = shl_ref_unsorted_segment_sum_quant; + cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i].perf = shl_ref_unsorted_segment_sum_perf; #endif #ifndef CONFIG_C_REFERENCE_SHUFFLE_CHANNEL_DISABLED cb_map[CSINN_OP_SHUFFLE_CHANNEL][i].exec = shl_ref_shuffle_channel_quant; + cb_map[CSINN_OP_SHUFFLE_CHANNEL][i].perf = shl_ref_shuffle_channel_perf; #endif #ifndef CONFIG_C_REFERENCE_SIGMOID_DISABLED cb_map[CSINN_OP_SIGMOID][i].exec = shl_ref_sigmoid_quant; + cb_map[CSINN_OP_SIGMOID][i].perf = shl_ref_sigmoid_perf; #endif #ifndef CONFIG_C_REFERENCE_SILU_DISABLED cb_map[CSINN_OP_SILU][i].exec = shl_ref_silu_quant; + cb_map[CSINN_OP_SILU][i].perf = shl_ref_silu_perf; #endif #ifndef CONFIG_C_REFERENCE_SIGN_DISABLED cb_map[CSINN_OP_SIGN][i].exec = shl_ref_sign_quant; + cb_map[CSINN_OP_SIGN][i].perf = shl_ref_sign_perf; #endif #ifndef CONFIG_C_REFERENCE_SIN_DISABLED cb_map[CSINN_OP_SIN][i].exec = shl_ref_sin_quant; + cb_map[CSINN_OP_SIN][i].perf = shl_ref_sin_perf; #endif #ifndef CONFIG_C_REFERENCE_SINH_DISABLED cb_map[CSINN_OP_SINH][i].exec = shl_ref_sinh_quant; + cb_map[CSINN_OP_SINH][i].perf = shl_ref_sinh_perf; #endif #ifndef CONFIG_C_REFERENCE_SLICE_DISABLED cb_map[CSINN_OP_SLICE][i].exec = shl_ref_slice_quant; + cb_map[CSINN_OP_SLICE][i].perf = shl_ref_slice_perf; #endif #ifndef CONFIG_C_REFERENCE_SOFTMAX_DISABLED cb_map[CSINN_OP_SOFTMAX][i].exec = shl_ref_softmax_quant; + cb_map[CSINN_OP_SOFTMAX][i].perf = shl_ref_softmax_perf; #endif #ifndef CONFIG_C_REFERENCE_SOFTPLUS_DISABLED cb_map[CSINN_OP_SOFTPLUS][i].exec = shl_ref_softplus_quant; + cb_map[CSINN_OP_SOFTPLUS][i].perf = shl_ref_softplus_perf; #endif #ifndef CONFIG_C_REFERENCE_SOFTRELU_DISABLED cb_map[CSINN_OP_SOFTRELU][i].exec = shl_ref_softrelu_quant; + cb_map[CSINN_OP_SOFTRELU][i].perf = shl_ref_softrelu_perf; #endif #ifndef CONFIG_C_REFERENCE_SOFTSIGN_DISABLED cb_map[CSINN_OP_SOFTSIGN][i].exec = shl_ref_softsign_quant; + cb_map[CSINN_OP_SOFTSIGN][i].perf = shl_ref_softsign_perf; #endif #ifndef CONFIG_C_REFERENCE_SPACE_TO_BATCH_DISABLED cb_map[CSINN_OP_SPACE_TO_BATCH][i].exec = shl_ref_space_to_batch_quant; + cb_map[CSINN_OP_SPACE_TO_BATCH][i].perf = shl_ref_space_to_batch_perf; #endif #ifndef CONFIG_C_REFERENCE_SPACE_TO_DEPTH_DISABLED cb_map[CSINN_OP_SPACE_TO_DEPTH][i].exec = shl_ref_space_to_depth_quant; + cb_map[CSINN_OP_SPACE_TO_DEPTH][i].perf = shl_ref_space_to_depth_perf; #endif #ifndef CONFIG_C_REFERENCE_SQRT_DISABLED cb_map[CSINN_OP_SQRT][i].exec = shl_ref_sqrt_quant; + cb_map[CSINN_OP_SQRT][i].perf = shl_ref_sqrt_perf; #endif #ifndef CONFIG_C_REFERENCE_SQUARE_DISABLED cb_map[CSINN_OP_SQUARE][i].exec = shl_ref_square_quant; + cb_map[CSINN_OP_SQUARE][i].perf = shl_ref_square_perf; #endif #ifndef CONFIG_C_REFERENCE_SQUEEZE_DISABLED cb_map[CSINN_OP_SQUEEZE][i].exec = shl_ref_squeeze_quant; + cb_map[CSINN_OP_SQUEEZE][i].perf = shl_ref_squeeze_perf; #endif #ifndef CONFIG_C_REFERENCE_STACK_DISABLED cb_map[CSINN_OP_STACK][i].exec = shl_ref_stack_quant; + cb_map[CSINN_OP_STACK][i].perf = shl_ref_stack_perf; #endif #ifndef CONFIG_C_REFERENCE_STRIDED_SLICE_DISABLED cb_map[CSINN_OP_STRIDED_SLICE][i].exec = shl_ref_strided_slice_quant; + cb_map[CSINN_OP_STRIDED_SLICE][i].perf = shl_ref_strided_slice_perf; #endif #ifndef CONFIG_C_REFERENCE_SUB_DISABLED cb_map[CSINN_OP_SUB][i].exec = shl_ref_sub_quant; + cb_map[CSINN_OP_SUB][i].perf = shl_ref_sub_perf; #endif #ifndef CONFIG_C_REFERENCE_SUM_DISABLED cb_map[CSINN_OP_SUM][i].exec = shl_ref_sum_stride_quant; + cb_map[CSINN_OP_SUM][i].perf = shl_ref_sum_stride_perf; #endif #ifndef CONFIG_C_REFERENCE_TAN_DISABLED cb_map[CSINN_OP_TAN][i].exec = shl_ref_tan_quant; + cb_map[CSINN_OP_TAN][i].perf = shl_ref_tan_perf; #endif #ifndef CONFIG_C_REFERENCE_TANH_DISABLED cb_map[CSINN_OP_TANH][i].exec = shl_ref_tanh_quant; + cb_map[CSINN_OP_TANH][i].perf = shl_ref_tanh_perf; #endif #ifndef CONFIG_C_REFERENCE_THRESHOLD_RELU_DISABLED cb_map[CSINN_OP_THRESHOLD_RELU][i].exec = shl_ref_threshold_relu_quant; + cb_map[CSINN_OP_THRESHOLD_RELU][i].perf = shl_ref_threshold_relu_perf; #endif #ifndef CONFIG_C_REFERENCE_TILE_DISABLED cb_map[CSINN_OP_TILE][i].exec = shl_ref_tile_quant; + cb_map[CSINN_OP_TILE][i].perf = shl_ref_tile_perf; #endif #ifndef CONFIG_C_REFERENCE_TOPK_DISABLED cb_map[CSINN_OP_TOPK][i].exec = shl_ref_topk_quant; + cb_map[CSINN_OP_TOPK][i].perf = shl_ref_topk_perf; #endif #ifndef CONFIG_C_REFERENCE_TRANSPOSE_DISABLED cb_map[CSINN_OP_TRANSPOSE][i].exec = shl_ref_transpose; + cb_map[CSINN_OP_TRANSPOSE][i].perf = shl_ref_transpose_perf; cb_map[CSINN_OP_TRANSPOSE][i].init = shl_ref_transpose_init; #endif #ifndef CONFIG_C_REFERENCE_TRUNC_DISABLED cb_map[CSINN_OP_TRUNC][i].exec = shl_ref_trunc_quant; + cb_map[CSINN_OP_TRUNC][i].perf = shl_ref_trunc_perf; #endif #ifndef CONFIG_C_REFERENCE_UNPOOLING_DISABLED cb_map[CSINN_OP_UNPOOLING][i].exec = shl_ref_unpooling_quant; + cb_map[CSINN_OP_UNPOOLING][i].perf = shl_ref_unpooling_perf; #endif #ifndef CONFIG_C_REFERENCE_YUV_RGB_SCALE_DISABLED cb_map[CSINN_OP_YUV_RGB_SCALE][i].exec = shl_ref_yuv_rgb_scale_quant; + cb_map[CSINN_OP_YUV_RGB_SCALE][i].perf = shl_ref_yuv_rgb_scale_perf; #endif #ifndef CONFIG_C_REFERENCE_CONVOLUTION_DISABLED cb_map[CSINN_OP_CONV2D][i].exec = shl_ref_conv2d_quant; + cb_map[CSINN_OP_CONV2D][i].perf = shl_ref_conv2d_perf; cb_map[CSINN_OP_DEPTHWISE_CONV2D][i].exec = shl_ref_depthwise_conv2d_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D][i].perf = shl_ref_depthwise_conv2d_perf; cb_map[CSINN_OP_GROUP_CONV2D][i].exec = shl_ref_group_conv2d_quant; + cb_map[CSINN_OP_GROUP_CONV2D][i].perf = shl_ref_group_conv2d_perf; #endif #ifndef CONFIG_C_REFERENCE_CONVOLUTION_RELU_DISABLED cb_map[CSINN_OP_CONV2D_RELU][i].exec = shl_ref_conv2d_relu_quant; + cb_map[CSINN_OP_CONV2D_RELU][i].perf = shl_ref_conv2d_relu_perf; cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i].exec = shl_ref_depthwise_conv2d_relu_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i].perf = shl_ref_depthwise_conv2d_relu_perf; cb_map[CSINN_OP_GROUP_CONV2D_RELU][i].exec = shl_ref_group_conv2d_relu_quant; + cb_map[CSINN_OP_GROUP_CONV2D_RELU][i].perf = shl_ref_group_conv2d_relu_perf; #endif #ifndef CONFIG_C_REFERENCE_CONVOLUTION_RELU6_DISABLED cb_map[CSINN_OP_CONV2D_RELU6][i].exec = shl_ref_conv2d_relu6_quant; + cb_map[CSINN_OP_CONV2D_RELU6][i].perf = shl_ref_conv2d_relu6_perf; cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i].exec = shl_ref_depthwise_conv2d_relu6_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i].perf = shl_ref_depthwise_conv2d_relu6_perf; cb_map[CSINN_OP_GROUP_CONV2D_RELU6][i].exec = shl_ref_group_conv2d_relu6_quant; + cb_map[CSINN_OP_GROUP_CONV2D_RELU6][i].perf = shl_ref_group_conv2d_relu6_perf; #endif #ifndef CONFIG_C_REFERENCE_CONVOLUTION_CHANNEL_DISABLED cb_map[CSINN_OP_CONV2D_CHANNEL][i].exec = shl_ref_conv2d_channel_quant; + cb_map[CSINN_OP_CONV2D_CHANNEL][i].perf = shl_ref_conv2d_channel_perf; cb_map[CSINN_OP_CONV2D_CHANNEL_RELU][i].exec = shl_ref_conv2d_channel_relu_quant; + cb_map[CSINN_OP_CONV2D_CHANNEL_RELU][i].perf = shl_ref_conv2d_channel_relu_perf; cb_map[CSINN_OP_CONV2D_CHANNEL_RELU6][i].exec = shl_ref_conv2d_channel_relu6_quant; + cb_map[CSINN_OP_CONV2D_CHANNEL_RELU6][i].perf = shl_ref_conv2d_channel_relu6_perf; cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL][i].exec = shl_ref_depthwise_conv2d_channel_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL][i].perf = shl_ref_depthwise_conv2d_channel_perf; cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU][i].exec = shl_ref_depthwise_conv2d_channel_relu_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU][i].perf = + shl_ref_depthwise_conv2d_channel_relu_perf; cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6][i].exec = shl_ref_depthwise_conv2d_channel_relu6_quant; + cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6][i].perf = + shl_ref_depthwise_conv2d_channel_relu6_perf; cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL][i].exec = shl_ref_group_conv2d_channel_quant; + cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL][i].perf = shl_ref_group_conv2d_channel_perf; cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL_RELU][i].exec = shl_ref_group_conv2d_channel_relu_quant; + cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL_RELU][i].perf = shl_ref_group_conv2d_channel_relu_perf; #endif #ifndef CONFIG_C_REFERENCE_CONVOLUTION3D_DISABLED cb_map[CSINN_OP_CONV3D][i].exec = shl_ref_conv3d_quant; + cb_map[CSINN_OP_CONV3D][i].perf = shl_ref_conv3d_perf; #endif #ifndef CONFIG_C_REFERENCE_DECONVOLUTION_DISABLED cb_map[CSINN_OP_DECONV2D][i].exec = shl_ref_deconv2d_quant; + cb_map[CSINN_OP_DECONV2D][i].perf = shl_ref_deconv2d_perf; cb_map[CSINN_OP_DEPTHWISE_DECONV2D][i].exec = shl_ref_depthwise_deconv2d_quant; + cb_map[CSINN_OP_DEPTHWISE_DECONV2D][i].perf = shl_ref_depthwise_deconv2d_perf; cb_map[CSINN_OP_GROUP_DECONV2D][i].exec = shl_ref_group_deconv2d_quant; + cb_map[CSINN_OP_GROUP_DECONV2D][i].perf = shl_ref_group_deconv2d_perf; #endif #ifndef CONFIG_C_REFERENCE_DECONVOLUTION3D_DISABLED cb_map[CSINN_OP_DECONV3D][i].exec = shl_ref_deconv3d_quant; + cb_map[CSINN_OP_DECONV3D][i].perf = shl_ref_deconv3d_perf; #endif #ifndef CONFIG_C_REFERENCE_FULLYCONNECTED_DISABLED cb_map[CSINN_OP_FULLYCONNECTED][i].exec = shl_ref_fullyconnected_quant; + cb_map[CSINN_OP_FULLYCONNECTED][i].perf = shl_ref_fullyconnected_perf; #endif #ifndef CONFIG_C_REFERENCE_SCATTER_DISABLED cb_map[CSINN_OP_SCATTER_ND][i].exec = shl_ref_scatter_nd_quant; + cb_map[CSINN_OP_SCATTER_ND][i].perf = shl_ref_scatter_nd_perf; #endif #ifndef CONFIG_C_REFERENCE_SPLIT_DISABLED cb_map[CSINN_OP_SPLIT][i].exec = shl_ref_split_quant; + cb_map[CSINN_OP_SPLIT][i].perf = shl_ref_split_perf; #endif #ifndef CONFIG_C_REFERENCE_ONE_HOT_DISABLED cb_map[CSINN_OP_ONE_HOT][i].exec = shl_ref_one_hot_quant; + cb_map[CSINN_OP_ONE_HOT][i].perf = shl_ref_one_hot_perf; #endif #ifndef CONFIG_C_REFERENCE_WHERE_DISABLED cb_map[CSINN_OP_WHERE][i].exec = shl_ref_where_quant; + cb_map[CSINN_OP_WHERE][i].perf = shl_ref_where_perf; #endif #ifndef CONFIG_C_REFERENCE_WHERE_SOFTMAX_DISABLED cb_map[CSINN_OP_WHERE_SOFTMAX][i].exec = shl_ref_where_softmax_quant; + cb_map[CSINN_OP_WHERE_SOFTMAX][i].perf = shl_ref_where_softmax_perf; #endif #ifndef CONFIG_C_REFERENCE_INSTANCE_NORM_DISABLED cb_map[CSINN_OP_INSTANCE_NORM][i].exec = shl_ref_instance_norm_quant; + cb_map[CSINN_OP_INSTANCE_NORM][i].perf = shl_ref_instance_norm_perf; #endif #ifndef CONFIG_C_REFERENCE_RMS_NORM_DISABLED cb_map[CSINN_OP_RMS_NORM][i].exec = shl_ref_rms_norm_quant; + cb_map[CSINN_OP_RMS_NORM][i].perf = shl_ref_rms_norm_perf; #endif } for (int i = CSINN_DTYPE_INT4; i < CSINN_DTYPE_FLOAT64; i++) { #ifndef CONFIG_C_REFERENCE_DATA_CONVERT_DISABLED cb_map[CSINN_OP_DATA_CONVERT][i].exec = shl_ref_data_convert_quant; + cb_map[CSINN_OP_DATA_CONVERT][i].perf = shl_ref_data_convert_perf; #endif } #ifndef CONFIG_C_REFERENCE_RESHAPE_DISABLED cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].exec = shl_ref_reshape; + cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].perf = shl_ref_reshape_perf; cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_INT64].init = shl_ref_reshape_init; cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_BOOL].exec = shl_ref_reshape; + cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_BOOL].perf = shl_ref_reshape_perf; cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_BOOL].init = shl_ref_reshape_init; #endif #ifndef CONFIG_C_REFERENCE_CONCAT_DISABLED cb_map[CSINN_OP_CONCAT][CSINN_DTYPE_INT64].exec = shl_ref_concat_quant; + cb_map[CSINN_OP_CONCAT][CSINN_DTYPE_INT64].perf = shl_ref_concat_perf; #endif #ifndef CONFIG_C_REFERENCE_MUL_DISABLED cb_map[CSINN_OP_MUL][CSINN_DTYPE_INT64].exec = shl_ref_mul_quant; + cb_map[CSINN_OP_MUL][CSINN_DTYPE_INT64].perf = shl_ref_mul_perf; #endif #ifndef CONFIG_C_REFERENCE_ADD_DISABLED cb_map[CSINN_OP_ADD][CSINN_DTYPE_INT64].exec = shl_ref_add_quant; + cb_map[CSINN_OP_ADD][CSINN_DTYPE_INT64].perf = shl_ref_add_perf; #endif #ifndef CONFIG_C_REFERENCE_AND_DISABLED cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT8].exec = shl_ref_and_u8; + cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT8].perf = shl_ref_and_perf; cb_map[CSINN_OP_AND][CSINN_DTYPE_INT8].exec = shl_ref_and_i8; + cb_map[CSINN_OP_AND][CSINN_DTYPE_INT8].perf = shl_ref_and_perf; cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT32].exec = shl_ref_and_u32; + cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT32].perf = shl_ref_and_perf; #endif #ifndef CONFIG_C_REFERENCE_NDARRAY_SIZE_DISABLED cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_UINT8].exec = shl_ref_ndarray_size_u8; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_UINT8].perf = shl_ref_ndarray_size_perf; cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT8].exec = shl_ref_ndarray_size_i8; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT8].perf = shl_ref_ndarray_size_perf; cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT32].exec = shl_ref_ndarray_size_i32; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT32].perf = shl_ref_ndarray_size_perf; cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_FLOAT32].exec = shl_ref_ndarray_size_f32; + cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_FLOAT32].perf = shl_ref_ndarray_size_perf; #endif #ifndef CONFIG_C_REFERENCE_NOT_DISABLED cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT8].exec = shl_ref_not_u8; + cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT8].perf = shl_ref_not_perf; cb_map[CSINN_OP_NOT][CSINN_DTYPE_INT8].exec = shl_ref_not_i8; + cb_map[CSINN_OP_NOT][CSINN_DTYPE_INT8].perf = shl_ref_not_perf; cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT32].exec = shl_ref_not_u32; + cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT32].perf = shl_ref_not_perf; #endif #ifndef CONFIG_C_REFERENCE_OR_DISABLED cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT8].exec = shl_ref_or_u8; + cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT8].perf = shl_ref_or_perf; cb_map[CSINN_OP_OR][CSINN_DTYPE_INT8].exec = shl_ref_or_i8; + cb_map[CSINN_OP_OR][CSINN_DTYPE_INT8].perf = shl_ref_or_perf; cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT32].exec = shl_ref_or_u32; + cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT32].perf = shl_ref_or_perf; #endif #ifndef CONFIG_C_REFERENCE_SELECT_DISABLED cb_map[CSINN_OP_SELECT][CSINN_DTYPE_UINT8].exec = shl_ref_select_u8; + cb_map[CSINN_OP_SELECT][CSINN_DTYPE_UINT8].perf = shl_ref_select_perf; cb_map[CSINN_OP_SELECT][CSINN_DTYPE_INT8].exec = shl_ref_select_i8; + cb_map[CSINN_OP_SELECT][CSINN_DTYPE_INT8].perf = shl_ref_select_perf; cb_map[CSINN_OP_SELECT][CSINN_DTYPE_FLOAT32].exec = shl_ref_select_f32; + cb_map[CSINN_OP_SELECT][CSINN_DTYPE_FLOAT32].perf = shl_ref_select_perf; #endif #ifndef CONFIG_C_REFERENCE_SHAPE_DISABLED cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_UINT8].exec = shl_ref_shape_u8; + cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_UINT8].perf = shl_ref_shape_perf; cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT8].exec = shl_ref_shape_i8; + cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT8].perf = shl_ref_shape_perf; cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT32].exec = shl_ref_shape_i32; + cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT32].perf = shl_ref_shape_perf; #endif #ifndef CONFIG_C_REFERENCE_XOR_DISABLED cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT8].exec = shl_ref_xor_u8; + cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT8].perf = shl_ref_xor_perf; cb_map[CSINN_OP_XOR][CSINN_DTYPE_INT8].exec = shl_ref_xor_i8; + cb_map[CSINN_OP_XOR][CSINN_DTYPE_INT8].perf = shl_ref_xor_perf; cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT32].exec = shl_ref_xor_u32; + cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT32].perf = shl_ref_xor_perf; #endif #ifndef CONFIG_C_REFERENCE_NON_MAX_SUPPRESSION_DISABLED cb_map[CSINN_OP_NON_MAX_SUPPRESSION][CSINN_DTYPE_FLOAT32].exec = shl_ref_non_max_suppression_std; + cb_map[CSINN_OP_NON_MAX_SUPPRESSION][CSINN_DTYPE_FLOAT32].perf = + shl_ref_non_max_suppression_std_perf; #endif #ifndef CONFIG_C_REFERENCE_ROIALIGN_DISABLED cb_map[CSINN_OP_ROIALIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_roi_align_f32; + cb_map[CSINN_OP_ROIALIGN][CSINN_DTYPE_FLOAT32].perf = shl_ref_roi_align_perf; #endif #ifndef CONFIG_C_REFERENCE_SCATTER_DISABLED cb_map[CSINN_OP_SCATTER_ND][CSINN_DTYPE_FLOAT32].exec = shl_ref_scatter_nd_f32; + cb_map[CSINN_OP_SCATTER_ND][CSINN_DTYPE_FLOAT32].perf = shl_ref_scatter_nd_perf; #endif #ifndef CONFIG_C_REFERENCE_COL2IM_DISABLED cb_map[CSINN_OP_COL2IM][CSINN_DTYPE_FLOAT32].exec = shl_ref_col2im_f32; + cb_map[CSINN_OP_COL2IM][CSINN_DTYPE_FLOAT32].perf = shl_ref_col2im_perf; #endif #ifndef CONFIG_C_REFERENCE_ISNAN_DISABLED cb_map[CSINN_OP_ISNAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_isnan_bool_f32; + cb_map[CSINN_OP_ISNAN][CSINN_DTYPE_FLOAT32].perf = shl_ref_isnan_bool_perf; #endif #ifndef CONFIG_C_REFERENCE_L2POOL_DISABLED cb_map[CSINN_OP_L2POOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_l2pool_f32; + cb_map[CSINN_OP_L2POOL2D][CSINN_DTYPE_FLOAT32].perf = shl_ref_l2pool_perf; #endif #ifndef CONFIG_C_REFERENCE_WHERE_DISABLED cb_map[CSINN_OP_WHERE][CSINN_DTYPE_BOOL].exec = shl_ref_where_quant; + cb_map[CSINN_OP_WHERE][CSINN_DTYPE_BOOL].perf = shl_ref_where_perf; #endif #ifndef CONFIG_C_REFERENCE_WHERE_SOFTMAX_DISABLED cb_map[CSINN_OP_WHERE_SOFTMAX][CSINN_DTYPE_BOOL].exec = shl_ref_where_softmax_quant; + cb_map[CSINN_OP_WHERE_SOFTMAX][CSINN_DTYPE_BOOL].perf = shl_ref_where_softmax_perf; #endif #ifndef CONFIG_C_REFERENCE_INSTANCE_NORM_DISABLED cb_map[CSINN_OP_INSTANCE_NORM][CSINN_DTYPE_FLOAT32].exec = shl_ref_instance_norm_f32; + cb_map[CSINN_OP_INSTANCE_NORM][CSINN_DTYPE_FLOAT32].perf = shl_ref_instance_norm_perf; #endif #ifndef CONFIG_C_REFERENCE_RMS_NORM_DISABLED cb_map[CSINN_OP_RMS_NORM][CSINN_DTYPE_FLOAT32].exec = shl_ref_rms_norm_f32; + cb_map[CSINN_OP_RMS_NORM][CSINN_DTYPE_FLOAT32].perf = shl_ref_rms_norm_perf; #endif #ifndef CONFIG_C_REFERENCE_SCALED_DOT_PRODUCT_ATTENTION_DISABLED cb_map[CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION][CSINN_DTYPE_FLOAT16].exec = shl_ref_scaled_dot_product_attention_quant; + cb_map[CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION][CSINN_DTYPE_FLOAT16].perf = + shl_ref_scaled_dot_product_attention_perf; cb_map[CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION][CSINN_DTYPE_FLOAT32].exec = shl_ref_scaled_dot_product_attention_f32; + cb_map[CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION][CSINN_DTYPE_FLOAT32].perf = + shl_ref_scaled_dot_product_attention_perf; #endif #ifndef CONFIG_C_REFERENCE_CAST_DISABLED cb_map[CSINN_OP_CAST][CSINN_DTYPE_UINT8].exec = shl_ref_cast_quant; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_UINT8].perf = shl_ref_cast_perf; cb_map[CSINN_OP_CAST][CSINN_DTYPE_INT8].exec = shl_ref_cast_quant; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_INT8].perf = shl_ref_cast_perf; cb_map[CSINN_OP_CAST][CSINN_DTYPE_INT32].exec = shl_ref_cast_quant; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_INT32].perf = shl_ref_cast_perf; cb_map[CSINN_OP_CAST][CSINN_DTYPE_FLOAT16].exec = shl_ref_cast_quant; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_FLOAT16].perf = shl_ref_cast_perf; cb_map[CSINN_OP_CAST][CSINN_DTYPE_FLOAT32].exec = shl_ref_cast_f32; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_FLOAT32].perf = shl_ref_cast_perf; cb_map[CSINN_OP_CAST][CSINN_DTYPE_BOOL].exec = shl_ref_cast_bool; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_BOOL].perf = shl_ref_cast_perf; cb_map[CSINN_OP_CAST][CSINN_DTYPE_INT64].exec = shl_ref_cast_i64; + cb_map[CSINN_OP_CAST][CSINN_DTYPE_INT64].perf = shl_ref_cast_perf; #endif #ifdef SHL_BUILD_GREF diff --git a/source/thead_matrix/fullyconnected_fp16.c b/source/thead_matrix/fullyconnected_fp16.c index d9661eac..4146c8eb 100644 --- a/source/thead_matrix/fullyconnected_fp16.c +++ b/source/thead_matrix/fullyconnected_fp16.c @@ -142,6 +142,62 @@ void shl_rvm_fc_gemm_reorder_weight_fp16_w_int8(struct csinn_tensor *weights) shl_mem_free(weight_reorder); } +/************************************************************************************* + * Per-channel dequantize int8 -> fp16 + * mrows = rlenb / 4 + * m2rows = mrows * 2 + * mcols = rlenb / sizeof(__fp16) + * msize_n: m2rows, mrows, n_tail + * msize_k: mcols, k_tail + * weight: [n/msize_n, k/msize_k, msize_n, msize_k] + ************************************************************************************/ +void shl_rvm_fc_dequantize_per_channel_i8_to_f16(struct csinn_tensor *weights, + struct csinn_fc_params *params, + __fp16 *weights_fp16) +{ + int8_t *weights_int8 = (int8_t *)weights->data; + int n = weights->dim[0]; // out_nodes + int k = weights->dim[1]; // in_nodes + + int mrows = csrr_xrlenb() / 4; + int m2rows = mrows * 2; + int mcols = m2rows; + + int r = 0; + for (; r + m2rows - 1 < n; r += m2rows) { + int c = 0; + while (c < k) { + uint16_t msize_k = (k - c >= mcols) ? mcols : (k - c); + int8_t *w_src = weights_int8 + r * k + c * m2rows; + __fp16 *w_dst = weights_fp16 + r * k + c * m2rows; + for (int i = 0; i < m2rows; i++) { + int32_t zp = weights->qinfo[r + i].zero_point; + float scale = weights->qinfo[r + i].scale; + shl_rvv_dequantize_i8_to_f16(w_src + i * msize_k, w_dst + i * msize_k, msize_k, zp, + scale); + } + c += msize_k; + } + } + while (r < n) { + int msize_n = (n - r >= mrows) ? mrows : (n - r); + int c = 0; + while (c < k) { + uint16_t msize_k = (k - c >= mcols) ? mcols : (k - c); + int8_t *w_src = weights_int8 + r * k + c * msize_n; + __fp16 *w_dst = weights_fp16 + r * k + c * msize_n; + for (int i = 0; i < msize_n; i++) { + int32_t zp = weights->qinfo[r + i].zero_point; + float scale = weights->qinfo[r + i].scale; + shl_rvv_dequantize_i8_to_f16(w_src + i * msize_k, w_dst + i * msize_k, msize_k, zp, + scale); + } + c += msize_k; + } + r += msize_n; + } +} + int shl_rvm_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weights, struct csinn_tensor *bias, struct csinn_fc_params *params) @@ -179,14 +235,9 @@ int shl_rvm_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_te float scale = weights->qinfo->scale; shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale); } else if (weights->quant_channel == output_depth) { - // support channel quantization - for (int c = 0; c < output_depth; c++) { - int32_t zp = weights->qinfo[c].zero_point; - float scale = weights->qinfo[c].scale; - shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth, - weights_fp16 + c * accum_depth, accum_depth, zp, - scale); - } + shl_rvm_fc_dequantize_per_channel_i8_to_f16(weights, params, weights_fp16); + } else { + shl_debug_error("%s unsupported quant_channel: %d\n", __func__, weights->quant_channel); } weights_data = weights_fp16; } else if (weights->dtype == CSINN_DTYPE_FLOAT16) { diff --git a/source/thead_matrix/setup.c b/source/thead_matrix/setup.c index 66489e2d..75a7a035 100644 --- a/source/thead_matrix/setup.c +++ b/source/thead_matrix/setup.c @@ -344,24 +344,6 @@ void shl_target_init_rvm() shl_rvm_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU6, shl_rvm_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu6); - shl_rvm_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_rvm_maxpool2d_init_fp16, NULL, - shl_gref_maxpool2d); - shl_rvm_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MAXPOOL2D, shl_rvm_maxpool2d_init_int8, NULL, - shl_gref_maxpool2d); - shl_rvm_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_rvm_avgpool2d_init_fp16, NULL, - shl_gref_avgpool2d); - shl_rvm_reg_op(CSINN_DTYPE_INT8, CSINN_OP_AVGPOOL2D, shl_rvm_avgpool2d_init_int8, NULL, - shl_gref_avgpool2d); - - shl_rvm_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvm_global_avgpool2d_init, - NULL, shl_gref_global_avgpool2d); - shl_rvm_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvm_global_avgpool2d_init, NULL, - shl_gref_global_avgpool2d); - shl_rvm_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, shl_rvm_global_maxpool2d_init, - NULL, shl_gref_global_maxpool2d); - shl_rvm_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GLOBAL_MAXPOOL2D, shl_rvm_global_maxpool2d_init, NULL, - shl_gref_global_maxpool2d); - shl_rvm_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_rvm_fullyconnected_init_fp16, NULL, shl_gref_fullyconnected); shl_rvm_reg_op(CSINN_DTYPE_INT8, CSINN_OP_FULLYCONNECTED, shl_rvm_fullyconnected_init_int8, diff --git a/source/thead_rvv/CMakeLists.txt b/source/thead_rvv/CMakeLists.txt index 3f95bd14..9b206aa2 100644 --- a/source/thead_rvv/CMakeLists.txt +++ b/source/thead_rvv/CMakeLists.txt @@ -21,9 +21,7 @@ if(CONFIG_THEAD_RVV_ADD_INT8) endif() if(CONFIG_THEAD_RVV_AVERAGEPOOL_FP32) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/avgpool_2x2_fp32_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/avgpool_2x2_fp32.c) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/avgpool_3x3_fp32_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/avgpool_3x3_fp32.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/avgpool_fp32_nhwc.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/avgpool_fp32_packn.c) @@ -32,9 +30,7 @@ if(CONFIG_THEAD_RVV_AVERAGEPOOL_FP32) endif() if(CONFIG_THEAD_RVV_AVERAGEPOOL_FP16) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/avgpool_2x2_fp16_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/avgpool_2x2_fp16.c) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/avgpool_3x3_fp16_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/avgpool_3x3_fp16.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/avgpool_fp16_nhwc.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/avgpool_fp16_packn.c) @@ -42,8 +38,6 @@ if(CONFIG_THEAD_RVV_AVERAGEPOOL_FP16) endif() if(CONFIG_THEAD_RVV_AVERAGEPOOL_INT8) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/avgpool_2x2_int8_packn.c) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/avgpool_3x3_int8_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/avgpool_int8_nhwc.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/avgpool_int8_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/avgpool.c) @@ -191,6 +185,10 @@ if(CONFIG_THEAD_RVV_DIV_INT8) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/div.c) endif() +if(CONFIG_THEAD_RVV_EMBEDDING_INT32) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int32/embedding.c) +endif() + if(CONFIG_THEAD_RVV_ERF_FP32) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/erf.c) endif() @@ -203,6 +201,14 @@ if(CONFIG_THEAD_RVV_ERF_INT8) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/erf.c) endif() +if(CONFIG_THEAD_RVV_EXPAND_DIMS_FP32) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/expand_dims.c) +endif() + +if(CONFIG_THEAD_RVV_EXPAND_DIMS_FP16) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/expand_dims.c) +endif() + if(CONFIG_THEAD_RVV_FULLYCONNECTED_FP32) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/gemm_fp32_a0b1.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/fullyconnected_fp32.c) @@ -317,6 +323,10 @@ if(CONFIG_THEAD_RVV_LEAKY_RELU_INT8) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/leaky_relu.c) endif() +if(CONFIG_THEAD_RVV_LLM_POS_FP16) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/llm_pos.c) +endif() + if(CONFIG_THEAD_RVV_MATMUL_FP32) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/matmul.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/gemm_fp32_block.c) @@ -334,9 +344,7 @@ if(CONFIG_THEAD_RVV_MATMUL_INT8) endif() if(CONFIG_THEAD_RVV_MAXPOOL_FP32) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/maxpool_2x2_fp32_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/maxpool_2x2_fp32.c) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/maxpool_3x3_fp32_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/maxpool_3x3_fp32.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/maxpool_fp32_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/maxpool_fp32_nhwc.c) @@ -344,9 +352,7 @@ if(CONFIG_THEAD_RVV_MAXPOOL_FP32) endif() if(CONFIG_THEAD_RVV_MAXPOOL_FP16) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/maxpool_2x2_fp16_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/maxpool_2x2_fp16.c) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/maxpool_3x3_fp16_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/maxpool_3x3_fp16.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/maxpool_fp16_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/maxpool_fp16_nhwc.c) @@ -354,9 +360,7 @@ if(CONFIG_THEAD_RVV_MAXPOOL_FP16) endif() if(CONFIG_THEAD_RVV_MAXPOOL_INT8) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/maxpool_2x2_int8_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/maxpool_2x2_int8.c) - list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/maxpool_3x3_int8_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/maxpool_3x3_int8.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/maxpool_int8_packn.c) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/maxpool_int8_nhwc.c) @@ -451,6 +455,22 @@ if(CONFIG_THEAD_RVV_RMS_NORM_INT8) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/int8/rms_norm.c) endif() +if(CONFIG_THEAD_RVV_ROPE_FP32) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/rope.c) +endif() + +if(CONFIG_THEAD_RVV_ROPE_FP16) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/rope.c) +endif() + +if(CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP32) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/scaled_dot_product_attention.c) +endif() + +if(CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP16) + list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp16/scaled_dot_product_attention.c) +endif() + if(CONFIG_THEAD_RVV_SIGMOID_FP32) list(APPEND THEAD_RVV_SRCS_MOD source/thead_rvv/fp32/sigmoid.c) endif() diff --git a/source/thead_rvv/Kconfig b/source/thead_rvv/Kconfig index c559e3ec..ebffd122 100644 --- a/source/thead_rvv/Kconfig +++ b/source/thead_rvv/Kconfig @@ -210,27 +210,48 @@ config THEAD_RVV_DIV_INT8 help Select SHL build v extension optimized div -config CONFIG_THEAD_RVV_ERF_FP32 +config THEAD_RVV_EMBEDDING_INT32 + depends on THEAD_RVV_SOURCE + bool "Layer embedding int32" + default y + help + Select SHL build v extension optimized embedding + +config THEAD_RVV_ERF_FP32 depends on THEAD_RVV_SOURCE bool "Layer erf fp32" default y help Select SHL build v extension optimized erf -config CONFIG_THEAD_RVV_ERF_FP16 +config THEAD_RVV_ERF_FP16 depends on THEAD_RVV_SOURCE bool "Layer erf fp16" default y help Select SHL build v extension optimized erf -config CONFIG_THEAD_RVV_ERF_INT8 +config THEAD_RVV_ERF_INT8 depends on THEAD_RVV_SOURCE bool "Layer erf int8" default y help Select SHL build v extension optimized erf +config THEAD_RVV_EXPAND_DIMS_FP32 + depends on THEAD_RVV_SOURCE + bool "Layer expand_dims fp32" + default y + help + Select SHL build v extension optimized expand_dims + +config THEAD_RVV_EXPAND_DIMS_FP16 + depends on THEAD_RVV_SOURCE + bool "Layer expand_dims fp16" + default y + help + Select SHL build v extension optimized expand_dims + config THEAD_RVV_FULLYCONNECTED_FP32 depends on THEAD_RVV_SOURCE bool "Layer fullyconnected fp32" @@ -392,6 +413,13 @@ config THEAD_RVV_LEAKY_RELU_INT8 help Select SHL build v extension optimized leaky_relu +config THEAD_RVV_LLM_POS_FP16 + depends on THEAD_RVV_SOURCE + bool "Layer llm_pos fp16" + default y + help + Select SHL build v extension optimized llm_pos + config THEAD_RVV_MATMUL_FP32 depends on THEAD_RVV_SOURCE bool "Layer matmul fp32" @@ -588,6 +616,34 @@ config THEAD_RVV_RMS_NORM_INT8 help Select SHL build v extension optimized rms_norm +config THEAD_RVV_ROPE_FP32 + depends on THEAD_RVV_SOURCE + bool "Layer rope fp32" + default y + help + Select SHL build v extension optimized rope + +config THEAD_RVV_ROPE_FP16 + depends on THEAD_RVV_SOURCE + bool "Layer rope fp16" + default y + help + Select SHL build v extension optimized rope + +config THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP32 + depends on THEAD_RVV_SOURCE + bool "Layer scaled_dot_product_attention fp32" + default y + help + Select SHL build v extension optimized scaled_dot_product_attention + +config THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP16 + depends on THEAD_RVV_SOURCE + bool "Layer scaled_dot_product_attention fp16" + default y + help + Select SHL build v extension optimized scaled_dot_product_attention + config THEAD_RVV_SIGMOID_FP32 depends on THEAD_RVV_SOURCE bool "Layer sigmoid fp32" diff --git a/source/thead_rvv/binary_broadcast.c b/source/thead_rvv/binary_broadcast.c index 06cc3866..bab41c29 100644 --- a/source/thead_rvv/binary_broadcast.c +++ b/source/thead_rvv/binary_broadcast.c @@ -354,6 +354,7 @@ int shl_rvv_binary_op_broadcast_fp32(struct csinn_tensor *input0, struct csinn_t int32_t *out_dim = output->dim; int32_t dim_count = output->dim_count; + // Mark an index that traverses each dimension. int32_t *idx = (int32_t *)shl_mem_alloc(dim_count * sizeof(int32_t)); int cur = 0; @@ -366,15 +367,23 @@ int shl_rvv_binary_op_broadcast_fp32(struct csinn_tensor *input0, struct csinn_t binary_op = binary_op_callback[CSINN_BROADCAST_SV]; } + // Work like a stack, "push" the higher dimension until reach the last dimension, + // "pop" when done traversing current dimension. while (idx[0] < out_dim[0]) { if (cur == dim_count - 1) { + // Do broadcast in the last dimension float *in0_ptr = input0_data + broadcast_get_index(in0_dim, idx, dim_count); float *in1_ptr = input1_data + broadcast_get_index(in1_dim, idx, dim_count); float *out_ptr = output_data + broadcast_get_index(out_dim, idx, dim_count); binary_op(in0_ptr, in1_ptr, out_ptr, out_dim[cur]); + if (cur == 0) { + break; + } cur -= 1; idx[cur] += 1; } else { + // If the current index is less than the current dim size, traverse the next dimension; + // Otherwise, set the index to 0, and return to the previous dimension. if (idx[cur] < out_dim[cur]) { cur += 1; } else { @@ -469,6 +478,9 @@ int shl_rvv_binary_op_broadcast_fp16(struct csinn_tensor *input0, struct csinn_t __fp16 *in1_ptr = input1_data + broadcast_get_index(in1_dim, idx, dim_count); __fp16 *out_ptr = output_data + broadcast_get_index(out_dim, idx, dim_count); binary_op(in0_ptr, in1_ptr, out_ptr, out_dim[cur]); + if (cur == 0) { + break; + } cur -= 1; idx[cur] += 1; } else { @@ -570,6 +582,9 @@ int shl_rvv_binary_op_broadcast_int8(struct csinn_tensor *input0, struct csinn_t int8_t *in1_ptr = input1_data + broadcast_get_index(in1_dim, idx, dim_count); int8_t *out_ptr = output_data + broadcast_get_index(out_dim, idx, dim_count); binary_op(in0_ptr, in1_ptr, out_ptr, out_dim[cur], scale, zero_point); + if (cur == 0) { + break; + } cur -= 1; idx[cur] += 1; } else { diff --git a/source/thead_rvv/capability.c b/source/thead_rvv/capability.c index ae19893d..2deb8b32 100644 --- a/source/thead_rvv/capability.c +++ b/source/thead_rvv/capability.c @@ -713,8 +713,44 @@ int shl_rvv_silu_cap(struct csinn_tensor *input, struct csinn_tensor *output, return common_all_support(input, &(params->base)); } -int shl_rvv_rms_norm_cap(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params) +int shl_rvv_rms_norm_cap(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { return common_all_support(input, &(params->base)); } + +int shl_rvv_embedding_cap(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + return float_all_support(input, &(params->base)); +} + +int shl_rvv_expand_dims_cap(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) +{ + return float_all_support(input, &(params->base)); +} + +int shl_rvv_rope_cap(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params) +{ + return float_all_support(input, &(params->base)); +} + +int shl_rvv_scaled_dot_product_attention_cap(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params) +{ + return float_all_support(query, &(params->base)); +} + +int shl_rvv_llm_pos_cap(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params) +{ + if (input->dtype == CSINN_DTYPE_FLOAT16) { + return CSINN_OPT_INTRINSIC; + } + + return CSINN_OPT_UNSUPPORTED; +} diff --git a/source/thead_rvv/fp16/avgpool.c b/source/thead_rvv/fp16/avgpool.c index 12597000..398bca83 100644 --- a/source/thead_rvv/fp16/avgpool.c +++ b/source/thead_rvv/fp16/avgpool.c @@ -51,69 +51,82 @@ int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor elempack = in_c % packn == 0 ? packn : 1; } - // global avgpool2d - if (in_h == kernel_h && in_w == kernel_w) { - cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 - : shl_rvv_global_avgpool2d_fp16; - return CSINN_TRUE; - } + if (input->layout == CSINN_LAYOUT_NCHW) { + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 + : shl_rvv_global_avgpool2d_fp16; + return CSINN_TRUE; + } - if (elempack % packn == 0) { - cb->exec = shl_rvv_avgpool_packn_fp16; - } else { - if (stride_h == 2 && stride_w == 2) { - if (kernel_h == 2 && kernel_w == 2) { - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_down == 0) params->pad_down++; - } - if (in_w % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; - } - // end consider ceil_mode 2x2s2p0 - cb->exec = shl_rvv_avgpool2x2s2_fp16; - } else if (pad_left == 1 && pad_top == 1) { - cb->exec = shl_rvv_avgpool2x2s2_p1_fp16; - } - } else if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_down == 0) - params->pad_down++; // origin pad_down mast be equal to zero ? - } - if (in_w % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; + if (elempack % packn == 0) { + cb->exec = shl_rvv_avgpool_packn_fp16; + } else { + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = shl_rvv_avgpool2x2s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = shl_rvv_avgpool2x2s2_p1_fp16; } - // end consider ceil_mode 3x3s2p0 - cb->exec = shl_rvv_avgpool3x3s2_fp16; - } else if (pad_left == 1 && pad_top == 1) { - if (params->ceil_mode == 0) { - cb->exec = shl_rvv_avgpool3x3s2_p1_fp16; - } else { - if ((in_w % 2 == 0 && pad_right == 1) || (in_h % 2 == 0 && pad_down == 1)) { - cb->exec = shl_ref_avgpool2d_quant; - } else { + } else if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = shl_rvv_avgpool3x3s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + if (params->ceil_mode == 0) { cb->exec = shl_rvv_avgpool3x3s2_p1_fp16; + } else { + if ((in_w % 2 == 0 && pad_right == 1) || + (in_h % 2 == 0 && pad_down == 1)) { + cb->exec = shl_ref_avgpool2d_quant; + } else { + cb->exec = shl_rvv_avgpool3x3s2_p1_fp16; + } } } } - } - } else if (stride_h == 1 && stride_w == 1) { - if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - cb->exec = shl_rvv_avgpool3x3s1_p1_fp16; + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = shl_rvv_avgpool3x3s1_p1_fp16; + } } } } - if (cb->exec == NULL) { - shl_debug_warning( - "avgpool is not optimized to achieve under this condition on rvv, call reference " - "func replaced.\n"); - cb->exec = shl_ref_avgpool2d_quant; + + } else if (input->layout == CSINN_LAYOUT_NHWC) { + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = shl_rvv_global_avgpool2d_nhwc_fp16; + return CSINN_TRUE; } + cb->exec = shl_rvv_avgpool_nhwc_fp16; + } + + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on rvv, call reference " + "func replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; } + return CSINN_TRUE; } @@ -140,7 +153,12 @@ int shl_rvv_global_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_ elempack = in_c % packn == 0 ? packn : 1; } - cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 - : shl_rvv_global_avgpool2d_fp16; + if (input->layout == CSINN_LAYOUT_NCHW) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16 + : shl_rvv_global_avgpool2d_fp16; + } else if (input->layout == CSINN_LAYOUT_NHWC) { + cb->exec = shl_rvv_global_avgpool2d_nhwc_fp16; + } + return CSINN_TRUE; } diff --git a/source/thead_rvv/fp16/convolution.c b/source/thead_rvv/fp16/convolution.c index cadb955f..d7496c9f 100644 --- a/source/thead_rvv/fp16/convolution.c +++ b/source/thead_rvv/fp16/convolution.c @@ -61,7 +61,9 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou if (params->group == 1 && kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { params->conv_extra.conv_mode = CSINN_DIRECT; - shl_rvv_conv3x3s1_direct_reorder_kernel_pack4n_fp16(kernel, params); + if (!binary_model_op_init) { + shl_rvv_conv3x3s1_direct_reorder_kernel_pack4n_fp16(kernel, params); + } cb->exec = shl_rvv_conv3x3s1_direct_fp16_nhwc; return CSINN_TRUE; } @@ -95,16 +97,20 @@ int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *ou return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_rvv_wg_b4f3s1_packn_fp16; } else { - shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel); cb->exec = shl_rvv_wg_b6f3s1_packn_fp16; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c b/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c index c1ceefa8..27aec768 100644 --- a/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c +++ b/source/thead_rvv/fp16/convolution_3x3_fp16_packn.c @@ -920,7 +920,16 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 6; + dst_kernel->dim[2] = 6; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -957,9 +966,6 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(__fp16)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(__fp16); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { __fp16 *g0 = kernel_tm_packn + oc * 36 * inch; @@ -986,6 +992,7 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } @@ -1089,7 +1096,7 @@ int shl_rvv_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor /****************************************************************************************** * kernel layout before: [O, I, 3, 3] - * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * kernel layout after : [O/pack2n, 64, I, pack2n] --> [O/packn, 64, I, packn] * constrain: output channel % packn = 0 * input channel % packn = 0 ******************************************************************************************/ @@ -1123,7 +1130,16 @@ void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, // {0.0f, 0.0f, 1.0f} // }; + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 8; + dst_kernel->dim[2] = 8; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -1158,9 +1174,6 @@ void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(__fp16); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { __fp16 *g0 = kernel_tm_packn + oc * 64 * inch; @@ -1187,6 +1200,7 @@ void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel, } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } diff --git a/source/thead_rvv/fp16/expand_dims.c b/source/thead_rvv/fp16/expand_dims.c new file mode 100644 index 00000000..0f222e33 --- /dev/null +++ b/source/thead_rvv/fp16/expand_dims.c @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" + +int shl_rvv_expand_dims_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) +{ + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + int size = 1; + if (input_data != output_data) { + for (int i = 0; i < input->dim_count; i++) { + size *= input->dim[i]; + } + int j = 0; + while (j < size) { + int vl = vsetvl_e16m4(size - j); + vfloat16m4_t _in = vle16_v_f16m4(input_data, vl); + vse16_v_f16m4(output_data, _in, vl); + input_data += vl; + output_data += vl; + j += vl; + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/fp16/fullyconnected_fp16.c b/source/thead_rvv/fp16/fullyconnected_fp16.c index 48f3f02b..61499d25 100644 --- a/source/thead_rvv/fp16/fullyconnected_fp16.c +++ b/source/thead_rvv/fp16/fullyconnected_fp16.c @@ -112,6 +112,62 @@ void shl_rvv_fc_gemm_reorder_weight_fp16_w_int8(struct csinn_tensor *weights) shl_mem_free(pa_reorder); } +/************************************************************************************* + * Per-channel dequantize int8 -> fp16 + ************************************************************************************/ +void shl_rvv_fc_npack2n_dequantize_per_channel_i8_to_f16(struct csinn_tensor *weights, + struct csinn_fc_params *params, + __fp16 *weights_fp16) +{ + int8_t *weights_int8 = (int8_t *)weights->data; + int n = weights->dim[0]; // out_nodes + int k = weights->dim[1]; // in_nodes + + const int packn = csrr_vlenb() / sizeof(__fp16); + const int pack2n = packn * 2; + + int i = 0; + int vl = vsetvl_e16m2(pack2n); + for (; i + pack2n - 1 < n; i += pack2n) { + int8_t *w_src = weights_int8 + i * k; + __fp16 *w_dst = weights_fp16 + i * k; + vint32m4_t _z32 = + vlse32_v_i32m4(&(weights->qinfo[i].zero_point), sizeof(struct csinn_quant_info), vl); + vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl); + vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl); + vfloat32m4_t _s32 = + vlse32_v_f32m4(&(weights->qinfo[i].scale), sizeof(struct csinn_quant_info), vl); + vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl); + for (int j = 0; j < k; j++) { + vint8m1_t _i8 = vle8_v_i8m1(w_src, vl); + vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl); + vse16_v_f16m2(w_dst, _f16, vl); + w_src += vl; + w_dst += vl; + } + } + while (i < n) { + int vl = vsetvl_e16m1(n - i); + int8_t *w_src = weights_int8 + i * k; + __fp16 *w_dst = weights_fp16 + i * k; + vint32m4_t _z32 = + vlse32_v_i32m4(&(weights->qinfo[i].zero_point), sizeof(struct csinn_quant_info), vl); + vint16m2_t _z16 = vnclip_wx_i16m2(_z32, 0, vl); + vint8m1_t _z = vnclip_wx_i8m1(_z16, 0, vl); + vfloat32m4_t _s32 = + vlse32_v_f32m4(&(weights->qinfo[i].scale), sizeof(struct csinn_quant_info), vl); + vfloat16m2_t _s = vfncvt_f_f_w_f16m2(_s32, vl); + for (int j = 0; j < k; j++) { + vint8m1_t _i8 = vle8_v_i8m1(w_src, vl); + vfloat16m2_t _f16 = shl_rvv_vdeq_vv_f16m2(_i8, _z, _s, vl); + vse16_v_f16m2(w_dst, _f16, vl); + w_src += vl; + w_dst += vl; + } + i += vl; + } +} + int shl_rvv_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weights, struct csinn_tensor *bias, struct csinn_fc_params *params) @@ -149,14 +205,9 @@ int shl_rvv_fullyconnected_gemm_fp16(struct csinn_tensor *input, struct csinn_te float scale = weights->qinfo->scale; shl_rvv_dequantize_i8_to_f16(weights_int8, weights_fp16, size, zp, scale); } else if (weights->quant_channel == output_depth) { - // support channel quantization - for (int c = 0; c < output_depth; c++) { - int32_t zp = weights->qinfo[c].zero_point; - float scale = weights->qinfo[c].scale; - shl_rvv_dequantize_i8_to_f16(weights_int8 + c * accum_depth, - weights_fp16 + c * accum_depth, accum_depth, zp, - scale); - } + shl_rvv_fc_npack2n_dequantize_per_channel_i8_to_f16(weights, params, weights_fp16); + } else { + shl_debug_error("%s unsupported quant_channel: %d\n", __func__, weights->quant_channel); } weights_data = weights_fp16; } else if (weights->dtype == CSINN_DTYPE_FLOAT16) { diff --git a/source/thead_rvv/fp16/gemm_fp16_block.c b/source/thead_rvv/fp16/gemm_fp16_block.c index 8eef76f1..366a6c80 100644 --- a/source/thead_rvv/fp16/gemm_fp16_block.c +++ b/source/thead_rvv/fp16/gemm_fp16_block.c @@ -682,7 +682,7 @@ static inline void gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp * k_blk: K_BLK, K_tail * * dst - output: [m, n] - * sa - kernel: [m/m_blk, k/k_blk, m_blk/12, 12, k_blk] + * sa - kernel: [m/m_blk, k/k_blk, m_blk/12, k_blk, 12] * sb - input: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n] * bias: [m] ************************************************************/ @@ -723,7 +723,8 @@ void shl_rvv_gemm_block_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp __fp16 *out = output_data + m_idx * n + n_idx; const __fp16 *ker = kernel_data + m_idx * k + k_idx * m_block; const __fp16 *in = input_data + n_idx * k + k_idx * n_block; - gemm_12xpack2n_fp16(out, ker, in, bias, m_block, n_block, k_block, n, k_idx); + gemm_12xpack2n_fp16(out, ker, in, bias + m_idx, m_block, n_block, k_block, n, + k_idx); k_idx += k_block; } diff --git a/source/thead_rvv/fp16/llm_pos.c b/source/thead_rvv/fp16/llm_pos.c new file mode 100644 index 00000000..4456a6f2 --- /dev/null +++ b/source/thead_rvv/fp16/llm_pos.c @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" + +int shl_rvv_llm_pos_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params) +{ + __fp16 *output_data = output->data; + __fp16 *input_data = input->data; + + int batch = params->bsz; + int seqlen = params->seqlen; + int start_pos = params->pos[0]; + int inner_size = input->dim[2] * input->dim[3]; + if (params->mode == CSINN_LLM_POS_CACHE_COPY_IN) { + for (int i = 0; i < batch; i++) { + int output_index = i * output->dim[1] * inner_size + start_pos * inner_size; + int input_index = i * input->dim[1] * inner_size; + int cpy_size = seqlen * inner_size * sizeof(__fp16); + + output_data = params->cache_buffer; + memcpy(output_data + output_index, input_data + input_index, cpy_size); + } + } else if (params->mode == CSINN_LLM_POS_CACHE_COPY_OUT) { + for (int i = 0; i < batch; i++) { + int output_index = i * output->dim[1] * inner_size; + int input_index = i * input->dim[1] * inner_size; + int cpy_size = (start_pos + seqlen) * inner_size * sizeof(__fp16); + input_data = params->cache_buffer; + + memcpy(output_data + output_index, input_data + input_index, cpy_size); + } + } else if (params->mode == CSINN_LLM_POS_MASK) { + memcpy(output->data, input->data, csinn_tensor_byte_size(output)); + for (int i = 0; i < input->dim[0] * input->dim[1]; i++) { + for (int j = 0; j < seqlen; ++j) { + int32_t *pos = params->pos; + for (int k = pos[j] + 1; k < seqlen; k++) { + int output_index = i * seqlen * seqlen + j * seqlen + k; + output_data[output_index] = (__fp16)-INFINITY; + } + } + } + return CSINN_TRUE; + } else { + shl_debug_error("Unsupport mode in %s\n", __func__); + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/fp16/matmul.c b/source/thead_rvv/fp16/matmul.c index 306e2821..6534f42f 100644 --- a/source/thead_rvv/fp16/matmul.c +++ b/source/thead_rvv/fp16/matmul.c @@ -330,13 +330,17 @@ int shl_rvv_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat struct csinn_tensor *output, struct csinn_matmul_params *params) { struct csinn_callback *cb = params->base.cb; + struct csinn_session *sess = params->base.sess; + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); if (!params->trans_a && !params->trans_b) { if (mat0->dtype == CSINN_DTYPE_FLOAT16) { - if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) { - shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK); - } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) { - if (mat1->is_const) { - shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK); + if (!binary_model_op_init) { + if (mat1->is_const && mat1->dtype == CSINN_DTYPE_INT8) { + shl_rvv_matmul_reorder_weight_fp16_w_int8(mat1, MATMUL_K_BLK, MATMUL_N_BLK); + } else if (mat1->dtype == CSINN_DTYPE_FLOAT16) { + if (mat1->is_const) { + shl_rvv_matmul_reorder_weight_fp16(mat1, MATMUL_K_BLK, MATMUL_N_BLK); + } } } cb->exec = shl_rvv_matmul_fp16; @@ -344,7 +348,7 @@ int shl_rvv_matmul_init_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat } if (cb->exec == NULL) { shl_debug_warning( - "matmul is not optimized to achieve under this condition, call reference func " + "matmul is not optimized to achieve under this condition on RVV, call reference func " "replaced.\n"); cb->exec = shl_ref_matmul_quant; } diff --git a/source/thead_rvv/fp16/maxpool.c b/source/thead_rvv/fp16/maxpool.c index b7e06a71..0dd86857 100644 --- a/source/thead_rvv/fp16/maxpool.c +++ b/source/thead_rvv/fp16/maxpool.c @@ -50,69 +50,82 @@ int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor elempack = in_c % packn == 0 ? packn : 1; } - // global maxpool2d // TODO: remove - if (in_h == kernel_h && in_w == kernel_w) { - cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 - : shl_rvv_global_maxpool2d_fp16; - return CSINN_TRUE; - } + if (input->layout == CSINN_LAYOUT_NCHW) { + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 + : shl_rvv_global_maxpool2d_fp16; + return CSINN_TRUE; + } - if (elempack % packn == 0) { - cb->exec = shl_rvv_maxpool_packn_fp16; - } else { - if (stride_h == 2 && stride_w == 2) { - if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_down == 0) params->pad_down++; - } - if (in_w % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; - } - // end consider ceil_mode 2x2s2p0 - cb->exec = shl_rvv_maxpool2x2s2_fp16; - } else if (pad_left == 1 && pad_top == 1) { - cb->exec = shl_rvv_maxpool2x2s2_p1_fp16; - } - } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_down == 0) - params->pad_down++; // origin pad_down mast be equal to zero ? - } - if (in_w % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; + if (elempack % packn == 0) { + cb->exec = shl_rvv_maxpool_packn_fp16; + } else { + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = shl_rvv_maxpool2x2s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = shl_rvv_maxpool2x2s2_p1_fp16; } - // end consider ceil_mode 3x3s2p0 - cb->exec = shl_rvv_maxpool3x3s2_fp16; - } else if (pad_left == 1 && pad_top == 1) { - if (params->ceil_mode == 0) { - cb->exec = shl_rvv_maxpool3x3s2_p1_fp16; - } else { - if ((in_w % 2 == 0 && pad_right == 1) || (in_h % 2 == 0 && pad_down == 1)) { - cb->exec = shl_ref_maxpool2d_quant; - } else { + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = shl_rvv_maxpool3x3s2_fp16; + } else if (pad_left == 1 && pad_top == 1) { + if (params->ceil_mode == 0) { cb->exec = shl_rvv_maxpool3x3s2_p1_fp16; + } else { + if ((in_w % 2 == 0 && pad_right == 1) || + (in_h % 2 == 0 && pad_down == 1)) { + cb->exec = shl_ref_maxpool2d_quant; + } else { + cb->exec = shl_rvv_maxpool3x3s2_p1_fp16; + } } } } - } - } else if (stride_h == 1 && stride_w == 1) { - if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - cb->exec = shl_rvv_maxpool3x3s1_p1_fp16; + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = shl_rvv_maxpool3x3s1_p1_fp16; + } } } } - if (cb->exec == NULL) { - shl_debug_warning( - "maxpool is not optimized to achieve under this condition on rvv, call reference " - "func replaced.\n"); - cb->exec = shl_ref_maxpool2d_quant; + + } else if (input->layout == CSINN_LAYOUT_NHWC) { + // global maxpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = shl_rvv_global_maxpool2d_nhwc_fp16; + return CSINN_TRUE; } + cb->exec = shl_rvv_maxpool_nhwc_fp16; + } + + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on rvv, call reference " + "func replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; } + return CSINN_TRUE; } @@ -139,7 +152,12 @@ int shl_rvv_global_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_ elempack = in_c % packn == 0 ? packn : 1; } - cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 - : shl_rvv_global_maxpool2d_fp16; + if (input->layout == CSINN_LAYOUT_NCHW) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16 + : shl_rvv_global_maxpool2d_fp16; + } else if (input->layout == CSINN_LAYOUT_NHWC) { + cb->exec = shl_rvv_global_maxpool2d_nhwc_fp16; + } + return CSINN_TRUE; } diff --git a/source/thead_rvv/fp16/rms_norm.c b/source/thead_rvv/fp16/rms_norm.c index ef07d6b3..b6d12c8c 100644 --- a/source/thead_rvv/fp16/rms_norm.c +++ b/source/thead_rvv/fp16/rms_norm.c @@ -18,8 +18,8 @@ #include "rvv/rvv.h" -int shl_rvv_rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params) +static int rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { if (input->layout == CSINN_LAYOUT_NC1HWC0) { shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input); @@ -43,26 +43,30 @@ int shl_rvv_rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *outpu __fp16 *input_ptr = input_data + b * norm_size; __fp16 *output_ptr = output_data + b * norm_size; - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.0f, 1); + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.0f, 1); int i = 0; while (i < norm_size) { int vl = vsetvl_e16m2(norm_size - i); vfloat16m2_t _x = vle16_v_f16m2(input_ptr + i, vl); - vfloat16m2_t _x2 = vfmul_vv_f16m2(_x, _x, vl); - _sum = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _x2, _sum, vl); + vfloat32m4_t _x_f32 = vfwcvt_f_f_v_f32m4(_x, vl); + vfloat32m4_t _x2 = vfmul_vv_f32m4(_x_f32, _x_f32, vl); + _sum = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), _x2, _sum, vl); i += vl; } - float sum = vfmv_f_s_f16m1_f16(_sum); + float sum = vfmv_f_s_f32m1_f32(_sum); float scale = 1.0 / sqrt(sum / norm_size + eps); i = 0; while (i < norm_size) { int vl = vsetvl_e16m2(norm_size - i); vfloat16m2_t _x = vle16_v_f16m2(input_ptr + i, vl); + vfloat32m4_t _x_f32 = vfwcvt_f_f_v_f32m4(_x, vl); vfloat16m2_t _w = vle16_v_f16m2(weight_data + i, vl); - vfloat16m2_t _res = vfmul_vf_f16m2(_x, scale, vl); - _res = vfmul_vv_f16m2(_res, _w, vl); + vfloat32m4_t _w_f32 = vfwcvt_f_f_v_f32m4(_w, vl); + vfloat32m4_t _res_f32 = vfmul_vf_f32m4(_x_f32, scale, vl); + _res_f32 = vfmul_vv_f32m4(_res_f32, _w_f32, vl); + vfloat16m2_t _res = vfncvt_f_f_w_f16m2(_res_f32, vl); vse16_v_f16m2(output_ptr + i, _res, vl); i += vl; } @@ -70,3 +74,73 @@ int shl_rvv_rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *outpu return CSINN_TRUE; } + +int rms_norm_fp16_w_fp32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) +{ + if (input->layout == CSINN_LAYOUT_NC1HWC0) { + shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp16(input); + } + __fp16 *input_data = (__fp16 *)input->data; + __fp16 *output_data = (__fp16 *)output->data; + float *weight_data = (float *)weight->data; + float eps = params->epsilon; + /* support negative axis */ + int axis = params->axis >= 0 ? params->axis : (params->axis + input->dim_count); + int32_t batches = 1; + for (int i = 0; i < axis; i++) { + batches *= input->dim[i]; + } + int32_t norm_size = 1; + for (int i = axis; i < input->dim_count; i++) { + norm_size *= input->dim[i]; + } + + for (int b = 0; b < batches; b++) { + __fp16 *input_ptr = input_data + b * norm_size; + __fp16 *output_ptr = output_data + b * norm_size; + + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.0f, 1); + int i = 0; + while (i < norm_size) { + int vl = vsetvl_e16m2(norm_size - i); + vfloat16m2_t _x = vle16_v_f16m2(input_ptr + i, vl); + vfloat32m4_t _x_f32 = vfwcvt_f_f_v_f32m4(_x, vl); + vfloat32m4_t _x2 = vfmul_vv_f32m4(_x_f32, _x_f32, vl); + _sum = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), _x2, _sum, vl); + i += vl; + } + + float sum = vfmv_f_s_f32m1_f32(_sum); + float scale = 1.0 / sqrt(sum / norm_size + eps); + + i = 0; + while (i < norm_size) { + int vl = vsetvl_e16m2(norm_size - i); + vfloat16m2_t _x = vle16_v_f16m2(input_ptr + i, vl); + vfloat32m4_t _x_f32 = vfwcvt_f_f_v_f32m4(_x, vl); + vfloat32m4_t _w_f32 = vle32_v_f32m4(weight_data + i, vl); + vfloat32m4_t _res_f32 = vfmul_vf_f32m4(_x_f32, scale, vl); + _res_f32 = vfmul_vv_f32m4(_res_f32, _w_f32, vl); + vfloat16m2_t _res = vfncvt_f_f_w_f16m2(_res_f32, vl); + vse16_v_f16m2(output_ptr + i, _res, vl); + i += vl; + } + } + + return CSINN_TRUE; +} + +int shl_rvv_rms_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) +{ + if (output->dtype == CSINN_DTYPE_FLOAT16) { + if (weight->dtype == CSINN_DTYPE_FLOAT16) { + return rms_norm_fp16(input, weight, output, params); + } else if (weight->dtype == CSINN_DTYPE_FLOAT32) { + return rms_norm_fp16_w_fp32(input, weight, output, params); + } + } + + return shl_ref_rms_norm_quant(input, weight, output, params); +} diff --git a/source/thead_rvv/fp16/rope.c b/source/thead_rvv/fp16/rope.c new file mode 100644 index 00000000..b986f535 --- /dev/null +++ b/source/thead_rvv/fp16/rope.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" + +int shl_rvv_rope_fp16(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params) +{ + float freq_base = params->freq_base; + float freq_scale = params->freq_scale; + float xpos_base = params->xpos_base; + int32_t xpos_down = params->xpos_down; + int n_dims = params->n_dims; + + float theta_scale = powf(freq_base, -2.0f / n_dims); + + __fp16 *src_data = (__fp16 *)input->data; + __fp16 *dst_data = (__fp16 *)output->data; + int32_t *pos = params->pos; + + if (!params->use_rope_cache) { + for (int i3 = 0; i3 < input->dim[0]; i3++) { + for (int i2 = 0; i2 < input->dim[1]; i2++) { + int p = pos[i2]; + for (int i1 = 0; i1 < input->dim[2]; i1++) { + float theta = freq_scale * (float)p; + + for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { + __fp16 cos_theta = (__fp16)cosf(theta); + __fp16 sin_theta = (__fp16)sinf(theta); + // zeta scaling for xPos only: + float zeta = + xpos_base != 0.0f + ? powf((i0 + 0.4f * input->dim[0]) / (1.4f * input->dim[0]), + p / xpos_base) + : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; + __fp16 fin_zeta = (__fp16)zeta; + + theta *= theta_scale; + + int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + __fp16 x0 = src_data[index]; + __fp16 x1 = src_data[index + 1]; + + dst_data[index] = x0 * cos_theta * fin_zeta - x1 * sin_theta * fin_zeta; + dst_data[index + 1] = x0 * sin_theta * fin_zeta + x1 * cos_theta * fin_zeta; + } + } + } + } + } else { + __fp16 *rope_cache = + &((__fp16 *)params->rope_cache)[pos[0] * input->dim[2] * input->dim[3]]; + for (int i3 = 0; i3 < input->dim[0]; i3++) { + for (int i2 = 0; i2 < input->dim[1]; i2++) { + for (int i1 = 0; i1 < input->dim[2]; i1++) { + for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { + int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + int rope_cache_index = + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + __fp16 x0 = src_data[index]; + __fp16 x1 = src_data[index + 1]; + __fp16 sin_theta = rope_cache[index]; + __fp16 cos_theta = rope_cache[index + 1]; + + dst_data[index] = x0 * cos_theta - x1 * sin_theta; + dst_data[index + 1] = x0 * sin_theta + x1 * cos_theta; + } + } + } + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/fp16/scaled_dot_product_attention.c b/source/thead_rvv/fp16/scaled_dot_product_attention.c new file mode 100644 index 00000000..0663a2df --- /dev/null +++ b/source/thead_rvv/fp16/scaled_dot_product_attention.c @@ -0,0 +1,751 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" +#include "rvv_mathfun_fp16.h" + +static inline void qk_t1_dot_4x4_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int M, int K, + int N, int lda, int ldb, int ldc); +static inline void trans_q_0132_fp16(__fp16 *src, __fp16 *dst, int sv, int head_dim); +static void q0k1_softmax_v1_fp16(__fp16 *q, __fp16 *k, __fp16 *v, __fp16 *o, + struct csinn_scale_dot_attention_params *params, int32_t sq, + int32_t sk, int32_t head_dim); + +int shl_rvv_scaled_dot_product_attention_fp16(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params) +{ + __fp16 *query_data = query->data; + __fp16 *key_data = key->data; + __fp16 *value_data = value->data; + __fp16 *output_data = output_tensor->data; + // np: number of heads + // sk: sequence number of kv + // sq: sequence number of q + int32_t batch = query->dim[0]; // batch = 1 only + int32_t np = query->dim[1]; + int32_t sk = key->dim[2]; + int32_t sq = query->dim[2]; + int32_t head_dim = query->dim[3]; + + if (shl_multithread_is_enable()) { +#pragma omp parallel for + for (int i = 0; i < batch * np; i++) { + __fp16 *q = query_data + i * sq * head_dim; + __fp16 *k = key_data + i * sk * head_dim; + __fp16 *v = value_data + i * sk * head_dim; + __fp16 *o = output_data + i * sq * head_dim; + if (params->transpose_v == 0) { + __fp16 *value_transpose_tmp = malloc(sk * head_dim * sizeof(__fp16)); + trans_q_0132_fp16(v, value_transpose_tmp, sk, head_dim); + q0k1_softmax_v1_fp16(q, k, value_transpose_tmp, o, params, sq, sk, head_dim); + free(value_transpose_tmp); + } else { + q0k1_softmax_v1_fp16(q, k, v, o, params, sq, sk, head_dim); + } + } + } else { + for (int i = 0; i < batch * np; i++) { + __fp16 *q = query_data + i * sq * head_dim; + __fp16 *k = key_data + i * sk * head_dim; + __fp16 *v = value_data + i * sk * head_dim; + __fp16 *o = output_data + i * sq * head_dim; + if (params->transpose_v == 0) { + __fp16 *value_transpose_tmp = malloc(sk * head_dim * sizeof(__fp16)); + trans_q_0132_fp16(v, value_transpose_tmp, sk, head_dim); + q0k1_softmax_v1_fp16(q, k, value_transpose_tmp, o, params, sq, sk, head_dim); + free(value_transpose_tmp); + } else { + q0k1_softmax_v1_fp16(q, k, v, o, params, sq, sk, head_dim); + } + } + } + return CSINN_TRUE; +} + +static inline void trans_q_0132_fp16(__fp16 *src, __fp16 *dst, int sv, int head_dim) +{ + for (int i = 0; i < sv; i++) { + int size = head_dim; + __fp16 *d_ptr = dst + i; + while (size > 0) { + int vl = vsetvl_e16m4(size); + vfloat16m4_t _in = vle16_v_f16m4(src, vl); + src += vl; + vsse16_v_f16m4(d_ptr, sv * sizeof(__fp16), _in, vl); + d_ptr += vl * sv; + size -= vl; + } + } + dst += head_dim * sv; +} + +static inline void qk_t1_dot_4x4_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int M, int K, + int N, int lda, int ldb, int ldc) +{ + int i = 0; + for (; i + 3 < M; i += 4) { + const __fp16 *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *a1_ptr = sa_ptr + 1 * lda; + const __fp16 *a2_ptr = sa_ptr + 2 * lda; + const __fp16 *a3_ptr = sa_ptr + 3 * lda; + const __fp16 *b0_ptr = sb + j * ldb; + const __fp16 *b1_ptr = b0_ptr + 1 * ldb; + const __fp16 *b2_ptr = b0_ptr + 2 * ldb; + const __fp16 *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[m, 0] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vlmax); + // dst[m, 1] + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc01, vlmax); + vfloat16m1_t _acc21 = vmv_v_v_f16m1(_acc01, vlmax); + vfloat16m1_t _acc31 = vmv_v_v_f16m1(_acc01, vlmax); + // dst[m, 2] + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc02, vlmax); + vfloat16m1_t _acc22 = vmv_v_v_f16m1(_acc02, vlmax); + vfloat16m1_t _acc32 = vmv_v_v_f16m1(_acc02, vlmax); + // dst[m, 3] + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc03, vlmax); + vfloat16m1_t _acc23 = vmv_v_v_f16m1(_acc03, vlmax); + vfloat16m1_t _acc33 = vmv_v_v_f16m1(_acc03, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _a1 = vle16_v_f16m1(a1_ptr + c, vl); + vfloat16m1_t _a2 = vle16_v_f16m1(a2_ptr + c, vl); + vfloat16m1_t _a3 = vle16_v_f16m1(a3_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr + c, vl); + vfloat16m1_t _b2 = vle16_v_f16m1(b2_ptr + c, vl); + vfloat16m1_t _b3 = vle16_v_f16m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vlmax); + + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vlmax); + _acc11 = vfmacc_vv_f16m1(_acc11, _a1, _b1, vlmax); + _acc21 = vfmacc_vv_f16m1(_acc21, _a2, _b1, vlmax); + _acc31 = vfmacc_vv_f16m1(_acc31, _a3, _b1, vlmax); + + _acc02 = vfmacc_vv_f16m1(_acc02, _a0, _b2, vlmax); + _acc12 = vfmacc_vv_f16m1(_acc12, _a1, _b2, vlmax); + _acc22 = vfmacc_vv_f16m1(_acc22, _a2, _b2, vlmax); + _acc32 = vfmacc_vv_f16m1(_acc32, _a3, _b2, vlmax); + + _acc03 = vfmacc_vv_f16m1(_acc03, _a0, _b3, vlmax); + _acc13 = vfmacc_vv_f16m1(_acc13, _a1, _b3, vlmax); + _acc23 = vfmacc_vv_f16m1(_acc23, _a2, _b3, vlmax); + _acc33 = vfmacc_vv_f16m1(_acc33, _a3, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + int idx01 = (i + 0) * ldc + (j + 1); + int idx11 = (i + 1) * ldc + (j + 1); + int idx21 = (i + 2) * ldc + (j + 1); + int idx31 = (i + 3) * ldc + (j + 1); + + int idx02 = (i + 0) * ldc + (j + 2); + int idx12 = (i + 1) * ldc + (j + 2); + int idx22 = (i + 2) * ldc + (j + 2); + int idx32 = (i + 3) * ldc + (j + 2); + + int idx03 = (i + 0) * ldc + (j + 3); + int idx13 = (i + 1) * ldc + (j + 3); + int idx23 = (i + 2) * ldc + (j + 3); + int idx33 = (i + 3) * ldc + (j + 3); + + // dst[m, 0] + vfloat16m1_t _sum00; + vfloat16m1_t _sum10; + vfloat16m1_t _sum20; + vfloat16m1_t _sum30; + // dst[m, 1] + vfloat16m1_t _sum01; + vfloat16m1_t _sum11; + vfloat16m1_t _sum21; + vfloat16m1_t _sum31; + // dst[m, 2] + vfloat16m1_t _sum02; + vfloat16m1_t _sum12; + vfloat16m1_t _sum22; + vfloat16m1_t _sum32; + // dst[m, 3] + vfloat16m1_t _sum03; + vfloat16m1_t _sum13; + vfloat16m1_t _sum23; + vfloat16m1_t _sum33; + + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f16m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f16m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f16m1(dst[idx30], 1); + + _sum01 = vfmv_v_f_f16m1(dst[idx01], 1); + _sum11 = vfmv_v_f_f16m1(dst[idx11], 1); + _sum21 = vfmv_v_f_f16m1(dst[idx21], 1); + _sum31 = vfmv_v_f_f16m1(dst[idx31], 1); + + _sum02 = vfmv_v_f_f16m1(dst[idx02], 1); + _sum12 = vfmv_v_f_f16m1(dst[idx12], 1); + _sum22 = vfmv_v_f_f16m1(dst[idx22], 1); + _sum32 = vfmv_v_f_f16m1(dst[idx32], 1); + + _sum03 = vfmv_v_f_f16m1(dst[idx03], 1); + _sum13 = vfmv_v_f_f16m1(dst[idx13], 1); + _sum23 = vfmv_v_f_f16m1(dst[idx23], 1); + _sum33 = vfmv_v_f_f16m1(dst[idx33], 1); + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc30, _sum30, vlmax); + + _sum01 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc01, _sum01, vlmax); + _sum11 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc11, _sum11, vlmax); + _sum21 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc21, _sum21, vlmax); + _sum31 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc31, _sum31, vlmax); + + _sum02 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc02, _sum02, vlmax); + _sum12 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc12, _sum12, vlmax); + _sum22 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc22, _sum22, vlmax); + _sum32 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc32, _sum32, vlmax); + + _sum03 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc03, _sum03, vlmax); + _sum13 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc13, _sum13, vlmax); + _sum23 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc23, _sum23, vlmax); + _sum33 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc33, _sum33, vlmax); + + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + dst[idx10] = vfmv_f_s_f16m1_f16(_sum10); + dst[idx20] = vfmv_f_s_f16m1_f16(_sum20); + dst[idx30] = vfmv_f_s_f16m1_f16(_sum30); + + dst[idx01] = vfmv_f_s_f16m1_f16(_sum01); + dst[idx11] = vfmv_f_s_f16m1_f16(_sum11); + dst[idx21] = vfmv_f_s_f16m1_f16(_sum21); + dst[idx31] = vfmv_f_s_f16m1_f16(_sum31); + + dst[idx02] = vfmv_f_s_f16m1_f16(_sum02); + dst[idx12] = vfmv_f_s_f16m1_f16(_sum12); + dst[idx22] = vfmv_f_s_f16m1_f16(_sum22); + dst[idx32] = vfmv_f_s_f16m1_f16(_sum32); + + dst[idx03] = vfmv_f_s_f16m1_f16(_sum03); + dst[idx13] = vfmv_f_s_f16m1_f16(_sum13); + dst[idx23] = vfmv_f_s_f16m1_f16(_sum23); + dst[idx33] = vfmv_f_s_f16m1_f16(_sum33); + } + for (; j < N; j++) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *a1_ptr = sa_ptr + 1 * lda; + const __fp16 *a2_ptr = sa_ptr + 2 * lda; + const __fp16 *a3_ptr = sa_ptr + 3 * lda; + const __fp16 *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[m, 0] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + vfloat16m1_t _acc10 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc20 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc30 = vmv_v_v_f16m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _a1 = vle16_v_f16m1(a1_ptr + c, vl); + vfloat16m1_t _a2 = vle16_v_f16m1(a2_ptr + c, vl); + vfloat16m1_t _a3 = vle16_v_f16m1(a3_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f16m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f16m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f16m1(_acc30, _a3, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + // dst[m, 0] + vfloat16m1_t _sum00; + vfloat16m1_t _sum10; + vfloat16m1_t _sum20; + vfloat16m1_t _sum30; + + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f16m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f16m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f16m1(dst[idx30], 1); + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc30, _sum30, vlmax); + + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + dst[idx10] = vfmv_f_s_f16m1_f16(_sum10); + dst[idx20] = vfmv_f_s_f16m1_f16(_sum20); + dst[idx30] = vfmv_f_s_f16m1_f16(_sum30); + } + } + for (; i < M; i += 1) { + const __fp16 *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * ldb; + const __fp16 *b1_ptr = b0_ptr + 1 * ldb; + const __fp16 *b2_ptr = b0_ptr + 2 * ldb; + const __fp16 *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[0, n] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vlmax); + vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + vfloat16m1_t _b1 = vle16_v_f16m1(b1_ptr + c, vl); + vfloat16m1_t _b2 = vle16_v_f16m1(b2_ptr + c, vl); + vfloat16m1_t _b3 = vle16_v_f16m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + _acc01 = vfmacc_vv_f16m1(_acc01, _a0, _b1, vlmax); + _acc02 = vfmacc_vv_f16m1(_acc02, _a0, _b2, vlmax); + _acc03 = vfmacc_vv_f16m1(_acc03, _a0, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx01 = (i + 0) * ldc + (j + 1); + int idx02 = (i + 0) * ldc + (j + 2); + int idx03 = (i + 0) * ldc + (j + 3); + + // dst[0, n] + vfloat16m1_t _sum00; + vfloat16m1_t _sum01; + vfloat16m1_t _sum02; + vfloat16m1_t _sum03; + + // _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + // _sum01 = vfmv_v_f_f16m1(dst[idx01], 1); + // _sum02 = vfmv_v_f_f16m1(dst[idx02], 1); + // _sum03 = vfmv_v_f_f16m1(dst[idx03], 1); + _sum00 = vfmv_v_f_f16m1(0.0f, 1); + _sum01 = vfmv_v_f_f16m1(0.0f, 1); + _sum02 = vfmv_v_f_f16m1(0.0f, 1); + _sum03 = vfmv_v_f_f16m1(0.0f, 1); + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + _sum01 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc01, _sum01, vlmax); + _sum02 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc02, _sum02, vlmax); + _sum03 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc03, _sum03, vlmax); + + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + dst[idx01] = vfmv_f_s_f16m1_f16(_sum01); + dst[idx02] = vfmv_f_s_f16m1_f16(_sum02); + dst[idx03] = vfmv_f_s_f16m1_f16(_sum03); + } + for (; j < N; j++) { + const __fp16 *a0_ptr = sa_ptr; + const __fp16 *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + // dst[0, 0] + vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e16m1(K - c); + vfloat16m1_t _a0 = vle16_v_f16m1(a0_ptr + c, vl); + vfloat16m1_t _b0 = vle16_v_f16m1(b0_ptr + c, vl); + _acc00 = vfmacc_vv_f16m1(_acc00, _a0, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + + // dst[0, 0] + vfloat16m1_t _sum00; + + _sum00 = vfmv_v_f_f16m1(dst[idx00], 1); + + _sum00 = vfredosum_vs_f16m1_f16m1(vundefined_f16m1(), _acc00, _sum00, vlmax); + dst[idx00] = vfmv_f_s_f16m1_f16(_sum00); + } + } +} + +/** + * for llm + * if prefill: q [batch,np,sq,dim_head] + * k [batch,np,sk,dim_head] + * v [batch,np,dim_head,sv] + * sq = sk =sv > 1 + * if decoder: q [batch,np,sq,dim_head] + * k [batch,np,sk,dim_head] + * v [batch,np,dim_head,sv] + * sq = 1, sk = sv > 1 + * + */ +static void q0k1_softmax_v1_fp16(__fp16 *q, __fp16 *k, __fp16 *v, __fp16 *o, + struct csinn_scale_dot_attention_params *params, int32_t sq, + int32_t sk, int32_t head_dim) +{ + __fp16 norm_factor = 1.0f / params->norm_factor; // sqrt(128) + size_t matmul_res_size = sq * sk * sizeof(__fp16); + __fp16 *matmul_res_data = malloc(matmul_res_size); + memset(matmul_res_data, 0, matmul_res_size); + if (sq > 1) { + const __fp16 *q_in = q; + const __fp16 *k_in = k; + const __fp16 *v_in = v; + for (int i = 0; i < sq; i++) { + __fp16 max = -65504; + __fp16 acc_exp = 0.0f; + int casual_cnt = sk; + if (params->casual) casual_cnt = i + 1 + (sk - sq); + const __fp16 *q_ptr = q_in + i * head_dim; + int j = 0; + const int stride = 4; + int m1_vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + ; + vfloat16m1_t _max = vfmv_v_f_f16m1(max, m1_vl); + for (; j + stride - 1 < casual_cnt; j += stride) { + const __fp16 *k_ptr0 = k_in + j * head_dim; + const __fp16 *k_ptr1 = k_in + (j + 1) * head_dim; + const __fp16 *k_ptr2 = k_in + (j + 2) * head_dim; + const __fp16 *k_ptr3 = k_in + (j + 3) * head_dim; + int vl = vsetvl_e16m2(csrr_vlenb() / sizeof(__fp16) * 2); + + vfloat16m2_t _acc00 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc01 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc02 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc03 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m1_t _sum00 = vfmv_v_f_f16m1(matmul_res_data[i * sk + j], 1); + vfloat16m1_t _sum01 = vfmv_v_f_f16m1(matmul_res_data[i * sk + j + 1], 1); + vfloat16m1_t _sum02 = vfmv_v_f_f16m1(matmul_res_data[i * sk + j + 2], 1); + vfloat16m1_t _sum03 = vfmv_v_f_f16m1(matmul_res_data[i * sk + j + 3], 1); + + int l = 0; + while (l < head_dim) { + // vlen128 e16m2 = 16 + int vl_ = vsetvl_e16m2(head_dim - l); + vfloat16m2_t _q0 = vle16_v_f16m2(q_ptr + l, vl_); + vfloat16m2_t _k0 = vle16_v_f16m2(k_ptr0 + l, vl_); + vfloat16m2_t _k1 = vle16_v_f16m2(k_ptr1 + l, vl_); + vfloat16m2_t _k2 = vle16_v_f16m2(k_ptr2 + l, vl_); + vfloat16m2_t _k3 = vle16_v_f16m2(k_ptr3 + l, vl_); + + _acc00 = vfmacc_vv_f16m2(_acc00, _q0, _k0, vl); + _acc01 = vfmacc_vv_f16m2(_acc01, _q0, _k1, vl); + _acc02 = vfmacc_vv_f16m2(_acc02, _q0, _k2, vl); + _acc03 = vfmacc_vv_f16m2(_acc03, _q0, _k3, vl); + l += vl_; + } + __fp16 res[stride]; + _sum00 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + res[0] = vfmv_f_s_f16m1_f16(_sum00); + _sum01 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc01, _sum01, vl); + res[1] = vfmv_f_s_f16m1_f16(_sum01); + _sum02 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc02, _sum02, vl); + res[2] = vfmv_f_s_f16m1_f16(_sum02); + _sum03 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc03, _sum03, vl); + res[3] = vfmv_f_s_f16m1_f16(_sum03); + int min_vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16) / 2); // min_vl = 4 + vfloat16m1_t save = vle16_v_f16m1(res, min_vl); + save = vfmul_vf_f16m1(save, norm_factor, min_vl); + vse16_v_f16m1(matmul_res_data + i * sk + j, save, min_vl); + _max = vfmax_vv_f16m1(save, _max, min_vl); + } + + vfloat16m1_t _min_f = vfmv_v_f_f16m1(max, m1_vl); + vfloat16m1_t _max0 = vfredmax_vs_f16m1_f16m1(vundefined_f16m1(), _max, _min_f, m1_vl); + max = vfmv_f_s_f16m1_f16(_max0); + for (; j < casual_cnt; j++) { + const __fp16 *k_ptr = k_in + j * head_dim; + int vl = vsetvl_e16m4(csrr_vlenb() / sizeof(__fp16) * 4); + vfloat16m4_t _acc00 = vfmv_v_f_f16m4(0.0f, vl); + vfloat16m1_t _sum00 = vfmv_v_f_f16m1(matmul_res_data[i * sk + j], 1); + ; + int l = 0; + while (l < head_dim) { + // vlen128 e16m4 = 32 + int vl_ = vsetvl_e16m4(head_dim - l); + vfloat16m4_t _q0 = vle16_v_f16m4(q_ptr + l, vl_); + vfloat16m4_t _k0 = vle16_v_f16m4(k_ptr + l, vl_); + _acc00 = vfmacc_vv_f16m4(_acc00, _q0, _k0, vl); + l += vl_; + } + _sum00 = vfredosum_vs_f16m4_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + __fp16 res = vfmv_f_s_f16m1_f16(_sum00); + res *= norm_factor; + matmul_res_data[i * sk + j] = res; + max = fmax(max, res); + } + + vfloat16m1_t fred_sum1 = vfmv_v_f_f16m1(0.0f, 1); + __fp16 *res_in = &matmul_res_data[i * sk]; + int vl_m4 = vsetvl_e16m4(csrr_vlenb() / sizeof(__fp16) * 4); + int len = 0; + vfloat16m4_t div_sum0 = vfmv_v_f_f16m4(0.0f, vl_m4); + while (len < casual_cnt) { + int vl_in = vsetvl_e16m4(casual_cnt - len); + vfloat16m4_t _res = vle16_v_f16m4(res_in + len, vl_in); + _res = vfadd_vf_f16m4(_res, -max, vl_in); + _res = exp_ps_vfloat16m4(_res, vl_in); + vse16_v_f16m4(res_in + len, _res, vl_in); + div_sum0 = vfadd_vv_f16m4(div_sum0, _res, vl_m4); + len += vl_in; + } + fred_sum1 = vfredosum_vs_f16m4_f16m1(vundefined_f16m1(), div_sum0, fred_sum1, vl_m4); + acc_exp = vfmv_f_s_f16m1_f16(fred_sum1); + len = 0; + const __fp16 _mul_exp = 1.0f / acc_exp; + while (len < casual_cnt) { + int vl_in = vsetvl_e16m4(casual_cnt - len); + vfloat16m4_t _mul_in = vle16_v_f16m4(res_in + len, vl_in); + vfloat16m4_t _output_data = vfmul_vf_f16m4(_mul_in, _mul_exp, vl_in); + vse16_v_f16m4(res_in + len, _output_data, vl_in); + len += vl_in; + } + } + __fp16 *o_out = o; + const __fp16 *qk_ptr = &matmul_res_data[0]; + const __fp16 *v_ptr = v_in; + int M = sq; + int K = sk; // not casual_cnt + int N = head_dim; + int lda = sk; + int ldb = sk; + int ldc = head_dim; + qk_t1_dot_4x4_fp16(o_out, qk_ptr, v_ptr, M, K, N, lda, ldb, ldc); + } else if (sq == 1) { + const __fp16 *q_in = q; + const __fp16 *k_in = k; + const __fp16 *v_in = v; + // for sq, but sq = 1 + __fp16 max = -65504; + __fp16 acc_exp = 0.0f; + int casual_cnt = sk; + if (params->casual) casual_cnt = 1 + (sk - sq); + const __fp16 *q_ptr = q_in; + + { + const int stride = 4; + int m1_vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + vfloat16m1_t _max = vfmv_v_f_f16m1(max, m1_vl); + int j = 0; + for (; j + stride - 1 < casual_cnt; j += stride) { + const __fp16 *k_ptr0 = k_in + j * head_dim; + const __fp16 *k_ptr1 = k_in + (j + 1) * head_dim; + const __fp16 *k_ptr2 = k_in + (j + 2) * head_dim; + const __fp16 *k_ptr3 = k_in + (j + 3) * head_dim; + int vl = vsetvl_e16m2(csrr_vlenb() / sizeof(__fp16) * 2); + + vfloat16m2_t _acc00 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc01 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc02 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc03 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m1_t _sum00 = vfmv_v_f_f16m1(matmul_res_data[j], 1); + vfloat16m1_t _sum01 = vfmv_v_f_f16m1(matmul_res_data[j + 1], 1); + vfloat16m1_t _sum02 = vfmv_v_f_f16m1(matmul_res_data[j + 2], 1); + vfloat16m1_t _sum03 = vfmv_v_f_f16m1(matmul_res_data[j + 3], 1); + + int l = 0; + while (l < head_dim) { + // vlen128 e16m2 = 16 + int vl_ = vsetvl_e16m2(head_dim - l); + vfloat16m2_t _q0 = vle16_v_f16m2(q_ptr + l, vl_); + vfloat16m2_t _k0 = vle16_v_f16m2(k_ptr0 + l, vl_); + vfloat16m2_t _k1 = vle16_v_f16m2(k_ptr1 + l, vl_); + vfloat16m2_t _k2 = vle16_v_f16m2(k_ptr2 + l, vl_); + vfloat16m2_t _k3 = vle16_v_f16m2(k_ptr3 + l, vl_); + + _acc00 = vfmacc_vv_f16m2(_acc00, _q0, _k0, vl); + _acc01 = vfmacc_vv_f16m2(_acc01, _q0, _k1, vl); + _acc02 = vfmacc_vv_f16m2(_acc02, _q0, _k2, vl); + _acc03 = vfmacc_vv_f16m2(_acc03, _q0, _k3, vl); + l += vl_; + } + __fp16 res[stride]; + _sum00 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + res[0] = vfmv_f_s_f16m1_f16(_sum00); + _sum01 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc01, _sum01, vl); + res[1] = vfmv_f_s_f16m1_f16(_sum01); + _sum02 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc02, _sum02, vl); + res[2] = vfmv_f_s_f16m1_f16(_sum02); + _sum03 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc03, _sum03, vl); + res[3] = vfmv_f_s_f16m1_f16(_sum03); + int min_vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16) / 2); // min_vl = 4 + vfloat16m1_t save = vle16_v_f16m1(res, min_vl); + save = vfmul_vf_f16m1(save, norm_factor, min_vl); + vse16_v_f16m1(matmul_res_data + j, save, min_vl); + _max = vfmax_vv_f16m1(save, _max, min_vl); + } + + vfloat16m1_t _min_f = vfmv_v_f_f16m1(max, m1_vl); + vfloat16m1_t _max0 = vfredmax_vs_f16m1_f16m1(vundefined_f16m1(), _max, _min_f, m1_vl); + max = vfmv_f_s_f16m1_f16(_max0); + + for (; j < casual_cnt; j++) { + const __fp16 *k_ptr = k_in + j * head_dim; + int vl = vsetvl_e16m4(csrr_vlenb() / sizeof(__fp16) * 4); + vfloat16m4_t _acc00 = vfmv_v_f_f16m4(0.0f, vl); + vfloat16m1_t _sum00 = vfmv_v_f_f16m1(matmul_res_data[j], 1); + ; + int l = 0; + while (l < head_dim) { + // vlen128 e16m4=32 + int vl_ = vsetvl_e16m4(head_dim - l); + vfloat16m4_t _q0 = vle16_v_f16m4(q_ptr + l, vl_); + vfloat16m4_t _k0 = vle16_v_f16m4(k_ptr + l, vl_); + _acc00 = vfmacc_vv_f16m4(_acc00, _q0, _k0, vl); + l += vl_; + } + + _sum00 = vfredosum_vs_f16m4_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + __fp16 res = vfmv_f_s_f16m1_f16(_sum00); + res *= norm_factor; + matmul_res_data[j] = res; + max = fmax(max, res); + } + + vfloat16m1_t fred_sum1 = vfmv_v_f_f16m1(0.0f, 1); + int len = 0; + __fp16 *res_in = &matmul_res_data[0]; + int vl_m4 = vsetvl_e16m4(32); + vfloat16m4_t div_sum0 = vfmv_v_f_f16m4(0.0f, vl_m4); + while (len < casual_cnt) { + int vl_in = vsetvl_e16m4(casual_cnt - len); + vfloat16m4_t _res = vle16_v_f16m4(res_in + len, vl_in); + _res = vfadd_vf_f16m4(_res, -max, vl_in); + _res = exp_ps_vfloat16m4(_res, vl_in); + vse16_v_f16m4(res_in + len, _res, vl_in); + div_sum0 = vfadd_vv_f16m4(div_sum0, _res, vl_m4); + len += vl_in; + } + fred_sum1 = vfredosum_vs_f16m4_f16m1(vundefined_f16m1(), div_sum0, fred_sum1, vl_m4); + acc_exp = vfmv_f_s_f16m1_f16(fred_sum1); + } + + { + __fp16 *o_out = o; // for sq = 1 + const __fp16 *_in = matmul_res_data; + const __fp16 _mul_exp = 1.0f / acc_exp; + const int stride = 4; + int m1_vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16)); + ; + int dim = 0; + for (; dim + stride - 1 < head_dim; dim += stride) { + const __fp16 *v_ptr0 = v_in + dim * sk; + const __fp16 *v_ptr1 = v_in + (dim + 1) * sk; + const __fp16 *v_ptr2 = v_in + (dim + 2) * sk; + const __fp16 *v_ptr3 = v_in + (dim + 3) * sk; + int vl = vsetvl_e16m2(csrr_vlenb() / sizeof(__fp16) * 2); + + vfloat16m2_t _acc00 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc01 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc02 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m2_t _acc03 = vfmv_v_f_f16m2(0.0f, vl); + vfloat16m1_t _sum00 = vfmv_v_f_f16m1(0.0f, 1); + vfloat16m1_t _sum01 = vfmv_v_f_f16m1(0.0f, 1); + vfloat16m1_t _sum02 = vfmv_v_f_f16m1(0.0f, 1); + vfloat16m1_t _sum03 = vfmv_v_f_f16m1(0.0f, 1); + + int j = 0; + while (j < casual_cnt) { + // vlen128 e16m2 = 8 + int vl_v = vsetvl_e16m2(casual_cnt - j); + vfloat16m2_t _in0 = vle16_v_f16m2(_in + j, vl_v); + vfloat16m2_t _v0 = vle16_v_f16m2(v_ptr0 + j, vl_v); + vfloat16m2_t _v1 = vle16_v_f16m2(v_ptr1 + j, vl_v); + vfloat16m2_t _v2 = vle16_v_f16m2(v_ptr2 + j, vl_v); + vfloat16m2_t _v3 = vle16_v_f16m2(v_ptr3 + j, vl_v); + + _acc00 = vfmacc_vv_f16m2(_acc00, _in0, _v0, vl); + _acc01 = vfmacc_vv_f16m2(_acc01, _in0, _v1, vl); + _acc02 = vfmacc_vv_f16m2(_acc02, _in0, _v2, vl); + _acc03 = vfmacc_vv_f16m2(_acc03, _in0, _v3, vl); + j += vl_v; + } + __fp16 res[stride]; + _sum00 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc00, _sum00, vl); + res[0] = vfmv_f_s_f16m1_f16(_sum00); + _sum01 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc01, _sum01, vl); + res[1] = vfmv_f_s_f16m1_f16(_sum01); + _sum02 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc02, _sum02, vl); + res[2] = vfmv_f_s_f16m1_f16(_sum02); + _sum03 = vfredosum_vs_f16m2_f16m1(vundefined_f16m1(), _acc03, _sum03, vl); + res[3] = vfmv_f_s_f16m1_f16(_sum03); + vfloat16m1_t save = vle16_v_f16m1(res, m1_vl); + save = vfmul_vf_f16m1(save, _mul_exp, m1_vl); + vse16_v_f16m1(o_out + dim, save, m1_vl); + } + for (; dim < head_dim; dim++) { + int j = 0; + int vl_size = vsetvl_e16m4(csrr_vlenb() / sizeof(__fp16) * 4); + vfloat16m4_t _acc_o0 = vfmv_v_f_f16m4(0.0f, vl_size); + vfloat16m1_t _out; + while (j < casual_cnt) { + const __fp16 *res_in = matmul_res_data + j; + int vl_v = vsetvl_e16m4(casual_cnt - j); + vfloat16m4_t _r0 = vle16_v_f16m4(res_in, vl_v); + const __fp16 *v_ptr = v_in + dim * sk + j; + vfloat16m4_t _v0 = vle16_v_f16m4(v_ptr, vl_v); + _acc_o0 = vfmacc_vv_f16m4(_acc_o0, _r0, _v0, vl_size); + j += vl_v; + } + _out = vfmv_v_f_f16m1(0.0f, 1); // colud not be blocked + _out = vfredosum_vs_f16m4_f16m1(vundefined_f16m1(), _acc_o0, _out, vl_size); + o_out[dim] = vfmv_f_s_f16m1_f16(_out); + o_out[dim] = o_out[dim] / acc_exp; + } + } + } + free(matmul_res_data); +} \ No newline at end of file diff --git a/source/thead_rvv/fp16/softmax.c b/source/thead_rvv/fp16/softmax.c index 14b13388..91bd049b 100644 --- a/source/thead_rvv/fp16/softmax.c +++ b/source/thead_rvv/fp16/softmax.c @@ -112,10 +112,11 @@ int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output ptr = exp_buffer + k; ptr2 = output_data + k; int n = cnt; + __fp16 acc_axp_multi_coeff = 1.0f / acc_exp; while (n > 0) { size_t vl = vsetvl_e16m2(n); vfloat16m2_t _exp = vlse16_v_f16m2(ptr, inner_size * sizeof(__fp16), vl); - vfloat16m2_t _output_data = vfdiv_vf_f16m2(_exp, acc_exp, vl); + vfloat16m2_t _output_data = vfmul_vf_f16m2(_exp, acc_axp_multi_coeff, vl); vsse16_v_f16m2(ptr2, inner_size * sizeof(__fp16), _output_data, vl); ptr += vl * inner_size; diff --git a/source/thead_rvv/fp16/strided_slice.c b/source/thead_rvv/fp16/strided_slice.c index cf6a6475..dbd55a50 100644 --- a/source/thead_rvv/fp16/strided_slice.c +++ b/source/thead_rvv/fp16/strided_slice.c @@ -187,6 +187,9 @@ int shl_rvv_strided_slice_fp16(struct csinn_tensor *input, struct csinn_tensor * output_data += vl; size -= vl; } + if (cur == 0) { + break; + } cur -= 1; idx[cur] += stride[cur]; } else { diff --git a/source/thead_rvv/fp16/transpose.c b/source/thead_rvv/fp16/transpose.c index 39f6a48b..c1ff62ad 100644 --- a/source/thead_rvv/fp16/transpose.c +++ b/source/thead_rvv/fp16/transpose.c @@ -69,6 +69,9 @@ static int transpose_tail_coincide_fp16(struct csinn_tensor *input, struct csinn dst += vl; i += vl; } + if (d == 0) { + break; + } d -= 1; idx[d] += 1; } else { diff --git a/source/thead_rvv/fp32/avgpool.c b/source/thead_rvv/fp32/avgpool.c index 9c683960..f59a1d95 100644 --- a/source/thead_rvv/fp32/avgpool.c +++ b/source/thead_rvv/fp32/avgpool.c @@ -51,69 +51,82 @@ int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor elempack = in_c % packn == 0 ? packn : 1; } - // global avgpool2d - if (in_h == kernel_h && in_w == kernel_w) { - cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 - : shl_rvv_global_avgpool2d_fp32; - return CSINN_TRUE; - } + if (input->layout == CSINN_LAYOUT_NCHW) { + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 + : shl_rvv_global_avgpool2d_fp32; + return CSINN_TRUE; + } - if (elempack % packn == 0) { - cb->exec = shl_rvv_avgpool_packn_fp32; - } else { - if (stride_h == 2 && stride_w == 2) { - if (kernel_h == 2 && kernel_w == 2) { - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_down == 0) params->pad_down++; - } - if (in_w % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; - } - // end consider ceil_mode 2x2s2p0 - cb->exec = shl_rvv_avgpool2x2s2_fp32; - } else if (pad_left == 1 && pad_top == 1) { - cb->exec = shl_rvv_avgpool2x2s2_p1_fp32; - } - } else if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_down == 0) - params->pad_down++; // origin pad_down mast be equal to zero ? - } - if (in_w % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; + if (elempack % packn == 0) { + cb->exec = shl_rvv_avgpool_packn_fp32; + } else { + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = shl_rvv_avgpool2x2s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = shl_rvv_avgpool2x2s2_p1_fp32; } - // end consider ceil_mode 3x3s2p0 - cb->exec = shl_rvv_avgpool3x3s2_fp32; - } else if (pad_left == 1 && pad_top == 1) { - if (params->ceil_mode == 0) { - cb->exec = shl_rvv_avgpool3x3s2_p1_fp32; - } else { - if ((in_w % 2 == 0 && pad_right == 1) || (in_h % 2 == 0 && pad_down == 1)) { - cb->exec = shl_ref_avgpool2d_f32; - } else { + } else if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = shl_rvv_avgpool3x3s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + if (params->ceil_mode == 0) { cb->exec = shl_rvv_avgpool3x3s2_p1_fp32; + } else { + if ((in_w % 2 == 0 && pad_right == 1) || + (in_h % 2 == 0 && pad_down == 1)) { + cb->exec = shl_ref_avgpool2d_f32; + } else { + cb->exec = shl_rvv_avgpool3x3s2_p1_fp32; + } } } } - } - } else if (stride_h == 1 && stride_w == 1) { - if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - cb->exec = shl_rvv_avgpool3x3s1_p1_fp32; + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = shl_rvv_avgpool3x3s1_p1_fp32; + } } } } - if (cb->exec == NULL) { - shl_debug_warning( - "avgpool is not optimized to achieve under this condition on rvv, call reference " - "func replaced.\n"); - cb->exec = shl_ref_avgpool2d_f32; + + } else if (input->layout == CSINN_LAYOUT_NHWC) { + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = shl_rvv_global_avgpool2d_nhwc_fp32; + return CSINN_TRUE; } + cb->exec = shl_rvv_avgpool_nhwc_fp32; + } + + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on rvv, call reference " + "func replaced.\n"); + cb->exec = shl_ref_avgpool2d_f32; } + return CSINN_TRUE; } @@ -140,7 +153,12 @@ int shl_rvv_global_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_ elempack = in_c % packn == 0 ? packn : 1; } - cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 - : shl_rvv_global_avgpool2d_fp32; + if (input->layout == CSINN_LAYOUT_NCHW) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32 + : shl_rvv_global_avgpool2d_fp32; + } else if (input->layout == CSINN_LAYOUT_NHWC) { + cb->exec = shl_rvv_global_avgpool2d_nhwc_fp32; + } + return CSINN_TRUE; } diff --git a/source/thead_rvv/fp32/convolution.c b/source/thead_rvv/fp32/convolution.c index 0f1b00dd..3c5e45c4 100644 --- a/source/thead_rvv/fp32/convolution.c +++ b/source/thead_rvv/fp32/convolution.c @@ -73,17 +73,20 @@ int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *ou return CSINN_TRUE; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - - // TODO: params->conv_extra.kernel_tm in binary model - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + if ((in_h < 13) && (in_w < 13)) { + shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } else { + shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); + } + params->conv_extra.kernel_tm = t_kernel; + } if ((in_h < 13) && (in_w < 13)) { - shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_rvv_wg_b4f3s1_packn_fp32; } else { - shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel); cb->exec = shl_rvv_wg_b6f3s1_packn_fp32; } - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; diff --git a/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c b/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c index d548c2f0..ba2f6fc0 100644 --- a/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c +++ b/source/thead_rvv/fp32/convolution_3x3_fp32_packn.c @@ -918,7 +918,16 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, {1.0f / 24, -1.0f / 12, 1.0f / 6}, {0.0f, 0.0f, 1.0f}}; + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 6; + dst_kernel->dim[2] = 6; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -955,9 +964,6 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(float)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(float); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { float *g0 = kernel_tm_packn + oc * 36 * inch; @@ -984,6 +990,7 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } @@ -1084,7 +1091,7 @@ int shl_rvv_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor /****************************************************************************************** * kernel layout before: [O, I, 3, 3] - * kernel layout after : [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn] + * kernel layout after : [O/pack2n, 64, I, pack2n] --> [O/packn, 64, I, packn] * constrain: output channel % packn = 0 * input channel % packn = 0 ******************************************************************************************/ @@ -1118,7 +1125,16 @@ void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, // {0.0f, 0.0f, 1.0f} // }; + const int packn = csrr_vlenb() / sizeof(float); + const int pack2n = packn * 2; csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 8; + dst_kernel->dim[2] = 8; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { @@ -1153,9 +1169,6 @@ void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(float); - const int pack2n = packn * 2; - int oc = 0; for (; oc + pack2n - 1 < outch; oc += pack2n) { float *g0 = kernel_tm_packn + oc * 64 * inch; @@ -1182,6 +1195,7 @@ void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel, } } } + src_kernel->data = NULL; shl_mem_free(kernel_tm); } diff --git a/source/thead_rvv/fp32/expand_dims.c b/source/thead_rvv/fp32/expand_dims.c new file mode 100644 index 00000000..de58e8b1 --- /dev/null +++ b/source/thead_rvv/fp32/expand_dims.c @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" + +int shl_rvv_expand_dims_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size = 1; + if (input_data != output_data) { + for (int i = 0; i < input->dim_count; i++) { + size *= input->dim[i]; + } + int j = 0; + while (j < size) { + int vl = vsetvl_e32m4(size - j); + vfloat32m4_t _in = vle32_v_f32m4(input_data, vl); + vse32_v_f32m4(output_data, _in, vl); + input_data += vl; + output_data += vl; + j += vl; + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/fp32/gemm_fp32_block.c b/source/thead_rvv/fp32/gemm_fp32_block.c index 6fd13092..9a7f38aa 100644 --- a/source/thead_rvv/fp32/gemm_fp32_block.c +++ b/source/thead_rvv/fp32/gemm_fp32_block.c @@ -681,7 +681,7 @@ static inline void gemm_12xpack2n_fp32(float *dst, const float *sa, const float * k_blk: K_BLK, K_tail * * dst - output: [m, n] - * sa - kernel: [m/m_blk, k/k_blk, m_blk/12, 12, k_blk] + * sa - kernel: [m/m_blk, k/k_blk, m_blk/12, k_blk, 12] * sb - input: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n] * bias: [m] ************************************************************/ @@ -722,7 +722,8 @@ void shl_rvv_gemm_block_12xpack2n_fp32(float *dst, const float *sa, const float float *out = output_data + m_idx * n + n_idx; const float *ker = kernel_data + m_idx * k + k_idx * m_block; const float *in = input_data + n_idx * k + k_idx * n_block; - gemm_12xpack2n_fp32(out, ker, in, bias, m_block, n_block, k_block, n, k_idx); + gemm_12xpack2n_fp32(out, ker, in, bias + m_idx, m_block, n_block, k_block, n, + k_idx); k_idx += k_block; } diff --git a/source/thead_rvv/fp32/matmul.c b/source/thead_rvv/fp32/matmul.c index f8cc6408..1d3a9bcc 100644 --- a/source/thead_rvv/fp32/matmul.c +++ b/source/thead_rvv/fp32/matmul.c @@ -116,7 +116,7 @@ int shl_rvv_matmul_block_fp32(struct csinn_tensor *mat0, struct csinn_tensor *ma } /************************************************************* - * packn = vlenb / sizeof(__fp16) + * packn = vlenb / sizeof(float) * src: [k, n] * dst: [n/n_blk, k/k_blk, n_blk/pack2n, k_blk, pack2n] * n_blk: N_BLK, N_tail @@ -154,19 +154,21 @@ int shl_rvv_matmul_init_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat struct csinn_tensor *output, struct csinn_matmul_params *params) { struct csinn_callback *cb = params->base.cb; + struct csinn_session *sess = params->base.sess; + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); if (!params->trans_a && !params->trans_b) { - if (mat0->dtype == CSINN_DTYPE_FLOAT32) { - if (mat1->dtype == CSINN_DTYPE_FLOAT32) { + if (mat0->dtype == CSINN_DTYPE_FLOAT32 && mat1->dtype == CSINN_DTYPE_FLOAT32) { + if (!binary_model_op_init) { if (mat1->is_const) { shl_rvv_matmul_reorder_weight_fp32(mat1, MATMUL_K_BLK, MATMUL_N_BLK); } - cb->exec = shl_rvv_matmul_fp32; } + cb->exec = shl_rvv_matmul_fp32; } } if (cb->exec == NULL) { shl_debug_warning( - "matmul is not optimized to achieve under this condition, call reference func " + "matmul is not optimized to achieve under this condition on RVV, call reference func " "replaced.\n"); cb->exec = shl_ref_matmul_quant; } diff --git a/source/thead_rvv/fp32/maxpool.c b/source/thead_rvv/fp32/maxpool.c index 719f8b4b..5586b3ff 100644 --- a/source/thead_rvv/fp32/maxpool.c +++ b/source/thead_rvv/fp32/maxpool.c @@ -51,69 +51,82 @@ int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor elempack = in_c % packn == 0 ? packn : 1; } - // global maxpool2d // TODO: remove - if (in_h == kernel_h && in_w == kernel_w) { - cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 - : shl_rvv_global_maxpool2d_fp32; - return CSINN_TRUE; - } + if (input->layout == CSINN_LAYOUT_NCHW) { + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 + : shl_rvv_global_maxpool2d_fp32; + return CSINN_TRUE; + } - if (elempack % packn == 0) { - cb->exec = shl_rvv_maxpool_packn_fp32; - } else { - if (stride_h == 2 && stride_w == 2) { - if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_down == 0) params->pad_down++; - } - if (in_w % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; - } - // end consider ceil_mode 2x2s2p0 - cb->exec = shl_rvv_maxpool2x2s2_fp32; - } else if (pad_left == 1 && pad_top == 1) { - cb->exec = shl_rvv_maxpool2x2s2_p1_fp32; - } - } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_down == 0) - params->pad_down++; // origin pad_down mast be equal to zero ? - } - if (in_w % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; + if (elempack % packn == 0) { + cb->exec = shl_rvv_maxpool_packn_fp32; + } else { + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = shl_rvv_maxpool2x2s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = shl_rvv_maxpool2x2s2_p1_fp32; } - // end consider ceil_mode 3x3s2p0 - cb->exec = shl_rvv_maxpool3x3s2_fp32; - } else if (pad_left == 1 && pad_top == 1) { - if (params->ceil_mode == 0) { - cb->exec = shl_rvv_maxpool3x3s2_p1_fp32; - } else { - if ((in_w % 2 == 0 && pad_right == 1) || (in_h % 2 == 0 && pad_down == 1)) { - cb->exec = shl_ref_maxpool2d_f32; - } else { + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = shl_rvv_maxpool3x3s2_fp32; + } else if (pad_left == 1 && pad_top == 1) { + if (params->ceil_mode == 0) { cb->exec = shl_rvv_maxpool3x3s2_p1_fp32; + } else { + if ((in_w % 2 == 0 && pad_right == 1) || + (in_h % 2 == 0 && pad_down == 1)) { + cb->exec = shl_ref_maxpool2d_f32; + } else { + cb->exec = shl_rvv_maxpool3x3s2_p1_fp32; + } } } } - } - } else if (stride_h == 1 && stride_w == 1) { - if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - cb->exec = shl_rvv_maxpool3x3s1_p1_fp32; + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = shl_rvv_maxpool3x3s1_p1_fp32; + } } } } - if (cb->exec == NULL) { - shl_debug_warning( - "maxpool is not optimized to achieve under this condition on rvv, call reference " - "func replaced.\n"); - cb->exec = shl_ref_maxpool2d_f32; + + } else if (input->layout == CSINN_LAYOUT_NHWC) { + // global maxpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = shl_rvv_global_maxpool2d_nhwc_fp32; + return CSINN_TRUE; } + cb->exec = shl_rvv_maxpool_nhwc_fp32; + } + + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on rvv, call reference " + "func replaced.\n"); + cb->exec = shl_ref_maxpool2d_f32; } + return CSINN_TRUE; } @@ -140,7 +153,12 @@ int shl_rvv_global_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_ elempack = in_c % packn == 0 ? packn : 1; } - cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 - : shl_rvv_global_maxpool2d_fp32; + if (input->layout == CSINN_LAYOUT_NCHW) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32 + : shl_rvv_global_maxpool2d_fp32; + } else if (input->layout == CSINN_LAYOUT_NHWC) { + cb->exec = shl_rvv_global_maxpool2d_nhwc_fp32; + } + return CSINN_TRUE; } diff --git a/source/thead_rvv/fp32/rms_norm.c b/source/thead_rvv/fp32/rms_norm.c index 0cbed1b9..cd2ca515 100644 --- a/source/thead_rvv/fp32/rms_norm.c +++ b/source/thead_rvv/fp32/rms_norm.c @@ -18,8 +18,8 @@ #include "rvv/rvv.h" -int shl_rvv_rms_norm_fp32(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params) +int shl_rvv_rms_norm_fp32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { if (input->layout == CSINN_LAYOUT_NC1HWC0) { shl_rvv_tensor_nc1xc0_to_ndarray_replace_fp32(input); diff --git a/source/thead_rvv/fp32/rope.c b/source/thead_rvv/fp32/rope.c new file mode 100644 index 00000000..a86a51cc --- /dev/null +++ b/source/thead_rvv/fp32/rope.c @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" + +int shl_rvv_rope_fp32(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params) +{ + float freq_base = params->freq_base; + float freq_scale = params->freq_scale; + float xpos_base = params->xpos_base; + int32_t xpos_down = params->xpos_down; + int n_dims = params->n_dims; + + float theta_scale = powf(freq_base, -2.0f / n_dims); + + float *src_data = input->data; + float *dst_data = output->data; + int32_t *pos = params->pos; + + if (!params->use_rope_cache) { + for (int i3 = 0; i3 < input->dim[0]; i3++) { + for (int i2 = 0; i2 < input->dim[1]; i2++) { + int p = pos[i2]; + for (int i1 = 0; i1 < input->dim[2]; i1++) { + float theta = freq_scale * (float)p; + + for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { + float cos_theta = cosf(theta); + float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = + xpos_base != 0.0f + ? powf((i0 + 0.4f * input->dim[0]) / (1.4f * input->dim[0]), + p / xpos_base) + : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; + + theta *= theta_scale; + + int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + float x0 = src_data[index]; + float x1 = src_data[index + 1]; + + dst_data[index] = x0 * cos_theta * zeta - x1 * sin_theta * zeta; + dst_data[index + 1] = x0 * sin_theta * zeta + x1 * cos_theta * zeta; + } + } + } + } + } else { + float *rope_cache = &((float *)params->rope_cache)[pos[0] * input->dim[2] * input->dim[3]]; + for (int i3 = 0; i3 < input->dim[0]; i3++) { + for (int i2 = 0; i2 < input->dim[1]; i2++) { + for (int i1 = 0; i1 < input->dim[2]; i1++) { + for (int i0 = 0; i0 < input->dim[3]; i0 += 2) { + int index = i3 * (input->dim[3] * input->dim[2] * input->dim[1]) + + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + int rope_cache_index = + i2 * (input->dim[3] * input->dim[2]) + i1 * input->dim[3] + i0; + + float x0 = src_data[index]; + float x1 = src_data[index + 1]; + float sin_theta = rope_cache[index]; + float cos_theta = rope_cache[index + 1]; + + dst_data[index] = x0 * cos_theta - x1 * sin_theta; + dst_data[index + 1] = x0 * sin_theta + x1 * cos_theta; + } + } + } + } + } + return CSINN_TRUE; +} diff --git a/source/thead_rvv/fp32/scaled_dot_product_attention.c b/source/thead_rvv/fp32/scaled_dot_product_attention.c new file mode 100644 index 00000000..61108ab4 --- /dev/null +++ b/source/thead_rvv/fp32/scaled_dot_product_attention.c @@ -0,0 +1,750 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" +#include "rvv_mathfun_fp32.h" + +static inline void qk_t1_dot_4x4_fp32(float *dst, const float *sa, const float *sb, int M, int K, + int N, int lda, int ldb, int ldc); + +static inline void trans_q_0132_fp32(float *src, float *dst, int sv, int head_dim); + +static void q0k1_softmax_v1_fp32(float *q, float *k, float *v, float *o, + struct csinn_scale_dot_attention_params *params, int32_t sq, + int32_t sk, int32_t head_dim); + +int shl_rvv_scaled_dot_product_attention_fp32(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params) +{ + float *query_data = query->data; + float *key_data = key->data; + float *value_data = value->data; + float *output_data = output_tensor->data; + // np: number of heads + // sk: sequence number of k and v + // sq: sequence number of q + int32_t batch = query->dim[0]; // batch = 1 only + int32_t np = query->dim[1]; + int32_t sk = key->dim[2]; + int32_t sq = query->dim[2]; + int32_t head_dim = query->dim[3]; + + if (shl_multithread_is_enable()) { +#pragma omp parallel for + for (int i = 0; i < batch * np; i++) { + float *q = query_data + i * sq * head_dim; + float *k = key_data + i * sk * head_dim; + float *v = value_data + i * sk * head_dim; + float *o = output_data + i * sq * head_dim; + if (params->transpose_v == 0) { + float *value_transpose_tmp = malloc(sk * head_dim * sizeof(float)); + trans_q_0132_fp32(v, value_transpose_tmp, sk, head_dim); + q0k1_softmax_v1_fp32(q, k, value_transpose_tmp, o, params, sq, sk, head_dim); + free(value_transpose_tmp); + } else { + q0k1_softmax_v1_fp32(q, k, v, o, params, sq, sk, head_dim); + } + } + + } else { + for (int i = 0; i < batch * np; i++) { + float *q = query_data + i * sq * head_dim; + float *k = key_data + i * sk * head_dim; + float *v = value_data + i * sk * head_dim; + float *o = output_data + i * sq * head_dim; + if (params->transpose_v == 0) { + float *value_transpose_tmp = malloc(sk * head_dim * sizeof(float)); + trans_q_0132_fp32(v, value_transpose_tmp, sk, head_dim); + q0k1_softmax_v1_fp32(q, k, value_transpose_tmp, o, params, sq, sk, head_dim); + free(value_transpose_tmp); + } else { + q0k1_softmax_v1_fp32(q, k, v, o, params, sq, sk, head_dim); + } + } + } + return CSINN_TRUE; +} + +static inline void trans_q_0132_fp32(float *src, float *dst, int sv, int head_dim) +{ + for (int i = 0; i < sv; i++) { + int size = head_dim; + float *d_ptr = dst + i; + while (size > 0) { + int vl = vsetvl_e32m4(size); + vfloat32m4_t _in = vle32_v_f32m4(src, vl); + src += vl; + vsse32_v_f32m4(d_ptr, sv * sizeof(float), _in, vl); + d_ptr += vl * sv; + size -= vl; + } + } + dst += head_dim * sv; +} + +static inline void qk_t1_dot_4x4_fp32(float *dst, const float *sa, const float *sb, int M, int K, + int N, int lda, int ldb, int ldc) +{ + int i = 0; + for (; i + 3 < M; i += 4) { + const float *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const float *a0_ptr = sa_ptr; + const float *a1_ptr = sa_ptr + 1 * lda; + const float *a2_ptr = sa_ptr + 2 * lda; + const float *a3_ptr = sa_ptr + 3 * lda; + const float *b0_ptr = sb + j * ldb; + const float *b1_ptr = b0_ptr + 1 * ldb; + const float *b2_ptr = b0_ptr + 2 * ldb; + const float *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[m, 0] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vlmax); + // dst[m, 1] + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc01, vlmax); + vfloat32m1_t _acc21 = vmv_v_v_f32m1(_acc01, vlmax); + vfloat32m1_t _acc31 = vmv_v_v_f32m1(_acc01, vlmax); + // dst[m, 2] + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc02, vlmax); + vfloat32m1_t _acc22 = vmv_v_v_f32m1(_acc02, vlmax); + vfloat32m1_t _acc32 = vmv_v_v_f32m1(_acc02, vlmax); + // dst[m, 3] + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc03, vlmax); + vfloat32m1_t _acc23 = vmv_v_v_f32m1(_acc03, vlmax); + vfloat32m1_t _acc33 = vmv_v_v_f32m1(_acc03, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _a1 = vle32_v_f32m1(a1_ptr + c, vl); + vfloat32m1_t _a2 = vle32_v_f32m1(a2_ptr + c, vl); + vfloat32m1_t _a3 = vle32_v_f32m1(a3_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr + c, vl); + vfloat32m1_t _b2 = vle32_v_f32m1(b2_ptr + c, vl); + vfloat32m1_t _b3 = vle32_v_f32m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vlmax); + + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vlmax); + _acc11 = vfmacc_vv_f32m1(_acc11, _a1, _b1, vlmax); + _acc21 = vfmacc_vv_f32m1(_acc21, _a2, _b1, vlmax); + _acc31 = vfmacc_vv_f32m1(_acc31, _a3, _b1, vlmax); + + _acc02 = vfmacc_vv_f32m1(_acc02, _a0, _b2, vlmax); + _acc12 = vfmacc_vv_f32m1(_acc12, _a1, _b2, vlmax); + _acc22 = vfmacc_vv_f32m1(_acc22, _a2, _b2, vlmax); + _acc32 = vfmacc_vv_f32m1(_acc32, _a3, _b2, vlmax); + + _acc03 = vfmacc_vv_f32m1(_acc03, _a0, _b3, vlmax); + _acc13 = vfmacc_vv_f32m1(_acc13, _a1, _b3, vlmax); + _acc23 = vfmacc_vv_f32m1(_acc23, _a2, _b3, vlmax); + _acc33 = vfmacc_vv_f32m1(_acc33, _a3, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + int idx01 = (i + 0) * ldc + (j + 1); + int idx11 = (i + 1) * ldc + (j + 1); + int idx21 = (i + 2) * ldc + (j + 1); + int idx31 = (i + 3) * ldc + (j + 1); + + int idx02 = (i + 0) * ldc + (j + 2); + int idx12 = (i + 1) * ldc + (j + 2); + int idx22 = (i + 2) * ldc + (j + 2); + int idx32 = (i + 3) * ldc + (j + 2); + + int idx03 = (i + 0) * ldc + (j + 3); + int idx13 = (i + 1) * ldc + (j + 3); + int idx23 = (i + 2) * ldc + (j + 3); + int idx33 = (i + 3) * ldc + (j + 3); + + // dst[m, 0] + vfloat32m1_t _sum00; + vfloat32m1_t _sum10; + vfloat32m1_t _sum20; + vfloat32m1_t _sum30; + // dst[m, 1] + vfloat32m1_t _sum01; + vfloat32m1_t _sum11; + vfloat32m1_t _sum21; + vfloat32m1_t _sum31; + // dst[m, 2] + vfloat32m1_t _sum02; + vfloat32m1_t _sum12; + vfloat32m1_t _sum22; + vfloat32m1_t _sum32; + // dst[m, 3] + vfloat32m1_t _sum03; + vfloat32m1_t _sum13; + vfloat32m1_t _sum23; + vfloat32m1_t _sum33; + + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f32m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f32m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f32m1(dst[idx30], 1); + + _sum01 = vfmv_v_f_f32m1(dst[idx01], 1); + _sum11 = vfmv_v_f_f32m1(dst[idx11], 1); + _sum21 = vfmv_v_f_f32m1(dst[idx21], 1); + _sum31 = vfmv_v_f_f32m1(dst[idx31], 1); + + _sum02 = vfmv_v_f_f32m1(dst[idx02], 1); + _sum12 = vfmv_v_f_f32m1(dst[idx12], 1); + _sum22 = vfmv_v_f_f32m1(dst[idx22], 1); + _sum32 = vfmv_v_f_f32m1(dst[idx32], 1); + + _sum03 = vfmv_v_f_f32m1(dst[idx03], 1); + _sum13 = vfmv_v_f_f32m1(dst[idx13], 1); + _sum23 = vfmv_v_f_f32m1(dst[idx23], 1); + _sum33 = vfmv_v_f_f32m1(dst[idx33], 1); + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc30, _sum30, vlmax); + + _sum01 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc01, _sum01, vlmax); + _sum11 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc11, _sum11, vlmax); + _sum21 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc21, _sum21, vlmax); + _sum31 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc31, _sum31, vlmax); + + _sum02 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc02, _sum02, vlmax); + _sum12 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc12, _sum12, vlmax); + _sum22 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc22, _sum22, vlmax); + _sum32 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc32, _sum32, vlmax); + + _sum03 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc03, _sum03, vlmax); + _sum13 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc13, _sum13, vlmax); + _sum23 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc23, _sum23, vlmax); + _sum33 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc33, _sum33, vlmax); + + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + dst[idx10] = vfmv_f_s_f32m1_f32(_sum10); + dst[idx20] = vfmv_f_s_f32m1_f32(_sum20); + dst[idx30] = vfmv_f_s_f32m1_f32(_sum30); + + dst[idx01] = vfmv_f_s_f32m1_f32(_sum01); + dst[idx11] = vfmv_f_s_f32m1_f32(_sum11); + dst[idx21] = vfmv_f_s_f32m1_f32(_sum21); + dst[idx31] = vfmv_f_s_f32m1_f32(_sum31); + + dst[idx02] = vfmv_f_s_f32m1_f32(_sum02); + dst[idx12] = vfmv_f_s_f32m1_f32(_sum12); + dst[idx22] = vfmv_f_s_f32m1_f32(_sum22); + dst[idx32] = vfmv_f_s_f32m1_f32(_sum32); + + dst[idx03] = vfmv_f_s_f32m1_f32(_sum03); + dst[idx13] = vfmv_f_s_f32m1_f32(_sum13); + dst[idx23] = vfmv_f_s_f32m1_f32(_sum23); + dst[idx33] = vfmv_f_s_f32m1_f32(_sum33); + } + for (; j < N; j++) { + const float *a0_ptr = sa_ptr; + const float *a1_ptr = sa_ptr + 1 * lda; + const float *a2_ptr = sa_ptr + 2 * lda; + const float *a3_ptr = sa_ptr + 3 * lda; + const float *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[m, 0] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + vfloat32m1_t _acc10 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc20 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc30 = vmv_v_v_f32m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _a1 = vle32_v_f32m1(a1_ptr + c, vl); + vfloat32m1_t _a2 = vle32_v_f32m1(a2_ptr + c, vl); + vfloat32m1_t _a3 = vle32_v_f32m1(a3_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + _acc10 = vfmacc_vv_f32m1(_acc10, _a1, _b0, vlmax); + _acc20 = vfmacc_vv_f32m1(_acc20, _a2, _b0, vlmax); + _acc30 = vfmacc_vv_f32m1(_acc30, _a3, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx10 = (i + 1) * ldc + (j + 0); + int idx20 = (i + 2) * ldc + (j + 0); + int idx30 = (i + 3) * ldc + (j + 0); + + // dst[m, 0] + vfloat32m1_t _sum00; + vfloat32m1_t _sum10; + vfloat32m1_t _sum20; + vfloat32m1_t _sum30; + + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + _sum10 = vfmv_v_f_f32m1(dst[idx10], 1); + _sum20 = vfmv_v_f_f32m1(dst[idx20], 1); + _sum30 = vfmv_v_f_f32m1(dst[idx30], 1); + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + _sum10 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc10, _sum10, vlmax); + _sum20 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc20, _sum20, vlmax); + _sum30 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc30, _sum30, vlmax); + + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + dst[idx10] = vfmv_f_s_f32m1_f32(_sum10); + dst[idx20] = vfmv_f_s_f32m1_f32(_sum20); + dst[idx30] = vfmv_f_s_f32m1_f32(_sum30); + } + } + for (; i < M; i += 1) { + const float *sa_ptr = sa + i * lda; + int j = 0; + for (; j + 3 < N; j += 4) { + const float *a0_ptr = sa_ptr; + const float *b0_ptr = sb + j * ldb; + const float *b1_ptr = b0_ptr + 1 * ldb; + const float *b2_ptr = b0_ptr + 2 * ldb; + const float *b3_ptr = b0_ptr + 3 * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[0, n] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vlmax); + vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + vfloat32m1_t _b1 = vle32_v_f32m1(b1_ptr + c, vl); + vfloat32m1_t _b2 = vle32_v_f32m1(b2_ptr + c, vl); + vfloat32m1_t _b3 = vle32_v_f32m1(b3_ptr + c, vl); + + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + _acc01 = vfmacc_vv_f32m1(_acc01, _a0, _b1, vlmax); + _acc02 = vfmacc_vv_f32m1(_acc02, _a0, _b2, vlmax); + _acc03 = vfmacc_vv_f32m1(_acc03, _a0, _b3, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + int idx01 = (i + 0) * ldc + (j + 1); + int idx02 = (i + 0) * ldc + (j + 2); + int idx03 = (i + 0) * ldc + (j + 3); + + // dst[0, n] + vfloat32m1_t _sum00; + vfloat32m1_t _sum01; + vfloat32m1_t _sum02; + vfloat32m1_t _sum03; + + // _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + // _sum01 = vfmv_v_f_f32m1(dst[idx01], 1); + // _sum02 = vfmv_v_f_f32m1(dst[idx02], 1); + // _sum03 = vfmv_v_f_f32m1(dst[idx03], 1); + _sum00 = vfmv_v_f_f32m1(0.0f, 1); + _sum01 = vfmv_v_f_f32m1(0.0f, 1); + _sum02 = vfmv_v_f_f32m1(0.0f, 1); + _sum03 = vfmv_v_f_f32m1(0.0f, 1); + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + _sum01 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc01, _sum01, vlmax); + _sum02 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc02, _sum02, vlmax); + _sum03 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc03, _sum03, vlmax); + + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + dst[idx01] = vfmv_f_s_f32m1_f32(_sum01); + dst[idx02] = vfmv_f_s_f32m1_f32(_sum02); + dst[idx03] = vfmv_f_s_f32m1_f32(_sum03); + } + for (; j < N; j++) { + const float *a0_ptr = sa_ptr; + const float *b0_ptr = sb + j * ldb; + + int vlmax = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + // dst[0, 0] + vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vlmax); + + int c = 0; + while (c < K) { + int vl = vsetvl_e32m1(K - c); + vfloat32m1_t _a0 = vle32_v_f32m1(a0_ptr + c, vl); + vfloat32m1_t _b0 = vle32_v_f32m1(b0_ptr + c, vl); + _acc00 = vfmacc_vv_f32m1(_acc00, _a0, _b0, vlmax); + c += vl; + } + + int idx00 = (i + 0) * ldc + (j + 0); + + // dst[0, 0] + vfloat32m1_t _sum00; + + _sum00 = vfmv_v_f_f32m1(dst[idx00], 1); + + _sum00 = vfredosum_vs_f32m1_f32m1(vundefined_f32m1(), _acc00, _sum00, vlmax); + dst[idx00] = vfmv_f_s_f32m1_f32(_sum00); + } + } +} + +/** + * for llm + * if prefill: q [batch,np,sq,dim_head] + * k [batch,np,sk,dim_head] + * v [batch,np,dim_head,sv] + * sq = sk =sv > 1 + * if decoder: q [batch,np,sq,dim_head] + * k [batch,np,sk,dim_head] + * v [batch,np,dim_head,sv] + * sq = 1, sk = sv > 1 + * + */ +static void q0k1_softmax_v1_fp32(float *q, float *k, float *v, float *o, + struct csinn_scale_dot_attention_params *params, int32_t sq, + int32_t sk, int32_t head_dim) +{ + float norm_factor = 1.0f / params->norm_factor; // sqrt(128) + size_t matmul_res_size = sq * sk * sizeof(float); + float *matmul_res_data = shl_mem_alloc(matmul_res_size); + memset(matmul_res_data, 0, matmul_res_size); + if (sq > 1) { + const float *q_in = q; + const float *k_in = k; + const float *v_in = v; + for (int i = 0; i < sq; i++) { + float max = -FLT_MAX; + float acc_exp = 0.0f; + int casual_cnt = sk; + if (params->casual) casual_cnt = i + 1 + (sk - sq); + const float *q_ptr = q_in + i * head_dim; + int j = 0; + const int stride = 4; + int m1_vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + vfloat32m1_t _max = vfmv_v_f_f32m1(max, m1_vl); + // calculate q * k and max value in result + for (; j + stride - 1 < casual_cnt; j += stride) { + const float *k_ptr0 = k_in + j * head_dim; + const float *k_ptr1 = k_in + (j + 1) * head_dim; + const float *k_ptr2 = k_in + (j + 2) * head_dim; + const float *k_ptr3 = k_in + (j + 3) * head_dim; + int vl = vsetvl_e32m2(csrr_vlenb() / sizeof(float) * 2); + + vfloat32m2_t _acc00 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc01 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc02 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc03 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m1_t _sum00 = vfmv_v_f_f32m1(matmul_res_data[i * sk + j], 1); + vfloat32m1_t _sum01 = vfmv_v_f_f32m1(matmul_res_data[i * sk + j + 1], 1); + vfloat32m1_t _sum02 = vfmv_v_f_f32m1(matmul_res_data[i * sk + j + 2], 1); + vfloat32m1_t _sum03 = vfmv_v_f_f32m1(matmul_res_data[i * sk + j + 3], 1); + + int l = 0; + while (l < head_dim) { + // vlen128 e32m2 = 8 + int vl_ = vsetvl_e32m2(head_dim - l); + vfloat32m2_t _q0 = vle32_v_f32m2(q_ptr + l, vl_); + vfloat32m2_t _k0 = vle32_v_f32m2(k_ptr0 + l, vl_); + vfloat32m2_t _k1 = vle32_v_f32m2(k_ptr1 + l, vl_); + vfloat32m2_t _k2 = vle32_v_f32m2(k_ptr2 + l, vl_); + vfloat32m2_t _k3 = vle32_v_f32m2(k_ptr3 + l, vl_); + + _acc00 = vfmacc_vv_f32m2(_acc00, _q0, _k0, vl); + _acc01 = vfmacc_vv_f32m2(_acc01, _q0, _k1, vl); + _acc02 = vfmacc_vv_f32m2(_acc02, _q0, _k2, vl); + _acc03 = vfmacc_vv_f32m2(_acc03, _q0, _k3, vl); + l += vl_; + } + float res[stride]; + _sum00 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + res[0] = vfmv_f_s_f32m1_f32(_sum00); + _sum01 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc01, _sum01, vl); + res[1] = vfmv_f_s_f32m1_f32(_sum01); + _sum02 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc02, _sum02, vl); + res[2] = vfmv_f_s_f32m1_f32(_sum02); + _sum03 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc03, _sum03, vl); + res[3] = vfmv_f_s_f32m1_f32(_sum03); + vfloat32m1_t save = vle32_v_f32m1(res, m1_vl); + save = vfmul_vf_f32m1(save, norm_factor, m1_vl); + vse32_v_f32m1(matmul_res_data + i * sk + j, save, m1_vl); + _max = vfmax_vv_f32m1(save, _max, m1_vl); + } + + vfloat32m1_t _min_f = vfmv_v_f_f32m1(max, m1_vl); + vfloat32m1_t _max0 = vfredmax_vs_f32m1_f32m1(vundefined_f32m1(), _max, _min_f, m1_vl); + max = vfmv_f_s_f32m1_f32(_max0); + for (; j < casual_cnt; j++) { + const float *k_ptr = k_in + j * head_dim; + int vl = vsetvl_e32m4(csrr_vlenb() / sizeof(float) * 4); + vfloat32m4_t _acc00 = vfmv_v_f_f32m4(0.0f, vl); + vfloat32m1_t _sum00 = vfmv_v_f_f32m1(matmul_res_data[i * sk + j], 1); + ; + int l = 0; + while (l < head_dim) { + // vlen128 e32m4=16 + int vl_ = vsetvl_e32m4(head_dim - l); + vfloat32m4_t _q0 = vle32_v_f32m4(q_ptr + l, vl_); + vfloat32m4_t _k0 = vle32_v_f32m4(k_ptr + l, vl_); + _acc00 = vfmacc_vv_f32m4(_acc00, _q0, _k0, vl); + l += vl_; + } + _sum00 = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + float res = vfmv_f_s_f32m1_f32(_sum00); + res *= norm_factor; + matmul_res_data[i * sk + j] = res; + max = fmax(max, res); + } + // calculate exp and sum + vfloat32m1_t fred_sum1 = vfmv_v_f_f32m1(0.0f, 1); + float *res_in = &matmul_res_data[i * sk]; + int vl_m4 = vsetvl_e32m4(csrr_vlenb() / sizeof(float) * 4); + int len = 0; + vfloat32m4_t div_sum0 = vfmv_v_f_f32m4(0.0f, vl_m4); + while (len < casual_cnt) { + int vl_in = vsetvl_e32m4(casual_cnt - len); + vfloat32m4_t _res = vle32_v_f32m4(res_in + len, vl_in); + _res = vfadd_vf_f32m4(_res, -max, vl_in); + _res = exp_ps_vfloat32m4(_res, vl_in); + vse32_v_f32m4(res_in + len, _res, vl_in); + div_sum0 = vfadd_vv_f32m4(div_sum0, _res, vl_m4); + len += vl_in; + } + fred_sum1 = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), div_sum0, fred_sum1, vl_m4); + acc_exp = vfmv_f_s_f32m1_f32(fred_sum1); + // every value div acc_exp + len = 0; + const float _mul_exp = 1.0f / acc_exp; + while (len < casual_cnt) { + int vl_in = vsetvl_e32m4(casual_cnt - len); + vfloat32m4_t _mul_in = vle32_v_f32m4(res_in + len, vl_in); + vfloat32m4_t _output_data = vfmul_vf_f32m4(_mul_in, _mul_exp, vl_in); + vse32_v_f32m4(res_in + len, _output_data, vl_in); + len += vl_in; + } + } + + float *o_out = o; + const float *qk_ptr = &matmul_res_data[0]; + const float *v_ptr = v_in; + int M = sq; + int K = sk; // not casual_cnt + int N = head_dim; + int lda = sk; + int ldb = sk; + int ldc = head_dim; + qk_t1_dot_4x4_fp32(o_out, qk_ptr, v_ptr, M, K, N, lda, ldb, ldc); + } else if (sq == 1) { + const float *q_in = q; + const float *k_in = k; + const float *v_in = v; + float max = -FLT_MAX; + float acc_exp = 0.0f; + int casual_cnt = sk; + if (params->casual) casual_cnt = 1 + (sk - sq); + const float *q_ptr = q_in; + // calculate q * k and part of softmax + { + int j = 0; + const int stride = 4; + const int m1_vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + vfloat32m1_t _max = vfmv_v_f_f32m1(max, m1_vl); + for (; j + stride - 1 < casual_cnt; j += stride) { + const float *k_ptr0 = k_in + j * head_dim; + const float *k_ptr1 = k_in + (j + 1) * head_dim; + const float *k_ptr2 = k_in + (j + 2) * head_dim; + const float *k_ptr3 = k_in + (j + 3) * head_dim; + int vl = vsetvl_e32m2(csrr_vlenb() / sizeof(float) * 2); + + vfloat32m2_t _acc00 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc01 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc02 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc03 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m1_t _sum00 = vfmv_v_f_f32m1(matmul_res_data[j], 1); + vfloat32m1_t _sum01 = vfmv_v_f_f32m1(matmul_res_data[j + 1], 1); + vfloat32m1_t _sum02 = vfmv_v_f_f32m1(matmul_res_data[j + 2], 1); + vfloat32m1_t _sum03 = vfmv_v_f_f32m1(matmul_res_data[j + 3], 1); + + int l = 0; + while (l < head_dim) { + // vlen128 e32m2 = 8 + int vl_ = vsetvl_e32m2(head_dim - l); + vfloat32m2_t _q0 = vle32_v_f32m2(q_ptr + l, vl_); + vfloat32m2_t _k0 = vle32_v_f32m2(k_ptr0 + l, vl_); + vfloat32m2_t _k1 = vle32_v_f32m2(k_ptr1 + l, vl_); + vfloat32m2_t _k2 = vle32_v_f32m2(k_ptr2 + l, vl_); + vfloat32m2_t _k3 = vle32_v_f32m2(k_ptr3 + l, vl_); + + _acc00 = vfmacc_vv_f32m2(_acc00, _q0, _k0, vl); + _acc01 = vfmacc_vv_f32m2(_acc01, _q0, _k1, vl); + _acc02 = vfmacc_vv_f32m2(_acc02, _q0, _k2, vl); + _acc03 = vfmacc_vv_f32m2(_acc03, _q0, _k3, vl); + l += vl_; + } + float res[stride]; + _sum00 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + res[0] = vfmv_f_s_f32m1_f32(_sum00); + _sum01 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc01, _sum01, vl); + res[1] = vfmv_f_s_f32m1_f32(_sum01); + _sum02 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc02, _sum02, vl); + res[2] = vfmv_f_s_f32m1_f32(_sum02); + _sum03 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc03, _sum03, vl); + res[3] = vfmv_f_s_f32m1_f32(_sum03); + vfloat32m1_t save = vle32_v_f32m1(res, m1_vl); + save = vfmul_vf_f32m1(save, norm_factor, m1_vl); + vse32_v_f32m1(matmul_res_data + j, save, m1_vl); + _max = vfmax_vv_f32m1(save, _max, m1_vl); + } + + vfloat32m1_t _min_f = vfmv_v_f_f32m1(max, m1_vl); + vfloat32m1_t _max0 = vfredmax_vs_f32m1_f32m1(vundefined_f32m1(), _max, _min_f, m1_vl); + max = vfmv_f_s_f32m1_f32(_max0); + for (; j < casual_cnt; j++) { + const float *k_ptr = k_in + j * head_dim; + int vl = vsetvl_e32m4(csrr_vlenb() / sizeof(float) * 4); + vfloat32m4_t _acc00 = vfmv_v_f_f32m4(0.0f, vl); + vfloat32m1_t _sum00 = vfmv_v_f_f32m1(matmul_res_data[j], 1); + ; + int l = 0; + while (l < head_dim) { + // vlen128 e32m4=16 + int vl_ = vsetvl_e32m4(head_dim - l); + vfloat32m4_t _q0 = vle32_v_f32m4(q_ptr + l, vl_); + vfloat32m4_t _k0 = vle32_v_f32m4(k_ptr + l, vl_); + _acc00 = vfmacc_vv_f32m4(_acc00, _q0, _k0, vl); + l += vl_; + } + _sum00 = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + float res = vfmv_f_s_f32m1_f32(_sum00); + res *= norm_factor; + matmul_res_data[j] = res; + max = fmax(max, res); + } + + vfloat32m1_t fred_sum1 = vfmv_v_f_f32m1(0.0f, 1); + int len = 0; + float *res_in = &matmul_res_data[0]; + int vl_m4 = vsetvl_e32m4(csrr_vlenb() / sizeof(float) * 4); + vfloat32m4_t div_sum0 = vfmv_v_f_f32m4(0.0f, vl_m4); + while (len < casual_cnt) { + int vl_in = vsetvl_e32m4(casual_cnt - len); + vfloat32m4_t _res = vle32_v_f32m4(res_in + len, vl_in); + _res = vfadd_vf_f32m4(_res, -max, vl_in); + _res = exp_ps_vfloat32m4(_res, vl_in); + vse32_v_f32m4(res_in + len, _res, vl_in); + div_sum0 = vfadd_vv_f32m4(div_sum0, _res, vl_m4); + len += vl_in; + } + fred_sum1 = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), div_sum0, fred_sum1, vl_m4); + acc_exp = vfmv_f_s_f32m1_f32(fred_sum1); + } + // matmul with v + { + float *o_out = o; // for sq = 1 + const float *_in = matmul_res_data; + const float _mul_exp = 1.0f / acc_exp; + const int stride = 4; + const int m1_vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float)); + int dim = 0; + for (; dim + stride - 1 < head_dim; dim += stride) { + const float *v_ptr0 = v_in + dim * sk; + const float *v_ptr1 = v_in + (dim + 1) * sk; + const float *v_ptr2 = v_in + (dim + 2) * sk; + const float *v_ptr3 = v_in + (dim + 3) * sk; + int vl = vsetvl_e32m2(csrr_vlenb() / sizeof(float) * 2); + + vfloat32m2_t _acc00 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc01 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc02 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m2_t _acc03 = vfmv_v_f_f32m2(0.0f, vl); + vfloat32m1_t _sum00 = vfmv_v_f_f32m1(0.0f, 1); + vfloat32m1_t _sum01 = vfmv_v_f_f32m1(0.0f, 1); + vfloat32m1_t _sum02 = vfmv_v_f_f32m1(0.0f, 1); + vfloat32m1_t _sum03 = vfmv_v_f_f32m1(0.0f, 1); + + int j = 0; + while (j < casual_cnt) { + // vlen128 e32m2 = 8 + int vl_v = vsetvl_e32m2(casual_cnt - j); + vfloat32m2_t _in0 = vle32_v_f32m2(_in + j, vl_v); + vfloat32m2_t _v0 = vle32_v_f32m2(v_ptr0 + j, vl_v); + vfloat32m2_t _v1 = vle32_v_f32m2(v_ptr1 + j, vl_v); + vfloat32m2_t _v2 = vle32_v_f32m2(v_ptr2 + j, vl_v); + vfloat32m2_t _v3 = vle32_v_f32m2(v_ptr3 + j, vl_v); + + _acc00 = vfmacc_vv_f32m2(_acc00, _in0, _v0, vl); + _acc01 = vfmacc_vv_f32m2(_acc01, _in0, _v1, vl); + _acc02 = vfmacc_vv_f32m2(_acc02, _in0, _v2, vl); + _acc03 = vfmacc_vv_f32m2(_acc03, _in0, _v3, vl); + j += vl_v; + } + float res[stride]; + _sum00 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc00, _sum00, vl); + res[0] = vfmv_f_s_f32m1_f32(_sum00); + _sum01 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc01, _sum01, vl); + res[1] = vfmv_f_s_f32m1_f32(_sum01); + _sum02 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc02, _sum02, vl); + res[2] = vfmv_f_s_f32m1_f32(_sum02); + _sum03 = vfredosum_vs_f32m2_f32m1(vundefined_f32m1(), _acc03, _sum03, vl); + res[3] = vfmv_f_s_f32m1_f32(_sum03); + vfloat32m1_t save = vle32_v_f32m1(res, m1_vl); + save = vfmul_vf_f32m1(save, _mul_exp, m1_vl); + vse32_v_f32m1(o_out + dim, save, m1_vl); + } + for (; dim < head_dim; dim++) { + int j = 0; + int vl_size = vsetvl_e32m4(4 * sizeof(float)); + vfloat32m4_t _acc_o0 = vfmv_v_f_f32m4(0.0f, vl_size); + vfloat32m1_t _out; + while (j < casual_cnt) { + const float *res_in = matmul_res_data + j; + int vl_v = vsetvl_e32m4(casual_cnt - j); + vfloat32m4_t _r0 = vle32_v_f32m4(res_in, vl_v); + const float *v_ptr = v_in + dim * sk + j; + vfloat32m4_t _v0 = vle32_v_f32m4(v_ptr, vl_v); + _acc_o0 = vfmacc_vv_f32m4(_acc_o0, _r0, _v0, vl_size); + j += vl_v; + } + _out = vfmv_v_f_f32m1(0.0f, 1); + _out = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), _acc_o0, _out, vl_size); + o_out[dim] = vfmv_f_s_f32m1_f32(_out); + o_out[dim] = o_out[dim] / acc_exp; // div acc_exp here ! + } + } + } + shl_mem_free(matmul_res_data); +} \ No newline at end of file diff --git a/source/thead_rvv/fp32/softmax.c b/source/thead_rvv/fp32/softmax.c index 128fad64..1b02f53d 100644 --- a/source/thead_rvv/fp32/softmax.c +++ b/source/thead_rvv/fp32/softmax.c @@ -90,10 +90,11 @@ int shl_rvv_softmax_fp32(struct csinn_tensor *input, struct csinn_tensor *output ptr = exp_buffer + k; float *ptr2 = output_data + k; int n = cnt; + float acc_axp_multi_coeff = 1.0f / acc_exp; while (n > 0) { size_t vl = vsetvl_e32m2(n); vfloat32m2_t _exp = vlse32_v_f32m2(ptr, inner_size * sizeof(float), vl); - vfloat32m2_t _output_data = vfdiv_vf_f32m2(_exp, acc_exp, vl); + vfloat32m2_t _output_data = vfmul_vf_f32m2(_exp, acc_axp_multi_coeff, vl); vsse32_v_f32m2(ptr2, inner_size * sizeof(float), _output_data, vl); ptr += vl * inner_size; diff --git a/source/thead_rvv/fp32/transpose.c b/source/thead_rvv/fp32/transpose.c index d9e47b00..ddc835d1 100644 --- a/source/thead_rvv/fp32/transpose.c +++ b/source/thead_rvv/fp32/transpose.c @@ -69,6 +69,9 @@ static int transpose_tail_coincide_fp32(struct csinn_tensor *input, struct csinn dst += vl; i += vl; } + if (d == 0) { + break; + } d -= 1; idx[d] += 1; } else { diff --git a/source/thead_rvv/int32/embedding.c b/source/thead_rvv/int32/embedding.c new file mode 100644 index 00000000..a958bb54 --- /dev/null +++ b/source/thead_rvv/int32/embedding.c @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/rvv.h" + +int shl_rvv_embedding_fp32_fp32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + int input_len = input->dim[0]; + int embd_size = weight->dim[1]; + int32_t *input_data = input->data; + float *output_data = output->data; + float *weight_data = weight->data; + for (int i = 0; i < input_len; i++) { + int token = input_data[i]; + memcpy(output_data + i * embd_size, weight_data + token * embd_size, + embd_size * sizeof(float)); + } + + return CSINN_TRUE; +} + +int shl_rvv_embedding_fp32_q8(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + int input_len = input->dim[0]; + int embd_size = weight->dim[1]; + int32_t *input_data = input->data; + float *output_data = output->data; + + int q8_block_size = 32; + + int8_t *weight_data = weight->data; + __fp16 *scale_data = weight->data + csinn_tensor_size(weight); + for (int i = 0; i < input_len; i++) { + int token = input_data[i]; + + for (int j = 0; j < embd_size; ++j) { + int input_index = token * embd_size + j; + int output_index = i * embd_size + j; + + int8_t value = weight_data[input_index]; + __fp16 scale = scale_data[input_index / q8_block_size]; + output_data[output_index] = (float)(value * scale); + } + } + + return CSINN_TRUE; +} + +int shl_rvv_embedding_fp32_q4(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + int input_len = input->dim[0]; + int embd_size = weight->dim[1]; + int32_t *input_data = input->data; + float *output_data = output->data; + + int block_size = 32; + int embd_block_num = embd_size / block_size; + + int8_t *weight_data = weight->data; + __fp16 *scale_data = weight->data + csinn_tensor_size(weight) / 2; + for (int i = 0; i < input_len; i++) { + int token = input_data[i]; + + for (int j = 0; j < embd_block_num; j++) { + for (int k = 0; k < block_size / 2; k++) { + int input_index = token * embd_size / 2 + j * block_size / 2 + k; + int output_index = i * embd_size + j * block_size + k; + + int8_t value = weight_data[input_index]; + __fp16 scale = scale_data[input_index * 2 / block_size]; + output_data[output_index] = ((float)(value & 0xf) - 8) * (float)scale; + output_data[output_index + block_size / 2] = + ((float)((value & 0xf0) >> 4) - 8) * (float)scale; + } + } + } + + return CSINN_TRUE; +} + +int shl_rvv_embedding_fp16_fp16(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + int input_len = input->dim[0]; + int embd_size = weight->dim[1]; + int32_t *input_data = input->data; + __fp16 *output_data = output->data; + __fp16 *weight_data = weight->data; + for (int i = 0; i < input_len; i++) { + int token = input_data[i]; + memcpy(output_data + i * embd_size, weight_data + token * embd_size, + embd_size * sizeof(__fp16)); + } + + return CSINN_TRUE; +} + +int shl_rvv_embedding_fp16_q8(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + int input_len = input->dim[0]; + int embd_size = weight->dim[1]; + int32_t *input_data = input->data; + __fp16 *output_data = output->data; + + int q8_block_size = 32; + + int8_t *weight_data = weight->data; + __fp16 *scale_data = weight->data + csinn_tensor_size(weight); + for (int i = 0; i < input_len; i++) { + int token = input_data[i]; + + for (int j = 0; j < embd_size; ++j) { + int input_index = token * embd_size + j; + int output_index = i * embd_size + j; + + int8_t value = weight_data[input_index]; + __fp16 scale = scale_data[input_index / q8_block_size]; + output_data[output_index] = (__fp16)(value * scale); + } + } + + return CSINN_TRUE; +} + +int shl_rvv_embedding_fp16_q4(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + int input_len = input->dim[0]; + int embd_size = weight->dim[1]; + int32_t *input_data = input->data; + __fp16 *output_data = output->data; + + int block_size = 32; + int embd_block_num = embd_size / block_size; + + int8_t *weight_data = weight->data; + __fp16 *scale_data = weight->data + csinn_tensor_size(weight) / 2; + for (int i = 0; i < input_len; i++) { + int token = input_data[i]; + + for (int j = 0; j < embd_block_num; j++) { + for (int k = 0; k < block_size / 2; k++) { + int input_index = token * embd_size / 2 + j * block_size / 2 + k; + int output_index = i * embd_size + j * block_size + k; + + int8_t value = weight_data[input_index]; + __fp16 scale = scale_data[input_index * 2 / block_size]; + output_data[output_index] = ((__fp16)(value & 0xf) - 8) * (__fp16)scale; + output_data[output_index + block_size / 2] = + ((__fp16)((value & 0xf0) >> 4) - 8) * (__fp16)scale; + } + } + } + + return CSINN_TRUE; +} + +int shl_rvv_embedding_int32(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params) +{ + if (output->dtype == CSINN_DTYPE_FLOAT32) { + if (weight->dtype == CSINN_DTYPE_INT8 && weight->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) { + return shl_rvv_embedding_fp32_q8(input, weight, output, params); + } else if (weight->dtype == CSINN_DTYPE_INT4 && + weight->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) { + return shl_rvv_embedding_fp32_q4(input, weight, output, params); + } else if (weight->dtype == CSINN_DTYPE_FLOAT32) { + return shl_rvv_embedding_fp32_fp32(input, weight, output, params); + } + + } else if (output->dtype == CSINN_DTYPE_FLOAT16) { + if (weight->dtype == CSINN_DTYPE_INT8 && weight->mtype == CSINN_MEM_TYPE_BLOCK_Q8_0) { + return shl_rvv_embedding_fp16_q8(input, weight, output, params); + } else if (weight->dtype == CSINN_DTYPE_INT4 && + weight->mtype == CSINN_MEM_TYPE_BLOCK_Q4_0) { + return shl_rvv_embedding_fp16_q4(input, weight, output, params); + } else if (weight->dtype == CSINN_DTYPE_FLOAT16) { + return shl_rvv_embedding_fp16_fp16(input, weight, output, params); + } + } + + return shl_ref_embedding_quant(input, weight, output, params); +} diff --git a/source/thead_rvv/int8/avgpool.c b/source/thead_rvv/int8/avgpool.c index dc329335..6d81dbc5 100644 --- a/source/thead_rvv/int8/avgpool.c +++ b/source/thead_rvv/int8/avgpool.c @@ -51,21 +51,32 @@ int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor elempack = in_c % packn == 0 ? packn : 1; } - // global avgpool2d - if (in_h == kernel_h && in_w == kernel_w) { - cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 - : shl_ref_global_avgpool2d_quant; - return CSINN_TRUE; - } - if (cb->exec == NULL) { + if (input->layout == CSINN_LAYOUT_NCHW) { + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 + : shl_ref_global_avgpool2d_quant; + return CSINN_TRUE; + } + if (elempack % packn == 0) { cb->exec = shl_rvv_avgpool_packn_int8; - } else { - shl_debug_warning( - "avgpool is not optimized to achieve under this condition on rvv, call reference " - "func replaced.\n"); - cb->exec = shl_ref_avgpool2d_quant; } + + } else if (input->layout == CSINN_LAYOUT_NHWC) { + // global avgpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = shl_rvv_global_avgpool2d_nhwc_int8; + return CSINN_TRUE; + } + cb->exec = shl_rvv_avgpool_nhwc_int8; + } + + if (cb->exec == NULL) { + shl_debug_warning( + "avgpool is not optimized to achieve under this condition on rvv, call reference " + "func replaced.\n"); + cb->exec = shl_ref_avgpool2d_quant; } return CSINN_TRUE; } @@ -93,7 +104,12 @@ int shl_rvv_global_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_ elempack = in_c % packn == 0 ? packn : 1; } - cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 - : shl_ref_global_avgpool2d_quant; + if (input->layout == CSINN_LAYOUT_NCHW) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8 + : shl_ref_global_avgpool2d_quant; + } else if (input->layout == CSINN_LAYOUT_NHWC) { + cb->exec = shl_rvv_global_avgpool2d_nhwc_int8; + } + return CSINN_TRUE; } diff --git a/source/thead_rvv/int8/convolution.c b/source/thead_rvv/int8/convolution.c index f2a033c2..ffa44b60 100644 --- a/source/thead_rvv/int8/convolution.c +++ b/source/thead_rvv/int8/convolution.c @@ -32,6 +32,9 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou int32_t stride_w = params->stride_width; int32_t dilation_h = params->dilation_height; int32_t dilation_w = params->dilation_width; + int32_t *bias_data = (int32_t *)bias->data; + int8_t *kernel_data = (int8_t *)kernel->data; + int32_t input_zp = input->qinfo->zero_point; struct csinn_callback *cb = params->base.cb; if (params->base.quant_type != CSINN_QUANT_INT8_ASYM_W_SYM) { @@ -58,32 +61,42 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou out_elempack = out_c % packn == 0 ? packn : 1; } + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); + // packn if (in_elempack % packn == 0 && out_elempack % packn == 0) { if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); - shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params); + } cb->exec = shl_rvv_conv1x1s1_gemm_packn_int8; } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { if (params->group > 1) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); - shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + } cb->exec = shl_rvv_conv_im2col_gemm_packn_int8; } else { params->conv_extra.conv_mode = CSINN_WINOGRAD; - struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); - shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel); + if (!binary_model_op_init) { + struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL); + shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel); + params->conv_extra.kernel_tm = t_kernel; + } cb->exec = shl_rvv_wg_b4f3s1_packn_int8; - params->conv_extra.kernel_tm = t_kernel; } } else { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); - shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params); + } cb->exec = shl_rvv_conv_im2col_gemm_packn_int8; } } @@ -91,13 +104,18 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou // pack1ton if (in_elempack % packn != 0 && out_elempack % packn == 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params); + } cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_int8; } else { - shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params); + } cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_int8; } } @@ -105,13 +123,18 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou // packnto1 if (in_elempack % packn == 0 && out_elempack % packn != 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params); + } cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_int8; } else { - shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params); + } cb->exec = shl_rvv_conv_im2col_gemm_packnto1_int8; } } @@ -119,13 +142,18 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou // pack1 if (in_elempack % packn != 0 && out_elempack % packn != 0) { params->conv_extra.conv_mode = CSINN_GEMM; - params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { - shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params); + } cb->exec = shl_rvv_conv1x1s1_gemm_int8; } else { - shl_rvv_conv_im2col_gemm_reorder_kernel_int8(kernel, params); + if (!binary_model_op_init) { + params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL); + shl_rvv_conv_im2col_gemm_reorder_kernel_int8(kernel, params); + } cb->exec = shl_rvv_conv_im2col_gemm_int8; } } @@ -145,10 +173,6 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou if (params->conv_extra.conv_mode == CSINN_GEMM) { if (!params->conv_extra.fuse_zp2bias) { params->conv_extra.fuse_zp2bias = true; - int32_t *bias_data = (int32_t *)bias->data; - int8_t *kernel_data = (int8_t *)kernel->data; - int32_t input_zp = input->qinfo->zero_point; - if (bias_data == NULL) { // XXX: memory leak bias_data = (int32_t *)shl_mem_alloc(out_c * params->group * sizeof(int32_t)); @@ -168,10 +192,6 @@ int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou // recover fuse zeropoint to bias for winograd if (params->conv_extra.conv_mode == CSINN_WINOGRAD) { if (params->conv_extra.fuse_zp2bias) { - int32_t *bias_data = (int32_t *)bias->data; - int8_t *kernel_data = (int8_t *)kernel->data; - int32_t input_zp = input->qinfo->zero_point; - int kernel_inner = in_c * kernel_h * kernel_w; for (int oc = 0; oc < out_c * params->group; oc++) { int32_t tmp = 0; diff --git a/source/thead_rvv/int8/convolution1d.c b/source/thead_rvv/int8/convolution1d.c index 34a57a6a..3533920c 100644 --- a/source/thead_rvv/int8/convolution1d.c +++ b/source/thead_rvv/int8/convolution1d.c @@ -30,7 +30,8 @@ int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou int32_t dilation_w = params->dilation_width; int32_t group = params->group; struct csinn_callback *cb = params->base.cb; - + struct csinn_session *sess = params->base.sess; + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); if (params->base.quant_type != CSINN_QUANT_INT8_ASYM_W_SYM) { cb->exec = shl_ref_conv1d_quant; return CSINN_TRUE; @@ -58,7 +59,9 @@ int shl_rvv_conv1d_init_int8(struct csinn_tensor *input, struct csinn_tensor *ou bias_data[oc] -= tmp; } } - shl_rvv_conv1d_gemm_reorder_kernel_int8(kernel, params); + if (!binary_model_op_init) { + shl_rvv_conv1d_gemm_reorder_kernel_int8(kernel, params); + } cb->exec = shl_rvv_conv1d_gemm_int8; } else { cb->exec = shl_ref_conv1d_quant; diff --git a/source/thead_rvv/int8/convolution_1x1_int8.c b/source/thead_rvv/int8/convolution_1x1_int8.c index 66142d14..e9450d34 100644 --- a/source/thead_rvv/int8/convolution_1x1_int8.c +++ b/source/thead_rvv/int8/convolution_1x1_int8.c @@ -27,8 +27,10 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, int m = kernel->dim[0] / group; // out_ch int k = kernel->dim[1]; // in_ch ( kernel->dim[2] = kernel->dim[3] = 1) + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); #ifdef SHL_USE_DOT_INT8 int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + params->conv_extra.kernel_tm->dim[1] = k4; params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; @@ -48,6 +50,7 @@ void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t)); // shl_mem_free(pa_reorder); + kernel->data = NULL; } int shl_rvv_common_conv1x1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, diff --git a/source/thead_rvv/int8/convolution_3x3_int8_packn.c b/source/thead_rvv/int8/convolution_3x3_int8_packn.c index efa5398a..f6dcb21d 100644 --- a/source/thead_rvv/int8/convolution_3x3_int8_packn.c +++ b/source/thead_rvv/int8/convolution_3x3_int8_packn.c @@ -584,7 +584,17 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, // kernel transform matrix: G const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4}, {1, 2, 4}, {1, -2, 4}, {0, 0, 6}}; - csinn_tensor_copy(dst_kernel, src_kernel); // tensor->dtype ?? + const int packn = csrr_vlenb() / sizeof(int16_t); + const int pack2n = packn * 2; + csinn_tensor_copy(dst_kernel, src_kernel); + dst_kernel->dim_count = 5; + dst_kernel->dim[0] = outch / packn; + dst_kernel->dim[1] = 6; + dst_kernel->dim[2] = 6; + dst_kernel->dim[3] = inch; + dst_kernel->dim[4] = packn; + dst_kernel->layout = CSINN_LAYOUT_O1HWIO0; + dst_kernel->dtype = CSINN_DTYPE_INT16; for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) { const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9; @@ -616,7 +626,6 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, (int16_t *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(int16_t)); dst_kernel->data = kernel_tm_packn; - const int packn = csrr_vlenb() / sizeof(int16_t); for (int oc = 0; oc + packn - 1 < outch; oc += packn) { int16_t *g0 = kernel_tm_packn + oc * 36 * inch; for (int k = 0; k < 36; k++) { @@ -629,6 +638,7 @@ void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel, } } } + src_kernel->data = 0; shl_mem_free(kernel_tm); } diff --git a/source/thead_rvv/int8/convolution_gemm_int8.c b/source/thead_rvv/int8/convolution_gemm_int8.c index 24a42659..d68ea34a 100644 --- a/source/thead_rvv/int8/convolution_gemm_int8.c +++ b/source/thead_rvv/int8/convolution_gemm_int8.c @@ -27,8 +27,10 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, int m = kernel->dim[0] / group; // m = out_ch / group int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3]; + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); #ifdef SHL_USE_DOT_INT8 int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k; + params->conv_extra.kernel_tm->dim[1] = k4; params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; @@ -48,6 +50,7 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel, // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); // shl_mem_free(pa_reorder); + kernel->data = NULL; } int shl_rvv_common_conv_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output, diff --git a/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c b/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c index e86e22ea..5ef59e68 100644 --- a/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c +++ b/source/thead_rvv/int8/convolution_gemm_int8_pack1ton.c @@ -156,8 +156,11 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor * int in_c = kernel->dim[1]; int maxk = kernel->dim[2] * kernel->dim[3]; + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); + #ifdef SHL_USE_DOT_INT8 int in_c4 = ((in_c - 1) & -4) + 4; // align 4 for input_channel + params->conv_extra.kernel_tm->dim[1] = in_c4; params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(out_c * in_c4 * maxk * sizeof(int8_t)); int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data; @@ -177,6 +180,7 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor * im2col_gemm_reorder_kernel_pack1ton_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); } #endif // SHL_USE_DOT_INT8 + kernel->data = NULL; } int shl_rvv_common_conv_gemm_pack1ton_int8( diff --git a/source/thead_rvv/int8/convolution_gemm_int8_packn.c b/source/thead_rvv/int8/convolution_gemm_int8_packn.c index fbdba88f..ab2933c3 100644 --- a/source/thead_rvv/int8/convolution_gemm_int8_packn.c +++ b/source/thead_rvv/int8/convolution_gemm_int8_packn.c @@ -104,6 +104,7 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *ker int in_c = kernel->dim[1]; int maxk = kernel->dim[2] * kernel->dim[3]; + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t)); @@ -112,7 +113,7 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *ker int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk; im2col_gemm_reorder_kernel_packn_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); } - + kernel->data = NULL; // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); // shl_mem_free(pa_reorder); diff --git a/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c b/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c index cb228d66..baa57b49 100644 --- a/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c +++ b/source/thead_rvv/int8/convolution_gemm_int8_packnto1.c @@ -146,6 +146,7 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor * int in_c = kernel->dim[1]; int maxk = kernel->dim[2] * kernel->dim[3]; + csinn_tensor_copy(params->conv_extra.kernel_tm, kernel); params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t)); @@ -154,7 +155,7 @@ void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor * int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk; im2col_gemm_reorder_kernel_packnto1_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk); } - + kernel->data = NULL; // FIXME: free params->conv_extra.kernel_tm->data // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16)); // shl_mem_free(pa_reorder); diff --git a/source/thead_rvv/int8/depthwise_convolution.c b/source/thead_rvv/int8/depthwise_convolution.c index 53a6ca30..7dddc13e 100644 --- a/source/thead_rvv/int8/depthwise_convolution.c +++ b/source/thead_rvv/int8/depthwise_convolution.c @@ -39,6 +39,7 @@ int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_ int in_elempack = 1; int out_elempack = 1; struct csinn_session *sess = params->base.sess; + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); if (sess->base_run_mode == CSINN_RM_CPU_GRAPH) { struct shl_rvv_option *option = shl_rvv_get_graph_option(sess); if (option && option->use_packn_layout) { @@ -78,7 +79,9 @@ int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_ } if (in_elempack % packn == 0 && out_elempack % packn == 0) { - shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params); + if (!binary_model_op_init) { + shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params); + } if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) { cb->exec = shl_rvv_dwconv3x3s1_packn_int8; } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) { diff --git a/source/thead_rvv/int8/fullyconnected.c b/source/thead_rvv/int8/fullyconnected.c index 31fdf83c..9808a88d 100644 --- a/source/thead_rvv/int8/fullyconnected.c +++ b/source/thead_rvv/int8/fullyconnected.c @@ -26,6 +26,8 @@ int shl_rvv_fullyconnected_init_int8(struct csinn_tensor *input, struct csinn_te const int out_nodes = weights->dim[weights_dims_count - 2]; const int in_nodes = weights->dim[weights_dims_count - 1]; struct csinn_callback *cb = params->base.cb; + struct csinn_session *sess = params->base.sess; + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); if (params->base.quant_type != CSINN_QUANT_INT8_ASYM_W_SYM) { cb->exec = shl_ref_fullyconnected_quant; @@ -34,6 +36,7 @@ int shl_rvv_fullyconnected_init_int8(struct csinn_tensor *input, struct csinn_te // enable fuse zeropoint to bias if (!params->fc_extra.fuse_zp2bias) { + params->fc_extra.fuse_zp2bias = true; int32_t *bias_data = (int32_t *)bias->data; int8_t *weights_data = (int8_t *)weights->data; int32_t input_zp = input->qinfo->zero_point; @@ -58,8 +61,9 @@ int shl_rvv_fullyconnected_init_int8(struct csinn_tensor *input, struct csinn_te shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier), &(weights->qinfo[i].shift)); } - - shl_rvv_fc_gemm_reorder_weight_int8(weights); + if (!binary_model_op_init) { + shl_rvv_fc_gemm_reorder_weight_int8(weights); + } cb->exec = shl_rvv_fullyconnected_gemm_int8; return CSINN_TRUE; diff --git a/source/thead_rvv/int8/matmul.c b/source/thead_rvv/int8/matmul.c index 31358937..9c668f0d 100644 --- a/source/thead_rvv/int8/matmul.c +++ b/source/thead_rvv/int8/matmul.c @@ -176,10 +176,14 @@ int shl_rvv_matmul_init_int8(struct csinn_tensor *mat0, struct csinn_tensor *mat struct csinn_tensor *output, struct csinn_matmul_params *params) { struct csinn_callback *cb = params->base.cb; + struct csinn_session *sess = params->base.sess; + bool binary_model_op_init = shl_rvv_get_binary_model_op_init(sess); if (!params->trans_a && !params->trans_b) { if (mat0->dtype == CSINN_DTYPE_INT8 && mat1->dtype == CSINN_DTYPE_INT8) { - if (mat1->is_const) { - shl_rvv_matmul_reorder_weight_int8(mat0, mat1); + if (!binary_model_op_init) { + if (mat1->is_const) { + shl_rvv_matmul_reorder_weight_int8(mat0, mat1); + } } cb->exec = shl_rvv_matmul_int8; } diff --git a/source/thead_rvv/int8/maxpool.c b/source/thead_rvv/int8/maxpool.c index 838324f8..68f789ed 100644 --- a/source/thead_rvv/int8/maxpool.c +++ b/source/thead_rvv/int8/maxpool.c @@ -52,69 +52,82 @@ int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor elempack = in_c % packn == 0 ? packn : 1; } - // global maxpool2d // TODO: remove - if (in_h == kernel_h && in_w == kernel_w) { - cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 - : shl_ref_global_maxpool2d_quant; - return CSINN_TRUE; - } + if (input->layout == CSINN_LAYOUT_NCHW) { + // global maxpool2d // TODO: remove + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 + : shl_ref_global_maxpool2d_quant; + return CSINN_TRUE; + } - if (elempack % packn == 0) { - cb->exec = shl_rvv_maxpool_packn_int8; - } else { - if (stride_h == 2 && stride_w == 2) { - if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_down == 0) params->pad_down++; - } - if (in_w % 2 == 1 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; - } - // end consider ceil_mode 2x2s2p0 - cb->exec = shl_rvv_maxpool2x2s2_int8; - } else if (pad_left == 1 && pad_top == 1) { - cb->exec = shl_rvv_maxpool2x2s2_p1_int8; - } - } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 - if (pad_left == 0 && pad_top == 0) { - // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) - if (in_h % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_down == 0) - params->pad_down++; // origin pad_down mast be equal to zero ? - } - if (in_w % 2 == 0 && params->ceil_mode == 1) { - if (params->pad_right == 0) params->pad_right++; + if (elempack % packn == 0) { + cb->exec = shl_rvv_maxpool_packn_int8; + } else { + if (stride_h == 2 && stride_w == 2) { + if (kernel_h == 2 && kernel_w == 2) { // 2x2s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_down == 0) params->pad_down++; + } + if (in_w % 2 == 1 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 2x2s2p0 + cb->exec = shl_rvv_maxpool2x2s2_int8; + } else if (pad_left == 1 && pad_top == 1) { + cb->exec = shl_rvv_maxpool2x2s2_p1_int8; } - // end consider ceil_mode 3x3s2p0 - cb->exec = shl_rvv_maxpool3x3s2_int8; - } else if (pad_left == 1 && pad_top == 1) { - if (params->ceil_mode == 0) { - cb->exec = shl_rvv_maxpool3x3s2_p1_int8; - } else { - if ((in_w % 2 == 0 && pad_right == 1) || (in_h % 2 == 0 && pad_down == 1)) { - cb->exec = shl_ref_maxpool2d_quant; - } else { + } else if (kernel_h == 3 && kernel_w == 3) { // 3x3s2 + if (pad_left == 0 && pad_top == 0) { + // adjust pad according to ceil_mode (ceil mode on caffe pytorch..) + if (in_h % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_down == 0) + params->pad_down++; // origin pad_down mast be equal to zero ? + } + if (in_w % 2 == 0 && params->ceil_mode == 1) { + if (params->pad_right == 0) params->pad_right++; + } + // end consider ceil_mode 3x3s2p0 + cb->exec = shl_rvv_maxpool3x3s2_int8; + } else if (pad_left == 1 && pad_top == 1) { + if (params->ceil_mode == 0) { cb->exec = shl_rvv_maxpool3x3s2_p1_int8; + } else { + if ((in_w % 2 == 0 && pad_right == 1) || + (in_h % 2 == 0 && pad_down == 1)) { + cb->exec = shl_ref_maxpool2d_quant; + } else { + cb->exec = shl_rvv_maxpool3x3s2_p1_int8; + } } } } - } - } else if (stride_h == 1 && stride_w == 1) { - if (kernel_h == 3 && kernel_w == 3) { - if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { - cb->exec = shl_rvv_maxpool3x3s1_p1_int8; + } else if (stride_h == 1 && stride_w == 1) { + if (kernel_h == 3 && kernel_w == 3) { + if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) { + cb->exec = shl_rvv_maxpool3x3s1_p1_int8; + } } } } - if (cb->exec == NULL) { - shl_debug_warning( - "maxpool is not optimized to achieve under this condition on rvv, call reference " - "func replaced.\n"); - cb->exec = shl_ref_maxpool2d_quant; + + } else if (input->layout == CSINN_LAYOUT_NHWC) { + // global maxpool2d + if (in_h == kernel_h && in_w == kernel_w) { + cb->exec = shl_rvv_global_maxpool2d_nhwc_int8; + return CSINN_TRUE; } + cb->exec = shl_rvv_maxpool_nhwc_int8; + } + + if (cb->exec == NULL) { + shl_debug_warning( + "maxpool is not optimized to achieve under this condition on rvv, call reference " + "func replaced.\n"); + cb->exec = shl_ref_maxpool2d_quant; } + return CSINN_TRUE; } @@ -141,7 +154,12 @@ int shl_rvv_global_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_ elempack = in_c % packn == 0 ? packn : 1; } - cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 - : shl_ref_global_maxpool2d_quant; + if (input->layout == CSINN_LAYOUT_NCHW) { + cb->exec = (elempack % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8 + : shl_ref_global_maxpool2d_quant; + } else if (input->layout == CSINN_LAYOUT_NHWC) { + cb->exec = shl_rvv_global_maxpool2d_nhwc_int8; + } + return CSINN_TRUE; } diff --git a/source/thead_rvv/int8/rms_norm.c b/source/thead_rvv/int8/rms_norm.c index 6b391984..a55d867d 100644 --- a/source/thead_rvv/int8/rms_norm.c +++ b/source/thead_rvv/int8/rms_norm.c @@ -18,8 +18,8 @@ #include "rvv/rvv.h" -int shl_rvv_rms_norm_int8(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, struct csinn_rms_norm_params *params) +int shl_rvv_rms_norm_int8(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params) { struct csinn_tensor *float_input = shl_rvv_tensor_transform_f32(input); struct csinn_tensor *float_output = shl_rvv_tensor_transform_f32(output); @@ -44,7 +44,7 @@ int shl_rvv_rms_norm_int8(struct csinn_tensor *input, struct csinn_tensor *outpu float_weight = shl_ref_tensor_transform_f32(weight); } - int ret = shl_rvv_rms_norm_fp32(float_input, float_output, float_weight, params); + int ret = shl_rvv_rms_norm_fp32(float_input, float_weight, float_output, params); if (shl_rvv_tensor_data_convert(float_output, output) != CSINN_TRUE) { shl_debug_warning( diff --git a/source/thead_rvv/int8/transpose.c b/source/thead_rvv/int8/transpose.c index 1736de63..85dfb3de 100644 --- a/source/thead_rvv/int8/transpose.c +++ b/source/thead_rvv/int8/transpose.c @@ -69,6 +69,9 @@ static int transpose_tail_coincide_int8(struct csinn_tensor *input, struct csinn dst += vl; i += vl; } + if (d == 0) { + break; + } d -= 1; idx[d] += 1; } else { diff --git a/source/thead_rvv/performance.c b/source/thead_rvv/performance.c new file mode 100644 index 00000000..97d50edc --- /dev/null +++ b/source/thead_rvv/performance.c @@ -0,0 +1,723 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rvv/perf.h" +#include "rvv/rvv.h" + +static struct shl_function_map shl_rvv_kernel_map[] = { + {shl_rvv_common_conv_gemm_fp32, "shl_rvv_common_conv_gemm_fp32"}, + {shl_rvv_common_conv_gemm_fp16, "shl_rvv_common_conv_gemm_fp16"}, + {shl_rvv_common_conv_gemm_int8, "shl_rvv_common_conv_gemm_int8"}, + {shl_rvv_common_conv_gemm_packn_fp32, "shl_rvv_common_conv_gemm_packn_fp32"}, + {shl_rvv_common_conv_gemm_packn_fp16, "shl_rvv_common_conv_gemm_packn_fp16"}, + {shl_rvv_common_conv_gemm_packn_int8, "shl_rvv_common_conv_gemm_packn_int8"}, + {shl_rvv_common_conv_gemm_pack1ton_fp32, "shl_rvv_common_conv_gemm_pack1ton_fp32"}, + {shl_rvv_common_conv_gemm_pack1ton_fp16, "shl_rvv_common_conv_gemm_pack1ton_fp16"}, + {shl_rvv_common_conv_gemm_pack1ton_int8, "shl_rvv_common_conv_gemm_pack1ton_int8"}, + {shl_rvv_common_conv_gemm_packnto1_fp32, "shl_rvv_common_conv_gemm_packnto1_fp32"}, + {shl_rvv_common_conv_gemm_packnto1_fp16, "shl_rvv_common_conv_gemm_packnto1_fp16"}, + {shl_rvv_common_conv_gemm_packnto1_int8, "shl_rvv_common_conv_gemm_packnto1_int8"}, + {shl_rvv_common_conv1x1_gemm_fp32, "shl_rvv_common_conv1x1_gemm_fp32"}, + {shl_rvv_common_conv1x1_gemm_fp16, "shl_rvv_common_conv1x1_gemm_fp16"}, + {shl_rvv_common_conv1x1_gemm_int8, "shl_rvv_common_conv1x1_gemm_int8"}, + {shl_rvv_common_conv1x1_gemm_packn_fp32, "shl_rvv_common_conv1x1_gemm_packn_fp32"}, + {shl_rvv_common_conv1x1_gemm_packn_fp16, "shl_rvv_common_conv1x1_gemm_packn_fp16"}, + {shl_rvv_common_conv1x1_gemm_packn_int8, "shl_rvv_common_conv1x1_gemm_packn_int8"}, + {shl_rvv_common_conv1x1_gemm_pack1ton_fp32, "shl_rvv_common_conv1x1_gemm_pack1ton_fp32"}, + {shl_rvv_common_conv1x1_gemm_pack1ton_fp16, "shl_rvv_common_conv1x1_gemm_pack1ton_fp16"}, + {shl_rvv_common_conv1x1_gemm_pack1ton_int8, "shl_rvv_common_conv1x1_gemm_pack1ton_int8"}, + {shl_rvv_common_conv1x1_gemm_packnto1_fp32, "shl_rvv_common_conv1x1_gemm_packnto1_fp32"}, + {shl_rvv_common_conv1x1_gemm_packnto1_fp16, "shl_rvv_common_conv1x1_gemm_packnto1_fp16"}, + {shl_rvv_common_conv1x1_gemm_packnto1_int8, "shl_rvv_common_conv1x1_gemm_packnto1_int8"}, + {shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp32, + "shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp32"}, + {shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16, + "shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16"}, + {shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16_w_int8, + "shl_rvv_conv1d_im2col_gemm_reorder_kernel_fp16_w_int8"}, + {shl_rvv_conv1d_im2col_gemm_dequantize_per_channel_i8_to_f16, + "shl_rvv_conv1d_im2col_gemm_dequantize_per_channel_i8_to_f16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_fp32, "shl_rvv_conv_im2col_gemm_reorder_kernel_fp32"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_fp16, "shl_rvv_conv_im2col_gemm_reorder_kernel_fp16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_fp16_w_int8"}, + {shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16, + "shl_rvv_conv_im2col_gemm_dequantize_per_channel_i8_to_f16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_int8, "shl_rvv_conv_im2col_gemm_reorder_kernel_int8"}, + {shl_rvv_conv1d_im2col_gemm_fp32, "shl_rvv_conv1d_im2col_gemm_fp32"}, + {shl_rvv_conv1d_im2col_gemm_fp16, "shl_rvv_conv1d_im2col_gemm_fp16"}, + {shl_rvv_conv_im2col_gemm_fp32, "shl_rvv_conv_im2col_gemm_fp32"}, + {shl_rvv_conv_im2col_gemm_fp16, "shl_rvv_conv_im2col_gemm_fp16"}, + {shl_rvv_conv_im2col_gemm_int8, "shl_rvv_conv_im2col_gemm_int8"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16_w_int8"}, + {shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16, + "shl_rvv_conv_im2col_gemm_packn_dequantize_per_channel_i8_to_f16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8"}, + {shl_rvv_conv_im2col_gemm_packn_fp32, "shl_rvv_conv_im2col_gemm_packn_fp32"}, + {shl_rvv_conv_im2col_gemm_packn_fp16, "shl_rvv_conv_im2col_gemm_packn_fp16"}, + {shl_rvv_conv_im2col_gemm_packn_int8, "shl_rvv_conv_im2col_gemm_packn_int8"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32, + "shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16, + "shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16_w_int8"}, + {shl_rvv_conv_im2col_gemm_pack1ton_dequantize_per_channel_i8_to_f16, + "shl_rvv_conv_im2col_gemm_pack1ton_dequantize_per_channel_i8_to_f16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8"}, + {shl_rvv_conv_im2col_gemm_pack1ton_fp32, "shl_rvv_conv_im2col_gemm_pack1ton_fp32"}, + {shl_rvv_conv_im2col_gemm_pack1ton_fp16, "shl_rvv_conv_im2col_gemm_pack1ton_fp16"}, + {shl_rvv_conv_im2col_gemm_pack1ton_int8, "shl_rvv_conv_im2col_gemm_pack1ton_int8"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16_w_int8"}, + {shl_rvv_conv_im2col_gemm_packnto1_dequantize_per_channel_i8_to_f16, + "shl_rvv_conv_im2col_gemm_packnto1_dequantize_per_channel_i8_to_f16"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8"}, + {shl_rvv_conv_im2col_gemm_packnto1_fp32, "shl_rvv_conv_im2col_gemm_packnto1_fp32"}, + {shl_rvv_conv_im2col_gemm_packnto1_fp16, "shl_rvv_conv_im2col_gemm_packnto1_fp16"}, + {shl_rvv_conv_im2col_gemm_packnto1_int8, "shl_rvv_conv_im2col_gemm_packnto1_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32, "shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16, "shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16_w_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_int8, "shl_rvv_conv1x1s1_gemm_reorder_kernel_int8"}, + {shl_rvv_conv1x1s1_gemm_fp32, "shl_rvv_conv1x1s1_gemm_fp32"}, + {shl_rvv_conv1x1s1_gemm_fp16, "shl_rvv_conv1x1s1_gemm_fp16"}, + {shl_rvv_conv1x1s1_gemm_int8, "shl_rvv_conv1x1s1_gemm_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16_w_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8"}, + {shl_rvv_conv1x1s1_gemm_packn_fp32, "shl_rvv_conv1x1s1_gemm_packn_fp32"}, + {shl_rvv_conv1x1s1_gemm_packn_fp16, "shl_rvv_conv1x1s1_gemm_packn_fp16"}, + {shl_rvv_conv1x1s1_gemm_packn_int8, "shl_rvv_conv1x1s1_gemm_packn_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16_w_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8"}, + {shl_rvv_conv1x1s1_gemm_pack1ton_fp32, "shl_rvv_conv1x1s1_gemm_pack1ton_fp32"}, + {shl_rvv_conv1x1s1_gemm_pack1ton_fp16, "shl_rvv_conv1x1s1_gemm_pack1ton_fp16"}, + {shl_rvv_conv1x1s1_gemm_pack1ton_int8, "shl_rvv_conv1x1s1_gemm_pack1ton_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16_w_int8"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8"}, + {shl_rvv_conv1x1s1_gemm_packnto1_fp32, "shl_rvv_conv1x1s1_gemm_packnto1_fp32"}, + {shl_rvv_conv1x1s1_gemm_packnto1_fp16, "shl_rvv_conv1x1s1_gemm_packnto1_fp16"}, + {shl_rvv_conv1x1s1_gemm_packnto1_int8, "shl_rvv_conv1x1s1_gemm_packnto1_int8"}, + {shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32, "shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32"}, + {shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16, "shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16"}, + {shl_rvv_wg_b6f3s1_packn_fp32, "shl_rvv_wg_b6f3s1_packn_fp32"}, + {shl_rvv_wg_b6f3s1_packn_fp16, "shl_rvv_wg_b6f3s1_packn_fp16"}, + {shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32, "shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32"}, + {shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16, "shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16"}, + {shl_rvv_wg_b4f3s1_trans_kernel_packn_int8, "shl_rvv_wg_b4f3s1_trans_kernel_packn_int8"}, + {shl_rvv_wg_b4f3s1_packn_fp32, "shl_rvv_wg_b4f3s1_packn_fp32"}, + {shl_rvv_wg_b4f3s1_packn_fp16, "shl_rvv_wg_b4f3s1_packn_fp16"}, + {shl_rvv_wg_b4f3s1_packn_int8, "shl_rvv_wg_b4f3s1_packn_int8"}, + {shl_rvv_conv3x3s1_direct_reorder_kernel_pack4n_fp16, + "shl_rvv_conv3x3s1_direct_reorder_kernel_pack4n_fp16"}, + {shl_rvv_conv3x3s1_direct_fp16_nhwc, "shl_rvv_conv3x3s1_direct_fp16_nhwc"}, + {shl_rvv_dwconv3x3s1_fp32, "shl_rvv_dwconv3x3s1_fp32"}, + {shl_rvv_dwconv3x3s2_fp32, "shl_rvv_dwconv3x3s2_fp32"}, + {shl_rvv_dwconv3x3s1_fp16, "shl_rvv_dwconv3x3s1_fp16"}, + {shl_rvv_dwconv3x3s2_fp16, "shl_rvv_dwconv3x3s2_fp16"}, + {shl_rvv_dwconv3x3s1_int8, "shl_rvv_dwconv3x3s1_int8"}, + {shl_rvv_dwconv3x3s2_int8, "shl_rvv_dwconv3x3s2_int8"}, + {shl_rvv_dwconv3x3s1_int4, "shl_rvv_dwconv3x3s1_int4"}, + {shl_rvv_dwconv3x3s2_int4, "shl_rvv_dwconv3x3s2_int4"}, + {shl_rvv_dwconv_reorder_kernel_packn_fp32, "shl_rvv_dwconv_reorder_kernel_packn_fp32"}, + {shl_rvv_dwconv_reorder_kernel_packn_fp16, "shl_rvv_dwconv_reorder_kernel_packn_fp16"}, + {shl_rvv_dwconv_reorder_kernel_packn_fp16_w_int8, + "shl_rvv_dwconv_reorder_kernel_packn_fp16_w_int8"}, + {shl_rvv_dwconv_reorder_kernel_packn_int8, "shl_rvv_dwconv_reorder_kernel_packn_int8"}, + {shl_rvv_dwconv3x3s1_packn_fp32, "shl_rvv_dwconv3x3s1_packn_fp32"}, + {shl_rvv_dwconv3x3s2_packn_fp32, "shl_rvv_dwconv3x3s2_packn_fp32"}, + {shl_rvv_dwconv3x3s1_packn_fp16, "shl_rvv_dwconv3x3s1_packn_fp16"}, + {shl_rvv_dwconv3x3s2_packn_fp16, "shl_rvv_dwconv3x3s2_packn_fp16"}, + {shl_rvv_dwconv3x3s1_packn_int8, "shl_rvv_dwconv3x3s1_packn_int8"}, + {shl_rvv_dwconv3x3s2_packn_int8, "shl_rvv_dwconv3x3s2_packn_int8"}, + {shl_rvv_dwconv_packn_fp32, "shl_rvv_dwconv_packn_fp32"}, + {shl_rvv_dwconv_packn_fp16, "shl_rvv_dwconv_packn_fp16"}, + {shl_rvv_dwconv_packn_int8, "shl_rvv_dwconv_packn_int8"}, + {shl_rvv_dwconv_nhwc_fp32, "shl_rvv_dwconv_nhwc_fp32"}, + {shl_rvv_dwconv_nhwc_fp16, "shl_rvv_dwconv_nhwc_fp16"}, + {shl_rvv_dwconv_nhwc_int8, "shl_rvv_dwconv_nhwc_int8"}, + {shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp32, + "shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp32"}, + {shl_rvv_deconv2d_gemm_col2im_fp32, "shl_rvv_deconv2d_gemm_col2im_fp32"}, + {shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16, + "shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16"}, + {shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16_w_int8, + "shl_rvv_deconv2d_gemm_col2im_reorder_kernel_fp16_w_int8"}, + {shl_rvv_deconv2d_gemm_col2im_dequantize_per_channel_i8_to_f16, + "shl_rvv_deconv2d_gemm_col2im_dequantize_per_channel_i8_to_f16"}, + {shl_rvv_deconv2d_gemm_col2im_fp16, "shl_rvv_deconv2d_gemm_col2im_fp16"}, + {shl_rvv_reorder_kernel_n8_fp32, "shl_rvv_reorder_kernel_n8_fp32"}, + {shl_rvv_reorder_input_z8_fp32, "shl_rvv_reorder_input_z8_fp32"}, + {shl_rvv_gemm_8x8_fp32, "shl_rvv_gemm_8x8_fp32"}, + {shl_rvv_reorder_kernel_n8_fp16, "shl_rvv_reorder_kernel_n8_fp16"}, + {shl_rvv_reorder_input_z16_fp16, "shl_rvv_reorder_input_z16_fp16"}, + {shl_rvv_gemm_8x16_fp16, "shl_rvv_gemm_8x16_fp16"}, + {shl_rvv_reorder_kernel_n8_int8_dot, "shl_rvv_reorder_kernel_n8_int8_dot"}, + {shl_rvv_reorder_input_z8_int8_dot, "shl_rvv_reorder_input_z8_int8_dot"}, +#ifdef SHL_USE_DOT_INT8 + {shl_rvv_gemm_8x8_int8_dot, "shl_rvv_gemm_8x8_int8_dot"}, + {shl_rvv_ncxhwx_gemm_12xpackn_int8_dot, "shl_rvv_ncxhwx_gemm_12xpackn_int8_dot"}, + {shl_rvv_gemm_a0b1_8xmf2_int8_dot, "shl_rvv_gemm_a0b1_8xmf2_int8_dot"}, + {shl_rvv_matmul_reorder_mat0_n8z4_int8_dot, "shl_rvv_matmul_reorder_mat0_n8z4_int8_dot"}, + {shl_rvv_matmul_reorder_mat1_zmf2n4_int8_dot, "shl_rvv_matmul_reorder_mat1_zmf2n4_int8_dot"}, + {shl_rvv_matmul_8xmf2_int8_dot, "shl_rvv_matmul_8xmf2_int8_dot"}, +#endif + {shl_rvv_reorder_kernel_n4_int8_v128, "shl_rvv_reorder_kernel_n4_int8_v128"}, + {shl_rvv_reorder_input_z16_int8_v128, "shl_rvv_reorder_input_z16_int8_v128"}, + {shl_rvv_gemm_4x16_int8_v128, "shl_rvv_gemm_4x16_int8_v128"}, +#ifdef SHL_USE_DOT_INT4 + {shl_rvv_reorder_input_n8_int4_dot, "shl_rvv_reorder_input_n8_int4_dot"}, + {shl_rvv_reorder_kernel_n8_int4, "shl_rvv_reorder_kernel_n8_int4"}, + {shl_rvv_gemm_8x8_int4_dot, "shl_rvv_gemm_8x8_int4_dot"}, + {shl_rvv_ncxhwx_gemm_8xpackn_int4, "shl_rvv_ncxhwx_gemm_8xpackn_int4"}, + {shl_rvv_ncxhwx_gemm_12xpackn_int4, "shl_rvv_ncxhwx_gemm_12xpackn_int4"}, + {shl_rvv_gemm_a0b1_4xpackn_int8, "shl_rvv_gemm_a0b1_4xpackn_int8"}, +#endif + {shl_rvv_reorder_kernel_packn_fp32, "shl_rvv_reorder_kernel_packn_fp32"}, + {shl_rvv_reorder_input_z8_packn_fp32, "shl_rvv_reorder_input_z8_packn_fp32"}, + {shl_rvv_reorder_input_z12_packn_fp32, "shl_rvv_reorder_input_z12_packn_fp32"}, + {shl_rvv_ncxhwx_gemm_12xpack2n_fp32, "shl_rvv_ncxhwx_gemm_12xpack2n_fp32"}, + {shl_rvv_reorder_kernel_packn_fp16, "shl_rvv_reorder_kernel_packn_fp16"}, + {shl_rvv_reorder_input_z8_packn_fp16, "shl_rvv_reorder_input_z8_packn_fp16"}, + {shl_rvv_reorder_input_z12_packn_fp16, "shl_rvv_reorder_input_z12_packn_fp16"}, + {shl_rvv_ncxhwx_gemm_12xpack2n_fp16, "shl_rvv_ncxhwx_gemm_12xpack2n_fp16"}, + {shl_rvv_reorder_input_z8_packn_int8_dot, "shl_rvv_reorder_input_z8_packn_int8_dot"}, + {shl_rvv_reorder_input_z12_packn_int8_dot, "shl_rvv_reorder_input_z12_packn_int8_dot"}, + {shl_rvv_reorder_input_z8_packn_int4, "shl_rvv_reorder_input_z8_packn_int4"}, + {shl_rvv_reorder_input_z12_packn_int4, "shl_rvv_reorder_input_z12_packn_int4"}, + {shl_rvv_reorder_input_z12_pack1ton_fp32, "shl_rvv_reorder_input_z12_pack1ton_fp32"}, + {shl_rvv_reorder_input_z12_pack1ton_fp16, "shl_rvv_reorder_input_z12_pack1ton_fp16"}, + {shl_rvv_reorder_input_z4_pack1ton_int8, "shl_rvv_reorder_input_z4_pack1ton_int8"}, + {shl_rvv_reorder_input_z12_pack1ton_int8_dot, "shl_rvv_reorder_input_z12_pack1ton_int8_dot"}, + {shl_rvv_reorder_input_z4_packn_int8, "shl_rvv_reorder_input_z4_packn_int8"}, + {shl_rvv_ncxhwx_gemm_4xpack2n_int8, "shl_rvv_ncxhwx_gemm_4xpack2n_int8"}, + {shl_rvv_reorder_a_block_12xk_fp32, "shl_rvv_reorder_a_block_12xk_fp32"}, + {shl_rvv_reorder_b_block_pack2nxk_fp32, "shl_rvv_reorder_b_block_pack2nxk_fp32"}, + {shl_rvv_gemm_block_12xpack2n_fp32, "shl_rvv_gemm_block_12xpack2n_fp32"}, + {shl_rvv_reorder_a_block_12xk_fp16, "shl_rvv_reorder_a_block_12xk_fp16"}, + {shl_rvv_reorder_b_block_pack2nxk_fp16, "shl_rvv_reorder_b_block_pack2nxk_fp16"}, + {shl_rvv_gemm_block_12xpack2n_fp16, "shl_rvv_gemm_block_12xpack2n_fp16"}, + {shl_rvv_avgpool2x2s2_fp32, "shl_rvv_avgpool2x2s2_fp32"}, + {shl_rvv_avgpool2x2s2_fp16, "shl_rvv_avgpool2x2s2_fp16"}, + {shl_rvv_avgpool2x2s2_p1_fp32, "shl_rvv_avgpool2x2s2_p1_fp32"}, + {shl_rvv_avgpool2x2s2_p1_fp16, "shl_rvv_avgpool2x2s2_p1_fp16"}, + {shl_rvv_avgpool3x3s2_fp32, "shl_rvv_avgpool3x3s2_fp32"}, + {shl_rvv_avgpool3x3s2_fp16, "shl_rvv_avgpool3x3s2_fp16"}, + {shl_rvv_avgpool3x3s2_p1_fp32, "shl_rvv_avgpool3x3s2_p1_fp32"}, + {shl_rvv_avgpool3x3s2_p1_fp16, "shl_rvv_avgpool3x3s2_p1_fp16"}, + {shl_rvv_avgpool3x3s1_p1_fp32, "shl_rvv_avgpool3x3s1_p1_fp32"}, + {shl_rvv_avgpool3x3s1_p1_fp16, "shl_rvv_avgpool3x3s1_p1_fp16"}, + {shl_rvv_maxpool2x2s2_fp32, "shl_rvv_maxpool2x2s2_fp32"}, + {shl_rvv_maxpool2x2s2_fp16, "shl_rvv_maxpool2x2s2_fp16"}, + {shl_rvv_maxpool2x2s2_int8, "shl_rvv_maxpool2x2s2_int8"}, + {shl_rvv_maxpool2x2s2_p1_fp32, "shl_rvv_maxpool2x2s2_p1_fp32"}, + {shl_rvv_maxpool2x2s2_p1_fp16, "shl_rvv_maxpool2x2s2_p1_fp16"}, + {shl_rvv_maxpool2x2s2_p1_int8, "shl_rvv_maxpool2x2s2_p1_int8"}, + {shl_rvv_maxpool3x3s2_fp32, "shl_rvv_maxpool3x3s2_fp32"}, + {shl_rvv_maxpool3x3s2_fp16, "shl_rvv_maxpool3x3s2_fp16"}, + {shl_rvv_maxpool3x3s2_int8, "shl_rvv_maxpool3x3s2_int8"}, + {shl_rvv_maxpool3x3s2_p1_fp32, "shl_rvv_maxpool3x3s2_p1_fp32"}, + {shl_rvv_maxpool3x3s2_p1_fp16, "shl_rvv_maxpool3x3s2_p1_fp16"}, + {shl_rvv_maxpool3x3s2_p1_int8, "shl_rvv_maxpool3x3s2_p1_int8"}, + {shl_rvv_maxpool3x3s1_p1_fp32, "shl_rvv_maxpool3x3s1_p1_fp32"}, + {shl_rvv_maxpool3x3s1_p1_fp16, "shl_rvv_maxpool3x3s1_p1_fp16"}, + {shl_rvv_maxpool3x3s1_p1_int8, "shl_rvv_maxpool3x3s1_p1_int8"}, + {shl_rvv_global_avgpool2d_fp32, "shl_rvv_global_avgpool2d_fp32"}, + {shl_rvv_global_avgpool2d_fp16, "shl_rvv_global_avgpool2d_fp16"}, + {shl_rvv_global_maxpool2d_fp32, "shl_rvv_global_maxpool2d_fp32"}, + {shl_rvv_global_maxpool2d_fp16, "shl_rvv_global_maxpool2d_fp16"}, + {shl_rvv_global_maxpool2d_packn_fp32, "shl_rvv_global_maxpool2d_packn_fp32"}, + {shl_rvv_global_maxpool2d_packn_fp16, "shl_rvv_global_maxpool2d_packn_fp16"}, + {shl_rvv_global_maxpool2d_packn_int8, "shl_rvv_global_maxpool2d_packn_int8"}, + {shl_rvv_global_avgpool2d_packn_fp32, "shl_rvv_global_avgpool2d_packn_fp32"}, + {shl_rvv_global_avgpool2d_packn_fp16, "shl_rvv_global_avgpool2d_packn_fp16"}, + {shl_rvv_global_avgpool2d_packn_int8, "shl_rvv_global_avgpool2d_packn_int8"}, + {shl_rvv_maxpool_packn_fp32, "shl_rvv_maxpool_packn_fp32"}, + {shl_rvv_maxpool_packn_fp16, "shl_rvv_maxpool_packn_fp16"}, + {shl_rvv_maxpool_packn_int8, "shl_rvv_maxpool_packn_int8"}, + {shl_rvv_avgpool_packn_fp32, "shl_rvv_avgpool_packn_fp32"}, + {shl_rvv_avgpool_packn_fp16, "shl_rvv_avgpool_packn_fp16"}, + {shl_rvv_avgpool_packn_int8, "shl_rvv_avgpool_packn_int8"}, + {shl_rvv_maxpool_nhwc_fp32, "shl_rvv_maxpool_nhwc_fp32"}, + {shl_rvv_maxpool_nhwc_fp16, "shl_rvv_maxpool_nhwc_fp16"}, + {shl_rvv_maxpool_nhwc_int8, "shl_rvv_maxpool_nhwc_int8"}, + {shl_rvv_avgpool_nhwc_fp32, "shl_rvv_avgpool_nhwc_fp32"}, + {shl_rvv_avgpool_nhwc_fp16, "shl_rvv_avgpool_nhwc_fp16"}, + {shl_rvv_avgpool_nhwc_int8, "shl_rvv_avgpool_nhwc_int8"}, + {shl_rvv_global_maxpool2d_nhwc_fp32, "shl_rvv_global_maxpool2d_nhwc_fp32"}, + {shl_rvv_global_maxpool2d_nhwc_fp16, "shl_rvv_global_maxpool2d_nhwc_fp16"}, + {shl_rvv_global_maxpool2d_nhwc_int8, "shl_rvv_global_maxpool2d_nhwc_int8"}, + {shl_rvv_global_avgpool2d_nhwc_fp32, "shl_rvv_global_avgpool2d_nhwc_fp32"}, + {shl_rvv_global_avgpool2d_nhwc_fp16, "shl_rvv_global_avgpool2d_nhwc_fp16"}, + {shl_rvv_global_avgpool2d_nhwc_int8, "shl_rvv_global_avgpool2d_nhwc_int8"}, + {shl_rvv_fc_gemm_reorder_weight_fp32, "shl_rvv_fc_gemm_reorder_weight_fp32"}, + {shl_rvv_fc_gemm_reorder_weight_fp16, "shl_rvv_fc_gemm_reorder_weight_fp16"}, + {shl_rvv_fc_gemm_reorder_weight_fp16_w_int8, "shl_rvv_fc_gemm_reorder_weight_fp16_w_int8"}, + {shl_rvv_fc_gemm_reorder_weight_int8, "shl_rvv_fc_gemm_reorder_weight_int8"}, + {shl_rvv_gemm_a0b1_12xpack2n_fp32, "shl_rvv_gemm_a0b1_12xpack2n_fp32"}, + {shl_rvv_gemm_a0b1_12xpack2n_fp16, "shl_rvv_gemm_a0b1_12xpack2n_fp16"}, + {shl_rvv_fullyconnected_gemm_fp32, "shl_rvv_fullyconnected_gemm_fp32"}, + {shl_rvv_fullyconnected_gemm_fp16, "shl_rvv_fullyconnected_gemm_fp16"}, + {shl_rvv_fullyconnected_gemm_int8, "shl_rvv_fullyconnected_gemm_int8"}, + {shl_rvv_relu_fp32, "shl_rvv_relu_fp32"}, + {shl_rvv_relu_fp16, "shl_rvv_relu_fp16"}, + {shl_rvv_relu_int8, "shl_rvv_relu_int8"}, + {shl_rvv_relu6_fp32, "shl_rvv_relu6_fp32"}, + {shl_rvv_relu6_fp16, "shl_rvv_relu6_fp16"}, + {shl_rvv_relu6_int8, "shl_rvv_relu6_int8"}, + {shl_rvv_leaky_relu_fp32, "shl_rvv_leaky_relu_fp32"}, + {shl_rvv_leaky_relu_fp16, "shl_rvv_leaky_relu_fp16"}, + {shl_rvv_leaky_relu_int8, "shl_rvv_leaky_relu_int8"}, + {shl_rvv_sigmoid_fp32, "shl_rvv_sigmoid_fp32"}, + {shl_rvv_sigmoid_fp16, "shl_rvv_sigmoid_fp16"}, + {shl_rvv_sigmoid_int8, "shl_rvv_sigmoid_int8"}, + {shl_rvv_softmax_fp32, "shl_rvv_softmax_fp32"}, + {shl_rvv_softmax_fp16, "shl_rvv_softmax_fp16"}, + {shl_rvv_softmax_int8, "shl_rvv_softmax_int8"}, + {shl_rvv_prelu_fp32, "shl_rvv_prelu_fp32"}, + {shl_rvv_prelu_fp16, "shl_rvv_prelu_fp16"}, + {shl_rvv_prelu_int8, "shl_rvv_prelu_int8"}, + {shl_rvv_clip_fp32, "shl_rvv_clip_fp32"}, + {shl_rvv_clip_fp16, "shl_rvv_clip_fp16"}, + {shl_rvv_clip_int8, "shl_rvv_clip_int8"}, + {shl_rvv_silu_fp32, "shl_rvv_silu_fp32"}, + {shl_rvv_silu_fp16, "shl_rvv_silu_fp16"}, + {shl_rvv_silu_int8, "shl_rvv_silu_int8"}, + {shl_rvv_concat_fp32, "shl_rvv_concat_fp32"}, + {shl_rvv_concat_fp16, "shl_rvv_concat_fp16"}, + {shl_rvv_concat_int8, "shl_rvv_concat_int8"}, + {shl_rvv_split_fp32, "shl_rvv_split_fp32"}, + {shl_rvv_split_fp16, "shl_rvv_split_fp16"}, + {shl_rvv_split_int8, "shl_rvv_split_int8"}, + {shl_rvv_reshape_fp32, "shl_rvv_reshape_fp32"}, + {shl_rvv_reshape_fp16, "shl_rvv_reshape_fp16"}, + {shl_rvv_reshape_int8, "shl_rvv_reshape_int8"}, + {shl_rvv_transpose_fp32, "shl_rvv_transpose_fp32"}, + {shl_rvv_transpose_fp16, "shl_rvv_transpose_fp16"}, + {shl_rvv_transpose_int8, "shl_rvv_transpose_int8"}, + {shl_rvv_gather_fp32, "shl_rvv_gather_fp32"}, + {shl_rvv_gather_fp16, "shl_rvv_gather_fp16"}, + {shl_rvv_gather_int8, "shl_rvv_gather_int8"}, + {shl_rvv_strided_slice_fp16, "shl_rvv_strided_slice_fp16"}, + {shl_rvv_add_fp32, "shl_rvv_add_fp32"}, + {shl_rvv_add_fp16, "shl_rvv_add_fp16"}, + {shl_rvv_add_int8, "shl_rvv_add_int8"}, + {shl_rvv_sub_fp32, "shl_rvv_sub_fp32"}, + {shl_rvv_sub_fp16, "shl_rvv_sub_fp16"}, + {shl_rvv_sub_int8, "shl_rvv_sub_int8"}, + {shl_rvv_mul_fp32, "shl_rvv_mul_fp32"}, + {shl_rvv_mul_fp16, "shl_rvv_mul_fp16"}, + {shl_rvv_mul_int8, "shl_rvv_mul_int8"}, + {shl_rvv_div_fp32, "shl_rvv_div_fp32"}, + {shl_rvv_div_fp16, "shl_rvv_div_fp16"}, + {shl_rvv_div_int8, "shl_rvv_div_int8"}, + {shl_rvv_reduce_sum_int8, "shl_rvv_reduce_sum_int8"}, + {shl_rvv_erf_fp32, "shl_rvv_erf_fp32"}, + {shl_rvv_erf_fp16, "shl_rvv_erf_fp16"}, + {shl_rvv_erf_int8, "shl_rvv_erf_int8"}, + {shl_rvv_layer_norm_fp32, "shl_rvv_layer_norm_fp32"}, + {shl_rvv_layer_norm_fp16, "shl_rvv_layer_norm_fp16"}, + {shl_rvv_layer_norm_int8, "shl_rvv_layer_norm_int8"}, + {shl_rvv_rms_norm_fp32, "shl_rvv_rms_norm_fp32"}, + {shl_rvv_rms_norm_fp16, "shl_rvv_rms_norm_fp16"}, + {shl_rvv_rms_norm_int8, "shl_rvv_rms_norm_int8"}, + {shl_rvv_matmul_reorder_weight_fp32, "shl_rvv_matmul_reorder_weight_fp32"}, + {shl_rvv_matmul_reorder_weight_fp16, "shl_rvv_matmul_reorder_weight_fp16"}, + {shl_rvv_matmul_reorder_weight_fp16_w_int8, "shl_rvv_matmul_reorder_weight_fp16_w_int8"}, + {shl_rvv_matmul_reorder_weight_int8, "shl_rvv_matmul_reorder_weight_int8"}, + {shl_rvv_matmul_block_fp32, "shl_rvv_matmul_block_fp32"}, + {shl_rvv_matmul_block_fp16, "shl_rvv_matmul_block_fp16"}, + {shl_rvv_matmul_block_fp16_w_int8, "shl_rvv_matmul_block_fp16_w_int8"}, + {shl_rvv_matmul_common_int8, "shl_rvv_matmul_common_int8"}, + {shl_rvv_matmul_reorder_mat0_n4_int8, "shl_rvv_matmul_reorder_mat0_n4_int8"}, + {shl_rvv_matmul_reorder_mat1_zpackn_int8, "shl_rvv_matmul_reorder_mat1_zpackn_int8"}, + {shl_rvv_matmul_4xpackn_int8, "shl_rvv_matmul_4xpackn_int8"}, + {shl_rvv_matmul_fp32, "shl_rvv_matmul_fp32"}, + {shl_rvv_matmul_fp16, "shl_rvv_matmul_fp16"}, + {shl_rvv_matmul_int8, "shl_rvv_matmul_int8"}, + {shl_rvv_pad_input_fp32, "shl_rvv_pad_input_fp32"}, + {shl_rvv_pad_input_fp16, "shl_rvv_pad_input_fp16"}, + {shl_rvv_pad_input_int8, "shl_rvv_pad_input_int8"}, + {shl_rvv_pad_input_packn_fp32, "shl_rvv_pad_input_packn_fp32"}, + {shl_rvv_pad_input_packn_fp16, "shl_rvv_pad_input_packn_fp16"}, + {shl_rvv_pad_input_packn_int8, "shl_rvv_pad_input_packn_int8"}, + {shl_rvv_pad_input_pack1ton_fp32, "shl_rvv_pad_input_pack1ton_fp32"}, + {shl_rvv_pad_input_pack1ton_fp16, "shl_rvv_pad_input_pack1ton_fp16"}, + {shl_rvv_pad_input_pack1ton_int8, "shl_rvv_pad_input_pack1ton_int8"}, + {shl_rvv_pad_input_nhwc_fp32, "shl_rvv_pad_input_nhwc_fp32"}, + {shl_rvv_pad_input_nhwc_fp16, "shl_rvv_pad_input_nhwc_fp16"}, + {shl_rvv_pad_input_nhwc_int8, "shl_rvv_pad_input_nhwc_int8"}, + {shl_rvv_avgpool_get_window_size, "shl_rvv_avgpool_get_window_size"}, + {shl_rvv_conv1d_gemm_reorder_kernel_int8, "shl_rvv_conv1d_gemm_reorder_kernel_int8"}, + {shl_rvv_conv1d_gemm_int8, "shl_rvv_conv1d_gemm_int8"}, + {shl_rvv_dwconv1d_int8, "shl_rvv_dwconv1d_int8"}, + {shl_rvv_transpose_get_tail, "shl_rvv_transpose_get_tail"}, + {shl_rvv_transpose_get_in_index, "shl_rvv_transpose_get_in_index"}, + {shl_rvv_transpose_get_out_index, "shl_rvv_transpose_get_out_index"}, + {shl_rvv_binary_op_broadcast_fp32, "shl_rvv_binary_op_broadcast_fp32"}, + {shl_rvv_binary_op_broadcast_fp16, "shl_rvv_binary_op_broadcast_fp16"}, + {shl_rvv_binary_op_broadcast_int8, "shl_rvv_binary_op_broadcast_int8"}, + {shl_rvv_embedding_int32, "shl_rvv_embedding_int32"}, + {shl_rvv_expand_dims_fp32, "shl_rvv_expand_dims_fp32"}, + {shl_rvv_expand_dims_fp16, "shl_rvv_expand_dims_fp16"}, + {shl_rvv_rope_fp32, "shl_rvv_rope_fp32"}, + {shl_rvv_rope_fp16, "shl_rvv_rope_fp16"}, + {shl_rvv_scaled_dot_product_attention_fp32, "shl_rvv_scaled_dot_product_attention_fp32"}, + {shl_rvv_scaled_dot_product_attention_fp16, "shl_rvv_scaled_dot_product_attention_fp16"}, + {shl_rvv_llm_pos_fp16, "shl_rvv_llm_pos_fp16"}, +#ifdef SHL_USE_DOT_INT4 + {shl_rvv_conv2d_init_int4, "shl_rvv_conv2d_init_int4"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_int4, "shl_rvv_conv_im2col_gemm_reorder_kernel_int4"}, + {shl_rvv_conv_im2col_gemm_int4, "shl_rvv_conv_im2col_gemm_int4"}, + {shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4, + "shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4"}, + {shl_rvv_conv_im2col_gemm_packn_int4, "shl_rvv_conv_im2col_gemm_packn_int4"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_int4, "shl_rvv_conv1x1s1_gemm_reorder_kernel_int4"}, + {shl_rvv_conv1x1s1_gemm_int4, "shl_rvv_conv1x1s1_gemm_int4"}, + {shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4, + "shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4"}, + {shl_rvv_conv1x1s1_gemm_packn_int4, "shl_rvv_conv1x1s1_gemm_packn_int4"}, + {shl_rvv_fc_gemv_transform_weight_int4_dot, "shl_rvv_fc_gemv_transform_weight_int4_dot"}, + {shl_rvv_fullyconnected_packn_int4_dot, "shl_rvv_fullyconnected_packn_int4_dot"}, +#endif + {NULL, NULL}}; + +char *shl_ref_get_kernel_name(void *exec); + +char *shl_rvv_get_kernel_name(void *exec) +{ + char *name = shl_find_function_name(shl_rvv_kernel_map, exec); + if (name == NULL) { + name = shl_ref_get_kernel_name(exec); + } + return name; +} + +int shl_rvv_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_depthwise_conv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_conv1d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv1d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_deconv2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, + struct csinn_conv2d_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_fullyconnected_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *weights, struct csinn_tensor *bias, + struct csinn_fc_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_add_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_sub_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_mul_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_div_perf(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_concat_perf(struct csinn_tensor **input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_leaky_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_relu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_relu6_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_relu_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_global_avgpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_global_maxpool2d_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_pool_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_reshape_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reshape_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_sigmoid_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_softmax_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_softmax_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_reduce_sum_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_reduce_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_prelu_perf(struct csinn_tensor *input, struct csinn_tensor *alpha, + struct csinn_tensor *output, struct csinn_prelu_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_layer_norm_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *gamma, struct csinn_tensor *beta, + struct csinn_layer_norm_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_clip_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_transpose_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_transpose_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_matmul_perf(struct csinn_tensor *mat0, struct csinn_tensor *mat1, + struct csinn_tensor *output, struct csinn_matmul_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_gather_perf(struct csinn_tensor *input, struct csinn_tensor *indices, + struct csinn_tensor *output, struct csinn_gather_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_erf_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_clip_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_strided_slice_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_strided_slice_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_split_perf(struct csinn_tensor *input, struct csinn_tensor **output, + struct csinn_split_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_silu_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_sigmoid_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_rms_norm_perf(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_rms_norm_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_embedding_perf(struct csinn_tensor *input, struct csinn_tensor *weight, + struct csinn_tensor *output, struct csinn_diso_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_expand_dims_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_expand_dims_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_rope_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_rope_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_scaled_dot_product_attention_perf(struct csinn_tensor *query, struct csinn_tensor *key, + struct csinn_tensor *value, + struct csinn_tensor *output_tensor, + struct csinn_scale_dot_attention_params *params, + struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} + +int shl_rvv_llm_pos_perf(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_llm_pos_params *params, struct csinn_perf_info *perf_info) +{ + perf_info->kernel_name = shl_rvv_get_kernel_name(params->base.cb->exec); + return CSINN_TRUE; +} diff --git a/source/thead_rvv/setup.c b/source/thead_rvv/setup.c index 8e01c29e..2fefeeef 100644 --- a/source/thead_rvv/setup.c +++ b/source/thead_rvv/setup.c @@ -17,13 +17,14 @@ */ #include "rvv/cap.h" +#include "rvv/perf.h" #include "rvv/rvv.h" #define RVV_OP_PATTERN_MAX 120 static struct shl_cb_table shl_rvv_cb_table[RVV_OP_PATTERN_MAX]; void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec, - void *est, void *cap) + void *est, void *cap, void *perf) { static int i = 0; if (i >= RVV_OP_PATTERN_MAX) { @@ -34,6 +35,7 @@ void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, voi shl_rvv_cb_table[i].shl_cb_value.exec = exec; shl_rvv_cb_table[i].shl_cb_value.est = est; shl_rvv_cb_table[i].shl_cb_value.caps = cap; + shl_rvv_cb_table[i].shl_cb_value.perf = perf; i++; } @@ -67,419 +69,461 @@ void __attribute__((weak)) shl_target_init_rvv() { #ifndef CONFIG_THEAD_RVV_CONVOLUTION_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_rvv_conv2d_init_fp32, NULL, - shl_gref_conv2d, shl_rvv_conv2d_cap); + shl_gref_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_fp32, NULL, - shl_gref_group_conv2d, shl_rvv_conv2d_cap); + shl_gref_group_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_CONVOLUTION_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_rvv_conv2d_init_fp16, NULL, - shl_gref_conv2d, shl_rvv_conv2d_cap); + shl_gref_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_fp16, NULL, - shl_gref_group_conv2d, shl_rvv_conv2d_cap); + shl_gref_group_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_fp32, NULL, shl_gref_depthwise_conv2d, - shl_rvv_depthwise_conv2d_cap); + shl_rvv_depthwise_conv2d_cap, shl_rvv_depthwise_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_fp16, NULL, shl_gref_depthwise_conv2d, - shl_rvv_depthwise_conv2d_cap); + shl_rvv_depthwise_conv2d_cap, shl_rvv_depthwise_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_int8, - NULL, shl_gref_depthwise_conv2d, shl_rvv_depthwise_conv2d_cap); + NULL, shl_gref_depthwise_conv2d, shl_rvv_depthwise_conv2d_cap, + shl_rvv_depthwise_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU, shl_rvv_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu, - shl_rvv_depthwise_conv2d_cap); + shl_rvv_depthwise_conv2d_cap, shl_rvv_depthwise_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU6, shl_rvv_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu6, - shl_rvv_depthwise_conv2d_cap); + shl_rvv_depthwise_conv2d_cap, shl_rvv_depthwise_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_DECONVOLUTION_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DECONV2D, shl_rvv_deconv2d_init_fp32, NULL, - shl_gref_deconv2d, shl_rvv_deconv2d_cap); + shl_gref_deconv2d, shl_rvv_deconv2d_cap, shl_rvv_deconv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_DECONV2D, shl_rvv_deconv2d_init_fp32, NULL, - shl_gref_group_deconv2d, shl_rvv_deconv2d_cap); + shl_gref_group_deconv2d, shl_rvv_deconv2d_cap, shl_rvv_deconv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_DECONVOLUTION_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DECONV2D, shl_rvv_deconv2d_init_fp16, NULL, - shl_gref_deconv2d, shl_rvv_deconv2d_cap); + shl_gref_deconv2d, shl_rvv_deconv2d_cap, shl_rvv_deconv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_DECONV2D, shl_rvv_deconv2d_init_fp16, NULL, - shl_gref_group_deconv2d, shl_rvv_deconv2d_cap); + shl_gref_group_deconv2d, shl_rvv_deconv2d_cap, shl_rvv_deconv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_MAXPOOL_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp32, NULL, - shl_gref_maxpool2d, shl_rvv_maxpool2d_cap); + shl_gref_maxpool2d, shl_rvv_maxpool2d_cap, shl_rvv_maxpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_MAXPOOL_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp16, NULL, - shl_gref_maxpool2d, shl_rvv_maxpool2d_cap); + shl_gref_maxpool2d, shl_rvv_maxpool2d_cap, shl_rvv_maxpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_MAXPOOL_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_int8, NULL, - shl_gref_maxpool2d, shl_rvv_maxpool2d_cap); + shl_gref_maxpool2d, shl_rvv_maxpool2d_cap, shl_rvv_maxpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_AVERAGEPOOL_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_fp32, NULL, - shl_gref_avgpool2d, shl_rvv_avgpool2d_cap); + shl_gref_avgpool2d, shl_rvv_avgpool2d_cap, shl_rvv_avgpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_AVERAGEPOOL_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_fp16, NULL, - shl_gref_avgpool2d, shl_rvv_avgpool2d_cap); + shl_gref_avgpool2d, shl_rvv_avgpool2d_cap, shl_rvv_avgpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_AVERAGEPOOL_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_int8, NULL, - shl_gref_avgpool2d, shl_rvv_avgpool2d_cap); + shl_gref_avgpool2d, shl_rvv_avgpool2d_cap, shl_rvv_avgpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_FULLYCONNECTED_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init_fp32, - NULL, shl_gref_fullyconnected, shl_rvv_fullyconnected_cap); + NULL, shl_gref_fullyconnected, shl_rvv_fullyconnected_cap, + shl_rvv_fullyconnected_perf); #endif #ifndef CONFIG_THEAD_RVV_FULLYCONNECTED_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init_fp16, - NULL, shl_gref_fullyconnected, shl_rvv_fullyconnected_cap); + NULL, shl_gref_fullyconnected, shl_rvv_fullyconnected_cap, + shl_rvv_fullyconnected_perf); #endif #ifndef CONFIG_THEAD_RVV_FULLYCONNECTED_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init_int8, - NULL, shl_gref_fullyconnected, shl_rvv_fullyconnected_cap); + NULL, shl_gref_fullyconnected, shl_rvv_fullyconnected_cap, + shl_rvv_fullyconnected_perf); #endif #ifndef CONFIG_THEAD_RVV_ADD_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, NULL, shl_rvv_add_fp32, shl_gref_add, - shl_rvv_add_cap); + shl_rvv_add_cap, shl_rvv_add_perf); #endif #ifndef CONFIG_THEAD_RVV_ADD_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, NULL, shl_rvv_add_fp16, shl_gref_add, - shl_rvv_add_cap); + shl_rvv_add_cap, shl_rvv_add_perf); #endif #ifndef CONFIG_THEAD_RVV_ADD_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_ADD, NULL, shl_rvv_add_int8, shl_gref_add, - shl_rvv_add_cap); + shl_rvv_add_cap, shl_rvv_add_perf); #endif #ifndef CONFIG_THEAD_RVV_SUB_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, NULL, shl_rvv_sub_fp32, shl_gref_sub, - shl_rvv_sub_cap); + shl_rvv_sub_cap, shl_rvv_sub_perf); #endif #ifndef CONFIG_THEAD_RVV_SUB_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, NULL, shl_rvv_sub_fp16, shl_gref_sub, - shl_rvv_sub_cap); + shl_rvv_sub_cap, shl_rvv_sub_perf); #endif #ifndef CONFIG_THEAD_RVV_SUB_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SUB, NULL, shl_rvv_sub_int8, shl_gref_sub, - shl_rvv_sub_cap); + shl_rvv_sub_cap, shl_rvv_sub_perf); #endif #ifndef CONFIG_THEAD_RVV_MUL_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, NULL, shl_rvv_mul_fp32, shl_gref_mul, - shl_rvv_mul_cap); + shl_rvv_mul_cap, shl_rvv_mul_perf); #endif #ifndef CONFIG_THEAD_RVV_MUL_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, NULL, shl_rvv_mul_fp16, shl_gref_mul, - shl_rvv_mul_cap); + shl_rvv_mul_cap, shl_rvv_mul_perf); #endif #ifndef CONFIG_THEAD_RVV_MUL_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MUL, NULL, shl_rvv_mul_int8, shl_gref_mul, - shl_rvv_mul_cap); + shl_rvv_mul_cap, shl_rvv_mul_perf); #endif #ifndef CONFIG_THEAD_RVV_DIV_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, NULL, shl_rvv_div_fp32, shl_gref_div, - shl_rvv_div_cap); + shl_rvv_div_cap, shl_rvv_div_perf); #endif #ifndef CONFIG_THEAD_RVV_DIV_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, NULL, shl_rvv_div_fp16, shl_gref_div, - shl_rvv_div_cap); + shl_rvv_div_cap, shl_rvv_div_perf); #endif #ifndef CONFIG_THEAD_RVV_DIV_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DIV, NULL, shl_rvv_div_int8, shl_gref_div, - shl_rvv_div_cap); + shl_rvv_div_cap, shl_rvv_div_perf); #endif #ifndef CONFIG_THEAD_RVV_CONCAT_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp32, shl_gref_concat, - shl_rvv_concat_cap); + shl_rvv_concat_cap, shl_rvv_concat_perf); #endif #ifndef CONFIG_THEAD_RVV_CONCAT_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp16, shl_gref_concat, - shl_rvv_concat_cap); + shl_rvv_concat_cap, shl_rvv_concat_perf); #endif #ifndef CONFIG_THEAD_RVV_CONCAT_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONCAT, NULL, shl_rvv_concat_int8, shl_gref_concat, - shl_rvv_concat_cap); + shl_rvv_concat_cap, shl_rvv_concat_perf); #endif #ifndef CONFIG_THEAD_RVV_LEAKY_RELU_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_fp32, - shl_gref_leaky_relu, shl_rvv_leaky_relu_cap); + shl_gref_leaky_relu, shl_rvv_leaky_relu_cap, shl_rvv_leaky_relu_perf); #endif #ifndef CONFIG_THEAD_RVV_LEAKY_RELU_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_fp16, - shl_gref_leaky_relu, shl_rvv_leaky_relu_cap); + shl_gref_leaky_relu, shl_rvv_leaky_relu_cap, shl_rvv_leaky_relu_perf); #endif #ifndef CONFIG_THEAD_RVV_LEAKY_RELU_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_int8, - shl_gref_leaky_relu, shl_rvv_leaky_relu_cap); + shl_gref_leaky_relu, shl_rvv_leaky_relu_cap, shl_rvv_leaky_relu_perf); #endif #ifndef CONFIG_THEAD_RVV_RELU_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, NULL, shl_rvv_relu_fp32, shl_gref_relu, - shl_rvv_relu_cap); + shl_rvv_relu_cap, shl_rvv_relu_perf); #endif #ifndef CONFIG_THEAD_RVV_RELU_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, NULL, shl_rvv_relu_fp16, shl_gref_relu, - shl_rvv_relu_cap); + shl_rvv_relu_cap, shl_rvv_relu_perf); #endif #ifndef CONFIG_THEAD_RVV_RELU_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU, NULL, shl_rvv_relu_int8, shl_gref_relu, - shl_rvv_relu_cap); + shl_rvv_relu_cap, shl_rvv_relu_perf); #endif #ifndef CONFIG_THEAD_RVV_RELU6_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp32, shl_gref_relu6, - shl_rvv_relu6_cap); + shl_rvv_relu6_cap, shl_rvv_relu6_perf); #endif #ifndef CONFIG_THEAD_RVV_RELU6_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp16, shl_gref_relu6, - shl_rvv_relu6_cap); + shl_rvv_relu6_cap, shl_rvv_relu6_perf); #endif #ifndef CONFIG_THEAD_RVV_RELU6_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU6, NULL, shl_rvv_relu6_int8, shl_gref_relu6, - shl_rvv_relu6_cap); + shl_rvv_relu6_cap, shl_rvv_relu6_perf); #endif #ifndef CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init_fp32, NULL, shl_gref_global_avgpool2d, - shl_rvv_global_avgpool2d_cap); + shl_rvv_global_avgpool2d_cap, shl_rvv_global_avgpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init_fp16, NULL, shl_gref_global_avgpool2d, - shl_rvv_global_avgpool2d_cap); + shl_rvv_global_avgpool2d_cap, shl_rvv_global_avgpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_GLOBAL_AVERAGEPOOL_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init_int8, - NULL, shl_gref_global_avgpool2d, shl_rvv_global_avgpool2d_cap); + NULL, shl_gref_global_avgpool2d, shl_rvv_global_avgpool2d_cap, + shl_rvv_global_avgpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_GLOBAL_MAXPOOL_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, shl_rvv_global_maxpool2d_init_fp32, NULL, shl_gref_global_maxpool2d, - shl_rvv_global_maxpool2d_cap); + shl_rvv_global_maxpool2d_cap, shl_rvv_global_maxpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_GLOBAL_MAXPOOL_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, shl_rvv_global_maxpool2d_init_fp16, NULL, shl_gref_global_maxpool2d, - shl_rvv_global_maxpool2d_cap); + shl_rvv_global_maxpool2d_cap, shl_rvv_global_maxpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_GLOBAL_MAXPOOL_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GLOBAL_MAXPOOL2D, shl_rvv_global_maxpool2d_init_int8, - NULL, shl_gref_global_maxpool2d, shl_rvv_global_maxpool2d_cap); + NULL, shl_gref_global_maxpool2d, shl_rvv_global_maxpool2d_cap, + shl_rvv_global_maxpool2d_perf); #endif #ifndef CONFIG_THEAD_RVV_RESHAPE_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RESHAPE, NULL, shl_rvv_reshape_fp32, - shl_gref_reshape, shl_rvv_reshape_cap); + shl_gref_reshape, shl_rvv_reshape_cap, shl_rvv_reshape_perf); #endif #ifndef CONFIG_THEAD_RVV_RESHAPE_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, NULL, shl_rvv_reshape_fp16, - shl_gref_reshape, shl_rvv_reshape_cap); + shl_gref_reshape, shl_rvv_reshape_cap, shl_rvv_reshape_perf); #endif #ifndef CONFIG_THEAD_RVV_RESHAPE_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RESHAPE, NULL, shl_rvv_reshape_int8, shl_gref_reshape, - shl_rvv_reshape_cap); + shl_rvv_reshape_cap, shl_rvv_reshape_perf); #endif #ifndef CONFIG_THEAD_RVV_SIGMOID_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_fp32, - shl_gref_sigmoid, shl_rvv_sigmoid_cap); + shl_gref_sigmoid, shl_rvv_sigmoid_cap, shl_rvv_sigmoid_perf); #endif #ifndef CONFIG_THEAD_RVV_SIGMOID_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_fp16, - shl_gref_sigmoid, shl_rvv_sigmoid_cap); + shl_gref_sigmoid, shl_rvv_sigmoid_cap, shl_rvv_sigmoid_perf); #endif #ifndef CONFIG_THEAD_RVV_SIGMOID_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_int8, shl_gref_sigmoid, - shl_rvv_sigmoid_cap); + shl_rvv_sigmoid_cap, shl_rvv_sigmoid_perf); #endif #ifndef CONFIG_THEAD_RVV_SOFTMAX_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_fp32, - shl_gref_softmax, shl_rvv_softmax_cap); + shl_gref_softmax, shl_rvv_softmax_cap, shl_rvv_softmax_perf); #endif #ifndef CONFIG_THEAD_RVV_SOFTMAX_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_fp16, - shl_gref_softmax, shl_rvv_softmax_cap); + shl_gref_softmax, shl_rvv_softmax_cap, shl_rvv_softmax_perf); #endif #ifndef CONFIG_THEAD_RVV_SOFTMAX_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_int8, shl_gref_softmax, - shl_rvv_softmax_cap); + shl_rvv_softmax_cap, shl_rvv_softmax_perf); #endif #ifndef CONFIG_THEAD_RVV_REDUCE_SUM_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_REDUCE_SUM, NULL, shl_rvv_reduce_sum_int8, - shl_gref_reduce_sum, shl_rvv_reduce_sum_cap); + shl_gref_reduce_sum, shl_rvv_reduce_sum_cap, shl_rvv_reduce_sum_perf); #endif #ifndef CONFIG_THEAD_RVV_PRELU_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, NULL, shl_rvv_prelu_fp32, shl_gref_prelu, - shl_rvv_prelu_cap); + shl_rvv_prelu_cap, shl_rvv_prelu_perf); #endif #ifndef CONFIG_THEAD_RVV_PRELU_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, NULL, shl_rvv_prelu_fp16, shl_gref_prelu, - shl_rvv_prelu_cap); + shl_rvv_prelu_cap, shl_rvv_prelu_perf); #endif #ifndef CONFIG_THEAD_RVV_PRELU_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_PRELU, NULL, shl_rvv_prelu_int8, shl_gref_prelu, - shl_rvv_prelu_cap); + shl_rvv_prelu_cap, shl_rvv_prelu_perf); #endif #ifndef CONFIG_THEAD_RVV_LAYER_NORM_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LAYER_NORM, NULL, shl_rvv_layer_norm_fp32, - shl_gref_layer_norm, shl_rvv_layer_norm_cap); + shl_gref_layer_norm, shl_rvv_layer_norm_cap, shl_rvv_layer_norm_perf); #endif #ifndef CONFIG_THEAD_RVV_LAYER_NORM_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, NULL, shl_rvv_layer_norm_fp16, - shl_gref_layer_norm, shl_rvv_layer_norm_cap); + shl_gref_layer_norm, shl_rvv_layer_norm_cap, shl_rvv_layer_norm_perf); #endif #ifndef CONFIG_THEAD_RVV_LAYER_NORM_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_LAYER_NORM, NULL, shl_rvv_layer_norm_int8, - shl_gref_layer_norm, shl_rvv_layer_norm_cap); + shl_gref_layer_norm, shl_rvv_layer_norm_cap, shl_rvv_layer_norm_perf); #endif #ifndef CONFIG_THEAD_RVV_CLIP_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, NULL, shl_rvv_clip_fp32, shl_gref_clip, - shl_rvv_clip_cap); + shl_rvv_clip_cap, shl_rvv_clip_perf); #endif #ifndef CONFIG_THEAD_RVV_CLIP_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, NULL, shl_rvv_clip_fp16, shl_gref_clip, - shl_rvv_clip_cap); + shl_rvv_clip_cap, shl_rvv_clip_perf); #endif #ifndef CONFIG_THEAD_RVV_CLIP_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CLIP, NULL, shl_rvv_clip_int8, shl_gref_clip, - shl_rvv_clip_cap); + shl_rvv_clip_cap, shl_rvv_clip_perf); #endif #ifndef CONFIG_THEAD_RVV_CONVOLUTION1D_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_rvv_conv1d_init_fp32, NULL, - shl_gref_conv1d, shl_rvv_conv1d_cap); + shl_gref_conv1d, shl_rvv_conv1d_cap, shl_rvv_conv1d_perf); #endif #ifndef CONFIG_THEAD_RVV_CONVOLUTION1D_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_rvv_conv1d_init_fp16, NULL, - shl_gref_conv1d, shl_rvv_conv1d_cap); + shl_gref_conv1d, shl_rvv_conv1d_cap, shl_rvv_conv1d_perf); #endif #ifndef CONFIG_THEAD_RVV_CONVOLUTION1D_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV1D, shl_rvv_conv1d_init_int8, NULL, - shl_gref_conv1d, shl_rvv_conv1d_cap); + shl_gref_conv1d, shl_rvv_conv1d_cap, shl_rvv_conv1d_perf); #endif #ifndef CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION1D_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV1D, shl_rvv_conv1d_init_int8, NULL, - shl_gref_depthwise_conv1d, shl_rvv_conv1d_cap); + shl_gref_depthwise_conv1d, shl_rvv_conv1d_cap, shl_rvv_conv1d_perf); #endif #ifndef CONFIG_THEAD_RVV_CONVOLUTION_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int8, NULL, - shl_gref_conv2d, shl_rvv_conv2d_cap); + shl_gref_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_int8, NULL, - shl_gref_group_conv2d, shl_rvv_conv2d_cap); + shl_gref_group_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D_RELU, shl_rvv_conv2d_init_int8, NULL, - shl_gref_conv2d_relu, shl_rvv_conv2d_cap); + shl_gref_conv2d_relu, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D_RELU, shl_rvv_conv2d_init_int8, NULL, - shl_gref_group_conv2d_relu, shl_rvv_conv2d_cap); + shl_gref_group_conv2d_relu, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D_RELU6, shl_rvv_conv2d_init_int8, NULL, - shl_gref_conv2d_relu6, shl_rvv_conv2d_cap); + shl_gref_conv2d_relu6, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_TRANSPOSE_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TRANSPOSE, NULL, shl_rvv_transpose_fp32, - shl_gref_transpose, shl_rvv_transpose_cap); + shl_gref_transpose, shl_rvv_transpose_cap, shl_rvv_transpose_perf); #endif #ifndef CONFIG_THEAD_RVV_TRANSPOSE_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, NULL, shl_rvv_transpose_fp16, - shl_gref_transpose, shl_rvv_transpose_cap); + shl_gref_transpose, shl_rvv_transpose_cap, shl_rvv_transpose_perf); #endif #ifndef CONFIG_THEAD_RVV_TRANSPOSE_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_TRANSPOSE, NULL, shl_rvv_transpose_int8, - shl_gref_transpose, shl_rvv_transpose_cap); + shl_gref_transpose, shl_rvv_transpose_cap, shl_rvv_transpose_perf); #endif #ifndef CONFIG_THEAD_RVV_MATMUL_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MATMUL, shl_rvv_matmul_init_fp32, NULL, - shl_gref_matmul, shl_rvv_matmul_cap); + shl_gref_matmul, shl_rvv_matmul_cap, shl_rvv_matmul_perf); #endif #ifndef CONFIG_THEAD_RVV_MATMUL_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, shl_rvv_matmul_init_fp16, NULL, - shl_gref_matmul, shl_rvv_matmul_cap); + shl_gref_matmul, shl_rvv_matmul_cap, shl_rvv_matmul_perf); #endif #ifndef CONFIG_THEAD_RVV_MATMUL_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MATMUL, shl_rvv_matmul_init_int8, NULL, - shl_gref_matmul, shl_rvv_matmul_cap); + shl_gref_matmul, shl_rvv_matmul_cap, shl_rvv_matmul_perf); #endif #ifndef CONFIG_THEAD_RVV_GATHER_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GATHER, NULL, shl_rvv_gather_fp32, shl_gref_gather, - shl_rvv_gather_cap); + shl_rvv_gather_cap, shl_rvv_gather_perf); #endif #ifndef CONFIG_THEAD_RVV_GATHER_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, NULL, shl_rvv_gather_fp16, shl_gref_gather, - shl_rvv_gather_cap); + shl_rvv_gather_cap, shl_rvv_gather_perf); #endif #ifndef CONFIG_THEAD_RVV_GATHER_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GATHER, NULL, shl_rvv_gather_int8, shl_gref_gather, - shl_rvv_gather_cap); + shl_rvv_gather_cap, shl_rvv_gather_perf); #endif #ifndef CONFIG_THEAD_RVV_STRIDED_SLICE_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_STRIDED_SLICE, NULL, shl_rvv_strided_slice_fp16, - shl_gref_strided_slice, shl_rvv_strided_slice_cap); + shl_gref_strided_slice, shl_rvv_strided_slice_cap, shl_rvv_strided_slice_perf); #endif #ifndef CONFIG_THEAD_RVV_ERF_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ERF, NULL, shl_rvv_erf_fp32, shl_gref_erf, - shl_rvv_erf_cap); + shl_rvv_erf_cap, shl_rvv_erf_perf); #endif #ifndef CONFIG_THEAD_RVV_ERF_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ERF, NULL, shl_rvv_erf_fp16, shl_gref_erf, - shl_rvv_erf_cap); + shl_rvv_erf_cap, shl_rvv_erf_perf); #endif #ifndef CONFIG_THEAD_RVV_ERF_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_ERF, NULL, shl_rvv_erf_int8, shl_gref_erf, - shl_rvv_erf_cap); + shl_rvv_erf_cap, shl_rvv_erf_perf); #endif #ifndef CONFIG_THEAD_RVV_SPLIT_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, NULL, shl_rvv_split_fp32, shl_gref_split, - shl_rvv_split_cap); + shl_rvv_split_cap, shl_rvv_split_perf); #endif #ifndef CONFIG_THEAD_RVV_SPLIT_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, NULL, shl_rvv_split_fp16, shl_gref_split, - shl_rvv_split_cap); + shl_rvv_split_cap, shl_rvv_split_perf); #endif #ifndef CONFIG_THEAD_RVV_SPLIT_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SPLIT, NULL, shl_rvv_split_int8, shl_gref_split, - shl_rvv_split_cap); + shl_rvv_split_cap, shl_rvv_split_perf); #endif #ifndef CONFIG_THEAD_RVV_SILU_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SILU, NULL, shl_rvv_silu_fp32, shl_gref_silu, - shl_rvv_silu_cap); + shl_rvv_silu_cap, shl_rvv_silu_perf); #endif #ifndef CONFIG_THEAD_RVV_SILU_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SILU, NULL, shl_rvv_silu_fp16, shl_gref_silu, - shl_rvv_silu_cap); + shl_rvv_silu_cap, shl_rvv_silu_perf); #endif #ifndef CONFIG_THEAD_RVV_SILU_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SILU, NULL, shl_rvv_silu_int8, shl_gref_silu, - shl_rvv_silu_cap); + shl_rvv_silu_cap, shl_rvv_silu_perf); #endif #ifndef CONFIG_THEAD_RVV_RMS_NORM_FP32_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RMS_NORM, NULL, shl_rvv_rms_norm_fp32, - shl_gref_rms_norm, shl_rvv_rms_norm_cap); + shl_gref_rms_norm, shl_rvv_rms_norm_cap, shl_rvv_rms_norm_perf); #endif #ifndef CONFIG_THEAD_RVV_RMS_NORM_FP16_DISABLED shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RMS_NORM, NULL, shl_rvv_rms_norm_fp16, - shl_gref_rms_norm, shl_rvv_rms_norm_cap); + shl_gref_rms_norm, shl_rvv_rms_norm_cap, shl_rvv_rms_norm_perf); #endif #ifndef CONFIG_THEAD_RVV_RMS_NORM_INT8_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RMS_NORM, NULL, shl_rvv_rms_norm_int8, - shl_gref_rms_norm, shl_rvv_rms_norm_cap); + shl_gref_rms_norm, shl_rvv_rms_norm_cap, shl_rvv_rms_norm_perf); +#endif +#ifndef CONFIG_THEAD_RVV_EMBEDDING_INT32_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_INT32, CSINN_OP_EMBEDDING, NULL, shl_rvv_embedding_int32, + shl_gref_embedding, shl_rvv_embedding_cap, shl_rvv_embedding_perf); +#endif +#ifndef CONFIG_THEAD_RVV_EXPAND_DIMS_FP32_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXPAND_DIMS, NULL, shl_rvv_expand_dims_fp32, + shl_gref_expand_dims, shl_rvv_expand_dims_cap, shl_rvv_expand_dims_perf); +#endif +#ifndef CONFIG_THEAD_RVV_EXPAND_DIMS_FP16_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXPAND_DIMS, NULL, shl_rvv_expand_dims_fp16, + shl_gref_expand_dims, shl_rvv_expand_dims_cap, shl_rvv_expand_dims_perf); +#endif +#ifndef CONFIG_THEAD_RVV_ROPE_FP32_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROPE, NULL, shl_rvv_rope_fp32, shl_gref_rope, + shl_rvv_rope_cap, shl_rvv_rope_perf); +#endif +#ifndef CONFIG_THEAD_RVV_ROPE_FP16_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ROPE, NULL, shl_rvv_rope_fp16, shl_gref_rope, + shl_rvv_rope_cap, shl_rvv_rope_perf); +#endif +#ifndef CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP32_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION, NULL, + shl_rvv_scaled_dot_product_attention_fp32, shl_gref_scaled_dot_product_attention, + shl_rvv_scaled_dot_product_attention_cap, + shl_rvv_scaled_dot_product_attention_perf); +#endif +#ifndef CONFIG_THEAD_RVV_SCALED_DOT_PRODUCT_ATTENTION_FP16_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION, NULL, + shl_rvv_scaled_dot_product_attention_fp16, shl_gref_scaled_dot_product_attention, + shl_rvv_scaled_dot_product_attention_cap, + shl_rvv_scaled_dot_product_attention_perf); +#endif +#ifndef CONFIG_THEAD_RVV_LLM_POS_FP16_DISABLED + shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LLM_POS, NULL, shl_rvv_llm_pos_fp16, + shl_gref_llm_pos, shl_rvv_llm_pos_cap, shl_rvv_llm_pos_perf); #endif #ifdef SHL_USE_DOT_INT4 #ifndef CONFIG_THEAD_RVV_CONVOLUTION_INT4_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int4, NULL, - shl_gref_conv2d, shl_rvv_conv2d_cap); + shl_gref_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_int4, NULL, - shl_gref_group_conv2d, shl_rvv_conv2d_cap); + shl_gref_group_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D_RELU, shl_rvv_conv2d_init_int4, NULL, - shl_gref_conv2d_relu, shl_rvv_conv2d_cap); + shl_gref_conv2d_relu, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_GROUP_CONV2D_RELU, shl_rvv_conv2d_init_int4, NULL, - shl_gref_group_conv2d_relu, shl_rvv_conv2d_cap); + shl_gref_group_conv2d_relu, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); #endif #ifndef CONFIG_THEAD_RVV_DEPTHWISE_CONVOLUTION_INT4_DISABLED shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_int4, - NULL, shl_gref_depthwise_conv2d, shl_rvv_conv2d_cap); + NULL, shl_gref_depthwise_conv2d, shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D_RELU, shl_rvv_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d_relu, - shl_rvv_conv2d_cap); + shl_rvv_conv2d_cap, shl_rvv_conv2d_perf); #endif // shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL, // shl_gref_fullyconnected); diff --git a/source/utils/debug.c b/source/utils/debug.c index ff358481..b0b9cdc3 100644 --- a/source/utils/debug.c +++ b/source/utils/debug.c @@ -268,8 +268,8 @@ int shl_layer_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *o return CSINN_TRUE; } -int shl_rms_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weights, struct csinn_rms_norm_params *params, +int shl_rms_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *weights, + struct csinn_tensor *output, struct csinn_rms_norm_params *params, const char *name) { shl_debug_print_siso_base(input, output, &(params->base), name); @@ -1025,6 +1025,12 @@ char *op_strings[] = { [CSINN_OP_ERF] = "erf", [CSINN_OP_CAST] = "cast", [CSINN_OP_DECONV2D] = "deconv2d", + [CSINN_OP_RMS_NORM] = "rms_norm", + [CSINN_OP_ROPE] = "rope", + [CSINN_OP_SILU] = "silu", + [CSINN_OP_LLM_POS] = "llm_pos", + [CSINN_OP_EMBEDDING] = "embedding", + [CSINN_OP_SCALED_DOT_PRODUCT_ATTENTION] = "scaled_dot_product_attention", }; // #define FREQ 50 // FPGA: 50MHz @@ -1166,7 +1172,7 @@ static char *shl_debug_filter_invalid_char(char *src) return dst; } -int __attribute__((weak)) shl_dump_output_tensor(struct shl_node *node) +int __attribute__((weak)) shl_dump_output_tensor(struct shl_node *node, char **output_filenames) { #ifndef SHL_BUILD_RTOS const char TENSOR_DUMP_DIR[] = "shl_dump"; @@ -1197,22 +1203,25 @@ int __attribute__((weak)) shl_dump_output_tensor(struct shl_node *node) snprintf(filename, 1024, "%s/%s_%s.txt", TENSOR_DUMP_DIR, output_name, shape); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output); shl_debug_dump_data(foutput, filename); + strcpy(output_filenames[i], filename); shl_ref_tensor_transform_free_f32(foutput); shl_mem_free(output_name); } if (node->type == CSINN_OP_CONV2D || node->type == CSINN_OP_DEPTHWISE_CONV2D || node->type == CSINN_OP_FULLYCONNECTED && is_cpu_node) { // dump output - struct csinn_tensor *kernel_node = node->in[1]->data; - char shape[128] = {0}; - shl_debug_shape2string(kernel_node->dim, kernel_node->dim_count, shape, 128); - char *kernel_name = shl_debug_filter_invalid_char(kernel_node->name); char filename[1024] = {0}; - snprintf(filename, 1024, "%s/%s_%s.txt", TENSOR_DUMP_DIR, kernel_name, shape); - struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(kernel_node); - shl_debug_dump_data(foutput, filename); - shl_ref_tensor_transform_free_f32(foutput); - shl_mem_free(kernel_name); + char shape[128] = {0}; + struct csinn_tensor *kernel_node = node->in[1]->data; + if (kernel_node->data) { + shl_debug_shape2string(kernel_node->dim, kernel_node->dim_count, shape, 128); + char *kernel_name = shl_debug_filter_invalid_char(kernel_node->name); + snprintf(filename, 1024, "%s/%s_%s.txt", TENSOR_DUMP_DIR, kernel_name, shape); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(kernel_node); + shl_debug_dump_data(foutput, filename); + shl_ref_tensor_transform_free_f32(foutput); + shl_mem_free(kernel_name); + } // dump input struct csinn_tensor *input_node = node->in[0]->data; diff --git a/source/utils/multithread.c b/source/utils/multithread.c new file mode 100644 index 00000000..5184e801 --- /dev/null +++ b/source/utils/multithread.c @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "shl_debug.h" +#include "shl_multithread.h" + +int shl_thread_num = 1; + +void shl_multithread_set_threads(int threads) +{ +#ifdef _OPENMP + shl_thread_num = threads; + omp_set_num_threads(threads); +#else + shl_debug_warning("OPENMP is not defined!\n"); +#endif +} + +int shl_multithread_is_enable() +{ +#ifdef _OPENMP + omp_set_num_threads(shl_thread_num); + if (omp_get_max_threads() > 1) { + return CSINN_TRUE; + } +#endif + return CSINN_FALSE; +} diff --git a/source/utils/shl_profiler.c b/source/utils/shl_profiler.c new file mode 100644 index 00000000..a64a1afb --- /dev/null +++ b/source/utils/shl_profiler.c @@ -0,0 +1,490 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "shl_profiler.h" + +#include +#include +#include + +#include "csinn_runtime.h" +#include "shl_debug.h" +#include "shl_memory.h" + +#ifdef SHL_TRACE + +const char *SHL_TRACE_EVENT_CATEGORY_NAMES[SHL_TRACE_EVENT_CATEGORY_MAX] = { + "runtime", "cpu_operator", "memory", "cpu_kernel", "npu_kernel", "kernel", +}; + +const char *SHL_TRACE_EVENT_TYPE_NAMES[SHL_TRACE_EVENT_TYPE_MAX] = {"B", "E", "X", "i", "C", "b", + "n", "e", "s", "t", "f", "M"}; + +uint32_t shl_trace_get_current_pid() { return (uint32_t)getpid(); } + +uint32_t shl_trace_get_current_tid() +{ + uint32_t tid = syscall(__NR_gettid); + return tid; +} + +uint64_t shl_trace_get_timestamps_us() +{ + uint64_t ts = shl_get_timespec(); // ns + ts /= 1000; + return ts; +} + +void *shl_trace_alloc(int64_t size) { return calloc(1, size); } + +void shl_trace_free(void *ptr) +{ + if (ptr) { + free(ptr); + } +} + +struct shl_trace_value *shl_trace_create_string(const char *value) +{ + char *cpy_value = (char *)shl_trace_alloc(strlen(value) + 1); + memcpy(cpy_value, value, strlen(value) + 1); + + struct shl_trace_value *res = + (struct shl_trace_value *)shl_trace_alloc(sizeof(struct shl_trace_value)); + res->type = SHL_TRACE_VALUE_TYPE_STRING; + res->content.str = cpy_value; + + return res; +} + +struct shl_trace_value *shl_trace_create_int64(int64_t value) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_trace_alloc(sizeof(struct shl_trace_value)); + res->content.i64 = value; + res->type = SHL_TRACE_VALUE_TYPE_INT64; + return res; +} + +struct shl_trace_value *shl_trace_create_uint64(uint64_t value) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_trace_alloc(sizeof(struct shl_trace_value)); + res->content.u64 = value; + res->type = SHL_TRACE_VALUE_TYPE_UINT64; + return res; +} + +struct shl_trace_value *shl_trace_create_double(double value) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_trace_alloc(sizeof(struct shl_trace_value)); + res->content.f64 = value; + res->type = SHL_TRACE_VALUE_TYPE_DOUBLE; + return res; +} + +struct shl_trace_value *shl_trace_create_list(int num, ...) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_trace_alloc(sizeof(struct shl_trace_value)); + + struct shl_trace_value_list *list = + (struct shl_trace_value_list *)shl_trace_alloc(sizeof(struct shl_trace_value_list)); + list->size = num; + list->value = + (struct shl_trace_value **)shl_trace_alloc(sizeof(struct shl_trace_value *) * num); + + res->type = SHL_TRACE_VALUE_TYPE_LIST; + res->content.list = list; + + va_list args; + va_start(args, num); + for (int i = 0; i < num; i++) { + struct shl_trace_value *value = va_arg(args, struct shl_trace_value *); + list->value[i] = value; + } + + va_end(args); + return res; +} + +struct shl_trace_value *shl_trace_create_list_int(int num, int *arr) +{ + struct shl_trace_value *res = + (struct shl_trace_value *)shl_trace_alloc(sizeof(struct shl_trace_value)); + + struct shl_trace_value_list *list = + (struct shl_trace_value_list *)shl_trace_alloc(sizeof(struct shl_trace_value_list)); + list->size = num; + list->value = + (struct shl_trace_value **)shl_trace_alloc(sizeof(struct shl_trace_value *) * num); + + res->type = SHL_TRACE_VALUE_TYPE_LIST; + res->content.list = list; + + for (int i = 0; i < num; i++) { + list->value[i] = shl_trace_create_int64(arr[i]); + } + + return res; +} + +struct shl_trace_event_format *shl_trace_create_common_event() +{ + struct shl_trace_event_format *event = + (struct shl_trace_event_format *)shl_trace_alloc(sizeof(struct shl_trace_event_format)); + event->ts = shl_trace_get_timestamps_us(); + event->pid = shl_trace_get_current_pid(); + event->tid = shl_trace_get_current_tid(); + return event; +} + +struct shl_trace_dict_item *shl_trace_create_dict_item(const char *key, + struct shl_trace_value *value) +{ + struct shl_trace_dict_item *item = + (struct shl_trace_dict_item *)shl_trace_alloc(sizeof(struct shl_trace_dict_item)); + memcpy(item->key, key, strlen(key) + 1); + item->value = value; + return item; +} + +struct shl_trace_dict *shl_trace_create_dict_by_item(int argc, ...) +{ + if (argc <= 0) return NULL; + struct shl_trace_dict *data = + (struct shl_trace_dict *)shl_trace_alloc(sizeof(struct shl_trace_dict)); + + if (argc < SHL_TRACE_EVENT_ARGS_CAPACITY_STEP) { + data->items_capacity = SHL_TRACE_EVENT_ARGS_CAPACITY_STEP; + } else { + data->items_capacity = argc; + } + data->items_size = 0; + data->items = (struct shl_trace_dict_item **)shl_trace_alloc( + sizeof(struct shl_trace_dict_item *) * data->items_capacity); + + va_list args; + va_start(args, argc); + for (int i = 0; i < argc; i++) { + data->items[i] = va_arg(args, struct shl_trace_dict_item *); + data->items_size++; + } + va_end(args); + + return data; +} + +struct shl_trace_dict *shl_trace_create_dict(int argc, ...) +{ + if (argc <= 0) return NULL; + struct shl_trace_dict *data = + (struct shl_trace_dict *)shl_trace_alloc(sizeof(struct shl_trace_dict)); + + if (argc < SHL_TRACE_EVENT_ARGS_CAPACITY_STEP) { + data->items_capacity = SHL_TRACE_EVENT_ARGS_CAPACITY_STEP; + } else { + data->items_capacity = argc; + } + data->items_size = 0; + data->items = (struct shl_trace_dict_item **)shl_trace_alloc( + sizeof(struct shl_trace_dict_item *) * data->items_capacity); + + va_list args; + va_start(args, argc); + for (int i = 0; i < argc; i++) { + struct shl_trace_dict_item *item = + (struct shl_trace_dict_item *)shl_trace_alloc(sizeof(struct shl_trace_dict_item)); + + char *key = va_arg(args, char *); + memcpy(item->key, key, strlen(key) + 1); + item->value = va_arg(args, struct shl_trace_value *); + + data->items[i] = item; + data->items_size++; + } + va_end(args); + + return data; +} + +void shl_trace_release_value(struct shl_trace_value *value) +{ + if (value->type == SHL_TRACE_VALUE_TYPE_STRING) { + shl_trace_free(value->content.str); + shl_trace_free(value); + } else if (value->type == SHL_TRACE_VALUE_TYPE_LIST) { + // there may be a list nested within a list + for (int i = 0; i < value->content.list->size; i++) { + shl_trace_release_value(value->content.list->value[i]); + } + shl_trace_free(value->content.list->value); + shl_trace_free(value->content.list); + shl_trace_free(value); + } else { + shl_trace_free(value); + } +} + +void shl_trace_release_dict(struct shl_trace_dict *args) +{ + for (int i = 0; i < args->items_size; i++) { + struct shl_trace_dict_item *item = args->items[i]; + shl_trace_release_value(item->value); + shl_trace_free(item); + } + shl_trace_free(args->items); + shl_trace_free(args); +} + +void shl_trace_insert_event(struct shl_trace *trace, struct shl_trace_event_format *event) +{ + if (trace->events_size + 1 > trace->events_capacity) { + trace->events = (struct shl_trace_event_format **)shl_mem_realloc( + trace->events, + sizeof(struct shl_trace_event_format *) * + (trace->events_capacity + SHL_TRACE_EVENT_CAPACITY_STEP), + sizeof(struct shl_trace_event_format *) * trace->events_capacity); + trace->events_capacity += SHL_TRACE_EVENT_CAPACITY_STEP; + } + trace->events[trace->events_size] = event; + trace->events_size++; +} + +void shl_trace_init(struct shl_trace *trace) +{ + // initialize data field + trace->events_capacity = SHL_TRACE_EVENT_CAPACITY_STEP; + trace->events_size = 0; + trace->events = (struct shl_trace_event_format **)shl_trace_alloc( + sizeof(struct shl_trace_event_format *) * trace->events_capacity); + trace->other_data = + (struct shl_trace_other_data *)shl_trace_alloc(sizeof(struct shl_trace_other_data)); + strcpy(trace->other_data->version, SHL_TRACE_VERSION); + trace->is_init = true; + + uint64_t ts = shl_trace_get_timestamps_us(); + snprintf(trace->filename, sizeof(trace->filename), "model_csinn.trace.%lu.json", ts); +} + +void shl_trace_deinit(struct shl_trace *trace) +{ + if (!trace->is_init) return; + // release events + for (int i = 0; i < trace->events_size; i++) { + struct shl_trace_event_format *event = trace->events[i]; + if (event->args && event->args->items_size > 0) { + shl_trace_release_dict(event->args); + } + shl_trace_free(event); + } + shl_trace_free(trace->events); + trace->events = NULL; + trace->events_capacity = 0; + trace->events_size = 0; + + // release other_data + if (trace->other_data->data && trace->other_data->data->items_size > 0) { + shl_trace_release_dict(trace->other_data->data); + } + shl_trace_free(trace->other_data); + trace->other_data = NULL; + + trace->is_init = false; +} + +static void indent(FILE *file, int num) +{ + for (int i = 0; i < num; i++) { + fprintf(file, " "); + } +} + +#define WRITE_ONELINE(file, indent_num, ...) \ + indent(file, indent_num); \ + fprintf(file, __VA_ARGS__); + +static void write_trace_value_to_file(FILE *file, struct shl_trace_value value) +{ + switch (value.type) { + case SHL_TRACE_VALUE_TYPE_INT64: + fprintf(file, "%ld", value.content.i64); + break; + case SHL_TRACE_VALUE_TYPE_UINT64: + fprintf(file, "%lu", value.content.u64); + break; + case SHL_TRACE_VALUE_TYPE_DOUBLE: + fprintf(file, "%f", value.content.f64); + break; + case SHL_TRACE_VALUE_TYPE_STRING: + fprintf(file, "\"%s\"", value.content.str); + break; + case SHL_TRACE_VALUE_TYPE_LIST: + if (value.content.list->size >= 0) { + fprintf(file, "["); + for (int i = 0; i < value.content.list->size; i++) { + write_trace_value_to_file(file, *value.content.list->value[i]); + if (i != value.content.list->size - 1) { + fprintf(file, ", "); + } + } + fprintf(file, "]"); + } + break; + default: + break; + } +} + +static void write_trace_dict_to_file(FILE *file, struct shl_trace_dict *dict, int space) +{ + for (int i = 0; i < dict->items_size; i++) { + struct shl_trace_dict_item *item = dict->items[i]; + WRITE_ONELINE(file, space, "\"%s\": ", item->key); + write_trace_value_to_file(file, *item->value); + if (i == dict->items_size - 1) { + fprintf(file, "\n"); + } else { + fprintf(file, ",\n"); + } + } +} + +void shl_trace_to_json(struct shl_trace *trace) +{ + if (!trace->events || trace->events_size == 0 || trace->events_capacity == 0) return; + + int space_step = 2; + FILE *file = fopen(trace->filename, "w"); + if (!file) { + shl_debug_error("Failed to open file: %s\n", trace->filename); + return; + } + fprintf(file, "{\n"); + + // other data + WRITE_ONELINE(file, space_step, "\"otherData\": {\n"); + WRITE_ONELINE(file, space_step * 2, "\"version\": \"%s\"", trace->other_data->version); + struct shl_trace_dict *extra_data = trace->other_data->data; + if (extra_data && extra_data->items_size > 0) { + fprintf(file, ",\n"); + write_trace_dict_to_file(file, extra_data, space_step * 2); + } else { + fprintf(file, "\n"); + } + WRITE_ONELINE(file, space_step, "},\n"); // otherData end + + // events + WRITE_ONELINE(file, space_step, "\"traceEvents\": [\n"); + for (int i = 0; i < trace->events_size; i++) { + WRITE_ONELINE(file, space_step * 2, "{\n"); + + struct shl_trace_event_format *event = trace->events[i]; + WRITE_ONELINE(file, space_step * 3, "\"name\": \"%s\",\n", event->name); + WRITE_ONELINE(file, space_step * 3, "\"cat\": \"%s\",\n", + SHL_TRACE_EVENT_CATEGORY_NAMES[event->cat]); + WRITE_ONELINE(file, space_step * 3, "\"ph\": \"%s\",\n", + SHL_TRACE_EVENT_TYPE_NAMES[event->ph]); + WRITE_ONELINE(file, space_step * 3, "\"ts\": %lu,\n", event->ts); + WRITE_ONELINE(file, space_step * 3, "\"pid\": %u,\n", event->pid); + WRITE_ONELINE(file, space_step * 3, "\"tid\": %u", event->tid); + + if (event->args && event->args->items_size > 0) { + fprintf(file, ",\n"); + WRITE_ONELINE(file, space_step * 3, "\"args\": {\n"); + write_trace_dict_to_file(file, event->args, space_step * 4); + WRITE_ONELINE(file, space_step * 3, "}\n"); + } else { + fprintf(file, "\n"); + } + + if (i == trace->events_size - 1) { + WRITE_ONELINE(file, space_step * 2, "}\n"); + } else { + WRITE_ONELINE(file, space_step * 2, "},\n"); + } + } + WRITE_ONELINE(file, space_step, "]\n"); // traceEvents end + + fprintf(file, "}\n"); // json end + fclose(file); + shl_debug_info("Trace data saved to %s\n", trace->filename); +} + +void shl_trace_move_events(struct shl_trace *from_trace, struct shl_trace *to_trace) +{ + if (!from_trace || !from_trace->events_size) return; + if (!to_trace || !to_trace->events_size) return; + for (int i = 0; i < from_trace->events_size; i++) { + shl_trace_insert_event(to_trace, from_trace->events[i]); + } + from_trace->events_size = 0; + from_trace->events_capacity = 0; +} + +void shl_trace_begin(struct shl_trace *trace, const char *filename) +{ + if (!trace || !trace->enable_trace) return; + shl_trace_init(trace); + if (filename != NULL) { + memcpy(trace->filename, filename, strlen(filename) + 1); + } +} + +void shl_trace_end(struct shl_trace *trace) +{ + if (!trace) return; + shl_trace_to_json(trace); + shl_trace_deinit(trace); +} + +void shl_trace_other_data(struct shl_trace *trace, struct shl_trace_dict *data) +{ + if (!trace || !trace->enable_trace || !trace->is_init) return; + trace->other_data->data = data; +} + +void shl_trace_duration_begin(struct shl_trace *trace, const char *name, + enum shl_trace_event_category cat, struct shl_trace_dict *args) +{ + if (!trace || !trace->enable_trace || !trace->is_init) return; + struct shl_trace_event_format *event = shl_trace_create_common_event(); + memcpy(event->name, name, strlen(name) + 1); + event->cat = cat; + event->ph = SHL_TRACE_EVENT_TYPE_DURATION_B; + event->args = args; + + // update + shl_trace_insert_event(trace, event); +} + +void shl_trace_duration_end(struct shl_trace *trace, const char *name, + enum shl_trace_event_category cat, struct shl_trace_dict *args) +{ + if (!trace || !trace->enable_trace || !trace->is_init) return; + struct shl_trace_event_format *event = shl_trace_create_common_event(); + memcpy(event->name, name, strlen(name) + 1); + event->cat = cat; + event->ph = SHL_TRACE_EVENT_TYPE_DURATION_E; + event->args = args; + + // update + shl_trace_insert_event(trace, event); +} +#endif diff --git a/tests/autotest/interface_test.py b/tests/autotest/interface_test.py index c0053bd4..41be47a0 100644 --- a/tests/autotest/interface_test.py +++ b/tests/autotest/interface_test.py @@ -99,13 +99,13 @@ def caseParams(dirname, dtype, cpu_type, flow=FLOW_ID): for key, value in case_type.items(): case_name = key if isinstance(value, dict): - for k,v in value.items(): + for k,v in value.items(): if v.get("layout", "nchw") == "nhwc": elf_name = f"{case_name}_nhwc" else: elf_name = case_name elf_data = os.path.join(elf_path, f"{elf_name}.o.elf") - temp_case.append(pytest.param((elf_data, case_name, v), id=id_flag)) + temp_case.append(pytest.param((elf_data, case_name, k, v), id=id_flag)) else: continue except: @@ -119,6 +119,7 @@ def run_base( cmd_execute, elf_data, python_data, + data_params, test_accuracy, python_cmd, ): @@ -145,7 +146,7 @@ def run_base( print(ret) err = ret.stderr.decode("utf-8") # out = out - assert ret.returncode == 0, f"\nexecute cmd:\n{cmd}\ngenerate python:\n{python_cmd}\n{p_out}out:\n{out}\nerr:\n{err}" + assert ret.returncode == 0, f"\nexecute cmd:\n{cmd}\ngenerate python:\n{python_cmd}\n{p_out}out:\n{out}\nerr:\n{err}\nparams:{data_params}" class Test_CSINN_Base: @@ -173,8 +174,8 @@ def setup_class(self): class TestCSINN(Test_CSINN_Base): #####TODO fix########### @pytest.mark.parametrize('test_data', caseParams(elf_path, DTYPE, CPU_TYPE)) - def test_layer(self, test_data): - python_data = test_data[1:] + def test_layer(self, test_data): + python_data = (test_data[1], test_data[3]) if test_data[1] == "convolution" or test_data[1] == "group_convolution" or test_data[1] == "depthwise_convolution": convolution(python_data) elif test_data[1] == "convolution_relu": @@ -198,15 +199,15 @@ def test_layer(self, test_data): elif test_data[1] == "abs" or test_data[1] == "relu" or test_data[1] == "erf" or test_data[1] == "sigmoid": unary(python_data) elif test_data[1] == "relu1" or test_data[1] == "relu6": - thresholdedrelu(python_data) + thresholdedrelu(python_data) elif test_data[1] == "minimum": - muti_min(python_data) + muti_min(python_data) elif test_data[1] == "strided_slice": strided_slice(python_data) elif test_data[1] == "reduce_sum": - reduce_op(python_data) + reduce_op(python_data) elif test_data[1] == "reshape": - reshape(python_data) + reshape(python_data) elif test_data[1] == "silu": silu(python_data) elif test_data[1] == "clip": @@ -229,21 +230,21 @@ def test_layer(self, test_data): transpose(python_data) elif test_data[1] == "lrn": lrn(python_data) - elif test_data[1] == "convolution1d": + elif test_data[1] == "convolution1d" or test_data[1] == "depthwise_convolution1d": convolution1d(python_data) elif test_data[1] == "softmax": softmax(python_data) elif test_data[1] == "layer_norm": layer_norm(python_data) else: - return - - run_base(self.qemu, test_data[0], TOPDIR + test_data[1] + "_test_data_f32.bin", self.accuracy, "") + return + + run_base(self.qemu, test_data[0], TOPDIR + test_data[1] + "_test_data_f32.bin", f"{test_data[2]}: {test_data[3]}", self.accuracy, "") @pytest.mark.parametrize('unit_test_elf_data', numberOffile(unit_test_elf_path, "elf")) def test_opt_interface(self, unit_test_elf_data): - run_base(self.qemu, unit_test_elf_data, "", self.accuracy, "") + run_base(self.qemu, unit_test_elf_data, "", "", self.accuracy, "") class TestHeterogeneous: diff --git a/tests/llm/Makefile b/tests/llm/Makefile index 3741f418..db4c2ac1 100644 --- a/tests/llm/Makefile +++ b/tests/llm/Makefile @@ -8,5 +8,10 @@ x86_ref_llama_quantize: gcc -c -g model-f16.c -I../../include -I../../include/csinn g++ llama2_quantize.o model-f16.o -o llama2_quantize.elf ../../install_nn2/x86/lib/libshl.a -lm -static -fopenmp -g +c920_llama_quantize: + riscv64-unknown-linux-gnu-gcc -c -g c920_llama2_quantize.c -I../../include -I../../include/csinn + riscv64-unknown-linux-gnu-gcc -c -g model-f16.c -I../../include -I../../include/csinn + riscv64-unknown-linux-gnu-g++ c920_llama2_quantize.o model-f16.o -o c920_llama2_quantize.elf ../../install_nn2/c920/lib/libshl_c920.a -lm -static -fopenmp -g + clean: rm -rf *.o *.elf diff --git a/tests/llm/c920_llama2_quantize.c b/tests/llm/c920_llama2_quantize.c new file mode 100644 index 00000000..577028ff --- /dev/null +++ b/tests/llm/c920_llama2_quantize.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Block quantization from llama.cpp + */ + +#include "llm/shl_llm.h" +#include "llm/shl_llm_json.h" + +// calculate cosine similarity +static float compute_cs(float *a, float *b, uint32_t size) +{ + double dot_sum = 0.0; + double a_norm = 0.0; + double b_norm = 0.0; + float res = 0.0; + + for (int i = 0; i < size; i++) { + dot_sum += (a[i] * b[i]); + a_norm += (a[i] * a[i]); + b_norm += (b[i] * b[i]); + } + res = dot_sum / (sqrt(a_norm * b_norm)); + return res; +} + +int main(int argc, char **argv) +{ + struct llama_config *config = shl_mem_alloc(sizeof(struct llama_config)); + config->dim = 4096; + config->multiple_of = 256; + config->n_heads = 32; + config->n_layers = 32; + config->nor_eps = 1e-05; + config->vocab_size = -1; + + /* + * arg1: q8/q4 model path + * arg2: data type (32/16) + */ + struct shl_llm_model *reload_model = shl_llm_load_json(argv[1]); + config->shl_model = reload_model; + config->base_api = CSINN_C920; + + int dtype = 32; + if (atoi(argv[2]) == 32) { + dtype = 32; + config->base_quant_type = CSINN_QUANT_FLOAT32; + config->base_dtype = CSINN_DTYPE_FLOAT32; + } else if (atoi(argv[2]) == 16) { + dtype = 16; + config->base_quant_type = CSINN_QUANT_FLOAT16; + config->base_dtype = CSINN_DTYPE_FLOAT16; + } else { + printf("error dtype, only support 32/16\n"); + return 0; + } + + shl_multithread_set_threads(4); + + struct shl_llm_ctx *ctx = llama2_build(config); + + /* + * prompt: "Building a website can be done in 10 simple steps:\nStep 1:" + */ + int32_t token[] = {1, 17166, 263, 4700, 508, 367, 2309, 297, 29871, 29896, + 29900, 2560, 6576, 29901, 13, 14448, 29871, 29896, 29901}; + + struct shl_llm_input *embd = + (struct shl_llm_input *)shl_mem_alloc(sizeof(struct shl_llm_input)); + embd->n_tokens = 19; + embd->token = token; + embd->pos = (int32_t *)malloc(4 * 19); + for (int i = 0; i < 19; i++) { + embd->pos[i] = i; + } + + llm_run(ctx, embd); + + // check prefill result + float *result; + if (dtype == 32) { + result = (float *)ctx->output_session->output[0]->data; + // last logits + result += 32000 * 18; + } else if (dtype == 16) { + result = (float *)shl_mem_alloc(32000 * sizeof(float)); + int16_t *result_fp16 = ctx->output_session->output[0]->data; + result_fp16 += 32000 * 18; + for (int i = 0; i < 32000; i++) { + result[i] = shl_ref_float16_to_float32(result_fp16[i]); + } + } + float reference_result[] = { + -5.71030331, -6.5068779, 4.49947596, 1.61511719, 2.1548543, 0.0926032066, + 2.82565427, 0.221694469, -0.802444339, 0.397152185, 3.21004057, 2.24275088, + 10.127943, 7.96558952, 0.993502378, -0.401111603, -5.71008682, -1.67284417, + -0.722307682, 1.00253534, -0.748121202, -1.11147189, -0.527304411, -0.988370299, + -1.5118947, -1.75848043, -0.597546458, -0.898284316, -1.02883792, -0.916219473, + 0.592717409, -0.389472723, -1.51692116, -1.74400616, -0.0866698027, -5.71152353, + -5.71036053, -5.70992756, -5.7100029, -5.71010208, -5.71188021, -5.710711, + }; + float cs_ret = compute_cs(result, reference_result, 42); + printf("first five: %f, %f, %f, %f, %f\n", result[0], result[1], result[2], result[3], + result[4]); + printf("result cos = %f\n", cs_ret); + + // save_model(ctx); + /* temperature = 0, greedy sampling */ + float prob[5]; + uint32_t index[5]; + shl_get_top5(result, 32000, prob, index); + printf("Next id: %d = %f\n", index[0], prob[0]); + + /* + * --temp 0 --repeat-penalty 1.0 + * next decode reference results: + * Cho ose a domain name . + * 14542,852,263, 5354, 1024, 29889, 13 + * Step 2 : Cho ose a hosting provider. + * 14448,29871,29906,29901,14542,852,263,23376, 13113, 29889 + */ + for (int i = 19; i < 19 + 16; i++) { + embd->n_tokens = 1; + embd->token[0] = index[0]; + embd->pos[0] = i; + llm_run(ctx, embd); + if (dtype == 32) { + result = (float *)ctx->output_session->output[0]->data; + } else if (dtype == 16) { + int16_t *result_fp16 = ctx->output_session->output[0]->data; + for (int i = 0; i < 32000; i++) { + result[i] = shl_ref_float16_to_float32(result_fp16[i]); + } + } + shl_get_top5(result, 32000, prob, index); + printf("Next id: %d = %f\n", index[0], prob[0]); + } + if (dtype == 16) { + shl_mem_free(result); + } + return 0; +} diff --git a/tests/llm/llama2.c b/tests/llm/llama2.c index 31247727..3e92409d 100644 --- a/tests/llm/llama2.c +++ b/tests/llm/llama2.c @@ -65,6 +65,9 @@ void main() void *base_addr = shl_mmap(path); config->shl_model = load_shl_model(base_addr); + config->base_api = CSINN_REF; + config->base_quant_type = CSINN_QUANT_FLOAT32; + config->base_dtype = CSINN_DTYPE_FLOAT32; struct shl_llm_ctx *ctx = llama2_build(config); diff --git a/tests/llm/llama2_quantize.c b/tests/llm/llama2_quantize.c index 9e41b9d7..c54afcc9 100644 --- a/tests/llm/llama2_quantize.c +++ b/tests/llm/llama2_quantize.c @@ -145,6 +145,9 @@ int main(int argc, char **argv) struct shl_llm_model *reload_model = shl_llm_load_json(argv[2]); config->shl_model = reload_model; // config->shl_model = new_model; + config->base_api = CSINN_REF; + config->base_quant_type = CSINN_QUANT_FLOAT32; + config->base_dtype = CSINN_DTYPE_FLOAT32; struct shl_llm_ctx *ctx = llama2_build(config); diff --git a/tests/profiler/Makefile b/tests/profiler/Makefile new file mode 100644 index 00000000..baad470f --- /dev/null +++ b/tests/profiler/Makefile @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ROOT_DIR=$(shell cd ../..; pwd) +LIB_DIR=${ROOT_DIR}/install_nn2/x86 +INCLUDE += -I${LIB_DIR}/include +LDFLAGS = -L${LIB_DIR}/lib + +CC = gcc + +CFLAGS += -O2 -g ${INCLUDE} -DSHL_TRACE + +LDFLAGS += -lshl -lstdc++ -Wl,--gc-sections + +.PHONY: clean all + +all: test_shl_trace + +test_shl_trace: test_trace.c + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +run: test_shl_trace + ./test_shl_trace + +run_with_valgrind: test_shl_trace + valgrind --tool=memcheck --leak-check=full ./test_shl_trace + +clean: + -rm test_shl_trace \ No newline at end of file diff --git a/tests/profiler/test_trace.c b/tests/profiler/test_trace.c new file mode 100644 index 00000000..7c524132 --- /dev/null +++ b/tests/profiler/test_trace.c @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2016-2023 T-Head Semiconductor Co., Ltd. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "shl_profiler.h" + +#define TEST_WRAPPER(func, msg) \ + { \ + printf("Testing: %s\n", msg); \ + int fail = func; \ + if (fail > 0) { \ + printf("Testing: %s - fail\n", msg); \ + } else { \ + printf("Testing: %s - Pass\n", msg); \ + } \ + } + +int _test_shl_trace_create_string(const char *data) +{ + int ret = 0; + struct shl_trace_value *value = shl_trace_create_string(data); + if (value->type != SHL_TRACE_VALUE_TYPE_STRING || strcmp(value->content.str, data) != 0) { + printf("%s: %s fail\n", __func__, data); + ret = 1; + } + shl_trace_release_value(value); + return ret; +} + +int _test_shl_trace_create_int64(int64_t data) +{ + int ret = 0; + struct shl_trace_value *value = shl_trace_create_int64(data); + if (value->type != SHL_TRACE_VALUE_TYPE_INT64 || value->content.i64 != data) { + printf("%s: %ld fail\n", __func__, data); + ret = 1; + } + shl_trace_release_value(value); + return ret; +} + +int _test_shl_trace_create_uint64(uint64_t data) +{ + int ret = 0; + struct shl_trace_value *value = shl_trace_create_uint64(data); + if (value->type != SHL_TRACE_VALUE_TYPE_UINT64 || value->content.u64 != data) { + printf("%s: %lu fail\n", __func__, data); + ret = 1; + } + shl_trace_release_value(value); + return ret; +} + +int _test_shl_trace_create_double(double data) +{ + int ret = 0; + struct shl_trace_value *value = shl_trace_create_double(data); + if (value->type != SHL_TRACE_VALUE_TYPE_DOUBLE || fabs(value->content.f64 - data) > 1e-10) { + printf("%s: %f fail\n", __func__, data); + ret = 1; + } + shl_trace_release_value(value); + return ret; +} + +int _test_shl_trace_create_list() +{ + int ret = 0; + struct shl_trace_value *value = + shl_trace_create_list(4, shl_trace_create_string("data"), shl_trace_create_int64(-100), + shl_trace_create_uint64(100), shl_trace_create_double(3.3)); + if (value->type != SHL_TRACE_VALUE_TYPE_LIST || value->content.list->size != 4) { + printf("%s: wrong type(%d) or size(%d) fail\n", __func__, value->type, + value->content.list->size); + ret = 1; + } + struct shl_trace_value_list *list = value->content.list; + ret += _test_shl_trace_create_string(list->value[0]->content.str); + ret += _test_shl_trace_create_int64(list->value[0]->content.i64); + ret += _test_shl_trace_create_uint64(list->value[0]->content.u64); + ret += _test_shl_trace_create_double(list->value[0]->content.f64); + + shl_trace_release_value(value); + return ret > 0; +} + +int test_shl_trace_value() +{ + int fail_num = 0; + fail_num += _test_shl_trace_create_string("data"); + fail_num += _test_shl_trace_create_int64(-1); + fail_num += _test_shl_trace_create_uint64(2); + fail_num += _test_shl_trace_create_double(2.5); + fail_num += _test_shl_trace_create_list(); + + return fail_num; +} + +int test_shl_trace_dict() +{ + int fail_num = 0; + struct shl_trace_dict *dict; + dict = shl_trace_create_dict( + 5, "string", shl_trace_create_string("string"), "int64", shl_trace_create_int64(-5), + "uint64", shl_trace_create_uint64(4), "double", shl_trace_create_double(4.6), "list", + shl_trace_create_list(2, shl_trace_create_int64(256), shl_trace_create_int64(256))); + if (dict && dict->items_size == 5) { + if (strcmp(dict->items[0]->key, "string") != 0 || + strcmp(dict->items[1]->key, "int64") != 0 || + strcmp(dict->items[2]->key, "uint64") != 0 || + strcmp(dict->items[3]->key, "double") != 0 || + strcmp(dict->items[4]->key, "list") != 0) { + printf("Wrong item key...\n"); + fail_num++; + } + } else { + fail_num++; + } + + shl_trace_release_dict(dict); + return fail_num; +} + +int test_shl_trace_begin_end() +{ + int fail_num = 0; + struct shl_trace *trace = (struct shl_trace *)shl_mem_alloc(sizeof(struct shl_trace)); + + shl_trace_begin(trace, "trace.json"); + if (trace->events != NULL || trace->events_capacity != 0 || trace->events_size != 0 || + strcmp(trace->filename, "trace.josn") == 0) { + printf("should'n initialize trace while enable_trace is false...\n"); + fail_num++; + } + + trace->enable_trace = true; + shl_trace_begin(trace, "trace.json"); + if (trace->is_init == false || strcmp(trace->filename, "trace.json") != 0 || + trace->events == NULL || trace->events_capacity == 0 || trace->events_size != 0) { + printf("fail to initialize trace...\n"); + fail_num++; + } + + shl_trace_end(trace); + if (trace->is_init != false || trace->events_capacity != 0 || trace->events_size != 0 || + trace->events != NULL) { + printf("fail to deinit trace...\n"); + fail_num++; + } + shl_mem_free(trace); + return fail_num; +} + +void func_procedure() +{ + // simulate doing something + sleep(1); +} + +int test_shl_trace_end2end() +{ + int fail_num = 0; + + struct shl_trace *trace = (struct shl_trace *)shl_mem_alloc(sizeof(struct shl_trace)); + trace->enable_trace = true; + + SHL_TRACE_CALL(shl_trace_begin(trace, NULL)); + + // generate custom data into otherData + SHL_TRACE_CALL(shl_trace_other_data( + trace, + shl_trace_create_dict(2, "hardware", SHL_TRACE_STRING("x86"), "key", SHL_TRACE_INT64(10)))); + + int tmp[] = {1, 3, 224, 224}; + SHL_TRACE_CALL(shl_trace_duration_begin( + trace, "func_procedure", SHL_TRACE_EVENT_CPU_OPERATOR, + shl_trace_create_dict(4, "type", shl_trace_create_string("csinn cpu"), "shape", + shl_trace_create_list(3, SHL_TRACE_INT64(3), SHL_TRACE_INT64(112), + SHL_TRACE_INT64(112)), + "dim", SHL_TRACE_LIST_INT(4, tmp), "input_shape", + SHL_TRACE_LIST(1, SHL_TRACE_LIST_INT(4, tmp))))); + + func_procedure(); + + SHL_TRACE_CALL( + shl_trace_duration_end(trace, "func_procedure", SHL_TRACE_EVENT_CPU_OPERATOR, NULL)); + + SHL_TRACE_CALL(shl_trace_end(trace)); + + shl_mem_free(trace); + return fail_num; +} + +int main() +{ + TEST_WRAPPER(test_shl_trace_value(), "shl_trace_value: create/release"); + TEST_WRAPPER(test_shl_trace_dict(), "shl_trace_dict: create/release dict_item/dict"); + TEST_WRAPPER(test_shl_trace_begin_end(), "shl_trace_begin_end"); + + TEST_WRAPPER(test_shl_trace_end2end(), "shl_trace_end2end"); + + return 0; +} \ No newline at end of file diff --git a/tests/utils/test_utils.c b/tests/utils/test_utils.c index 69c8c443..0a273a66 100644 --- a/tests/utils/test_utils.c +++ b/tests/utils/test_utils.c @@ -470,57 +470,71 @@ void find_min_max(float *input, float *max_value, float *min_value, int size) void set_quant_info(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, enum csinn_api_enum api) { - float max, min, scale; - int32_t zp, quantized_multiplier, shift; - if (tensor->qinfo == NULL) { - tensor->qinfo = malloc(sizeof(struct csinn_quant_info)); + if (qtype == CSINN_QUANT_BLOCK_Q8_0 || qtype == CSINN_QUANT_BLOCK_Q4_0) { + return; } + int size = csinn_tensor_size(tensor); - find_min_max(tensor->data, &max, &min, size); + int q_size = tensor->quant_channel; + int inner_size = size / q_size; - if (qtype == CSINN_QUANT_INT8_SYM) { - if (api == CSINN_TH1520) { - get_scale_and_zp_power2_i8(max, min, &scale, &zp); - if (min >= 0 && max > 0) { - min = -max; + if (q_size > 1 && !(tensor->layout >= CSINN_LAYOUT_N && tensor->layout <= CSINN_LAYOUT_O1HW)) { + printf("only support NCHW layout\n"); + } + if (tensor->qinfo == NULL) { + tensor->qinfo = (struct csinn_quant_info *)malloc(q_size * sizeof(struct csinn_quant_info)); + } + + for (int i = 0; i < q_size; i++) { + float max, min, scale; + int32_t zp, quantized_multiplier, shift; + float *data = (float *)tensor->data + i * inner_size; + find_min_max(data, &max, &min, inner_size); + + if (qtype == CSINN_QUANT_INT8_SYM) { + if (api == CSINN_TH1520) { + get_scale_and_zp_power2_i8(max, min, &scale, &zp); + if (min >= 0 && max > 0) { + min = -max; + } else { + min = min; + } } else { - min = min; + get_scale_and_zp_i8(max, min, &scale, &zp); } - } else { - get_scale_and_zp_i8(max, min, &scale, &zp); - } - } else if (qtype == CSINN_QUANT_UINT8_ASYM) { - get_scale_and_zp(max, min, &scale, &zp); - } else if (qtype == CSINN_QUANT_INT8_ASYM) { - get_scale_and_zp_i8_asym(max, min, &scale, &zp); - } else if (qtype == CSINN_QUANT_INT16_SYM) { - if (api == CSINN_TH1520) { - get_scale_and_zp_power2_i16(max, min, &scale, &zp); - if (min >= 0 && max > 0) { - min = -max; + } else if (qtype == CSINN_QUANT_UINT8_ASYM) { + get_scale_and_zp(max, min, &scale, &zp); + } else if (qtype == CSINN_QUANT_INT8_ASYM) { + get_scale_and_zp_i8_asym(max, min, &scale, &zp); + } else if (qtype == CSINN_QUANT_INT16_SYM) { + if (api == CSINN_TH1520) { + get_scale_and_zp_power2_i16(max, min, &scale, &zp); + if (min >= 0 && max > 0) { + min = -max; + } else { + min = min; + } } else { - min = min; + printf("unsupport qinfo\n"); } + } else if (qtype == CSINN_QUANT_FLOAT16) { + scale = 1; + zp = 0; + } else if (qtype == CSINN_QUANT_FLOAT32) { + scale = 1; + zp = 0; } else { printf("unsupport qinfo\n"); } - } else if (qtype == CSINN_QUANT_FLOAT16) { - scale = 1; - zp = 0; - } else if (qtype == CSINN_QUANT_FLOAT32) { - scale = 1; - zp = 0; - } else { - printf("unsupport qinfo\n"); - } - tensor->qinfo->max = max; - tensor->qinfo->min = min; - shl_quantize_multiplier(scale, &quantized_multiplier, &shift); - tensor->qinfo->scale = scale; - tensor->qinfo->zero_point = zp; - tensor->qinfo->multiplier = quantized_multiplier; - tensor->qinfo->shift = shift; + tensor->qinfo[i].max = max; + tensor->qinfo[i].min = min; + shl_quantize_multiplier(scale, &quantized_multiplier, &shift); + tensor->qinfo[i].scale = scale; + tensor->qinfo[i].zero_point = zp; + tensor->qinfo[i].multiplier = quantized_multiplier; + tensor->qinfo[i].shift = shift; + } } void get_quant_info(struct csinn_tensor *tensor) @@ -557,6 +571,19 @@ void get_quant_info(struct csinn_tensor *tensor) tensor->qinfo->shift = shift; } +struct csinn_tensor *broadcast_quant_info(struct csinn_tensor *i_tensor, + struct csinn_tensor *o_tensor, + enum csinn_dtype_enum dtype) +{ + struct csinn_tensor *ret = csinn_alloc_tensor(NULL); + o_tensor->qinfo = i_tensor->qinfo; + csinn_tensor_copy(ret, o_tensor); + ret->dtype = dtype; + ret->data = malloc(csinn_tensor_byte_size(ret)); + csinn_tensor_data_convert(ret, o_tensor); + return ret; +} + struct csinn_tensor *convert_input(struct csinn_tensor *tensor, int dtype) { struct csinn_tensor *ret = csinn_alloc_tensor(tensor->sess); @@ -598,11 +625,27 @@ struct csinn_tensor *convert_f32_layer(struct csinn_tensor *tensor, enum csinn_q ret->dtype = CSINN_DTYPE_FLOAT16; } else if (qtype == CSINN_QUANT_FLOAT32) { ret->dtype = CSINN_DTYPE_FLOAT32; + } else if (qtype == CSINN_QUANT_BLOCK_Q8_0) { + ret->dtype = CSINN_DTYPE_INT8; + ret->mtype = CSINN_MEM_TYPE_BLOCK_Q8_0; + } else if (qtype == CSINN_QUANT_BLOCK_Q4_0) { + ret->dtype = CSINN_DTYPE_INT4; + ret->mtype = CSINN_MEM_TYPE_BLOCK_Q4_0; } else { printf("unsupport qinfo\n"); } - ret->data = malloc(csinn_tensor_byte_size(ret)); + if (qtype == CSINN_QUANT_BLOCK_Q8_0) { + int q8_block_size = 32; + int scale_size = csinn_tensor_size(ret) / q8_block_size * sizeof(int16_t); + ret->data = shl_mem_alloc(csinn_tensor_size(ret) + scale_size); + } else if (qtype == CSINN_QUANT_BLOCK_Q4_0) { + int q4_block_size = 32; + int scale_size = csinn_tensor_size(ret) / q4_block_size * sizeof(int16_t); + ret->data = shl_mem_alloc(csinn_tensor_size(ret) / 2 + scale_size); + } else { + ret->data = malloc(csinn_tensor_byte_size(ret)); + } csinn_tensor_data_convert(ret, tensor); return ret; diff --git a/tests/utils/test_utils.h b/tests/utils/test_utils.h index d15cc2fc..e8c6d788 100644 --- a/tests/utils/test_utils.h +++ b/tests/utils/test_utils.h @@ -54,6 +54,9 @@ void set_quant_info(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, struct csinn_tensor *convert_input(struct csinn_tensor *tensor, int dtype); struct csinn_tensor *convert_f32_input(struct csinn_tensor *tensor, int dtype, struct csinn_session *sess); +struct csinn_tensor *broadcast_quant_info(struct csinn_tensor *i_tensor, + struct csinn_tensor *o_tensor, + enum csinn_dtype_enum dtype); struct csinn_tensor *convert_f32_layer(struct csinn_tensor *tensor, enum csinn_quant_enum qtype, enum csinn_api_enum api); struct csinn_tensor *fuse_zp_to_bias(struct csinn_tensor *input, struct csinn_tensor *weight, diff --git a/tests/validation_layer/Makefile.c906 b/tests/validation_layer/Makefile.c906 index 66111106..57f97202 100644 --- a/tests/validation_layer/Makefile.c906 +++ b/tests/validation_layer/Makefile.c906 @@ -1,5 +1,5 @@ LIB_DIR = ../../c906_static_build -INCLUDE = -I../../include -I../../include/shl_public -I../utils -I./layer +INCLUDE = -I../../include -I../../include/csinn -I../../include/shl_public -I../utils -I./layer CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections diff --git a/tests/validation_layer/Makefile.c908 b/tests/validation_layer/Makefile.c908 index b5e6edaf..42324998 100644 --- a/tests/validation_layer/Makefile.c908 +++ b/tests/validation_layer/Makefile.c908 @@ -1,5 +1,5 @@ LIB_DIR = ../../c908_build -INCLUDE = -I../../include -I../../include/shl_public -I../utils -I./layer +INCLUDE = -I../../include -I../../include/csinn -I../../include/shl_public -I../utils -I./layer CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv_zfh_xtheadc_xtheadvdot -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections diff --git a/tests/validation_layer/Makefile.c920 b/tests/validation_layer/Makefile.c920 index f3f9ce3c..86c85842 100644 --- a/tests/validation_layer/Makefile.c920 +++ b/tests/validation_layer/Makefile.c920 @@ -1,5 +1,5 @@ LIB_DIR = ../../c920_build -INCLUDE = -I../../include -I../../include/shl_public -I../utils -I./layer +INCLUDE = -I../../include -I../../include/csinn -I../../include/shl_public -I../utils -I./layer CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections @@ -62,7 +62,7 @@ $(utils_objs): %.o: %.c $(test_objs): %.o: %.cpp $(CPLUS) -c $(CFLAGS) $(INCLUDE) -D DTYPE=$(TYPE) $< -o $@ - $(CPLUS) $@ $(CFLAGS) $(BOARD) $(utils_objs) -L$(LIB_DIR) -l$(LIB_NAME) -lc -lm -o $@.elf -lgcov + $(CPLUS) $@ $(CFLAGS) $(BOARD) $(utils_objs) -L$(LIB_DIR) -l$(LIB_NAME) -lc -lm -fopenmp -o $@.elf -lgcov clean: rm -rf $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm diff --git a/tests/validation_layer/Makefile.c920v2 b/tests/validation_layer/Makefile.c920v2 index 96bf4b8b..d3fd4503 100644 --- a/tests/validation_layer/Makefile.c920v2 +++ b/tests/validation_layer/Makefile.c920v2 @@ -1,5 +1,5 @@ LIB_DIR = ../../c920v2_build -INCLUDE = -I../../include -I../../include/shl_public -I../utils -I./layer +INCLUDE = -I../../include -I../../include/csinn -I../../include/shl_public -I../utils -I./layer CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv_zfh_xtheadc_xtheadvdot -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections diff --git a/tests/validation_layer/Makefile.rvm b/tests/validation_layer/Makefile.rvm index 696ec740..9b0414cf 100644 --- a/tests/validation_layer/Makefile.rvm +++ b/tests/validation_layer/Makefile.rvm @@ -1,5 +1,5 @@ LIB_DIR = ../../rvm_build -INCLUDE = -I../../include -I../../include/shl_public -I../utils -I./layer +INCLUDE = -I../../include -I../../include/csinn -I../../include/shl_public -I../utils -I./layer CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv_zfh_xtheadc_xtheadvdot_xtheadmatrix -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections diff --git a/tests/validation_layer/Makefile.rvv b/tests/validation_layer/Makefile.rvv index e519b44e..c2129c43 100644 --- a/tests/validation_layer/Makefile.rvv +++ b/tests/validation_layer/Makefile.rvv @@ -1,5 +1,5 @@ LIB_DIR = ../../rvv_build -INCLUDE = -I../../include -I../../include/shl_public -I../utils -I./layer +INCLUDE = -I../../include -I../../include/csinn -I../../include/shl_public -I../utils -I./layer CFLAGS = -O0 -g3 -static CFLAGS += -march=rv64gcv_zfh_xtheadc_xtheadvdot -mabi=lp64d CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections @@ -14,6 +14,7 @@ test_objs = test_objs += abs.o test_objs += add.o test_objs += averagepool.o +test_objs += averagepool_nhwc.o test_objs += broadcast_to.o test_objs += div.o test_objs += clip.o @@ -29,10 +30,13 @@ test_objs += erf.o test_objs += fullyconnected.o test_objs += gather.o test_objs += global_avgpool.o +test_objs += global_avgpool_nhwc.o test_objs += global_maxpool.o +test_objs += global_maxpool_nhwc.o test_objs += layer_norm.o test_objs += leaky_relu.o test_objs += maxpool.o +test_objs += maxpool_nhwc.o test_objs += mul.o test_objs += minimum.o test_objs += matmul.o diff --git a/tests/validation_layer/abs.cpp b/tests/validation_layer/abs.cpp index 8021e316..9b22f019 100644 --- a/tests/validation_layer/abs.cpp +++ b/tests/validation_layer/abs.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of abs(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -59,14 +61,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_abs_init, csinn_abs, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_abs_init, csinn_abs, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_abs_init, csinn_abs, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_abs_init, csinn_abs, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_abs_init, csinn_abs, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_abs_init, csinn_abs, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/add.cpp b/tests/validation_layer/add.cpp index 727b1aa8..9b3c6b21 100644 --- a/tests/validation_layer/add.cpp +++ b/tests/validation_layer/add.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of add(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -77,14 +79,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_add_init, csinn_add, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_add_init, csinn_add, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_add_init, csinn_add, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_add_init, csinn_add, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_add_init, csinn_add, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_add_init, csinn_add, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/averagepool.cpp b/tests/validation_layer/averagepool.cpp index e262e049..76517c27 100644 --- a/tests/validation_layer/averagepool.cpp +++ b/tests/validation_layer/averagepool.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of avgpool2d(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -76,15 +79,29 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, +// csinn_avgpool2d_init, csinn_avgpool2d, &difference); +// #elif (DTYPE == 16) +// test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, +// csinn_avgpool2d_init, csinn_avgpool2d, &difference); +// #elif (DTYPE == 8) +// test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, +// csinn_avgpool2d_init, csinn_avgpool2d, &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_avgpool2d_init, csinn_avgpool2d, - &difference); + test_unary_layer(input, output, params, CSINN_QUANT_FLOAT32, csinn_avgpool2d_init, + csinn_avgpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_avgpool2d_init, csinn_avgpool2d, - &difference); + test_unary_layer(input, output, params, CSINN_QUANT_FLOAT16, csinn_avgpool2d_init, + csinn_avgpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_avgpool2d_init, - csinn_avgpool2d, &difference); + test_unary_layer(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_avgpool2d_init, + csinn_avgpool2d, &difference); #endif + return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/averagepool_nhwc.cpp b/tests/validation_layer/averagepool_nhwc.cpp index b04cae22..880c9add 100644 --- a/tests/validation_layer/averagepool_nhwc.cpp +++ b/tests/validation_layer/averagepool_nhwc.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of avgpool2d(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -77,14 +79,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_avgpool2d_init, csinn_avgpool2d, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_avgpool2d_init, csinn_avgpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_avgpool2d_init, csinn_avgpool2d, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_avgpool2d_init, csinn_avgpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_avgpool2d_init, - csinn_avgpool2d, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_avgpool2d_init, csinn_avgpool2d, &difference); #endif return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/broadcast_to.cpp b/tests/validation_layer/broadcast_to.cpp index 9a655666..c0802820 100644 --- a/tests/validation_layer/broadcast_to.cpp +++ b/tests/validation_layer/broadcast_to.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of broadcast_to(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -66,14 +68,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_broadcast_to_init, - csinn_broadcast_to, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_broadcast_to_init, csinn_broadcast_to, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_broadcast_to_init, - csinn_broadcast_to, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_broadcast_to_init, csinn_broadcast_to, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_broadcast_to_init, - csinn_broadcast_to, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_broadcast_to_init, csinn_broadcast_to, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/clip.cpp b/tests/validation_layer/clip.cpp index f5e9fab7..6fb00bc1 100644 --- a/tests/validation_layer/clip.cpp +++ b/tests/validation_layer/clip.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of clip(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -67,14 +69,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_clip_init, csinn_clip, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_clip_init, csinn_clip, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_clip_init, csinn_clip, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_clip_init, csinn_clip, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_clip_init, csinn_clip, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_clip_init, csinn_clip, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/concat.cpp b/tests/validation_layer/concat.cpp index 800b5323..6b2cc019 100644 --- a/tests/validation_layer/concat.cpp +++ b/tests/validation_layer/concat.cpp @@ -25,7 +25,9 @@ int main(int argc, char **argv) int out_size = 1; int *buffer = read_input_data_f32(argv[1]); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_concat_params *params = (csinn_concat_params *)csinn_alloc_params(sizeof(struct csinn_concat_params), sess); @@ -75,14 +77,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT32, - csinn_concat_init, csinn_concat, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_DTYPE_FLOAT32, + CSINN_QUANT_FLOAT32, sess, csinn_concat_init, csinn_concat, &difference); #elif (DTYPE == 16) - test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT16, - csinn_concat_init, csinn_concat, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16, sess, csinn_concat_init, csinn_concat, &difference); #elif (DTYPE == 8) - test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_INT8_SYM, - csinn_concat_init, csinn_concat, &difference); + test_concat_op((struct csinn_tensor **)input, output, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM, sess, csinn_concat_init, csinn_concat, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/convolution.cpp b/tests/validation_layer/convolution.cpp index 5f4f16d9..dc7ad6e9 100644 --- a/tests/validation_layer/convolution.cpp +++ b/tests/validation_layer/convolution.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of convolution(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -103,26 +106,46 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, +// sess, csinn_conv2d_init, csinn_conv2d, &difference); +// #elif (DTYPE == 16) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, +// sess, csinn_conv2d_init, csinn_conv2d, &difference); +// #elif (DTYPE == 8) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, +// CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #elif (DTYPE == 168) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, +// CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #elif (DTYPE == 0x168C) +// csinn_realloc_quant_info(kernel, kernel->dim[0]); +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, +// CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, CSINN_QUANT_FLOAT32, + csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 16) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, CSINN_QUANT_FLOAT16, + csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 8) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 168) - test_conv2d_op_fp16_w_int8(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_conv2d_init, csinn_conv2d, &difference); - + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(kernel, kernel->dim[0]); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); #endif - // if (params->base.api != CSINN_RVV && params->base.api != CSINN_C906 && params->base.api != - // CSINN_C920) { - // test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM, - // csinn_conv2d_init, csinn_conv2d, &difference); - // } - return done_testing(); } diff --git a/tests/validation_layer/convolution1d.cpp b/tests/validation_layer/convolution1d.cpp index 6b6b55d7..73adbd6a 100644 --- a/tests/validation_layer/convolution1d.cpp +++ b/tests/validation_layer/convolution1d.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of convolution1d(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -95,17 +97,21 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_fully_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv1d_init, - csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_conv1d_init, csinn_conv1d, &difference); #elif (DTYPE == 16) - test_fully_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv1d_init, - csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_conv1d_init, csinn_conv1d, &difference); #elif (DTYPE == 8) - test_fully_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv1d_init, - csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv1d_init, csinn_conv1d, &difference); #elif (DTYPE == 168) - test_conv1d_op_fp16_w_int8(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_conv1d_init, csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv1d_init, csinn_conv1d, &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(kernel, kernel->dim[0]); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv1d_init, csinn_conv1d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/convolution_nhwc.cpp b/tests/validation_layer/convolution_nhwc.cpp index c5472ff4..89026a11 100644 --- a/tests/validation_layer/convolution_nhwc.cpp +++ b/tests/validation_layer/convolution_nhwc.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of convolution(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -103,14 +105,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 16) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 8) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv2d_init, csinn_conv2d, &difference); #endif diff --git a/tests/validation_layer/deconvolution.cpp b/tests/validation_layer/deconvolution.cpp index 36a62bd9..dd48d995 100644 --- a/tests/validation_layer/deconvolution.cpp +++ b/tests/validation_layer/deconvolution.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of deconvolution(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -104,17 +106,24 @@ int main(int argc, char **argv) float *output_data = (float *)output->data; #if (DTYPE == 32) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_deconv2d_init, - csinn_deconv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_deconv2d_init, csinn_deconv2d, &difference); #elif (DTYPE == 16) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_deconv2d_init, - csinn_deconv2d, &difference); -#elif (DTYPE == 168) - test_conv2d_op_fp16_w_int8(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_deconv2d_init, csinn_deconv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_deconv2d_init, csinn_deconv2d, &difference); #elif (DTYPE == 8) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_deconv2d_init, - csinn_deconv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_deconv2d_init, csinn_deconv2d, + &difference); +#elif (DTYPE == 168) + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_deconv2d_init, csinn_deconv2d, + &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(kernel, kernel->dim[0]); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_deconv2d_init, csinn_deconv2d, + &difference); #endif return done_testing(); } diff --git a/tests/validation_layer/depthwise_convolution.cpp b/tests/validation_layer/depthwise_convolution.cpp index b9e3d53c..5fefb77f 100644 --- a/tests/validation_layer/depthwise_convolution.cpp +++ b/tests/validation_layer/depthwise_convolution.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of depthwise convolution(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -102,18 +105,45 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, +// sess, csinn_conv2d_init, csinn_conv2d, &difference); +// #elif (DTYPE == 16) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, +// sess, csinn_conv2d_init, csinn_conv2d, &difference); +// #elif (DTYPE == 8) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, +// CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #elif (DTYPE == 168) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, +// CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #elif (DTYPE == 0x168C) +// csinn_realloc_quant_info(kernel, kernel->dim[0]); +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, +// CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, CSINN_QUANT_FLOAT32, + csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 16) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, CSINN_QUANT_FLOAT16, + csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 8) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 168) - test_conv2d_op_fp16_w_int8(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_conv2d_init, csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(kernel, kernel->dim[0]); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/depthwise_convolution1d.cpp b/tests/validation_layer/depthwise_convolution1d.cpp index 0dbb799e..319d224a 100644 --- a/tests/validation_layer/depthwise_convolution1d.cpp +++ b/tests/validation_layer/depthwise_convolution1d.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of convolution1d(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -95,17 +97,21 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_fully_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv1d_init, - csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_conv1d_init, csinn_conv1d, &difference); #elif (DTYPE == 16) - test_fully_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv1d_init, - csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_conv1d_init, csinn_conv1d, &difference); #elif (DTYPE == 8) - test_fully_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv1d_init, - csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv1d_init, csinn_conv1d, &difference); #elif (DTYPE == 168) - test_conv1d_op_fp16_w_int8(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_conv1d_init, csinn_conv1d, &difference); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv1d_init, csinn_conv1d, &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(kernel, kernel->dim[0]); + test_fully_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv1d_init, csinn_conv1d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/depthwise_convolution_nhwc.cpp b/tests/validation_layer/depthwise_convolution_nhwc.cpp index cabbeb3d..0c8ef8b2 100644 --- a/tests/validation_layer/depthwise_convolution_nhwc.cpp +++ b/tests/validation_layer/depthwise_convolution_nhwc.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of depthwise convolution(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -102,14 +104,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 16) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 8) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv2d_init, csinn_conv2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/div.cpp b/tests/validation_layer/div.cpp index 62736df7..072799eb 100644 --- a/tests/validation_layer/div.cpp +++ b/tests/validation_layer/div.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of div(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -77,14 +79,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_div_init, csinn_div, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_div_init, csinn_div, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_div_init, csinn_div, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_div_init, csinn_div, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_div_init, csinn_div, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_div_init, csinn_div, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/erf.cpp b/tests/validation_layer/erf.cpp index fc4104ae..afea3364 100644 --- a/tests/validation_layer/erf.cpp +++ b/tests/validation_layer/erf.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of erf(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -57,14 +59,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_erf_init, csinn_erf, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_erf_init, csinn_erf, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_erf_init, csinn_erf, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_erf_init, csinn_erf, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_erf_init, csinn_erf, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_erf_init, csinn_erf, &difference); #endif return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/fullyconnected.cpp b/tests/validation_layer/fullyconnected.cpp index 8bd68530..62eabd5b 100644 --- a/tests/validation_layer/fullyconnected.cpp +++ b/tests/validation_layer/fullyconnected.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of fullyconnected(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -76,17 +78,24 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT32, - csinn_fullyconnected_init, csinn_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_fullyconnected_init, csinn_fullyconnected, &difference); #elif (DTYPE == 16) - test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT16, - csinn_fullyconnected_init, csinn_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_fullyconnected_init, csinn_fullyconnected, &difference); #elif (DTYPE == 8) - test_fully_op(input, output, weight, bias, params, CSINN_QUANT_INT8_SYM, - csinn_fullyconnected_init, csinn_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_DTYPE_INT8, + CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_fullyconnected_init, + csinn_fullyconnected, &difference); #elif (DTYPE == 168) - test_fully_op_fp16_w_int8(input, output, weight, bias, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_fullyconnected_init, csinn_fullyconnected, &difference); + test_fully_op(input, output, weight, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_fullyconnected_init, csinn_fullyconnected, + &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(weight, weight->dim[0]); + test_fully_op(input, output, weight, bias, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_fullyconnected_init, csinn_fullyconnected, + &difference); #endif return done_testing(); diff --git a/tests/validation_layer/gather.cpp b/tests/validation_layer/gather.cpp index 93b9d09c..d2c46b42 100644 --- a/tests/validation_layer/gather.cpp +++ b/tests/validation_layer/gather.cpp @@ -22,7 +22,9 @@ int main(int argc, char **argv) { init_testsuite("Testing function of gather(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *indices = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -69,7 +71,7 @@ int main(int argc, char **argv) input->quant_channel = 1; indices->dtype = CSINN_DTYPE_INT64; indices->layout = CSINN_LAYOUT_NCHW; - indices->is_const = 0; + indices->is_const = 1; indices->quant_channel = 1; output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; @@ -93,14 +95,14 @@ int main(int argc, char **argv) indices->data = data_i64; #if (DTYPE == 32) - test_gather_op(input, indices, output, params, CSINN_QUANT_FLOAT32, csinn_gather_init, - csinn_gather, &difference); + test_gather_op(input, indices, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_gather_init, csinn_gather, &difference); #elif (DTYPE == 16) - test_gather_op(input, indices, output, params, CSINN_QUANT_FLOAT16, csinn_gather_init, - csinn_gather, &difference); + test_gather_op(input, indices, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_gather_init, csinn_gather, &difference); #elif (DTYPE == 8) - test_gather_op(input, indices, output, params, CSINN_QUANT_INT8_ASYM, csinn_gather_init, - csinn_gather, &difference); + test_gather_op(input, indices, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_gather_init, csinn_gather, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/global_avgpool.cpp b/tests/validation_layer/global_avgpool.cpp index 9acf51ae..bde5001c 100644 --- a/tests/validation_layer/global_avgpool.cpp +++ b/tests/validation_layer/global_avgpool.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of global avgpool(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -66,15 +69,28 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, +// csinn_global_avgpool2d_init, csinn_global_avgpool2d, &difference); +// #elif (DTYPE == 16) +// test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, +// csinn_global_avgpool2d_init, csinn_global_avgpool2d, &difference); +// #elif (DTYPE == 8) +// test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, +// csinn_global_avgpool2d_init, csinn_global_avgpool2d, &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_avgpool2d_init, - csinn_global_avgpool2d, &difference); + test_unary_layer(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_avgpool2d_init, + csinn_global_avgpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_avgpool2d_init, - csinn_global_avgpool2d, &difference); + test_unary_layer(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_avgpool2d_init, + csinn_global_avgpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_avgpool2d_init, - csinn_global_avgpool2d, &difference); + test_unary_layer(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_global_avgpool2d_init, + csinn_global_avgpool2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/global_avgpool_nhwc.cpp b/tests/validation_layer/global_avgpool_nhwc.cpp index a3b6e30d..395be3af 100644 --- a/tests/validation_layer/global_avgpool_nhwc.cpp +++ b/tests/validation_layer/global_avgpool_nhwc.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of global avgpool(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -67,14 +69,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_avgpool2d_init, - csinn_global_avgpool2d, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_global_avgpool2d_init, csinn_global_avgpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_avgpool2d_init, - csinn_global_avgpool2d, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_global_avgpool2d_init, csinn_global_avgpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_avgpool2d_init, - csinn_global_avgpool2d, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_global_avgpool2d_init, csinn_global_avgpool2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/global_maxpool.cpp b/tests/validation_layer/global_maxpool.cpp index bcc855d4..7ac71ecb 100644 --- a/tests/validation_layer/global_maxpool.cpp +++ b/tests/validation_layer/global_maxpool.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of global maxpool(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -66,15 +69,28 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, +// csinn_global_maxpool2d_init, csinn_global_maxpool2d, &difference); +// #elif (DTYPE == 16) +// test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, +// csinn_global_maxpool2d_init, csinn_global_maxpool2d, &difference); +// #elif (DTYPE == 8) +// test_maxpool_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, +// csinn_global_maxpool2d_init, csinn_global_maxpool2d, &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_maxpool2d_init, - csinn_global_maxpool2d, &difference); + test_maxpool_layer(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_maxpool2d_init, + csinn_global_maxpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_maxpool2d_init, - csinn_global_maxpool2d, &difference); + test_maxpool_layer(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_maxpool2d_init, + csinn_global_maxpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_maxpool2d_init, - csinn_global_maxpool2d, &difference); + test_maxpool_layer(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_global_maxpool2d_init, + csinn_global_maxpool2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/global_maxpool_nhwc.cpp b/tests/validation_layer/global_maxpool_nhwc.cpp index 29946d89..2213ed8c 100644 --- a/tests/validation_layer/global_maxpool_nhwc.cpp +++ b/tests/validation_layer/global_maxpool_nhwc.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of global maxpool(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -67,14 +69,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_maxpool2d_init, - csinn_global_maxpool2d, &difference); + test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_global_maxpool2d_init, csinn_global_maxpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_maxpool2d_init, - csinn_global_maxpool2d, &difference); + test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_global_maxpool2d_init, csinn_global_maxpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_maxpool2d_init, - csinn_global_maxpool2d, &difference); + test_maxpool_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_global_maxpool2d_init, csinn_global_maxpool2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/group_convolution.cpp b/tests/validation_layer/group_convolution.cpp index 08d514d3..7bb4421c 100644 --- a/tests/validation_layer/group_convolution.cpp +++ b/tests/validation_layer/group_convolution.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of group convolution(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -104,15 +107,45 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, +// sess, csinn_conv2d_init, csinn_conv2d, &difference); +// #elif (DTYPE == 16) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, +// sess, csinn_conv2d_init, csinn_conv2d, &difference); +// #elif (DTYPE == 8) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, +// CSINN_QUANT_INT8_ASYM_W_SYM, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #elif (DTYPE == 168) +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_FLOAT16, +// CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #elif (DTYPE == 0x168C) +// csinn_realloc_quant_info(kernel, kernel->dim[0]); +// test_conv2d_op(input, output, kernel, bias, params, CSINN_DTYPE_INT8, +// CSINN_QUANT_FLOAT16_W_INT8, sess, csinn_conv2d_init, csinn_conv2d, +// &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32, CSINN_QUANT_FLOAT32, + csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 16) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, CSINN_QUANT_FLOAT16, + csinn_conv2d_init, csinn_conv2d, &difference); #elif (DTYPE == 8) - test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM, csinn_conv2d_init, - csinn_conv2d, &difference); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE == 168) + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); +#elif (DTYPE == 0x168C) + csinn_realloc_quant_info(kernel, kernel->dim[0]); + test_conv2d_layer(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16, + CSINN_QUANT_INT8_SYM, csinn_conv2d_init, csinn_conv2d, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/layer_norm.cpp b/tests/validation_layer/layer_norm.cpp index 9a96fe03..ff5f033c 100644 --- a/tests/validation_layer/layer_norm.cpp +++ b/tests/validation_layer/layer_norm.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of layer_norm(layer)\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -89,14 +91,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_ternary_op(input, output, gamma, beta, params, CSINN_QUANT_FLOAT32, csinn_layer_norm_init, - csinn_layer_norm, &difference); + test_ternary_op(input, output, gamma, beta, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_layer_norm_init, csinn_layer_norm, &difference); #elif (DTYPE == 16) - test_ternary_op(input, output, gamma, beta, params, CSINN_QUANT_FLOAT16, csinn_layer_norm_init, - csinn_layer_norm, &difference); + test_ternary_op(input, output, gamma, beta, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_layer_norm_init, csinn_layer_norm, &difference); #elif (DTYPE == 8) - test_ternary_op(input, output, gamma, beta, params, CSINN_QUANT_INT8_SYM, csinn_layer_norm_init, - csinn_layer_norm, &difference); + test_ternary_op(input, output, gamma, beta, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, + sess, csinn_layer_norm_init, csinn_layer_norm, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/leaky_relu.cpp b/tests/validation_layer/leaky_relu.cpp index 3240285d..fe1a88d4 100644 --- a/tests/validation_layer/leaky_relu.cpp +++ b/tests/validation_layer/leaky_relu.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of leaky_relu(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -65,14 +67,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_leaky_relu_init, - csinn_leaky_relu, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_leaky_relu_init, csinn_leaky_relu, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_leaky_relu_init, - csinn_leaky_relu, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_leaky_relu_init, csinn_leaky_relu, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_leaky_relu_init, - csinn_leaky_relu, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_leaky_relu_init, csinn_leaky_relu, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/lrn.cpp b/tests/validation_layer/lrn.cpp index 65c660c5..b7c96f61 100644 --- a/tests/validation_layer/lrn.cpp +++ b/tests/validation_layer/lrn.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of lrn(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -71,17 +73,17 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_lrn_init, csinn_lrn, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_lrn_init, csinn_lrn, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_lrn_init, csinn_lrn, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_lrn_init, csinn_lrn, &difference); #elif (DTYPE == 8) shl_quantize_multiplier(params->bias, ¶ms->bias_multiplier, ¶ms->bias_shift); shl_quantize_multiplier(params->alpha, ¶ms->alpha_multiplier, ¶ms->alpha_shift); shl_quantize_multiplier(params->beta, ¶ms->beta_multiplier, ¶ms->beta_shift); - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_lrn_init, csinn_lrn, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_lrn_init, csinn_lrn, &difference); #endif diff --git a/tests/validation_layer/matmul.cpp b/tests/validation_layer/matmul.cpp index d76acdb0..1ba131eb 100644 --- a/tests/validation_layer/matmul.cpp +++ b/tests/validation_layer/matmul.cpp @@ -22,7 +22,9 @@ int main(int argc, char **argv) { init_testsuite("Testing function of matmul(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -61,14 +63,17 @@ int main(int argc, char **argv) input0->layout = CSINN_LAYOUT_NCHW; input0->is_const = 0; input0->quant_channel = 1; + set_layout(input0); input1->dtype = CSINN_DTYPE_FLOAT32; input1->layout = CSINN_LAYOUT_NCHW; input1->is_const = 1; input1->quant_channel = 1; + set_layout(input1); output->dtype = CSINN_DTYPE_FLOAT32; output->layout = CSINN_LAYOUT_NCHW; output->is_const = 0; output->quant_channel = 1; + set_layout(output); params->base.api = CSINN_API; input0->data = (float *)(buffer + 3 + 3 * input0->dim_count); @@ -78,17 +83,33 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_matmul_init, - csinn_matmul, &difference); + test_matmul_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_matmul_init, csinn_matmul, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_matmul_init, - csinn_matmul, &difference); + test_matmul_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_matmul_init, csinn_matmul, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_matmul_init, - csinn_matmul, &difference); + test_matmul_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_matmul_init, csinn_matmul, &difference); #elif (DTYPE == 168) - test_matmul_op_fp16_w_int8(input0, input1, output, params, CSINN_QUANT_FLOAT16_W_INT8, - csinn_matmul_init, csinn_matmul, &difference); + test_matmul_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16_W_INT8, + sess, csinn_matmul_init, csinn_matmul, &difference); +#elif (DTYPE == 0x32B8) + test_matmul_op_hybrid_quant(input0, input1, output, params, CSINN_DTYPE_FLOAT32, + CSINN_QUANT_FLOAT32, CSINN_QUANT_BLOCK_Q8_0, sess, + csinn_matmul_init, csinn_matmul, &difference); +#elif (DTYPE == 0x32B4) + test_matmul_op_hybrid_quant(input0, input1, output, params, CSINN_DTYPE_FLOAT32, + CSINN_QUANT_FLOAT32, CSINN_QUANT_BLOCK_Q4_0, sess, + csinn_matmul_init, csinn_matmul, &difference); +#elif (DTYPE == 0x16B8) + test_matmul_op_hybrid_quant(input0, input1, output, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16, CSINN_QUANT_BLOCK_Q8_0, sess, + csinn_matmul_init, csinn_matmul, &difference); +#elif (DTYPE == 0x16B4) + test_matmul_op_hybrid_quant(input0, input1, output, params, CSINN_DTYPE_FLOAT16, + CSINN_QUANT_FLOAT16, CSINN_QUANT_BLOCK_Q4_0, sess, + csinn_matmul_init, csinn_matmul, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/maxpool.cpp b/tests/validation_layer/maxpool.cpp index ca157c1e..aee5d554 100644 --- a/tests/validation_layer/maxpool.cpp +++ b/tests/validation_layer/maxpool.cpp @@ -23,6 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of maxpool(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); + // sess->base_run_mode = CSINN_RM_CPU_GRAPH; + // sess->model.save_mode = CSINN_RUN_ONLY; + // sess->dynamic_shape = CSINN_FALSE; sess->base_run_mode = CSINN_RM_LAYER; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -75,15 +78,29 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; +/* CSINN_RM_CPU_GRAPH */ +// #if (DTYPE == 32) +// test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, +// csinn_maxpool2d_init, csinn_maxpool2d, &difference); +// #elif (DTYPE == 16) +// test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, +// csinn_maxpool2d_init, csinn_maxpool2d, &difference); +// #elif (DTYPE == 8) +// test_maxpool_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, +// csinn_maxpool2d_init, csinn_maxpool2d, &difference); +// #endif + +/* CSINN_RM_LAYER */ #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_maxpool2d_init, csinn_maxpool2d, - &difference); + test_maxpool_layer(input, output, params, CSINN_QUANT_FLOAT32, csinn_maxpool2d_init, + csinn_maxpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_maxpool2d_init, csinn_maxpool2d, - &difference); + test_maxpool_layer(input, output, params, CSINN_QUANT_FLOAT16, csinn_maxpool2d_init, + csinn_maxpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_maxpool2d_init, - csinn_maxpool2d, &difference); + test_maxpool_layer(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_maxpool2d_init, + csinn_maxpool2d, &difference); #endif + return done_testing(); } diff --git a/tests/validation_layer/maxpool_nhwc.cpp b/tests/validation_layer/maxpool_nhwc.cpp index 03b94110..d5920ece 100644 --- a/tests/validation_layer/maxpool_nhwc.cpp +++ b/tests/validation_layer/maxpool_nhwc.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of maxpool(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -76,14 +78,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_maxpool2d_init, csinn_maxpool2d, - &difference); + test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_maxpool2d_init, csinn_maxpool2d, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_maxpool2d_init, csinn_maxpool2d, - &difference); + test_maxpool_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_maxpool2d_init, csinn_maxpool2d, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_maxpool2d_init, - csinn_maxpool2d, &difference); + test_maxpool_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_maxpool2d_init, csinn_maxpool2d, &difference); #endif return done_testing(); } diff --git a/tests/validation_layer/mean_stride.cpp b/tests/validation_layer/mean_stride.cpp index db5ddf86..3a4200eb 100644 --- a/tests/validation_layer/mean_stride.cpp +++ b/tests/validation_layer/mean_stride.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of mean(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -89,14 +91,14 @@ int main(int argc, char **argv) params->base.layout = CSINN_LAYOUT_NCHW; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_mean_init, csinn_mean, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_mean_init, csinn_mean, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_mean_init, csinn_mean, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_mean_init, csinn_mean, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_mean_init, csinn_mean, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_mean_init, csinn_mean, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/minimum.cpp b/tests/validation_layer/minimum.cpp index fe4e6e2a..c1b7937c 100644 --- a/tests/validation_layer/minimum.cpp +++ b/tests/validation_layer/minimum.cpp @@ -22,7 +22,9 @@ int main(int argc, char **argv) { init_testsuite("Testing function of minimum(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -77,14 +79,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_minimum_init, - csinn_minimum, &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_minimum_init, csinn_minimum, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_minimum_init, - csinn_minimum, &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_minimum_init, csinn_minimum, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_minimum_init, - csinn_minimum, &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_minimum_init, csinn_minimum, &difference); #endif return done_testing(); } \ No newline at end of file diff --git a/tests/validation_layer/mul.cpp b/tests/validation_layer/mul.cpp index b6da447c..1571971c 100644 --- a/tests/validation_layer/mul.cpp +++ b/tests/validation_layer/mul.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of mul(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -77,14 +79,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_mul_init, csinn_mul, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_mul_init, csinn_mul, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_mul_init, csinn_mul, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_mul_init, csinn_mul, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_mul_init, csinn_mul, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_mul_init, csinn_mul, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/pad.cpp b/tests/validation_layer/pad.cpp index c6a38ce5..747140c7 100644 --- a/tests/validation_layer/pad.cpp +++ b/tests/validation_layer/pad.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of pad(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -81,14 +83,14 @@ int main(int argc, char **argv) return 0 #else #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_pad_init, csinn_pad, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_pad_init, csinn_pad, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_pad_init, csinn_pad, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_pad_init, csinn_pad, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_pad_init, csinn_pad, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_pad_init, csinn_pad, &difference); #endif #endif diff --git a/tests/validation_layer/power.cpp b/tests/validation_layer/power.cpp index 18a14e24..69b8464e 100644 --- a/tests/validation_layer/power.cpp +++ b/tests/validation_layer/power.cpp @@ -22,7 +22,9 @@ int main(int argc, char **argv) { init_testsuite("Testing function of power(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -69,14 +71,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_power_init, - csinn_power, &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_power_init, csinn_power, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_power_init, - csinn_power, &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_power_init, csinn_power, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_power_init, - csinn_power, &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_power_init, csinn_power, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/prelu.cpp b/tests/validation_layer/prelu.cpp index 8c30b64b..9bd128c1 100644 --- a/tests/validation_layer/prelu.cpp +++ b/tests/validation_layer/prelu.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of prelu(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -75,14 +77,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_binary_op(input, alpha_data, output, params, CSINN_QUANT_FLOAT32, csinn_prelu_init, - csinn_prelu, &difference); + test_binary_op(input, alpha_data, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_prelu_init, csinn_prelu, &difference); #elif (DTYPE == 16) - test_binary_op(input, alpha_data, output, params, CSINN_QUANT_FLOAT16, csinn_prelu_init, - csinn_prelu, &difference); + test_binary_op(input, alpha_data, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_prelu_init, csinn_prelu, &difference); #elif (DTYPE == 8) - test_binary_op(input, alpha_data, output, params, CSINN_QUANT_INT8_ASYM, csinn_prelu_init, - csinn_prelu, &difference); + test_binary_op(input, alpha_data, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_prelu_init, csinn_prelu, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/reduce_sum.cpp b/tests/validation_layer/reduce_sum.cpp index 13786044..02aabb13 100644 --- a/tests/validation_layer/reduce_sum.cpp +++ b/tests/validation_layer/reduce_sum.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of sum(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -83,14 +85,14 @@ int main(int argc, char **argv) params->base.layout = CSINN_LAYOUT_NCHW; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_reduce_sum_init, - csinn_reduce_sum, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_reduce_sum_init, csinn_reduce_sum, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_reduce_sum_init, - csinn_reduce_sum, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_reduce_sum_init, csinn_reduce_sum, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_reduce_sum_init, - csinn_reduce_sum, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_reduce_sum_init, csinn_reduce_sum, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/relu.cpp b/tests/validation_layer/relu.cpp index e05579c8..4c9b28e7 100644 --- a/tests/validation_layer/relu.cpp +++ b/tests/validation_layer/relu.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of relu(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -58,16 +60,15 @@ int main(int argc, char **argv) output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu_init, csinn_relu, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_relu_init, csinn_relu, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_relu_init, csinn_relu, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_relu_init, csinn_relu, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_relu_init, csinn_relu, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_relu_init, csinn_relu, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/relu1.cpp b/tests/validation_layer/relu1.cpp index 98eac970..a26defa7 100644 --- a/tests/validation_layer/relu1.cpp +++ b/tests/validation_layer/relu1.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of relu1(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -59,14 +61,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu1_init, csinn_relu1, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_relu1_init, csinn_relu1, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_relu1_init, csinn_relu1, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_relu1_init, csinn_relu1, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_relu1_init, csinn_relu1, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_relu1_init, csinn_relu1, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/relu6.cpp b/tests/validation_layer/relu6.cpp index 2a53472a..fc24f478 100644 --- a/tests/validation_layer/relu6.cpp +++ b/tests/validation_layer/relu6.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of relu6(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -57,16 +59,16 @@ int main(int argc, char **argv) reference->data = (float *)(buffer + 1 + input->dim_count + in_size); output->data = reference->data; float difference = argc > 2 ? atof(argv[2]) : 0.99; - + #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu6_init, csinn_relu6, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_relu6_init, csinn_relu6, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_relu6_init, csinn_relu6, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_relu6_init, csinn_relu6, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_relu6_init, csinn_relu6, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_relu6_init, csinn_relu6, &difference); #endif return done_testing(); } diff --git a/tests/validation_layer/reshape.cpp b/tests/validation_layer/reshape.cpp index 0e850a1e..016e5b2e 100644 --- a/tests/validation_layer/reshape.cpp +++ b/tests/validation_layer/reshape.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of reshape(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -41,7 +43,7 @@ int main(int argc, char **argv) input->dim[i] = buffer[2 + i]; in_size *= input->dim[i]; } - + for (int i = 0; i < reshape_count; i++) { reshape[i] = buffer[2 + input->dim_count + i]; output->dim[i] = reshape[i]; @@ -74,14 +76,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_reshape_init, csinn_reshape, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_reshape_init, csinn_reshape, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_reshape_init, csinn_reshape, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_reshape_init, csinn_reshape, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_reshape_init, csinn_reshape, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_reshape_init, csinn_reshape, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/rms_norm.cpp b/tests/validation_layer/rms_norm.cpp index 30ed4153..f4b1b090 100644 --- a/tests/validation_layer/rms_norm.cpp +++ b/tests/validation_layer/rms_norm.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of rms_norm(layer)\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *weight = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -56,7 +58,7 @@ int main(int argc, char **argv) } weight->dtype = CSINN_DTYPE_FLOAT32; weight->layout = CSINN_LAYOUT_NCHW; - weight->is_const = 0; + weight->is_const = 1; weight->quant_channel = 1; output->dim[0] = input->dim[0]; @@ -82,14 +84,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_binary2_op(input, output, weight, params, CSINN_QUANT_FLOAT32, csinn_rms_norm_init, - csinn_rms_norm, &difference); + test_binary2_op(input, output, weight, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_rms_norm_init, csinn_rms_norm, &difference); #elif (DTYPE == 16) - test_binary2_op(input, output, weight, params, CSINN_QUANT_FLOAT16, csinn_rms_norm_init, - csinn_rms_norm, &difference); + test_binary2_op(input, output, weight, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_rms_norm_init, csinn_rms_norm, &difference); #elif (DTYPE == 8) - test_binary2_op(input, output, weight, params, CSINN_QUANT_INT8_SYM, csinn_rms_norm_init, - csinn_rms_norm, &difference); + test_binary2_op(input, output, weight, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_rms_norm_init, csinn_rms_norm, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/sigmoid.cpp b/tests/validation_layer/sigmoid.cpp index b59a146d..76f555bf 100644 --- a/tests/validation_layer/sigmoid.cpp +++ b/tests/validation_layer/sigmoid.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of sigmoid(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -59,14 +61,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sigmoid_init, csinn_sigmoid, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_sigmoid_init, csinn_sigmoid, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sigmoid_init, csinn_sigmoid, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_sigmoid_init, csinn_sigmoid, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_sigmoid_init, csinn_sigmoid, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_sigmoid_init, csinn_sigmoid, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/silu.cpp b/tests/validation_layer/silu.cpp index 6f3e758a..11afaf20 100644 --- a/tests/validation_layer/silu.cpp +++ b/tests/validation_layer/silu.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of silu(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -59,14 +61,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_silu_init, csinn_silu, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_silu_init, csinn_silu, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_silu_init, csinn_silu, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_silu_init, csinn_silu, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_silu_init, csinn_silu, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_silu_init, csinn_silu, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/softmax.cpp b/tests/validation_layer/softmax.cpp index 86f12ebb..09f3fe9b 100644 --- a/tests/validation_layer/softmax.cpp +++ b/tests/validation_layer/softmax.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of softmax(layer)\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -60,14 +62,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_softmax_init, csinn_softmax, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_softmax_init, csinn_softmax, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_softmax_init, csinn_softmax, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_softmax_init, csinn_softmax, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_softmax_init, csinn_softmax, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_softmax_init, csinn_softmax, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/split.cpp b/tests/validation_layer/split.cpp index 57eade29..975ab428 100644 --- a/tests/validation_layer/split.cpp +++ b/tests/validation_layer/split.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of split(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_split_params *params = (csinn_split_params *)csinn_alloc_params(sizeof(struct csinn_split_params), sess); @@ -91,14 +93,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_split_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_split_init, csinn_split, - &difference); + test_split_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_split_init, csinn_split, &difference); #elif (DTYPE == 16) - test_split_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_split_init, csinn_split, - &difference); + test_split_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_split_init, csinn_split, &difference); #elif (DTYPE == 8) - test_split_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_split_init, csinn_split, - &difference); + test_split_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_split_init, csinn_split, &difference); #endif return done_testing(); } diff --git a/tests/validation_layer/sqrt.cpp b/tests/validation_layer/sqrt.cpp index a06c6f0e..6d9e014c 100644 --- a/tests/validation_layer/sqrt.cpp +++ b/tests/validation_layer/sqrt.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of sqrt(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -62,14 +64,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sqrt_init, csinn_sqrt, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_sqrt_init, csinn_sqrt, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sqrt_init, csinn_sqrt, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_sqrt_init, csinn_sqrt, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_sqrt_init, csinn_sqrt, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_sqrt_init, csinn_sqrt, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/strided_slice.cpp b/tests/validation_layer/strided_slice.cpp index 43c4aaf8..f0740019 100644 --- a/tests/validation_layer/strided_slice.cpp +++ b/tests/validation_layer/strided_slice.cpp @@ -24,7 +24,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of strided_slice(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -78,14 +80,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_strided_slice_init, - csinn_strided_slice, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_strided_slice_init, csinn_strided_slice, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_strided_slice_init, - csinn_strided_slice, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_strided_slice_init, csinn_strided_slice, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_strided_slice_init, - csinn_strided_slice, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_strided_slice_init, csinn_strided_slice, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/sub.cpp b/tests/validation_layer/sub.cpp index 16e2bdaf..8942f0c5 100644 --- a/tests/validation_layer/sub.cpp +++ b/tests/validation_layer/sub.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of sub(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input0 = csinn_alloc_tensor(sess); struct csinn_tensor *input1 = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -77,14 +79,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.9; #if (DTYPE == 32) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_sub_init, csinn_sub, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_sub_init, csinn_sub, &difference); #elif (DTYPE == 16) - test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_sub_init, csinn_sub, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_sub_init, csinn_sub, &difference); #elif (DTYPE == 8) - test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_ASYM, csinn_sub_init, csinn_sub, - &difference); + test_binary_op(input0, input1, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_sub_init, csinn_sub, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/sum_stride.cpp b/tests/validation_layer/sum_stride.cpp index 5222d379..43a69f71 100644 --- a/tests/validation_layer/sum_stride.cpp +++ b/tests/validation_layer/sum_stride.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of sum(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -90,14 +92,14 @@ int main(int argc, char **argv) params->base.layout = CSINN_LAYOUT_NCHW; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sum_init, csinn_sum, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_sum_init, csinn_sum, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sum_init, csinn_sum, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_sum_init, csinn_sum, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_sum_init, csinn_sum, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_sum_init, csinn_sum, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/testutil.h b/tests/validation_layer/testutil.h index 382dcf2f..da124596 100644 --- a/tests/validation_layer/testutil.h +++ b/tests/validation_layer/testutil.h @@ -21,6 +21,7 @@ #include #include +#include "csi_nn.h" #include "shl_ref.h" #include "test_utils.h" @@ -41,23 +42,89 @@ void set_layout(struct csinn_tensor *t) template void test_unary_op(struct csinn_tensor *input, struct csinn_tensor *output, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, T *), int (*unary_op)(struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput = - convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + + struct csinn_tensor *real_input = + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + if (init_op(qinput, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput, sess); + csinn_set_input(0, qinput, sess); unary_op(qinput, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); + + free_input(real_input); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); + } else { + printf("Function init failed\n"); + exit(-1); + } +} + +template +void test_maxpool_op(struct csinn_tensor *input, struct csinn_tensor *output, T *params, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + int (*unary_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + float *difference) +{ + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; + int test_api = params->base.api; + struct csinn_tensor *qinput = + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput = broadcast_quant_info(qinput, output, dtype); + + struct csinn_tensor *real_input = + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + + if (init_op(qinput, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput, sess); + csinn_set_input(0, qinput, sess); + unary_op(qinput, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, + *difference, csinn_tensor_size(output), false); + + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -66,27 +133,69 @@ void test_unary_op(struct csinn_tensor *input, struct csinn_tensor *output, T *p template void test_binary_op(struct csinn_tensor *input0, struct csinn_tensor *input1, - struct csinn_tensor *output, T *params, enum csinn_quant_enum quant_dtype, + struct csinn_tensor *output, T *params, enum csinn_dtype_enum dtype, + enum csinn_quant_enum quant_type, struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; - struct csinn_tensor *qinput0 = - convert_f32_layer(input0, test_dtype, (enum csinn_api_enum)test_api); - struct csinn_tensor *qinput1 = - convert_f32_layer(input1, test_dtype, (enum csinn_api_enum)test_api); - struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + + struct csinn_tensor *qinput0; + struct csinn_tensor *qinput1; + + struct csinn_tensor *qoutput; + struct csinn_tensor *real_input0; + struct csinn_tensor *real_input1; + + if (quant_type == CSINN_QUANT_FLOAT16_W_INT8) { + qinput0 = convert_f32_layer(input0, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + qinput1 = convert_f32_layer(input1, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); + + qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + real_input0 = convert_f32_layer(input0, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + real_input1 = + convert_f32_layer(input1, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); + + } else { + qinput0 = convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + qinput1 = convert_f32_layer(input1, quant_type, (enum csinn_api_enum)test_api); + qoutput = convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + + real_input0 = convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + real_input1 = convert_f32_layer(input1, quant_type, (enum csinn_api_enum)test_api); + } + + csinn_session_init(sess); + csinn_set_input_number(2, sess); + csinn_set_output_number(1, sess); + if (init_op(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput0, sess); + csinn_set_tensor_entry(qinput1, sess); + csinn_set_input(0, qinput0, sess); + csinn_set_input(1, qinput1, sess); binary_op(qinput0, qinput1, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input0, sess); + csinn_update_input(1, real_input1, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, *difference, csinn_tensor_size(output), false); + + free_input(real_input0); + free_input(real_input1); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -94,28 +203,113 @@ void test_binary_op(struct csinn_tensor *input0, struct csinn_tensor *input1, } template -void test_binary2_op(struct csinn_tensor *input, struct csinn_tensor *output, - struct csinn_tensor *weight, T *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, T *), - int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, T *), - float *difference) +void test_matmul_op(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, T *params, enum csinn_dtype_enum dtype, + enum csinn_quant_enum quant_type, struct csinn_session *sess, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, T *), + int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, T *), + float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; + int test_api = params->base.api; + + struct csinn_tensor *qinput0; + struct csinn_tensor *qinput1; + + struct csinn_tensor *qoutput; + struct csinn_tensor *real_input0; + struct csinn_tensor *real_input1; + + if (quant_type == CSINN_QUANT_FLOAT16_W_INT8) { + qinput0 = convert_f32_layer(input0, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + qinput1 = convert_f32_layer(input1, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); + + qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + real_input0 = convert_f32_layer(input0, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + + } else { + qinput0 = convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + qinput1 = convert_f32_layer(input1, quant_type, (enum csinn_api_enum)test_api); + + qoutput = convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + real_input0 = convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + } + + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + + if (init_op(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput0, sess); + csinn_set_input(0, qinput0, sess); + + binary_op(qinput0, qinput1, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input0, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); + struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, + *difference, csinn_tensor_size(output), false); + + free_input(real_input0); + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); + } else { + printf("Function init failed\n"); + exit(-1); + } +} + +template +void test_binary2_op( + struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *weight, T *params, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, struct csinn_session *sess, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), + int (*binary2_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), + float *difference) +{ + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput = - convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qweight = - convert_f32_layer(weight, test_dtype, (enum csinn_api_enum)test_api); - if (init_op(qinput, qoutput, qweight, params) == CSINN_TRUE) { - conv2d_op(qinput, qoutput, qweight, params); + convert_f32_layer(weight, quant_type, (enum csinn_api_enum)test_api); + + struct csinn_tensor *real_input = + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + + if (init_op(qinput, qweight, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput, sess); + csinn_set_input(0, qinput, sess); + + binary2_op(qinput, qweight, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); + + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -125,29 +319,49 @@ void test_binary2_op(struct csinn_tensor *input, struct csinn_tensor *output, template void test_ternary_op(struct csinn_tensor *input0, struct csinn_tensor *output, struct csinn_tensor *input1, struct csinn_tensor *input2, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), - int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, struct csinn_tensor *, T *), + int (*ternary_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput0 = - convert_f32_layer(input0, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qinput1 = - convert_f32_layer(input1, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input1, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qinput2 = - convert_f32_layer(input2, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input2, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + + struct csinn_tensor *real_input = + convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); if (init_op(qinput0, qoutput, qinput1, qinput2, params) == CSINN_TRUE) { - conv2d_op(qinput0, qoutput, qinput1, qinput2, params); + csinn_set_tensor_entry(qinput0, sess); + csinn_set_input(0, qinput0, sess); + ternary_op(qinput0, qoutput, qinput1, qinput2, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, *difference, csinn_tensor_size(output), false); + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -156,25 +370,53 @@ void test_ternary_op(struct csinn_tensor *input0, struct csinn_tensor *output, template void test_concat_op(struct csinn_tensor **input, struct csinn_tensor *output, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor **, struct csinn_tensor *, T *), int (*unary_op)(struct csinn_tensor **, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput[params->inputs_count]; + struct csinn_tensor *real_input[params->inputs_count]; for (int i = 0; i < params->inputs_count; i++) { - qinput[i] = convert_f32_layer(input[i], test_dtype, (enum csinn_api_enum)test_api); + qinput[i] = convert_f32_layer(input[i], quant_type, (enum csinn_api_enum)test_api); + real_input[i] = convert_f32_layer(input[i], quant_type, (enum csinn_api_enum)test_api); } struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(params->inputs_count, sess); + csinn_set_output_number(1, sess); + if (init_op((struct csinn_tensor **)qinput, qoutput, params) == CSINN_TRUE) { + for (int i = 0; i < params->inputs_count; i++) { + csinn_set_tensor_entry(qinput[i], sess); + csinn_set_input(i, qinput[i], sess); + } unary_op((struct csinn_tensor **)qinput, qoutput, params); + + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + + for (int i = 0; i < params->inputs_count; i++) { + csinn_update_input(i, real_input[i], sess); + } + csinn_session_run(sess); + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input[0]->data, *difference, csinn_tensor_size(output), false); + + for (int i = 0; i < params->inputs_count; i++) { + free_input(real_input[i]); + } + shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -183,30 +425,50 @@ void test_concat_op(struct csinn_tensor **input, struct csinn_tensor *output, T template void test_split_op(struct csinn_tensor *input, struct csinn_tensor **output, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor **, T *), int (*unary_op)(struct csinn_tensor *, struct csinn_tensor **, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput = - convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qoutput[params->output_num]; int output_size = 0; int o_size[params->output_num]; for (int i = 0; i < params->output_num; i++) { - qoutput[i] = convert_f32_layer(output[i], test_dtype, (enum csinn_api_enum)test_api); + qoutput[i] = broadcast_quant_info(qinput, output[i], dtype); o_size[i] = csinn_tensor_size(output[i]); output_size += o_size[i]; } + struct csinn_tensor *real_input = + convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(params->output_num, sess); + if (init_op(qinput, (struct csinn_tensor **)qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput, sess); + csinn_set_input(0, qinput, sess); + unary_op(qinput, (struct csinn_tensor **)qoutput, params); + + for (int i = 0; i < params->output_num; i++) { + csinn_set_output(i, qoutput[i], sess); + } + csinn_session_setup(sess); + + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); struct csinn_tensor *foutput[params->output_num]; float *output_data = (float *)malloc(output_size * sizeof(float)); float *foutput_data = (float *)malloc(output_size * sizeof(float)); int acc_size = 0; for (int i = 0; i < params->output_num; i++) { + csinn_get_output(i, qoutput[i], sess); foutput[i] = shl_ref_tensor_transform_f32(qoutput[i]); memcpy(output_data + acc_size, output[i]->data, o_size[i] * sizeof(float)); memcpy(foutput_data + acc_size, foutput[i]->data, o_size[i] * sizeof(float)); @@ -217,6 +479,9 @@ void test_split_op(struct csinn_tensor *input, struct csinn_tensor **output, T * false); free(output_data); free(foutput_data); + free_input(real_input); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -226,46 +491,79 @@ void test_split_op(struct csinn_tensor *input, struct csinn_tensor **output, T * template void test_conv2d_op(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; + params->base.quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qbias; struct csinn_tensor *qinput; + struct csinn_tensor *qkernel; - struct csinn_tensor *qkernel = - convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput; + struct csinn_tensor *real_input; - if (test_dtype == CSINN_QUANT_INT8_SYM) { - params->base.quant_type = CSINN_QUANT_INT8_ASYM_W_SYM; + if (quant_type == CSINN_QUANT_INT8_ASYM_W_SYM) { if (!params->conv_extra.fuse_zp2bias) { + qkernel = + convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); qbias = convert_f32_bias(input, kernel, bias, (enum csinn_api_enum)test_api); } else { + qkernel = + convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api); qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); qinput->qinfo->zero_point = 0; } + qoutput = convert_f32_layer(output, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); + real_input = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); + + } else if (quant_type == CSINN_QUANT_FLOAT16_W_INT8) { + qkernel = convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + qbias = convert_f32_layer(bias, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + + qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + real_input = convert_f32_layer(input, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + } else { - qbias = convert_f32_layer(bias, test_dtype, (enum csinn_api_enum)test_api); - qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + qkernel = convert_f32_layer(kernel, quant_type, (enum csinn_api_enum)test_api); + qbias = convert_f32_layer(bias, quant_type, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + qoutput = convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + real_input = convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); } - struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput, sess); + csinn_set_input(0, qinput, sess); conv2d_op(qinput, qoutput, qkernel, qbias, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -275,40 +573,72 @@ void test_conv2d_op(struct csinn_tensor *input, struct csinn_tensor *output, template void test_fully_op(struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; + params->base.quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qbias; struct csinn_tensor *qinput; + struct csinn_tensor *qkernel; - struct csinn_tensor *qkernel = - convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput; + struct csinn_tensor *real_input; - if (test_dtype == CSINN_QUANT_INT8_SYM) { - params->base.quant_type = CSINN_QUANT_INT8_ASYM_W_SYM; + if (quant_type == CSINN_QUANT_INT8_ASYM_W_SYM) { + qkernel = convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api); qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); qinput->qinfo->zero_point = 0; + + qoutput = convert_f32_layer(output, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); + real_input = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api); + + } else if (quant_type == CSINN_QUANT_FLOAT16_W_INT8) { + qkernel = convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + qbias = convert_f32_layer(bias, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + + qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + real_input = convert_f32_layer(input, CSINN_QUANT_FLOAT16, (enum csinn_api_enum)test_api); + } else { - qbias = convert_f32_layer(bias, test_dtype, (enum csinn_api_enum)test_api); - qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api); + qbias = convert_f32_layer(bias, quant_type, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); + qkernel = convert_f32_layer(kernel, quant_type, (enum csinn_api_enum)test_api); + + qoutput = convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + real_input = convert_f32_layer(input, quant_type, (enum csinn_api_enum)test_api); } - struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput, sess); + csinn_set_input(0, qinput, sess); conv2d_op(qinput, qoutput, qkernel, qbias, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -318,27 +648,47 @@ void test_fully_op(struct csinn_tensor *input, struct csinn_tensor *output, template void test_where_op(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *input2, struct csinn_tensor *output, T *params, - enum csinn_quant_enum quant_dtype, + enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), int (*trinary_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput1 = - convert_f32_layer(input1, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input1, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qinput2 = - convert_f32_layer(input2, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input2, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + + struct csinn_tensor *real_input = input0; + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + if (init_op(input0, qinput1, qinput2, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(input0, sess); + csinn_set_input(0, input0, sess); trinary_op(input0, qinput1, qinput2, qoutput, params); + + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, *difference, csinn_tensor_size(output), false); + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -346,25 +696,45 @@ void test_where_op(struct csinn_tensor *input0, struct csinn_tensor *input1, } template -void test_where_softmax_op( - struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, - T *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), - int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), - float *difference) +void test_where_softmax_op(struct csinn_tensor *input0, struct csinn_tensor *input1, + struct csinn_tensor *output, T *params, enum csinn_dtype_enum dtype, + enum csinn_quant_enum quant_type, struct csinn_session *sess, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, T *), + int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, T *), + float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput1 = - convert_f32_layer(input1, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(input1, quant_type, (enum csinn_api_enum)test_api); struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + + struct csinn_tensor *real_input = input0; + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); if (init_op(input0, qinput1, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(input0, sess); + csinn_set_input(0, input0, sess); binary_op(input0, qinput1, qoutput, params); + + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, *difference, csinn_tensor_size(output), false); + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); @@ -373,83 +743,143 @@ void test_where_softmax_op( template void test_gather_op(struct csinn_tensor *input0, struct csinn_tensor *input1, - struct csinn_tensor *output, T *params, enum csinn_quant_enum quant_dtype, + struct csinn_tensor *output, T *params, enum csinn_dtype_enum dtype, + enum csinn_quant_enum quant_type, struct csinn_session *sess, int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_quant_enum test_dtype = quant_dtype; + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; int test_api = params->base.api; struct csinn_tensor *qinput0 = - convert_f32_layer(input0, test_dtype, (enum csinn_api_enum)test_api); - struct csinn_tensor *qoutput = - convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api); - /* broadcast quantization */ - if (qinput0->dtype == CSINN_DTYPE_INT8) { - memcpy(qoutput->qinfo, qinput0->qinfo, - qinput0->quant_channel * sizeof(struct csinn_quant_info)); - } + convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + + struct csinn_tensor *qoutput = broadcast_quant_info(qinput0, output, dtype); + + struct csinn_tensor *real_input = + convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + if (init_op(qinput0, input1, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput0, sess); + csinn_set_input(0, qinput0, sess); + binary_op(qinput0, input1, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, *difference, csinn_tensor_size(output), false); + free_input(real_input); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); } } -void test_fully_op_fp16_w_int8( - struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel, - struct csinn_tensor *bias, csinn_fc_params *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_fc_params *), - int (*fully_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_fc_params *), +template +void test_matmul_op_hybrid_quant( + struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output, + T *params, enum csinn_dtype_enum dtype, enum csinn_quant_enum quant_type, + enum csinn_quant_enum quant_type_w, struct csinn_session *sess, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), + int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, T *), float *difference) { - enum csinn_api_enum test_api = (enum csinn_api_enum)params->base.api; - struct csinn_tensor *qbias = convert_f32_layer(bias, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qinput = convert_f32_layer(input, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qkernel = convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, test_api); + sess->base_dtype = dtype; + sess->base_quant_type = quant_type; + int test_api = params->base.api; - struct csinn_tensor *qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, test_api); + struct csinn_tensor *qinput0 = + convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); + struct csinn_tensor *qinput1 = + convert_f32_layer(input1, quant_type_w, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput = + convert_f32_layer(output, quant_type, (enum csinn_api_enum)test_api); + struct csinn_tensor *real_input0 = + convert_f32_layer(input0, quant_type, (enum csinn_api_enum)test_api); - if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { - fully_op(qinput, qoutput, qkernel, qbias, params); + csinn_session_init(sess); + csinn_set_input_number(1, sess); + csinn_set_output_number(1, sess); + + if (init_op(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { + csinn_set_tensor_entry(qinput0, sess); + csinn_set_input(0, qinput0, sess); + + binary_op(qinput0, qinput1, qoutput, params); + csinn_set_output(0, qoutput, sess); + csinn_session_setup(sess); + csinn_update_input(0, real_input0, sess); + csinn_session_run(sess); + + csinn_get_output(0, qoutput, sess); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); - result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, *difference, csinn_tensor_size(output), false); + + free_input(real_input0); shl_ref_tensor_transform_free_f32(foutput); + csinn_session_deinit(sess); + csinn_free_session(sess); } else { printf("Function init failed\n"); exit(-1); } } -void test_matmul_op_fp16_w_int8(struct csinn_tensor *input0, struct csinn_tensor *input1, - struct csinn_tensor *output, csinn_matmul_params *params, - enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_matmul_params *), - int (*matmul_op)(struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_matmul_params *), - float *difference) +template +void test_conv2d_layer(struct csinn_tensor *input, struct csinn_tensor *output, + struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params, + enum csinn_quant_enum quant_dtype, enum csinn_quant_enum quant_dtype_w, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), + int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *, + struct csinn_tensor *, struct csinn_tensor *, T *), + float *difference) { - enum csinn_api_enum test_api = (enum csinn_api_enum)params->base.api; - struct csinn_tensor *qinput0 = convert_f32_layer(input0, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qinput1 = convert_f32_layer(input1, CSINN_QUANT_INT8_SYM, test_api); + int test_api = params->base.api; + struct csinn_tensor *qbias; + struct csinn_tensor *qinput; + + struct csinn_tensor *qkernel = + convert_f32_layer(kernel, quant_dtype_w, (enum csinn_api_enum)test_api); - struct csinn_tensor *qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, test_api); + if (quant_dtype == CSINN_QUANT_INT8_ASYM) { + params->base.quant_type = CSINN_QUANT_INT8_ASYM_W_SYM; + if (!params->conv_extra.fuse_zp2bias) { + qinput = convert_f32_layer(input, quant_dtype, (enum csinn_api_enum)test_api); + qbias = convert_f32_bias(input, kernel, bias, (enum csinn_api_enum)test_api); + } else { + qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, quant_dtype, (enum csinn_api_enum)test_api); + qinput->qinfo->zero_point = 0; + } + } else { + qbias = convert_f32_layer(bias, quant_dtype, (enum csinn_api_enum)test_api); + qinput = convert_f32_layer(input, quant_dtype, (enum csinn_api_enum)test_api); + } - if (init_op(qinput0, qinput1, qoutput, params) == CSINN_TRUE) { - matmul_op(qinput0, qinput1, qoutput, params); + struct csinn_tensor *qoutput = + convert_f32_layer(output, quant_dtype, (enum csinn_api_enum)test_api); + + if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { + conv2d_op(qinput, qoutput, qkernel, qbias, params); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); - result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data, + result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); shl_ref_tensor_transform_free_f32(foutput); } else { @@ -458,24 +888,20 @@ void test_matmul_op_fp16_w_int8(struct csinn_tensor *input0, struct csinn_tensor } } -void test_conv2d_op_fp16_w_int8( - struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel, - struct csinn_tensor *bias, csinn_conv2d_params *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_conv2d_params *), - int (*conv_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_conv2d_params *), - float *difference) +template +void test_unary_layer(struct csinn_tensor *input, struct csinn_tensor *output, T *params, + enum csinn_quant_enum quant_dtype, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + int (*unary_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + float *difference) { - enum csinn_api_enum test_api = (enum csinn_api_enum)params->base.api; - struct csinn_tensor *qbias = convert_f32_layer(bias, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qinput = convert_f32_layer(input, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qkernel = convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, test_api); - - struct csinn_tensor *qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, test_api); - - if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { - conv_op(qinput, qoutput, qkernel, qbias, params); + int test_api = params->base.api; + struct csinn_tensor *qinput = + convert_f32_layer(input, quant_dtype, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput = + convert_f32_layer(output, quant_dtype, (enum csinn_api_enum)test_api); + if (init_op(qinput, qoutput, params) == CSINN_TRUE) { + unary_op(qinput, qoutput, params); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); @@ -486,24 +912,19 @@ void test_conv2d_op_fp16_w_int8( } } -void test_conv1d_op_fp16_w_int8( - struct csinn_tensor *input, struct csinn_tensor *output, struct csinn_tensor *kernel, - struct csinn_tensor *bias, csinn_conv1d_params *params, enum csinn_quant_enum quant_dtype, - int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_conv1d_params *), - int (*conv_op)(struct csinn_tensor *, struct csinn_tensor *, struct csinn_tensor *, - struct csinn_tensor *, csinn_conv1d_params *), - float *difference) +template +void test_maxpool_layer(struct csinn_tensor *input, struct csinn_tensor *output, T *params, + enum csinn_quant_enum quant_dtype, + int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + int (*unary_op)(struct csinn_tensor *, struct csinn_tensor *, T *), + float *difference) { - enum csinn_api_enum test_api = (enum csinn_api_enum)params->base.api; - struct csinn_tensor *qbias = convert_f32_layer(bias, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qinput = convert_f32_layer(input, CSINN_QUANT_FLOAT16, test_api); - struct csinn_tensor *qkernel = convert_f32_layer(kernel, CSINN_QUANT_INT8_SYM, test_api); - - struct csinn_tensor *qoutput = convert_f32_layer(output, CSINN_QUANT_FLOAT16, test_api); - - if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { - conv_op(qinput, qoutput, qkernel, qbias, params); + int test_api = params->base.api; + struct csinn_tensor *qinput = + convert_f32_layer(input, quant_dtype, (enum csinn_api_enum)test_api); + struct csinn_tensor *qoutput = broadcast_quant_info(qinput, output, qinput->dtype); + if (init_op(qinput, qoutput, params) == CSINN_TRUE) { + unary_op(qinput, qoutput, params); struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput); result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data, *difference, csinn_tensor_size(output), false); diff --git a/tests/validation_layer/transpose.cpp b/tests/validation_layer/transpose.cpp index fe35a828..c5b7b415 100644 --- a/tests/validation_layer/transpose.cpp +++ b/tests/validation_layer/transpose.cpp @@ -22,7 +22,9 @@ int main(int argc, char **argv) { init_testsuite("Testing function of transpose(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *input = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); struct csinn_tensor *reference = csinn_alloc_tensor(sess); @@ -65,14 +67,14 @@ int main(int argc, char **argv) float difference = argc > 2 ? atof(argv[2]) : 0.99; #if (DTYPE == 32) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_transpose_init, csinn_transpose, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_transpose_init, csinn_transpose, &difference); #elif (DTYPE == 16) - test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_transpose_init, csinn_transpose, - &difference); + test_unary_op(input, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_transpose_init, csinn_transpose, &difference); #elif (DTYPE == 8) - test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_transpose_init, - csinn_transpose, &difference); + test_unary_op(input, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_transpose_init, csinn_transpose, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/where.cpp b/tests/validation_layer/where.cpp index f7f7fcd3..0b085fb2 100644 --- a/tests/validation_layer/where.cpp +++ b/tests/validation_layer/where.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of where(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *condition = csinn_alloc_tensor(sess); struct csinn_tensor *x = csinn_alloc_tensor(sess); struct csinn_tensor *y = csinn_alloc_tensor(sess); @@ -81,14 +83,14 @@ int main(int argc, char **argv) condition->data = data_u8; #if (DTYPE == 32) - test_where_op(condition, x, y, output, params, CSINN_QUANT_FLOAT32, csinn_where_init, - csinn_where, &difference); + test_where_op(condition, x, y, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, sess, + csinn_where_init, csinn_where, &difference); #elif (DTYPE == 16) - test_where_op(condition, x, y, output, params, CSINN_QUANT_FLOAT16, csinn_where_init, - csinn_where, &difference); + test_where_op(condition, x, y, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, sess, + csinn_where_init, csinn_where, &difference); #elif (DTYPE == 8) - test_where_op(condition, x, y, output, params, CSINN_QUANT_INT8_SYM, csinn_where_init, - csinn_where, &difference); + test_where_op(condition, x, y, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, sess, + csinn_where_init, csinn_where, &difference); #endif return done_testing(); diff --git a/tests/validation_layer/where_softmax.cpp b/tests/validation_layer/where_softmax.cpp index bb23ab9f..957b486b 100644 --- a/tests/validation_layer/where_softmax.cpp +++ b/tests/validation_layer/where_softmax.cpp @@ -23,7 +23,9 @@ int main(int argc, char **argv) init_testsuite("Testing function of where_softmax(layer).\n"); struct csinn_session *sess = csinn_alloc_session(); - sess->base_run_mode = CSINN_RM_LAYER; + sess->base_run_mode = CSINN_RM_CPU_GRAPH; + sess->model.save_mode = CSINN_RUN_ONLY; + sess->dynamic_shape = CSINN_FALSE; struct csinn_tensor *condition = csinn_alloc_tensor(sess); struct csinn_tensor *y = csinn_alloc_tensor(sess); struct csinn_tensor *output = csinn_alloc_tensor(sess); @@ -76,13 +78,13 @@ int main(int argc, char **argv) condition->data = data_u8; #if (DTYPE == 32) - test_where_softmax_op(condition, y, output, params, CSINN_QUANT_FLOAT32, - csinn_where_softmax_init, csinn_where_softmax, &difference); + test_where_softmax_op(condition, y, output, params, CSINN_DTYPE_FLOAT32, CSINN_QUANT_FLOAT32, + sess, csinn_where_softmax_init, csinn_where_softmax, &difference); #elif (DTYPE == 16) - test_where_softmax_op(condition, y, output, params, CSINN_QUANT_FLOAT16, - csinn_where_softmax_init, csinn_where_softmax, &difference); + test_where_softmax_op(condition, y, output, params, CSINN_DTYPE_FLOAT16, CSINN_QUANT_FLOAT16, + sess, csinn_where_softmax_init, csinn_where_softmax, &difference); #elif (DTYPE == 8) - test_where_softmax_op(condition, y, output, params, CSINN_QUANT_INT8_SYM, + test_where_softmax_op(condition, y, output, params, CSINN_DTYPE_INT8, CSINN_QUANT_INT8_ASYM, csinn_where_softmax_init, csinn_where_softmax, &difference); #endif diff --git a/version b/version index dbe59006..1acd4dac 100644 --- a/version +++ b/version @@ -1 +1 @@ -2.8.1 +2.9.5