From f797193706c75d03edb77700abe6295805c31673 Mon Sep 17 00:00:00 2001 From: zhangwm Date: Mon, 18 Oct 2021 16:48:33 +0800 Subject: [PATCH] version 1.2 --- LICENSE | 201 ++++ include/csi_internal.h | 342 +++++- include/csi_internal_c906.h | 119 ++ include/csi_internal_ref.h | 1445 ++++++++++++++++++++++++ include/csi_nn.h | 344 +++--- include/csi_ovx.h | 45 +- include/csi_pnna.h | 319 ++++++ include/csi_pnna_wrapper.h | 41 + include/csi_utils.h | 72 +- source/c860_opt/utils.S | 100 ++ source/c906_opt/abs.c | 77 ++ source/c906_opt/add.c | 177 +++ source/c906_opt/broadcast_to.c | 89 ++ source/c906_opt/clip.c | 95 ++ source/c906_opt/fullyconnected.c | 143 +++ source/c906_opt/leaky_relu.c | 83 ++ source/c906_opt/prelu.c | 261 +++++ source/c906_opt/relu.c | 85 ++ source/c906_opt/relu1.c | 88 ++ source/c906_opt/relu6.c | 88 ++ source/c906_opt/setup.c | 212 ++++ source/openvx/abs.c | 5 +- source/openvx/add.c | 5 +- source/openvx/and.c | 5 +- source/openvx/argmax.c | 5 +- source/openvx/argmin.c | 5 +- source/openvx/averagepool.c | 5 +- source/openvx/batch_normalization.c | 5 +- source/openvx/batch_to_space.c | 5 +- source/openvx/concat.c | 11 +- source/openvx/convolution.c | 20 +- source/openvx/crop.c | 9 +- source/openvx/deconvolution.c | 56 +- source/openvx/depth_to_space.c | 5 +- source/openvx/div.c | 5 +- source/openvx/elu.c | 5 +- source/openvx/equal.c | 5 +- source/openvx/exp.c | 5 +- source/openvx/expand_dims.c | 10 +- source/openvx/flatten.c | 10 +- source/openvx/floor.c | 5 +- source/openvx/floor_divide.c | 5 +- source/openvx/fullyconnected.c | 10 +- source/openvx/global_averagepool.c | 5 +- source/openvx/global_maxpool.c | 5 +- source/openvx/greater.c | 5 +- source/openvx/greater_equal.c | 5 +- source/openvx/l2_normalization.c | 5 +- source/openvx/l2pool.c | 5 +- source/openvx/leaky_relu.c | 5 +- source/openvx/less.c | 5 +- source/openvx/less_equal.c | 5 +- source/openvx/lrn.c | 5 +- source/openvx/matmul.c | 5 +- source/openvx/max.c | 5 +- source/openvx/maximum.c | 5 +- source/openvx/maxpool.c | 5 +- source/openvx/maxpool_locat.c | 5 +- source/openvx/mean.c | 5 +- source/openvx/min.c | 5 +- source/openvx/minimum.c | 5 +- source/openvx/mul.c | 5 +- source/openvx/negative.c | 5 +- source/openvx/not_equal.c | 5 +- source/openvx/or.c | 5 +- source/openvx/pad.c | 5 +- source/openvx/pool_with_argmax.c | 31 +- source/openvx/power.c | 5 +- source/openvx/prelu.c | 5 +- source/openvx/prod.c | 5 +- source/openvx/proposal.c | 5 +- source/openvx/psroipooling.c | 5 +- source/openvx/relu.c | 5 +- source/openvx/relu1.c | 5 +- source/openvx/relu6.c | 5 +- source/openvx/relun.c | 5 +- source/openvx/reorg.c | 5 +- source/openvx/reshape.c | 10 +- source/openvx/resize.c | 5 +- source/openvx/reverse.c | 5 +- source/openvx/rsqrt.c | 5 +- source/openvx/select.c | 5 +- source/openvx/setup.c | 660 ++++++----- source/openvx/sigmoid.c | 5 +- source/openvx/slice.c | 10 +- source/openvx/softmax.c | 5 +- source/openvx/softplus.c | 5 +- source/openvx/space_to_batch.c | 5 +- source/openvx/space_to_depth.c | 5 +- source/openvx/split.c | 13 +- source/openvx/sqrt.c | 5 +- source/openvx/square.c | 5 +- source/openvx/squeeze.c | 10 +- source/openvx/stack.c | 5 +- source/openvx/sub.c | 5 +- source/openvx/sum.c | 5 +- source/openvx/tanh.c | 5 +- source/openvx/tile.c | 5 +- source/openvx/transpose.c | 5 +- source/openvx/unpool.c | 5 +- source/openvx/unstack.c | 15 +- source/reference/abs.c | 27 +- source/reference/acos.c | 27 +- source/reference/acosh.c | 31 +- source/reference/add.c | 35 +- source/reference/and.c | 23 +- source/reference/arange.c | 22 +- source/reference/argmax.c | 23 +- source/reference/argmin.c | 22 +- source/reference/asin.c | 33 +- source/reference/asinh.c | 31 +- source/reference/atan.c | 31 +- source/reference/atanh.c | 31 +- source/reference/averagepool.c | 88 +- source/reference/averagepool3d.c | 64 +- source/reference/batch_normalization.c | 59 +- source/reference/batch_to_space.c | 23 +- source/reference/broadcast_to.c | 29 +- source/reference/ceil.c | 23 +- source/reference/clip.c | 23 +- source/reference/col2im.c | 13 +- source/reference/concat.c | 41 +- source/reference/convolution.c | 957 +++++++++------- source/reference/convolution3d.c | 77 +- source/reference/convolution_channel.c | 953 ++++++++++++++++ source/reference/convolution_relu.c | 635 +++++++++++ source/reference/convolution_relu6.c | 545 +++++++++ source/reference/cos.c | 27 +- source/reference/cosh.c | 19 +- source/reference/cumprod.c | 30 +- source/reference/cumsum.c | 26 +- source/reference/deconvolution.c | 99 +- source/reference/deconvolution3d.c | 73 +- source/reference/depth_to_space.c | 20 +- source/reference/div.c | 29 +- source/reference/elu.c | 23 +- source/reference/equal.c | 29 +- source/reference/erf.c | 19 +- source/reference/exp.c | 23 +- source/reference/expand_dims.c | 19 +- source/reference/expm1.c | 31 +- source/reference/flatten.c | 19 +- source/reference/floor.c | 23 +- source/reference/floor_divide.c | 29 +- source/reference/floor_mod.c | 29 +- source/reference/fullyconnected.c | 31 +- source/reference/gather.c | 34 +- source/reference/gather_nd.c | 32 +- source/reference/global_averagepool.c | 209 +++- source/reference/global_maxpool.c | 52 +- source/reference/greater.c | 35 +- source/reference/greater_equal.c | 41 +- source/reference/hard_sigmoid.c | 31 +- source/reference/im2col.c | 271 ++++- source/reference/isnan.c | 29 +- source/reference/l2_normalization.c | 31 +- source/reference/l2pool.c | 19 +- source/reference/leaky_relu.c | 25 +- source/reference/less.c | 29 +- source/reference/less_equal.c | 42 +- source/reference/log.c | 27 +- source/reference/log1p.c | 31 +- source/reference/log_softmax.c | 93 +- source/reference/logical_and.c | 41 +- source/reference/logical_not.c | 31 +- source/reference/logical_or.c | 41 +- source/reference/logical_xor.c | 35 +- source/reference/lrn.c | 65 +- source/reference/matmul.c | 55 +- source/reference/max.c | 23 +- source/reference/maximum.c | 29 +- source/reference/maxpool.c | 76 +- source/reference/maxpool2d_locat.c | 63 +- source/reference/maxpool3d.c | 54 +- source/reference/mean.c | 38 +- source/reference/min.c | 35 +- source/reference/minimum.c | 29 +- source/reference/mod.c | 41 +- source/reference/mul.c | 59 +- source/reference/ndarray_size.c | 35 +- source/reference/negative.c | 27 +- source/reference/non_max_suppression.c | 122 ++ source/reference/not.c | 19 +- source/reference/not_equal.c | 41 +- source/reference/or.c | 36 +- source/reference/pad.c | 63 +- source/reference/power.c | 29 +- source/reference/prelu.c | 99 +- source/reference/prod.c | 31 +- source/reference/proposal.c | 59 +- source/reference/psroipooling.c | 31 +- source/reference/reduce_logsumexp.c | 32 +- source/reference/reduce_max.c | 32 +- source/reference/reduce_mean.c | 32 +- source/reference/reduce_min.c | 31 +- source/reference/reduce_prod.c | 32 +- source/reference/reduce_sum.c | 28 +- source/reference/relu.c | 23 +- source/reference/relu1.c | 23 +- source/reference/relu6.c | 23 +- source/reference/relun.c | 25 +- source/reference/reshape.c | 27 +- source/reference/resize.c | 27 +- source/reference/reverse.c | 34 +- source/reference/roialign.c | 7 +- source/reference/roipool.c | 162 ++- source/reference/round.c | 23 +- source/reference/rsqrt.c | 24 +- source/reference/segment_max.c | 66 +- source/reference/segment_mean.c | 70 +- source/reference/segment_min.c | 60 +- source/reference/segment_prod.c | 70 +- source/reference/segment_sum.c | 70 +- source/reference/select.c | 27 +- source/reference/setup.c | 409 ++++++- source/reference/shape.c | 19 +- source/reference/shuffle_channel.c | 200 ++++ source/reference/sigmoid.c | 23 +- source/reference/sign.c | 27 +- source/reference/sin.c | 27 +- source/reference/sinh.c | 31 +- source/reference/slice.c | 27 +- source/reference/softmax.c | 162 ++- source/reference/softplus.c | 24 +- source/reference/softrelu.c | 35 +- source/reference/softsign.c | 23 +- source/reference/space_to_batch.c | 23 +- source/reference/space_to_depth.c | 19 +- source/reference/split.c | 23 +- source/reference/sqrt.c | 23 +- source/reference/square.c | 11 +- source/reference/squeeze.c | 19 +- source/reference/stack.c | 34 +- source/reference/strided_slice.c | 33 +- source/reference/sub.c | 29 +- source/reference/sum.c | 27 +- source/reference/tan.c | 27 +- source/reference/tanh.c | 35 +- source/reference/threshold_relu.c | 27 +- source/reference/tile.c | 19 +- source/reference/topk.c | 124 ++ source/reference/transpose.c | 90 +- source/reference/trunc.c | 23 +- source/reference/unpooling.c | 49 +- source/reference/unstack.c | 30 +- source/reference/utils.c | 298 ++++- source/reference/xor.c | 23 +- source/reference/yuv_rgb_scale.c | 39 +- 248 files changed, 11417 insertions(+), 3996 deletions(-) create mode 100644 LICENSE create mode 100644 include/csi_internal_c906.h create mode 100644 include/csi_internal_ref.h create mode 100644 include/csi_pnna.h create mode 100644 include/csi_pnna_wrapper.h create mode 100644 source/c860_opt/utils.S create mode 100644 source/c906_opt/abs.c create mode 100644 source/c906_opt/add.c create mode 100644 source/c906_opt/broadcast_to.c create mode 100644 source/c906_opt/clip.c create mode 100644 source/c906_opt/fullyconnected.c create mode 100644 source/c906_opt/leaky_relu.c create mode 100644 source/c906_opt/prelu.c create mode 100644 source/c906_opt/relu.c create mode 100644 source/c906_opt/relu1.c create mode 100644 source/c906_opt/relu6.c create mode 100644 source/c906_opt/setup.c create mode 100644 source/reference/convolution_channel.c create mode 100644 source/reference/convolution_relu.c create mode 100644 source/reference/convolution_relu6.c create mode 100644 source/reference/non_max_suppression.c create mode 100644 source/reference/shuffle_channel.c create mode 100644 source/reference/topk.c diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..29f81d81 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/include/csi_internal.h b/include/csi_internal.h index 29a47d1b..a1f04ff3 100644 --- a/include/csi_internal.h +++ b/include/csi_internal.h @@ -21,15 +21,222 @@ /* data type */ enum { - CSINN_DTYPE_UINT8 = 0x0, - CSINN_DTYPE_INT8 = 0x1, - CSINN_DTYPE_UINT16 = 0x2, - CSINN_DTYPE_INT16 = 0x3, - CSINN_DTYPE_UINT32 = 0x4, - CSINN_DTYPE_INT32 = 0x5, - CSINN_DTYPE_FLOAT16 = 0x6, - CSINN_DTYPE_FLOAT32 = 0x7, - CSINN_DTYPE_FLOAT64 = 0x8, + CSINN_DTYPE_UINT8 = 0, + CSINN_DTYPE_INT8, + CSINN_DTYPE_UINT16, + CSINN_DTYPE_INT16, + CSINN_DTYPE_UINT32, + CSINN_DTYPE_INT32, + CSINN_DTYPE_FLOAT16, + CSINN_DTYPE_FLOAT32, + CSINN_DTYPE_FLOAT64, + CSINN_DTYPE_SIZE, +}; + +/* API type */ +enum +{ + CSINN_REF = 0, + CSINN_C860, + CSINN_C906, + CSINN_C910, + CSINN_ANOLE, + CSINN_TX510, + CSINN_LIGHT, + CSINN_TVMGEN, + CSINN_API_SIZE, +}; + +/* op and utils */ +enum +{ + CSINN_OP_ABS = 0, + CSINN_OP_ACOS, + CSINN_OP_ACOSH, + CSINN_OP_ADD, + CSINN_OP_ALL, + CSINN_OP_AND, + CSINN_OP_ANY, + CSINN_OP_ARANGE, + CSINN_OP_ARGMAX, + CSINN_OP_ARGMIN, + CSINN_OP_ASIN, + CSINN_OP_ASINH, + CSINN_OP_ATAN, + CSINN_OP_ATANH, + CSINN_OP_AVGPOOL2D, + CSINN_OP_AVGPOOL3D, + CSINN_OP_BN, + CSINN_OP_BATCH_TO_SPACE, + CSINN_OP_BROADCOST, + CSINN_OP_CEIL, + CSINN_OP_CLIP, + CSINN_OP_COL2IM, + CSINN_OP_CONCAT, + CSINN_OP_CONV2D, + CSINN_OP_CONV2D_RELU, + CSINN_OP_CONV2D_RELU6, + CSINN_OP_CONV2D_CHANNEL, + CSINN_OP_CONV2D_CHANNEL_RELU, + CSINN_OP_CONV2D_CHANNEL_RELU6, + CSINN_OP_DEPTHWISE_CONV2D, + CSINN_OP_DEPTHWISE_CONV2D_RELU, + CSINN_OP_DEPTHWISE_CONV2D_RELU6, + CSINN_OP_DEPTHWISE_CONV2D_CHANNEL, + CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU, + CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6, + CSINN_OP_GROUP_CONV2D, + CSINN_OP_GROUP_CONV2D_RELU, + CSINN_OP_GROUP_CONV2D_CHANNEL, + CSINN_OP_GROUP_CONV2D_CHANNEL_RELU, + CSINN_OP_CONV3D, + CSINN_OP_COS, + CSINN_OP_COSH, + CSINN_OP_CUMPROD, + CSINN_OP_CUMSUM, + CSINN_OP_DECONV2D, + CSINN_OP_DEPTHWISE_DECONV2D, + CSINN_OP_DECONV3D, + CSINN_OP_DEPTH_TO_SPACE, + CSINN_OP_DIV, + CSINN_OP_ELU, + CSINN_OP_EQUANL, + CSINN_OP_ERF, + CSINN_OP_EXP, + CSINN_OP_EXPAND_DIMS, + CSINN_OP_EXPM1, + CSINN_OP_FLATTEN, + CSINN_OP_FLOOR_DIVIDE, + CSINN_OP_FLOOR_MOD, + CSINN_OP_FLOOR, + CSINN_OP_FULLYCONNECTED, + CSINN_OP_GATHER_ND, + CSINN_OP_GATHER, + CSINN_OP_GLOBAL_AVGPOOL2D, + CSINN_OP_GLOBAL_MAXPOOL2D, + CSINN_OP_GREATHER_EQUAL, + CSINN_OP_GREATHER, + CSINN_OP_HARD_SIGMOID, + CSINN_OP_IM2COL, + CSINN_OP_ISNAN, + CSINN_OP_L2N, + CSINN_OP_L2POOL2D, + CSINN_OP_LEAKY_RELU, + CSINN_OP_LESS_EQUAL, + CSINN_OP_LESS, + CSINN_OP_LOG_SOFTMAX, + CSINN_OP_LOG, + CSINN_OP_LOG1P, + CSINN_OP_LOGICAL_AND, + CSINN_OP_LOGICAL_NOT, + CSINN_OP_LOGICAL_OR, + CSINN_OP_LOGICAL_XOR, + CSINN_OP_LRN, + CSINN_OP_MATMUL, + CSINN_OP_MAX, + CSINN_OP_MAXINUM, + CSINN_OP_MAXPOOL2D, + CSINN_OP_MAXPOOL2D_LOCAT, + CSINN_OP_MAXPOOL3D, + CSINN_OP_MEAN, + CSINN_OP_MEAN_STRIDE, + CSINN_OP_MIN, + CSINN_OP_MIN_STRIDE, + CSINN_OP_MINIMUM, + CSINN_OP_MOD, + CSINN_OP_MUL, + CSINN_OP_NDARRAY_SIZE, + CSINN_OP_NEGATIIVE, + CSINN_OP_NON_MAX_SUPPRESSION, + CSINN_OP_NOT_EQUAL, + CSINN_OP_NOT, + CSINN_OP_ONE_HOT, + CSINN_OP_OR, + CSINN_OP_PAD, + CSINN_OP_POWER, + CSINN_OP_PRELU, + CSINN_OP_PROD, + CSINN_OP_PROPOSAL, + CSINN_OP_PSROIPOOLING, + CSINN_OP_REDUCE_LOGSUMEXP, + CSINN_OP_REDUCE_MAX, + CSINN_OP_REDUCE_MEAN, + CSINN_OP_REDUCE_MIN, + CSINN_OP_REDUCE_PROD, + CSINN_OP_REDUCE_SUM, + CSINN_OP_RELU, + CSINN_OP_RELU1, + CSINN_OP_RELU6, + CSINN_OP_RELUN, + CSINN_OP_REORG, + CSINN_OP_RESHAPE, + CSINN_OP_RESIZE, + CSINN_OP_REVERSE, + CSINN_OP_ROIALIGN, + CSINN_OP_ROIPOOL, + CSINN_OP_ROUND, + CSINN_OP_RSQRT, + CSINN_OP_SEGMENT_MAX, + CSINN_OP_UNSORTED_SEGMENT_MAX, + CSINN_OP_SEGMENT_MEAN, + CSINN_OP_UNSORTED_SEGMENT_MEAN, + CSINN_OP_SEGMENT_MIN, + CSINN_OP_UNSORTED_SEGMENT_MIN, + CSINN_OP_SEGMENT_PROD, + CSINN_OP_UNSORTED_SEGMENT_PROD, + CSINN_OP_SEGMENT_SUM, + CSINN_OP_UNSORTED_SEGMENT_SUM, + CSINN_OP_SELECT, + CSINN_OP_SEQUENCE_MASK, + CSINN_OP_SHAPE, + CSINN_OP_SHUFFLE_CHANNEL, + CSINN_OP_SIGMOID, + CSINN_OP_SIGN, + CSINN_OP_SIN, + CSINN_OP_SINH, + CSINN_OP_SLICE, + CSINN_OP_SOFTMAX, + CSINN_OP_SOFTPLUS, + CSINN_OP_SOFTRELU, + CSINN_OP_SOFTSIGN, + CSINN_OP_SPACE_TO_BATCH, + CSINN_OP_SPACE_TO_DEPTH, + CSINN_OP_SPLIT, + CSINN_OP_SQRT, + CSINN_OP_SQUARE, + CSINN_OP_SQUEEZE, + CSINN_OP_STACK, + CSINN_OP_STRIDED_SLICE, + CSINN_OP_SUB, + CSINN_OP_SUM, + CSINN_OP_TAN, + CSINN_OP_TANH, + CSINN_OP_THRESHOLD_RELU, + CSINN_OP_TILE, + CSINN_OP_TOPK, + CSINN_OP_TRANSPOSE, + CSINN_OP_TRUNC, + CSINN_OP_UNPOOLING, + CSINN_OP_UNSTACK, + CSINN_OP_WHERE, + CSINN_OP_XOR, + CSINN_OP_YUV_RGB_SCALE, + + /* utils functions */ + CSINN_SESSION_INIT, + CSINN_SESSION_DEINIT, + CSINN_SESSION_SETUP, + CSINN_SESSION_RUN, + CSINN_UPDATE_INPUT, + CSINN_SET_INPUT_NUMBER, + CSINN_SET_OUTPUT_NUMBER, + CSINN_GET_INPUT_NUMBER, + CSINN_GET_OUTPUT_NUMBER, + CSINN_SET_INPUT, + CSINN_SET_OUTPUT, + CSINN_GET_INPUT, + CSINN_GET_OUTPUT, + CSINN_OP_SIZE, }; /* pad mode */ @@ -72,19 +279,22 @@ struct csi_tensor int32_t dtype; int32_t dim[MAX_DIM]; int32_t dim_count; + char *name; int32_t zero_point; float scale; - int32_t offset; int32_t multiplier; int32_t shift; int32_t layout; - void *t_private; + float min; + float max; + struct csi_session *sess; } __attribute__((packed)); struct conv2d_params { int (*bc)(); int32_t layout; + int32_t api; int32_t group; int32_t stride_height; int32_t stride_width; @@ -94,6 +304,9 @@ struct conv2d_params int32_t pad_right; int32_t dilation_height; int32_t dilation_width; + char *name; + float *wscales; + int32_t *wzps; }; struct conv3d_params @@ -123,12 +336,14 @@ struct fc_params { int (*bc)(); int32_t layout; + int32_t api; }; struct pool_params { int (*bc)(); int32_t layout; + int32_t api; int32_t pool_type; int32_t filter_height; int32_t filter_width; @@ -148,6 +363,7 @@ struct unpooling_params { int (*bc)(); int32_t layout; + int32_t api; int32_t scale_height; int32_t scale_width; int32_t pad_out_height; @@ -158,6 +374,7 @@ struct roi_align_params { int (*bc)(); int32_t layout; + int32_t api; int32_t pooled_size_h; int32_t pooled_size_w; float spatial_scale; @@ -170,6 +387,7 @@ struct roi_pool_params { int (*bc)(); int32_t layout; + int32_t api; int32_t pooled_size_h; int32_t pooled_size_w; float spatial_scale; @@ -181,18 +399,22 @@ struct siso_params { int (*bc)(); int32_t layout; + int32_t api; }; struct sigmoid_params { int (*bc)(); int32_t layout; + int32_t api; }; struct relu_params { int (*bc)(); int32_t layout; + int32_t api; + char *name; /* n / alpha / threshold */ float n; @@ -204,6 +426,7 @@ struct prelu_params { int (*bc)(); int32_t layout; + int32_t api; int32_t axis; }; @@ -211,6 +434,7 @@ struct softmax_params { int (*bc)(); int32_t layout; + int32_t api; int32_t axis; }; @@ -218,7 +442,7 @@ struct bn_params { int (*bc)(); int32_t layout; - + int32_t api; float epsilon; int32_t epsilon_multiplier; int32_t epsilon_shift; @@ -228,7 +452,7 @@ struct l2n_params { int (*bc)(); int32_t layout; - + int32_t api; float epsilon; int32_t epsilon_multiplier; int32_t epsilon_shift; @@ -240,7 +464,7 @@ struct lrn_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t range; double bias; int32_t bias_multiplier; @@ -257,7 +481,7 @@ struct matmul_params { int (*bc)(); int32_t layout; - + int32_t api; bool trans_a; bool trans_b; }; @@ -266,19 +490,21 @@ struct diso_params { int (*bc)(); int32_t layout; + int32_t api; }; struct select_params { int (*bc)(); int32_t layout; + int32_t api; }; struct pad_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t *pad_before; int32_t *pad_after; float pad_value; @@ -289,7 +515,7 @@ struct resize_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t resize_mode; bool align_corners; }; @@ -298,7 +524,7 @@ struct concat_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t inputs_count; int32_t axis; }; @@ -307,7 +533,7 @@ struct proposal_params { int (*bc)(); int32_t layout; - + int32_t api; float *scales; int32_t *scale_multipliers; int32_t *scale_shifts; @@ -330,7 +556,7 @@ struct psroipooling_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t output_dim; int32_t group_size; float spatial_scale; @@ -342,7 +568,7 @@ struct transpose_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t *permute; }; @@ -350,18 +576,21 @@ struct reshape_params { int (*bc)(); int32_t layout; + int32_t api; }; struct shape_params { int (*bc)(); int32_t layout; + int32_t api; }; struct expand_dims_params { int (*bc)(); int32_t layout; + int32_t api; int32_t axis; }; @@ -369,7 +598,7 @@ struct reverse_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t axis; }; @@ -377,13 +606,14 @@ struct flatten_params { int (*bc)(); int32_t layout; + int32_t api; }; struct crop_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t axis; int32_t *offset; }; @@ -392,7 +622,7 @@ struct slice_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t *begin; int32_t *end; int32_t *strides; @@ -402,7 +632,7 @@ struct split_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t *split_index; int32_t output_num; int32_t axis; @@ -412,7 +642,7 @@ struct stack_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t inputs_count; int32_t axis; }; @@ -421,7 +651,7 @@ struct tile_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t *reps; int32_t reps_num; }; @@ -430,7 +660,7 @@ struct arange_params { int (*bc)(); int32_t layout; - + int32_t api; float start; int32_t start_multiplier; int32_t start_shift; @@ -446,6 +676,7 @@ struct where_params { int (*bc)(); int32_t layout; + int32_t api; }; struct unstack_params @@ -461,7 +692,7 @@ struct take_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t axis; const char *mode; }; @@ -478,24 +709,28 @@ struct gather_nd_params { int (*bc)(); int32_t layout; + int32_t api; }; struct squeeze_params { int (*bc)(); int32_t layout; + int32_t api; }; struct ndarray_size_params { int (*bc)(); int32_t layout; + int32_t api; }; struct space_to_batch_params { int (*bc)(); int32_t layout; + int32_t api; int32_t pad_top; int32_t pad_bottom; int32_t pad_left; @@ -507,6 +742,7 @@ struct batch_to_space_params { int (*bc)(); int32_t layout; + int32_t api; int32_t crop_top; int32_t crop_bottom; int32_t crop_left; @@ -518,6 +754,7 @@ struct space_to_depth_params { int (*bc)(); int32_t layout; + int32_t api; int32_t block_size; }; @@ -525,6 +762,7 @@ struct depth_to_space_params { int (*bc)(); int32_t layout; + int32_t api; int32_t block_size; }; @@ -532,6 +770,7 @@ struct one_hot_params { int (*bc)(); int32_t layout; + int32_t api; float f_on_value; float f_off_value; int32_t on_value; @@ -544,6 +783,7 @@ struct sequence_mask_params { int (*bc)(); int32_t layout; + int32_t api; float mask_value; int32_t mask_value_multiplier; int32_t mask_value_shift; @@ -554,16 +794,22 @@ struct im2col_params { int (*bc)(); int32_t layout; - int32_t pad_h; - int32_t pad_w; + int32_t api; + int32_t pad_top; + int32_t pad_down; + int32_t pad_left; + int32_t pad_right; int32_t stride_h; int32_t stride_w; + int32_t kernel_h; + int32_t kernel_w; }; struct col2im_params { int (*bc)(); int32_t layout; + int32_t api; int32_t pad_h; int32_t pad_w; int32_t stride_h; @@ -574,7 +820,7 @@ struct reduce_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t *out_strides; int32_t *out_extents; int32_t n; @@ -591,7 +837,7 @@ struct reorg_params { int (*bc)(); int32_t layout; - + int32_t api; int32_t stride; }; @@ -599,6 +845,7 @@ struct segment_params { int (*bc)(); int32_t layout; + int32_t api; int32_t num_segments; bool unsorted; }; @@ -607,6 +854,7 @@ struct cumsum_params { int (*bc)(); int32_t layout; + int32_t api; int32_t axis; bool exclusive; }; @@ -615,6 +863,7 @@ struct cumprod_params { int (*bc)(); int32_t layout; + int32_t api; int32_t axis; bool exclusive; }; @@ -622,6 +871,7 @@ struct cumprod_params struct broadcast_to_params { int (*bc)(); + int32_t api; int32_t layout; int32_t *shape; int32_t shape_count; @@ -646,4 +896,30 @@ struct strided_slice_params int32_t slice_count; }; +struct shuffle_channel_params +{ + int (*bc)(); + int32_t api; + int32_t layout; + int32_t group; +}; + +struct topk_params +{ + int (*bc)(); + int32_t api; + int32_t layout; + int32_t k; +}; + +struct non_max_suppression_params +{ + int (*bc)(); + int32_t api; + int32_t layout; + int32_t max_output_size; + float iou_threshold; + // float score_threshold; +}; + #endif diff --git a/include/csi_internal_c906.h b/include/csi_internal_c906.h new file mode 100644 index 00000000..412a287a --- /dev/null +++ b/include/csi_internal_c906.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CSI_INTERNAL_C906_H +#define _CSI_INTERNAL_C906_H + +#include +#include +#include +#include +#include "csi_internal.h" +#include "csi_utils.h" + +int csi_abs_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_abs_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_add_f32_c906(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_add_u8_c906(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_broadcast_to_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params); + +int csi_broadcast_to_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params); + +int csi_clip_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params); + +int csi_clip_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params); + +int csi_fullyconnected_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params); + +int csi_fullyconnected_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params); + +int csi_prelu_f32_c906(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params); + +int csi_prelu_u8_c906(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params); + +int csi_relu_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu1_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu1_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu6_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu6_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_leaky_relu_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_leaky_relu_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + + + +#endif diff --git a/include/csi_internal_ref.h b/include/csi_internal_ref.h new file mode 100644 index 00000000..45c94cf0 --- /dev/null +++ b/include/csi_internal_ref.h @@ -0,0 +1,1445 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CSI_INTERNAL_REF_H +#define _CSI_INTERNAL_REF_H + +#include +#include +#include +#include +#include "csi_internal.h" +#include "csi_utils.h" + +int csi_abs_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_abs_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_acos_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_acos_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_acosh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_acosh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_add_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_add_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_and_u32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_and_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_arange_f32(struct csi_tensor *output, + struct arange_params *params); + +int csi_arange_u8(struct csi_tensor *output, + struct arange_params *params); + +int csi_argmax_stride_i32_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_argmax_stride_i32_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_argmin_stride_i32_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_argmin_stride_i32_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_asin_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_asin_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_asinh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_asinh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_atan_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_atan_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_atanh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_atanh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_averagepool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_averagepool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_averagepool3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_averagepool3d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_batch_normalization_f32(struct csi_tensor *input, + struct csi_tensor *mean, + struct csi_tensor *variance, + struct csi_tensor *gamma, + struct csi_tensor *beta, + struct csi_tensor *output, + struct bn_params *params); + +int csi_batch_normalization_u8(struct csi_tensor *input, + struct csi_tensor *mean, + struct csi_tensor *variance, + struct csi_tensor *gamma, + struct csi_tensor *beta, + struct csi_tensor *output, + struct bn_params *params); + +int csi_batch_to_space_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct batch_to_space_params *params); + +int csi_batch_to_space_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct batch_to_space_params *params); + +int csi_broadcast_to_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params); + +int csi_broadcast_to_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params); + +int csi_ceil_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_ceil_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_clip_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params); + +int csi_clip_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params); + +int csi_col2im_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct col2im_params *params); + +int csi_concat_f32(struct csi_tensor **input, + struct csi_tensor *output, + struct concat_params *params); + +int csi_concat_u8(struct csi_tensor **input, + struct csi_tensor *output, + struct concat_params *params); + +int csi_conv2d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv2d_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv2d_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv2d_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); +int csi_conv2d_relu_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); + +int csi_conv2d_channel_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); + +int csi_conv2d_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv2d_relu6_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv2d_channel_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_relu_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_channel_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_relu6_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_conv2d_channel_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_group_conv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_group_conv2d_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_group_conv2d_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_group_conv2d_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_group_conv2d_relu_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_group_conv2d_channel_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_conv3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params); + +int csi_conv3d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params); + +int csi_cos_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_cos_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_cosh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_cosh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_cumprod_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct cumprod_params *params); + +int csi_cumprod_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct cumprod_params *params); + +int csi_cumsum_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct cumsum_params *params); + +int csi_cumsum_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct cumsum_params *params); + +int csi_deconv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_depthwise_deconv2d_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params); + +int csi_deconv3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params); + +int csi_depth_to_space_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct depth_to_space_params *params); + +int csi_depth_to_space_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct depth_to_space_params *params); + +int csi_div_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_div_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_elu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_elu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_erf_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_erf_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_exp_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_exp_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_expand_dims_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct expand_dims_params *params); + +int csi_expand_dims_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct expand_dims_params *params); + +int csi_expm1_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_expm1_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_flatten_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct flatten_params *params); + +int csi_flatten_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct flatten_params *params); + +int csi_floor_divide_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_floor_divide_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_floor_mod_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_floor_mod_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_floor_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_floor_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_fullyconnected_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params); + +int csi_fullyconnected_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params); + +int csi_gather_nd_f32(struct csi_tensor *input, + struct csi_tensor *indices, + struct csi_tensor *output, + struct gather_nd_params *params); + +int csi_gather_nd_u8(struct csi_tensor *input, + struct csi_tensor *indices, + struct csi_tensor *output, + struct gather_nd_params *params); + +int csi_gather_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct gather_params *params); + +int csi_gather_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct gather_params *params); + +int csi_global_averagepool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_global_averagepool_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_global_maxpool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_greater_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_greater_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_greater_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_greater_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_hard_sigmoid_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params); + +int csi_hard_sigmoid_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params); + +int csi_im2col_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params); + +int csi_im2col_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params); + +int csi_isnan_bool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_isnan_bool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_l2_normalization_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct l2n_params *params); + +int csi_l2_normalization_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct l2n_params *params); + +int csi_l2pool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_leaky_relu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_leaky_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_less_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_less_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_less_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_less_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_log_softmax_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params); + +int csi_log_softmax_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params); + +int csi_log_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_log_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_log1p_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_log1p_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_logical_and_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_logical_and_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_logical_not_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_logical_not_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_logical_or_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_logical_or_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_logical_xor_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_logical_xor_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_lrn_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct lrn_params *params); + +int csi_lrn_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct lrn_params *params); + +int csi_matmul_f32(struct csi_tensor *mat0, + struct csi_tensor *mat1, + struct csi_tensor *output, + struct matmul_params *params); + +int csi_matmul_u8(struct csi_tensor *mat0, + struct csi_tensor *mat1, + struct csi_tensor *output, + struct matmul_params *params); + +int csi_max_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_max_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_maximum_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_maximum_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_maxpool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_maxpool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_maxpool2d_locat_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_maxpool2d_locat_i32_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_maxpool3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_maxpool3d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_mean_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_mean_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_min_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_min_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_minimum_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_minimum_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_mod_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_mod_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_mul_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_mul_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_ndarray_size_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct ndarray_size_params *params); + +int csi_ndarray_size_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct ndarray_size_params *params); + +int csi_ndarray_size_i32(struct csi_tensor *input, + struct csi_tensor *output, + struct ndarray_size_params *params); + +int csi_negative_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_negative_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_non_max_suppression_std(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct non_max_suppression_params *params); + +int csi_not_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_not_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_not_u32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_not_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_or_u32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_or_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pad_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pad_params *params); + +int csi_pad_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pad_params *params); + +int csi_power_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_power_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_prelu_f32(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params); + +int csi_prelu_u8(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params); + +int csi_prod_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_prod_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_proposal_f32(struct csi_tensor *cls_prob, + struct csi_tensor *bbox_pred, + struct csi_tensor *im_info, + struct csi_tensor *output, + struct proposal_params *params); + +int csi_proposal_u8(struct csi_tensor *cls_prob, + struct csi_tensor *bbox_pred, + struct csi_tensor *im_info, + struct csi_tensor *output, + struct proposal_params *params); + +int csi_psroipooling_f32(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct psroipooling_params *params); + +int csi_psroipooling_u8(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct psroipooling_params *params); + +int csi_reduce_logsumexp_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_logsumexp_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_max_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_max_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_mean_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_mean_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_min_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_min_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_prod_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_prod_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_sum_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_reduce_sum_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_relu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu1_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu1_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu6_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relun_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_relun_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_reshape_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reshape_params *params); + +int csi_reshape_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reshape_params *params); + +int csi_resize_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct resize_params *params); + +int csi_resize_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct resize_params *params); + +int csi_reverse_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reverse_params *params); + +int csi_reverse_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reverse_params *params); + +int csi_roi_align_f32(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_align_params *params); + +int csi_roipool_f32(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params); + +int csi_roipool_u8(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params); + +int csi_round_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_round_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_rsqrt_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_rsqrt_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_unsorted_segment_max_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_max_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_max_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_max_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_mean_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_mean_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_mean_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_mean_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_min_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_min_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_min_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_min_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_prod_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_prod_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_prod_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_prod_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_sum_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_sum_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_unsorted_segment_sum_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_sum_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params); + +int csi_select_f32(struct csi_tensor *condition, + struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct select_params *params); + +int csi_select_u8(struct csi_tensor *condition, + struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct select_params *params); + +int csi_shape_i32(struct csi_tensor *input, + struct csi_tensor *output, + struct shape_params *params); + +int csi_shape_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct shape_params *params); + +int csi_shuffle_channel_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params); + +int csi_shuffle_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params); + +int csi_sigmoid_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params); + +int csi_sigmoid_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params); + +int csi_sign_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_sign_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_sin_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_sin_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_sinh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_sinh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_slice_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct slice_params *params); + +int csi_slice_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct slice_params *params); + +int csi_softmax_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params); + +int csi_softmax_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params); + +int csi_softmax_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params); + +int csi_softplus_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_softplus_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_softrelu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_softrelu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_softsign_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_softsign_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_space_to_batch_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_batch_params *params); + +int csi_space_to_batch_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_batch_params *params); + +int csi_space_to_depth_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_depth_params *params); + +int csi_space_to_depth_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_depth_params *params); + +int csi_split_u8(struct csi_tensor *input, + struct csi_tensor **output, + struct split_params *params); + +int csi_sqrt_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_sqrt_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_square_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_squeeze_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct squeeze_params *params); + +int csi_squeeze_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct squeeze_params *params); + +int csi_stack_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct stack_params *params); + +int csi_stack_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct stack_params *params); + +int csi_strided_slice_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct strided_slice_params *params); + +int csi_strided_slice_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct strided_slice_params *params); + +int csi_sub_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_sub_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_sum_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_sum_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_tan_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_tan_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_tanh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_tanh_f64(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_tanh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_threshold_relu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_threshold_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_tile_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct tile_params *params); + +int csi_tile_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct tile_params *params); + +int csi_topk_f32(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params); + +int csi_topk_u8(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params); + +int csi_transpose_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params); + +int csi_transpose_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params); + +int csi_transpose_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params); + +int csi_trunc_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_trunc_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_unpooling_f32(struct csi_tensor *input, + struct csi_tensor *mask, + struct csi_tensor *output, + struct unpooling_params *params); + +int csi_unpooling_u8(struct csi_tensor *input, + struct csi_tensor *mask, + struct csi_tensor *output, + struct unpooling_params *params); + +int csi_unstack_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct unstack_params *params); + +int csi_unstack_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct unstack_params *params); + +int csi_xor_u32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_xor_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_yuv_rgb_scale_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_yuv_rgb_scale_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +#endif diff --git a/include/csi_nn.h b/include/csi_nn.h index d1d2210b..8a846cf3 100644 --- a/include/csi_nn.h +++ b/include/csi_nn.h @@ -81,16 +81,16 @@ int csi_conv3d_init(struct csi_tensor *input, struct conv3d_params *params); int csi_conv3d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params); + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params); int csi_deconv3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params); + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params); int csi_deconv3d(struct csi_tensor *input, struct csi_tensor *output, @@ -131,12 +131,12 @@ int csi_maxpool(struct csi_tensor *input, struct pool_params *params); int csi_maxpool3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params); + struct csi_tensor *output, + struct pool_params *params); int csi_maxpool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params); + struct csi_tensor *output, + struct pool_params *params); int csi_global_maxpool_init(struct csi_tensor *input, struct csi_tensor *output, @@ -159,8 +159,8 @@ int csi_averagepool3d_init(struct csi_tensor *input, struct pool_params *params); int csi_averagepool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params); + struct csi_tensor *output, + struct pool_params *params); int csi_global_averagepool_init(struct csi_tensor *input, struct csi_tensor *output, @@ -305,12 +305,12 @@ int csi_exp(struct csi_tensor *input, struct siso_params *params); int csi_expm1_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_expm1(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_sin_init(struct csi_tensor *input, struct csi_tensor *output, @@ -377,12 +377,12 @@ int csi_sigmoid(struct csi_tensor *input, struct sigmoid_params *params); int csi_hard_sigmoid_init(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params); + struct csi_tensor *output, + struct sigmoid_params *params); int csi_hard_sigmoid(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params); + struct csi_tensor *output, + struct sigmoid_params *params); int csi_elu_init(struct csi_tensor *input, struct csi_tensor *output, @@ -467,8 +467,8 @@ int csi_softmax(struct csi_tensor *input, struct softmax_params *params); int csi_log_softmax_init(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params); + struct csi_tensor *output, + struct softmax_params *params); int csi_log_softmax(struct csi_tensor *input, struct csi_tensor *output, @@ -637,37 +637,37 @@ int csi_less(struct csi_tensor *input0, struct diso_params *params); int csi_logical_and_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); int csi_logical_and(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); int csi_logical_or_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); int csi_logical_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); int csi_logical_not_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_logical_not(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_logical_xor_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); int csi_logical_xor(struct csi_tensor *input0, struct csi_tensor *input1, @@ -780,11 +780,11 @@ int csi_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params); -int csi_concat_init(struct csi_tensor *input, +int csi_concat_init(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params); -int csi_concat(struct csi_tensor *input, +int csi_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params); @@ -875,11 +875,11 @@ int csi_slice(struct csi_tensor *input, struct slice_params *params); int csi_split_init(struct csi_tensor *input, - struct csi_tensor *output, + struct csi_tensor **output, struct split_params *params); int csi_split(struct csi_tensor *input, - struct csi_tensor *output, + struct csi_tensor **output, struct split_params *params); int csi_stack_init(struct csi_tensor *inputs, @@ -891,8 +891,8 @@ int csi_stack(struct csi_tensor *inputs, struct stack_params *params); int csi_unstack_init(struct csi_tensor *input, - struct csi_tensor *output, - struct unstack_params *params); + struct csi_tensor *output, + struct unstack_params *params); int csi_unstack(struct csi_tensor *input, struct csi_tensor *output, @@ -933,12 +933,12 @@ int csi_unstack(struct csi_tensor *input, struct unstack_params *params); int csi_gather_init(struct csi_tensor *input, - struct csi_tensor *output, - struct gather_params *params); + struct csi_tensor *output, + struct gather_params *params); int csi_gather(struct csi_tensor *input, - struct csi_tensor *output, - struct gather_params *params); + struct csi_tensor *output, + struct gather_params *params); int csi_gather_nd_init(struct csi_tensor *input, struct csi_tensor *indices, @@ -1018,12 +1018,10 @@ int csi_sequence_mask(struct csi_tensor *input0, int csi_im2col_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct im2col_params *params); int csi_im2col(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct im2col_params *params); int csi_col2im_init(struct csi_tensor *input, @@ -1117,149 +1115,149 @@ int csi_reorg(struct csi_tensor *input, struct reorg_params *params); int csi_yuv_rgb_scale_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_yuv_rgb_scale(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_segment_max_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); int csi_segment_max(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); int csi_segment_min_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); int csi_segment_min(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); int csi_segment_sum_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); int csi_segment_sum(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); int csi_segment_mean_init(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_mean(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, struct segment_params *params); -int csi_segment_mean(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); - int csi_segment_prod_init(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params); + +int csi_segment_prod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, struct segment_params *params); -int csi_segment_prod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params); - int csi_threshold_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params); + struct csi_tensor *output, + struct relu_params *params); int csi_threshold_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params); + struct csi_tensor *output, + struct relu_params *params); int csi_acos_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_acos(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_acosh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_acosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_asin_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_asin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_asinh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_asinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_atan_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_atan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_atanh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_atanh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_cosh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_cosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_sinh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_sinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_tan_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params); int csi_tan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_log1p_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_log1p(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params); + struct csi_tensor *output, + struct siso_params *params); int csi_softsign_init(struct csi_tensor *input, struct csi_tensor *output, @@ -1282,12 +1280,12 @@ int csi_cumsum_init(struct csi_tensor *input, struct cumsum_params *params); int csi_cumsum(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params); + struct csi_tensor *output, + struct cumsum_params *params); int csi_cumprod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params); + struct csi_tensor *output, + struct cumprod_params *params); int csi_cumprod(struct csi_tensor *input, struct csi_tensor *output, @@ -1298,20 +1296,20 @@ int csi_reduce_max_init(struct csi_tensor *input, struct reduce_params *params); int csi_reduce_max(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_reduce_min_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); int csi_reduce_min(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_reduce_mean_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_reduce_mean(struct csi_tensor *input, struct csi_tensor *output, @@ -1322,32 +1320,32 @@ int csi_reduce_sum_init(struct csi_tensor *input, struct reduce_params *params); int csi_reduce_sum(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_reduce_prod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_reduce_prod(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params); int csi_reduce_logsumexp_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_reduce_logsumexp(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params); + struct csi_tensor *output, + struct reduce_params *params); int csi_broadcast_to_init(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params); + struct csi_tensor *output, + struct broadcast_to_params *params); int csi_broadcast_to(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params); + struct csi_tensor *output, + struct broadcast_to_params *params); int csi_clip_init(struct csi_tensor *input, struct csi_tensor *output, @@ -1358,11 +1356,49 @@ int csi_clip(struct csi_tensor *input, struct clip_params *params); int csi_strided_slice_init(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params); + struct csi_tensor *output, + struct strided_slice_params *params); int csi_strided_slice(struct csi_tensor *input, + struct csi_tensor *output, + struct strided_slice_params *params); + +int csi_topk_init(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params); + +int csi_topk(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params); + +int csi_non_max_suppression_init(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct non_max_suppression_params *params); + +int csi_non_max_suppression(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct non_max_suppression_params *params); + +int csi_shuffle_channel_init(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params); + +int csi_shuffle_channel(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params); + +int csi_roipool_init(struct csi_tensor *data, + struct csi_tensor *rois, struct csi_tensor *output, - struct strided_slice_params *params); + struct roi_pool_params *params); + +int csi_roipool(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params); #endif diff --git a/include/csi_ovx.h b/include/csi_ovx.h index 49ccc2d9..6417881e 100644 --- a/include/csi_ovx.h +++ b/include/csi_ovx.h @@ -52,6 +52,12 @@ int csi_ovx_deconv2d(struct csi_tensor *input, struct csi_tensor *bias, struct conv2d_params *params); +int csi_ovx_depthwise_deconv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + int csi_ovx_fullyconnected(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weights, @@ -301,7 +307,7 @@ int csi_ovx_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params); -int csi_ovx_concat(struct csi_tensor *input, +int csi_ovx_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params); @@ -365,7 +371,7 @@ int csi_ovx_slice_tail(struct csi_tensor *input, struct slice_params *params); int csi_ovx_split(struct csi_tensor *input, - struct csi_tensor *output, + struct csi_tensor **output, struct split_params *params); int csi_ovx_stack(struct csi_tensor *inputs, @@ -482,23 +488,20 @@ int csi_ovx_reorg(struct csi_tensor *input, struct reorg_params *params); int32_t csi_get_ceil_mode_fix(int32_t input, int32_t kernel, int32_t stride, int32_t pad); -int csi_nn_create_tensor(struct csi_tensor *input, - struct csi_tensor *output, - void *td); -int csi_nn_ovx_create_const(struct csi_tensor *input, void *td); -uint8_t *csi_nn_input_f32_to_u8(uint32_t idx, float *data, void *td); -void csi_nn_update_input(uint32_t idx, uint8_t *data, void *td); -void csi_nn_set_ovx_input(int index, int input, struct __target_data *td); -void csi_nn_set_ovx_output(int index, struct csi_tensor *output, struct __target_data *td); - -int csi_nn_get_output_number(void *td); -int csi_nn_get_input_number(void *td); -struct csi_tensor *csi_nn_get_output(void *td, int index); -struct csi_tensor *csi_nn_get_input(void *td, int index); -struct csi_tensor *csi_nn_ovx_get_tensor(void *td, int index); -void csi_nn_save_output(void *td, int index, const char *filename); -void csi_nn_show_top5(void *td, int index); -uint64_t csi_get_perf_count(); - -void csi_ovx_free(struct __target_data *td); + +struct csi_ovx_target_data { + void *graph; +}; + +void *csi_ovx_get_graph(struct csi_session *sess); + +void csi_ovx_set_tensor(struct csi_tensor *tensor, struct csi_session *sess); +void csi_ovx_set_const_tensor(struct csi_tensor *tensor, struct csi_session *sess); +uint8_t *csi_ovx_input_f32_to_u8(uint32_t idx, float *data, struct csi_session *sess); +int csi_ovx_get_tensor(int index, struct csi_tensor *ret, struct csi_session *sess); +void csi_ovx_save_output(int index, const char *filename, struct csi_session *sess); +void csi_ovx_show_top5(int index, struct csi_session *sess); +void csi_ovx_nbg(struct csi_tensor **input, struct csi_tensor **output, + uint32_t inputs_count, uint32_t outputs_count, const char *url); + #endif diff --git a/include/csi_pnna.h b/include/csi_pnna.h new file mode 100644 index 00000000..47e3e94d --- /dev/null +++ b/include/csi_pnna.h @@ -0,0 +1,319 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CSI_NN_PNNA_H +#define _CSI_NN_PNNA_H +#include "csi_nn.h" + +int csi_pnna_conv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_pnna_depthwise_conv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_pnna_group_conv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_pnna_deconv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_pnna_depthwise_deconv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params); + +int csi_pnna_fullyconnected(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params); + +int csi_pnna_maxpool(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_pnna_global_maxpool(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_pnna_averagepool(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_pnna_global_averagepool(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_pnna_global_maxpool(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params); + +int csi_pnna_negative(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_pnna_tanh(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params); + +int csi_pnna_sigmoid(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params); + +int csi_pnna_elu(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_pnna_relu(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_pnna_relu6(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_pnna_leaky_relu(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params); + +int csi_pnna_prelu(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params); + +int csi_pnna_softmax(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params); + +int csi_pnna_batch_normalization(struct csi_tensor *input, + struct csi_tensor *mean, + struct csi_tensor *variance, + struct csi_tensor *gamma, + struct csi_tensor *beta, + struct csi_tensor *output, + struct bn_params *params); + +int csi_pnna_lrn(struct csi_tensor *input, + struct csi_tensor *output, + struct lrn_params *params); + +int csi_pnna_matmul(struct csi_tensor *mat0, + struct csi_tensor *mat1, + struct csi_tensor *output, + struct matmul_params *params); + +int csi_pnna_add(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_sub(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_mul(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_div(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_maximum(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_minimum(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_power(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_greater(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_less(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_equal(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_not_equal(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_greater_equal(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_less_equal(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_select(struct csi_tensor *condition, + struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_and(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_or(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params); + +int csi_pnna_pad(struct csi_tensor *input, + struct csi_tensor *output, + struct pad_params *params); + +int csi_pnna_resize(struct csi_tensor *input, + struct csi_tensor *output, + struct resize_params *params); + +int csi_pnna_concat(struct csi_tensor **input, + struct csi_tensor *output, + struct concat_params *params); + +int csi_pnna_transpose(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params); + +int csi_pnna_reshape(struct csi_tensor *input, + struct csi_tensor *output, + struct reshape_params *params); + +int csi_pnna_shape(struct csi_tensor *input, + struct csi_tensor *output, + struct shape_params *params); + +int csi_pnna_flatten(struct csi_tensor *input, + struct csi_tensor *output, + struct flatten_params *params); + +int csi_pnna_crop(struct csi_tensor *input, + struct csi_tensor *output, + struct crop_params *params); + +int csi_pnna_slice(struct csi_tensor *input, + struct csi_tensor *output, + struct slice_params *params); + +int csi_pnna_split(struct csi_tensor *input, + struct csi_tensor **output, + struct split_params *params); + +int csi_pnna_squeeze(struct csi_tensor *input, + struct csi_tensor *output, + struct squeeze_params *params); + +int csi_pnna_space_to_batch(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_batch_params *params); + +int csi_pnna_batch_to_space(struct csi_tensor *input, + struct csi_tensor *output, + struct batch_to_space_params *params); + +int csi_pnna_space_to_depth(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_depth_params *params); + +int csi_pnna_depth_to_space(struct csi_tensor *input, + struct csi_tensor *output, + struct depth_to_space_params *params); + +int csi_pnna_sum(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_mean(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_max(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_min(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_prod(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_argmin(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_argmax(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_all(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +int csi_pnna_any(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params); + +struct csi_pnna_target_data { + void *network; + void *net_obj; + void *context; + void *attrs; + void *graph; + void *nodes; +}; + +#endif diff --git a/include/csi_pnna_wrapper.h b/include/csi_pnna_wrapper.h new file mode 100644 index 00000000..28bfebe1 --- /dev/null +++ b/include/csi_pnna_wrapper.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CSI_NN_PNNA_WRAPPER_H +#define _CSI_NN_PNNA_WRAPPER_H + +#ifdef __cplusplus +extern "C" { +#endif +int csi_pnna_session_init_internal(struct csi_pnna_target_data *td, void *params_buf); +int csi_pnna_session_deinit_internal(struct csi_pnna_target_data *td); +int csi_pnna_session_setup_internal(struct csi_pnna_target_data *td); +int csi_pnna_session_run_internal(struct csi_pnna_target_data* td, void *input_buf, int input_num); +int csi_pnna_create_val(struct csi_tensor *t); +int csi_pnna_create_const_val(struct csi_tensor *t); +int csi_pnna_create_conv2d(struct conv2d_params *params, struct csi_tensor *kernel, + struct csi_tensor *output, int channel, bool use_bias); +int csi_pnna_create_relu(struct relu_params *params, struct csi_tensor *t); +int csi_pnna_set_graph_attrs(void *attrs, double max, double min, char *name); +void csi_pnna_set_node_input(void *nodes_vec, int node, int input, int index); +void csi_pnna_set_output_internal(void *graph, void *nodes_vec, int output, int index); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/csi_utils.h b/include/csi_utils.h index 7115bd76..17c10d35 100644 --- a/include/csi_utils.h +++ b/include/csi_utils.h @@ -28,14 +28,17 @@ #include #include -struct __target_data { - void *graph; +#define CSINN_MAX_INPUT 4 +#define CSINN_MAX_OUTPUT 8 +struct csi_session { + int32_t base_dtype; + int32_t base_layout; + int32_t base_api; int32_t input_num; int32_t output_num; - int32_t layer_num; - struct csi_tensor *input; - struct layer_item *net; - int32_t *output_index; + struct csi_tensor *input[CSINN_MAX_INPUT]; + struct csi_tensor *output[CSINN_MAX_OUTPUT]; + void *td; }; int32_t csi_max_internal_s32(int32_t a, int32_t b); @@ -49,30 +52,55 @@ int32_t csi_get_index_6(int32_t *dim, int32_t index0, int32_t index1, int32_t in float csi_get_scale(int32_t multiplier, int32_t shift); int32_t csi_dequantize_u8(uint8_t input, int32_t offset, int32_t multiplier, int32_t shift); uint8_t csi_quantize_u8(int32_t input, int32_t offset, int32_t multiplier, int32_t shift); -float csi_dequantize_f32(uint8_t input, int32_t offset, int32_t multiplier, int32_t shift); -uint8_t csi_quantize_f32(float input, int32_t offset, int32_t multiplier, int32_t shift); +int8_t csi_quantize_i8(int32_t input, int32_t offset, int32_t multiplier, int32_t shift); +float csi_dequantize_u8_to_f32(uint8_t input, int32_t offset, int32_t multiplier, int32_t shift); +float csi_dequantize_i8_to_f32(int8_t input, int32_t offset, int32_t multiplier, int32_t shift); +void csi_dequantize_f32_c860(uint8_t *input, float * output, int32_t offset, int32_t multiplier, int32_t shift, int32_t length); +uint8_t csi_quantize_f32_to_u8(float input, int32_t offset, int32_t multiplier, int32_t shift); +int8_t csi_quantize_f32_to_i8(float input, int32_t offset, int32_t multiplier, int32_t shift); uint8_t csi_requantize_u8(uint8_t input, int32_t input_offset, int32_t input_multiplier, int32_t input_shift, int32_t output_offset, int32_t output_multiplier, int32_t output_shift); - -struct csi_tensor *csi_nchw_to_nhwc_u8(struct csi_tensor *t); -void csi_nhwc_to_nchw_u8(struct csi_tensor *nt, struct csi_tensor *t); +uint8_t csi_quantize_channel_u8(int32_t data, struct csi_tensor* input, struct csi_tensor* output, float wscale); +float uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point); +float uint8_to_float(uint8_t i, struct csi_tensor *t); +float int8_to_float(int8_t i, struct csi_tensor *t); +uint8_t float_to_uint8(float i, struct csi_tensor *t); +int8_t float_to_int8(float i, struct csi_tensor *t); +int64_t conv_out_u8(int64_t res, struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel); +int64_t conv_out_i8(int64_t res, struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel); +int64_t conv_relu6_out_u8(int64_t res, struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel); +int64_t conv_relu6_out_i8(int64_t res, struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel); +int64_t conv_channel_out_u8(int64_t res, struct csi_tensor *input, struct csi_tensor *output, float kscale); +int64_t conv_channel_relu6_u8(int64_t res, struct csi_tensor *input, struct csi_tensor *output, float kscale); +struct csi_tensor *csi_nchw_to_nhwc_8(struct csi_tensor *t); +void csi_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t); struct csi_tensor *csi_deconv_kernel_nchw_to_nhwc_u8(struct csi_tensor *t, int32_t permute[4]); struct csi_tensor *csi_nchw_to_nhwc_f32(struct csi_tensor *t); void csi_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t); -#define MAX_INPUT_INDEX 8 -#define MAX_OUTPUT_INDEX 8 -struct layer_item { - int32_t input_index[MAX_INPUT_INDEX]; - int32_t output_index[MAX_OUTPUT_INDEX]; - int32_t input_num; - int32_t output_num; -}; +void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *cls); +uint64_t csi_get_timespec(); struct csi_tensor *csi_nchw_to_nhwc_u8_new(struct csi_tensor *t, int32_t permute[4]); int32_t get_reduction_index(int32_t k, const int32_t *strides, - const int32_t *extents, int32_t n); - + const int32_t *extents, int32_t n); + +struct csi_tensor *csi_alloc_tensor(struct csi_session *session); +struct csi_session *csi_alloc_session(); +void csi_free_session(struct csi_session *session); +void csi_session_init(struct csi_session *session); +void csi_session_deinit(struct csi_session *session); +int csi_session_setup(struct csi_session *session); +int csi_session_run(struct csi_session *session); +void csi_set_input_number(int number, struct csi_session *sess); +void csi_set_output_number(int number, struct csi_session *sess); +int csi_get_input_number(struct csi_session *sess); +int csi_get_output_number(struct csi_session *sess); +int csi_set_input(int index, struct csi_tensor *input, struct csi_session *sess); +int csi_set_output(int index, struct csi_tensor *output, struct csi_session *sess); +int csi_get_input(int index, struct csi_tensor *input, struct csi_session *sess); +int csi_get_output(int index, struct csi_tensor *output, struct csi_session *sess); +int csi_update_input(int index, struct csi_tensor *input, struct csi_session *sess); /* * model setup and run @@ -91,6 +119,6 @@ void csi_nn_deinit(struct csi_tensor *input, struct csi_tensor *output); void *csi_nn_presetup(int input, int output); - +void *csi_bc_map(int api, int op, int dtype); #endif diff --git a/source/c860_opt/utils.S b/source/c860_opt/utils.S new file mode 100644 index 00000000..0197f2f3 --- /dev/null +++ b/source/c860_opt/utils.S @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * void csi_dequantize_f32_c860( + * uint8_t *input, + * float *output, + * int32_t offset, + * int32_t multiplier, + * int32_t shift, + * uint32_t length) + * +**/ + + .file "utils.S" + .section .text.csi_dequantize_f32_c860,"ax",@progbits + .align 2 + .global csi_dequantize_f32_c860 + +csi_dequantize_f32_c860: + ld.w t0, (sp, 0x4) // length + ld.w t3, (sp, 0x0) // shift + vdupg.32 vr0, a3 + addi t1, t3, 96 // shift - 31 + 127 + lsli t1, t1, 23 // the float value + vitof.s32.f32 vr1, vr0 + vdupg.32 vr7, a2 // offset + vdupg.32 vr6, t1 // scale + vmul.f32 vr6, vr6, vr1 + lsri t2, t0, 4 // length >> 4 + bez t2, .L2 + +.L0: + vldmu.8 vr0-vr0, (a0) // input + vmov.u8.e vr2, vr0 + vmov.u16.e vr4, vr2 + vmov.u16.e vr0, vr3 + vadd.s32 vr4, vr4, vr7 // add offset + vadd.s32 vr5, vr5, vr7 + vadd.s32 vr0, vr0, vr7 + vadd.s32 vr1, vr1, vr7 + vitof.s32.f32 vr2, vr4 + vitof.s32.f32 vr3, vr5 + vitof.s32.f32 vr4, vr0 + vitof.s32.f32 vr5, vr1 + vmul.f32 vr2, vr2, vr6 + vmul.f32 vr3, vr3, vr6 + vmul.f32 vr4, vr4, vr6 + vmul.f32 vr5, vr5, vr6 + vstmu.32 vr2-vr5, (a1) + bnezad t2, .L0 + +.L1: + andi t2, t0, 15 // length & 15 + lsri t1, t2, 2 + bez t1, .L3 + +.L2: + vldu.8.4 vr0, (a0) // input + vmov.u8.e vr2, vr0 + vmov.u16.e vr4, vr2 + vadd.s32 vr4, vr4, vr7 // add offset + vitof.s32.f32 vr2, vr4 + vmul.f32 vr2, vr2, vr6 + vstmu.32 vr2-vr2, (a1) + + bnezad t1, .L2 + +.L3: + andi t1, t2, 3 + bez t1, .L4 + + vldx.8 vr0, (a0), t1 // input + vmov.u8.e vr2, vr0 + vmov.u16.e vr4, vr2 + vadd.s32 vr4, vr4, vr7 // add offset + vitof.s32.f32 vr2, vr4 + vmul.f32 vr2, vr2, vr6 + vstx.32 vr2, (a1), t1 + +.L4: + rts + .size csi_dequantize_f32_c860, .-csi_dequantize_f32_c860 + diff --git a/source/c906_opt/abs.c b/source/c906_opt/abs.c new file mode 100644 index 00000000..9e341c06 --- /dev/null +++ b/source/c906_opt/abs.c @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_abs_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + asm volatile( + "loop:\n\t" + "vsetvli t0, %3, e32, m2\n\t" + "vlw.v v2, (%2)\n\t" + "slli t1, t0, 2\n\t" + "add %2, %2, t1\n\t" + "vfsgnjx.vv v4, v2, v2\n\t" + "vsw.v v4, (%0)\n\t" + "add %0, %0, t1\n\t" + "sub %3, %3, t0\n\t" + "bnez %3, loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size) // %3 + : "v2", "v3", "v4", "v5", "t0", "t1" + ); + + // for (int i = 0; i < size; i++) { + // output_data[i] = fabs(input_data[i]); + // } + return CSINN_TRUE; +} + + + +int csi_abs_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + for (int i = 0; i < size; i++) { + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); + float abs_val = fabs(input_val); + output_data[i] = csi_quantize_f32_to_u8(abs_val, output->zero_point, output->multiplier, output->shift); + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/add.c b/source/c906_opt/add.c new file mode 100644 index 00000000..c713f2b9 --- /dev/null +++ b/source/c906_opt/add.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_add_f32_c906(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) +{ + float *input0_data = input0->data; + float *input1_data = input1->data; + float *output_data = output->data; + int size0 = 1; + for (int i = 0; i < input0->dim_count; i++) { + size0 = size0 * input0->dim[i]; + } + + int size1 = 1; + for (int i = 0; i < input1->dim_count; i++) { + size1 = size1 * input1->dim[i]; + } + + if(size0 == size1){ + + asm volatile( + "0:\n\t" + "vsetvli t0, %4, e32, m2\n\t" + "vlw.v v2, (%2)\n\t" + "sub %4, %4, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" + "vlw.v v4, (%3)\n\t" + "add %3, %3, t0\n\t" + "vfadd.vv v6, v2, v4\n\t" + "vsw.v v6, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %4, 0b\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input0_data), // %2 + "r"(input1_data), // %3 + "r"(size0) // %4 + : "v2", "v3", "v4", "v5", "v6", "v7", "t0" + ); + + // for (int i = 0; i < size0; i++) { + // output_data[i] = input0_data[i] + input1_data[i]; + // } + } + else if(input1->dim[0] == input0->dim[3] && size1 == input1->dim[0]){ + + int inner_size = input0->dim[3]; + int outer_size = input0->dim[0] * input0->dim[1] * input0->dim[2]; + + asm volatile( + "outer_loop:\n\t" + "mv a1, %4\n\t" + "inner_loop:\n\t" + "vsetvli t0, a1, e32, m2\n\t" + "vlw.v v2, (%2)\n\t" + "sub a1, a1, t0\n\t" + "slli t0, t0, 2\t\n" + "add %2, %2, t0\n\t" + "vlw.v v4, (%3)\n\t" + "add %3, %3, t0\n\t" + "vfadd.vv v6, v2, v4\n\t" + "vsw.v v6, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez a1, inner_loop\n\t" + "slli a2, %4, 2\n\t" + "sub %3, %3, a2\n\t" + "addi %5, %5, -1\n\t" + "bnez %5, outer_loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input0_data), // %2 + "r"(input1_data), // %3 + "r"(inner_size), // %4 + "r"(outer_size) // %5 + : "v2", "v3", "v4", "v5", "v6", "v7", "a1", "a2", "t0" + ); + // for(int n = 0; n < input0->dim[0]; n++){ + // for(int h = 0; h < input0->dim[1]; h++){ + // for(int w = 0; w < input0->dim[2]; w++){ + // for(int c = 0; c < input0->dim[3]; c++){ + // int index = csi_get_index(input0->dim, n, h, w, c); + // output_data[index] = input1_data[c] + input0_data[index]; + // } + // } + // } + // } + } + + return CSINN_TRUE; +} + +int csi_add_u8_c906(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) +{ + uint8_t *input0_data = input0->data; + uint8_t *input1_data = input1->data; + uint8_t *output_data = output->data; + + int channel; + if (params->layout == CSINN_NHWC){channel = input0->dim[3];} + else if (params->layout == CSINN_NCHW){channel = input0->dim[1];} + + + int size0 = 1; + for (int i = 0; i < input0->dim_count; i++) { + size0 = size0 * input0->dim[i]; + } + + int size1 = 1; + int axis = 0; + for (int i = 0; i < input1->dim_count; i++) { + size1 = size1 * input1->dim[i]; + if (input1->dim[i] != 1){ + axis = i; + } + } + + if(size0 == size1){ + for (int i = 0; i < size0; i++) { + float input0_val = + csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); + float input1_val = + csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); + float res = input0_val + input1_val; + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + } + else if(input1->dim[axis] == channel && size1 == input1->dim[axis]){ + for(int n = 0; n < input0->dim[0]; n++){ + for(int h = 0; h < input0->dim[1]; h++){ + for(int w = 0; w < input0->dim[2]; w++){ + for(int c = 0; c < input0->dim[3]; c++){ + + if (params->layout == CSINN_NHWC){channel = c;} + else if (params->layout == CSINN_NCHW){channel = h;} + + float input1_val = + csi_dequantize_u8_to_f32(input1_data[channel], input1->zero_point, input1->multiplier, input1->shift); + + int index = csi_get_index(input0->dim, n, h, w, c); + float input0_val = + csi_dequantize_u8_to_f32(input0_data[index], input0->zero_point, input0->multiplier, input0->shift); + float res = input0_val + input1_val; + output_data[index] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + } + } + } + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/broadcast_to.c b/source/c906_opt/broadcast_to.c new file mode 100644 index 00000000..c155788e --- /dev/null +++ b/source/c906_opt/broadcast_to.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + + +int csi_broadcast_to_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size0 = 1; + for(int i=0; i < input->dim_count; i++) { + size0 = size0 * input->dim[i]; + } + + int size1 = 1; + for(int i=0; i < params->shape_count - input->dim_count; i++) { + size1 = size1 * params->shape[i]; + } + + asm volatile( + "outer_loop:\n\t" + "mv a1, %3\n\t" + "memcpy_loop:\n\t" + "vsetvli t0, a1, e32, m2\n\t" + "vlw.v v4, (%2)\n\t" + "slli t1, t0, 2\n\t" + "add %2, %2, t1\n\t" + "sub a1, a1, t0\n\t" + "vsw.v v4, (%0)\n\t" + "add %0, %0, t1\n\t" + "bnez a1, memcpy_loop\n\t" + "addi %4, %4, -1\n\t" + "bnez %4, outer_loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size0), // %3 + "r"(size1) // %4 + : "a1", "t0", "t1", "v4", "v5" + ); + + // for(int i=0; idata; + uint8_t *output_data = (uint8_t *)output->data; + int size0 = 1; + for(int i=0; idim_count; i++) { + size0 = size0 * input->dim[i]; + } + + int size1 = 1; + for(int i=0; i < params->shape_count - input->dim_count; i++) { + size1 = size1 * params->shape[i]; + } + for(int i=0; i + +int csi_clip_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + float min_value = params->min_value; + float max_value = params->max_value; + + asm volatile( + "loop:\n\t" + "vsetvli t0, %3, e32, m2\n\t" + "vlw.v v2, (%2)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" + "vfmax.vf v2, v2, %4\n\t" // v2[i] = min(v2[i], min_value) + "vfmin.vf v2, v2, %5\n\t" // v2[i] = max(v2[i], max_value) + "vsw.v v2, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size), // %3 + "f"(min_value), // %4 + "f"(max_value) // %5 + : "v0", "v2", "v3", "t0", "t1", "t2", "t3" + ); + + // for (int i = 0; i < size; i++) { + // if(input_data[i] < params->min_value) { + // output_data[i] = params->min_value; + // } else if(input_data[i] > params->max_value) { + // output_data[i] = params->max_value; + // } else { + // output_data[i] = input_data[i]; + // } + // } + return CSINN_TRUE; +} + +int csi_clip_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params) +{ + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *output_data = (uint8_t *)output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + for (int i = 0; i < size; i++) { + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, + input->shift); + float res = 0.0f; + if(input_val < params->min_value) { + res = params->min_value; + } else if(input_val > params->max_value) { + res = params->max_value; + } else { + res = output_data[i]; + } + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/fullyconnected.c b/source/c906_opt/fullyconnected.c new file mode 100644 index 00000000..e6f5f375 --- /dev/null +++ b/source/c906_opt/fullyconnected.c @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_fullyconnected_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params) +{ + float *input_data = input->data; + float *output_data = output->data; + float *weights_data = weights->data; + float *bias_data = bias->data; + const int output_dims_count = output->dim_count; + const int weights_dims_count = weights->dim_count; + const int batches = output->dim[0]; + const int output_depth = weights->dim[weights_dims_count - 2]; // output_nodes + const int accum_depth = weights->dim[weights_dims_count - 1]; // input_nodes + + float zero = 0.0f; + asm volatile( + "mv a0, %5\n\t" + "loop3:\n\t" + "mv a1, %6\n\t" + "loop2:\n\t" + "mv a2, %7\n\t" + "vfmv.s.f v8, %8\n\t" + "loop1:\n\t" + "vsetvli t0, a2, e32, m1\n\t" + "vlw.v v2, (%2)\n\t" // load input_data + "sub a2, a2, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" // bump input_data pointer + "vlw.v v4, (%3)\n\t" // load weight_data + "add %3, %3, t0\n\t" // bump weight_data pointer + "vfsub.vv v6, v6, v6\n\t" // clear v6 + "vfmacc.vv v6, v2, v4\n\t" + "vfredsum.vs v8, v6, v8\n\t" // v8[0] = v8[0] + sum(v6[0..i]) + + "bnez a2, loop1\n\t" + + "flw ft0, 0(%4)\n\t" // load bias_data + "addi %4, %4, 4\n\t" // bump bias_data pointer + "vfmv.f.s ft1, v8\n\t" + "fadd.s ft2, ft1, ft0\n\t" + "fsw ft2, 0(%0)\n\t" // store output_data + "addi %0, %0, 4\n\t" // bump output_data pointer + + "slli a3, %7, 2\n\t" + "sub %2, %2, a3\n\t" + "addi a1, a1, -1\n\t" + "bnez a1, loop2\n\t" + + "add %2, %2, a3\n\t" + "mul t1, %6, %7\n\t" + "slli t1, t1, 2\n\t" + "sub %3, %3, t1\n\t" // finish all output_nodes, jump weights_data pointer + "slli t2, %6, 2\n\t" + "sub %4, %4, t2\n\t" // finish all output_nodes, jump bias_data pointer + + "addi a0, a0, -1\n\t" + "bnez a0, loop3\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(weights_data), // %3 + "r"(bias_data), // %4 + "r"(batches), // %5 + "r"(output_depth), // %6 + "r"(accum_depth), // %7 + "f"(zero) // %8 + : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "ft0", "ft1", "ft2" + ); + + // for (int b = 0; b < batches; ++b) { + // for (int out_c = 0; out_c < output_depth; ++out_c) { + // float total = 0.f; + // for (int d = 0; d < accum_depth; ++d) { + // total += input_data[b * accum_depth + d] * weights_data[out_c * accum_depth + d]; + // } + // float bias_value = 0.0f; + // if (bias_data != NULL) { + // bias_value = bias_data[out_c]; + // } + // output_data[out_c + output_depth * b] = total + bias_value; + // } + // } + return CSINN_TRUE; +} + +int csi_fullyconnected_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *weights_data = weights->data; + int32_t *bias_data = bias->data; + const int output_dims_count = output->dim_count; + const int weights_dims_count = weights->dim_count; + const int batches = output->dim[0]; + const int output_depth = weights->dim[weights_dims_count - 2]; + const int accum_depth = weights->dim[weights_dims_count - 1]; + for (int b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int out_c = 0; out_c < output_depth; ++out_c) { + int32_t acc = 0; + for (int d = 0; d < accum_depth; ++d) { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = weights_data[out_c * accum_depth + d]; + acc += (filter_val + weights->zero_point) * (input_val + input->zero_point); + } + if (bias_data != NULL) { + acc += bias_data[out_c]; + } + + output_data[out_c + output_depth * b] = + csi_quantize_u8(acc, output->zero_point, output->multiplier, output->shift); + } + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/leaky_relu.c b/source/c906_opt/leaky_relu.c new file mode 100644 index 00000000..f04599c3 --- /dev/null +++ b/source/c906_opt/leaky_relu.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_leaky_relu_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + float alpha = params->n; + float gata = 0.0f; + asm volatile( + "loop:\n\t" + "vsetvli t0, %3, e32, m1\n\t" + "vlw.v v2, (%2)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" + "vmflt.vf v0, v2, %4\n\t" + "vfmul.vf v2, v2, %5, v0.t\n\t" + "vsw.v v2, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size), // %3 + "f"(gata), // %4 + "f"(alpha) // %5 + : "v0", "v2", "v3", "v4", "v5", "t0" + ); + + // for (int i = 0; i < size; i++) { + // float val = input_data[i]; + // output_data[i] = val > 0 ? val : val * params->n; + // } + return CSINN_TRUE; +} + +int csi_leaky_relu_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + float alpha_f = csi_dequantize_u8_to_f32(1, 0, params->n_multiplier, params->n_shift); + for (int i = 0; i < size; i++) { + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, + input->shift); + float res = input_val > 0 ? input_val : input_val * alpha_f; + + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/prelu.c b/source/c906_opt/prelu.c new file mode 100644 index 00000000..57f5c437 --- /dev/null +++ b/source/c906_opt/prelu.c @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +static int csi_prelu_nhwc_f32(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *alpha_data = (float *)alpha->data; + int outer_size = output->dim[0] * output->dim[1] * output->dim[2]; + int inner_size = output->dim[3]; + float gata = 0.0f; + asm volatile( + "mv a0, %0\n\t" // a0 = outpt_data + "mv a2, %2\n\t" // a2 = input_data + "outer_loop:\n\t" + "mv t1, %5\n\t" // t1 = inner_size + "mv a1, %3\n\t" // a1 = alpha_data + "inner_loop:\n\t" + "vsetvli t0, t1, e32, m1\n\t" + "vlw.v v2, (a2)\n\t" // load input_data to v2,v3 + "sub t1, t1, t0\n\t" + "slli t0, t0, 2\n\t" + "add a2, a2, t0\n\t" + "vlw.v v4, (a1)\n\t" // load alpha_data to v4,v5 + "add a1, a1, t0\n\t" + "vmflt.vf v0, v2, %6\n\t" + "vfmul.vv v2, v2, v4, v0.t\n\t" + "vsw.v v2, (a0)\n\t" + "add a0, a0, t0\n\t" + "bnez t1, inner_loop\n\t" // finish all channel + + "addi %4, %4, -1\n\t" + "bnez %4, outer_loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(alpha_data), // %3 + "r"(outer_size), // %4 + "r"(inner_size), // %5 + "f"(gata) // %6 + : "v0", "v2", "v3", "v4", "v5", "t0", "t1", "a0", "a1", "a2" + ); + + // for (int b = 0; b < output->dim[0]; ++b) { + // for (int y = 0; y < output->dim[1]; ++y) { + // for (int x = 0; x < output->dim[2]; ++x) { + // for (int c = 0; c < output->dim[3]; ++c) { + // int output_index = csi_get_index(output->dim, b, y, x, c); + // int input_index = csi_get_index(input->dim, b, y, x, c); + // float input_value = input_data[input_index]; + // if (input_value >= 0) { + // output_data[output_index] = input_data[input_index]; + // } else { + // output_data[output_index] = input_value * alpha_data[c]; + // } + // } + // } + // } + // } + return CSINN_TRUE; +} + +static int csi_prelu_nhwc_u8(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + int num_elements = 1; + for (int i = 0; i < output->dim_count; i++) { + num_elements *= output->dim[i]; + } + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *alpha_data = alpha->data; + const int32_t input_offset = input->zero_point; + const int32_t alpha_offset = alpha->zero_point; + + for (int b = 0; b < output->dim[0]; ++b) { + for (int y = 0; y < output->dim[1]; ++y) { + for (int x = 0; x < output->dim[2]; ++x) { + for (int c = 0; c < output->dim[3]; ++c) { + int index = csi_get_index(input->dim, b, y, x, c); + const float input_value = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); + if (input_value >= 0) { + output_data[index] = csi_quantize_f32_to_u8(input_value, + output->zero_point, output->multiplier, output->shift); + } else { + float alpha_val = csi_dequantize_u8_to_f32(alpha_data[c], alpha->zero_point, alpha->multiplier, alpha->shift); + output_data[index] = csi_quantize_f32_to_u8(input_value * alpha_val, + output->zero_point, output->multiplier, output->shift); + } + } + } + } + } + return CSINN_TRUE; +} + +static int csi_prelu_nchw_f32(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *alpha_data = (float *)alpha->data; + + int batch = output->dim[0]; + int channel = output->dim[1]; + int size = output->dim[2] * output->dim[3]; + float gata = 0.0f; + + asm volatile( + "mv t3, %4\n\t" // t3 = batch + "mv a2, %2\n\t" // a2 = input_data + "mv a0, %0\n\t" // a0 = output_data + "loop3:\n\t" + "mv t2, %5\n\t" // t2 = channel + "mv a1, %3\n\t" // a1 = alpha_data + "loop2:\n\t" + "mv t1, %6\n\t" // t1 = size; + "flw ft0, (a1)\n\t" + "addi a1, a1, 4\n\t" + "loop1:\n\t" + "vsetvli t0, t1, e32, m2\n\t" + "vlw.v v2, (a2)\n\t" + "sub t1, t1, t0\n\t" + "slli t0, t0, 2\n\t" + "add a2, a2, t0\n\t" + "vmflt.vf v0, v2, %7\n\t" + "vfmul.vf v2, v2, ft0, v0.t\n\t" + "vsw.v v2, (a0)\n\t" + "add a0, a0, t0\n\t" + "bnez t1, loop1\n\t" + + "addi t2, t2, -1\n\t" + "bnez t2, loop2\n\t" + + "addi t3, t3, -1\n\t" + "bnez t3, loop3\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(alpha_data), // %3 + "r"(batch), // %4 + "r"(channel), // %5 + "r"(size), // %6 + "f"(gata) // %7 + : "v0", "v2", "v3", "t0", "t1", "t2", "t3", "a0", "a1", "a2", "ft0" + ); + + // for (int b = 0; b < output->dim[0]; ++b) { + // for (int y = 0; y < output->dim[1]; ++y) { + // for (int x = 0; x < output->dim[2]; ++x) { + // for (int c = 0; c < output->dim[3]; ++c) { + // int output_index = csi_get_index(output->dim, b, y, x, c); + // int input_index = csi_get_index(input->dim, b, y, x, c); + // const int32_t input_value = input->offset + input_data[input_index]; + // if (input_value >= 0) { + // output_data[output_index] = input_data[input_index]; + // } else { + // output_data[output_index] = input_value * alpha_data[c]; + // } + // } + // } + // } + // } + return CSINN_TRUE; +} + +static int csi_prelu_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *alpha, + struct csi_tensor *o_output, + struct prelu_params *params) +{ + struct csi_tensor* input = csi_nchw_to_nhwc_8(o_input); + struct csi_tensor* output = csi_nchw_to_nhwc_8(o_output); + int num_elements = 1; + for (int i = 0; i < output->dim_count; i++) { + num_elements *= output->dim[i]; + } + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *alpha_data = alpha->data; + const int32_t input_offset = input->zero_point; + const int32_t alpha_offset = alpha->zero_point; + + for (int b = 0; b < output->dim[0]; ++b) { + for (int y = 0; y < output->dim[1]; ++y) { + for (int x = 0; x < output->dim[2]; ++x) { + for (int c = 0; c < output->dim[3]; ++c) { + int index = csi_get_index(input->dim, b, y, x, c); + const float input_value = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); + if (input_value >= 0) { + output_data[index] = csi_quantize_f32_to_u8(input_value, + output->zero_point, output->multiplier, output->shift); + } else { + float alpha_val = csi_dequantize_u8_to_f32(alpha_data[c], alpha->zero_point, alpha->multiplier, alpha->shift); + output_data[index] = csi_quantize_f32_to_u8(input_value * alpha_val, + output->zero_point, output->multiplier, output->shift); + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + return CSINN_TRUE; +} + +int csi_prelu_f32_c906(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_prelu_nchw_f32(input, alpha, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_prelu_nhwc_f32(input, alpha, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_prelu_u8_c906(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_prelu_nhwc_u8(input, alpha, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_prelu_nchw_u8(input, alpha, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} diff --git a/source/c906_opt/relu.c b/source/c906_opt/relu.c new file mode 100644 index 00000000..49de16db --- /dev/null +++ b/source/c906_opt/relu.c @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" +#include + +static float relu(float x){ + return x > 0 ? x : 0; +} + +int csi_relu_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + float *input_data = input->data; + float *output_data = output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + float gata = 0.0f; + asm volatile( + "loop:\n\t" + "vsetvli t0, %3, e32, m2\n\t" + "vlw.v v2, (%2)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" + "vfmax.vf v2, v2, %4\n\t" + "vsw.v v2, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size), // %3 + "f"(gata) // %4 + : "v0", "v2", "v3", "v4", "v5", "t0" + ); + + // for (int i = 0; i < size; i++) { + // output_data[i] = relu(input_data[i]); + // } + return CSINN_TRUE; +} + +int csi_relu_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + #pragma omp parallel for num_threads(8) + for (int i = 0; i < size; i++) { + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, + input->shift); + float res = relu(input0_val); + + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + return CSINN_TRUE; +} diff --git a/source/c906_opt/relu1.c b/source/c906_opt/relu1.c new file mode 100644 index 00000000..844840dd --- /dev/null +++ b/source/c906_opt/relu1.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" +#include + +static float relu1(float x){ + return fmin(x > 0 ? x : 0, 1); +} + +int csi_relu1_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + float gata = 0.0f; + float gata1 = 1.0f; + asm volatile( + "loop:\n\t" + "vsetvli t0, %3, e32, m2\n\t" + "vlw.v v2, (%2)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" + "vfmax.vf v2, v2, %4\n\t" + "vfmin.vf v2, v2, %5\n\t" + "vsw.v v2, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size), // %3 + "f"(gata), // %4 + "f"(gata1) // %5 + : "v0", "v2", "v3", "v4", "v5", "t0" + ); + + // for (int i = 0; i < size; i++) { + // output_data[i] = relu1(input_data[i]); + // } + return CSINN_TRUE; +} + +int csi_relu1_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + for (int i = 0; i < size; i++) { + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, + input->shift); + float res = relu1(input0_val); + + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + return CSINN_TRUE; +} + diff --git a/source/c906_opt/relu6.c b/source/c906_opt/relu6.c new file mode 100644 index 00000000..51479640 --- /dev/null +++ b/source/c906_opt/relu6.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" +#include + +static float relu6(float x){ + return fmin(x > 0 ? x : 0, 6); +} + +int csi_relu6_f32_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + float gata = 0.0f; + float gata1 = 6.0f; + asm volatile( + "loop:\n\t" + "vsetvli t0, %3, e32, m1\n\t" + "vlw.v v2, (%2)\n\t" + "sub %3, %3, t0\n\t" + "slli t0, t0, 2\n\t" + "add %2, %2, t0\n\t" + "vfmax.vf v2, v2, %4\n\t" + "vfmin.vf v2, v2, %5\n\t" + "vsw.v v2, (%0)\n\t" + "add %0, %0, t0\n\t" + "bnez %3, loop\n\t" + + :"=r"(output_data) // %0 + :"0"(output_data), // %1 + "r"(input_data), // %2 + "r"(size), // %3 + "f"(gata), // %4 + "f"(gata1) // %5 + : "v0", "v2", "v3", "v4", "v5", "t0" + ); + + // for (int i = 0; i < size; i++) { + // output_data[i] = relu6(input_data[i]); + // } + return CSINN_TRUE; +} + +int csi_relu6_u8_c906(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + int size = 1; + for (int i = 0; i < input->dim_count; i++) { + size = size * input->dim[i]; + } + + for (int i = 0; i < size; i++) { + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, + input->shift); + float res = relu6(input0_val); + + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); + } + return CSINN_TRUE; +} + diff --git a/source/c906_opt/setup.c b/source/c906_opt/setup.c new file mode 100644 index 00000000..ef3af089 --- /dev/null +++ b/source/c906_opt/setup.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" +#include "csi_internal_c906.h" + + +void* csi_bc_map_table_c906[CSINN_OP_SIZE][2] = { + {csi_abs_u8_c906, csi_abs_f32_c906}, /* CSINN_OP_ABS */ + {NULL, NULL}, /* CSINN_OP_ACOS */ + {NULL, NULL}, /* CSINN_OP_ACOSH */ + {csi_add_u8_c906, csi_add_f32_c906}, /* CSINN_OP_ADD */ + {NULL, NULL}, /* CSINN_OP_ALL */ + {NULL, NULL}, /* CSINN_OP_AND */ + {NULL, NULL}, /* CSINN_OP_ANY */ + {NULL, NULL}, /* CSINN_OP_ARANGE */ + {NULL, NULL}, /* CSINN_OP_ARGMAX */ + {NULL, NULL}, /* CSINN_OP_ARGMIN */ + {NULL, NULL}, /* CSINN_OP_ASIN */ + {NULL, NULL}, /* CSINN_OP_ASINH */ + {NULL, NULL}, /* CSINN_OP_ATAN */ + {NULL, NULL}, /* CSINN_OP_ATANH */ + {NULL, NULL}, /* CSINN_OP_AVGPOOL2D */ + {NULL, NULL}, /* CSINN_OP_AVGPOOL3D */ + {NULL, NULL}, /* CSINN_OP_BN */ + {NULL, NULL}, /* CSINN_OP_BATCH_TO_SPACE */ + {csi_broadcast_to_u8_c906, csi_broadcast_to_f32_c906}, /* CSINN_OP_BROADCOST */ + {NULL, NULL}, /* CSINN_OP_CEIL */ + {csi_clip_u8_c906, csi_clip_f32_c906}, /* CSINN_OP_CLIP */ + {NULL, NULL}, /* CSINN_OP_COL2IM */ + {NULL, NULL}, /* CSINN_OP_CONCAT */ + {NULL, NULL}, /* CSINN_OP_CONV2D */ + {NULL, NULL}, /* CSINN_OP_CONV2D_RELU */ + {NULL, NULL}, /* CSINN_OP_CONV2D_RELU6 */ + {NULL, NULL}, /* CSINN_OP_CONV2D_CHANNEL */ + {NULL, NULL}, /* CSINN_OP_CONV2D_CHANNEL_RELU */ + {NULL, NULL}, /* CSINN_OP_CONV2D_CHANNEL_RELU6 */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_RELU */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_RELU6 */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6 */ + {NULL, NULL}, /* CSINN_OP_GROUP_CONV2D */ + {NULL, NULL}, /* CSINN_OP_GROUP_CONV2D_RELU */ + {NULL, NULL}, /* CSINN_OP_GROUP_CONV2D_CHANNEL */ + {NULL, NULL}, /* CSINN_OP_GROUP_CONV2D_CHANNEL_RELU */ + {NULL, NULL}, /* CSINN_OP_CONV3D */ + {NULL, NULL}, /* CSINN_OP_COS */ + {NULL, NULL}, /* CSINN_OP_COSH */ + {NULL, NULL}, /* CSINN_OP_CUMPROD */ + {NULL, NULL}, /* CSINN_OP_CUMSUM */ + {NULL, NULL}, /* CSINN_OP_DECONV2D */ + {NULL, NULL}, /* CSINN_OP_DEPTHWISE_DECONV2D */ + {NULL, NULL}, /* CSINN_OP_DECONV3D */ + {NULL, NULL}, /* CSINN_OP_DEPTH_TO_SPACE */ + {NULL, NULL}, /* CSINN_OP_DIV */ + {NULL, NULL}, /* CSINN_OP_ELU */ + {NULL, NULL}, /* CSINN_OP_EQUANL */ + {NULL, NULL}, /* CSINN_OP_ERF */ + {NULL, NULL}, /* CSINN_OP_EXP */ + {NULL, NULL}, /* CSINN_OP_EXPAND_DIMS */ + {NULL, NULL}, /* CSINN_OP_EXPM1 */ + {NULL, NULL}, /* CSINN_OP_FLATTEN */ + {NULL, NULL}, /* CSINN_OP_FLOOR_DIVIDE */ + {NULL, NULL}, /* CSINN_OP_FLOOR_MOD */ + {NULL, NULL}, /* CSINN_OP_FLOOR */ + {csi_fullyconnected_u8_c906, csi_fullyconnected_f32_c906}, /* CSINN_OP_FULLYCONNECTED */ + {NULL, NULL}, /* CSINN_OP_GATHER_ND */ + {NULL, NULL}, /* CSINN_OP_GATHER */ + {NULL, NULL}, /* CSINN_OP_GLOBAL_AVGPOOL2D */ + {NULL, NULL}, /* CSINN_OP_GLOBAL_MAXPOOL2D */ + {NULL, NULL}, /* CSINN_OP_GREATHER_EQUAL */ + {NULL, NULL}, /* CSINN_OP_GREATHER */ + {NULL, NULL}, /* CSINN_OP_HARD_SIGMOID */ + {NULL, NULL}, /* CSINN_OP_IM2COL */ + {NULL, NULL}, /* CSINN_OP_ISNAN */ + {NULL, NULL}, /* CSINN_OP_L2N */ + {NULL, NULL}, /* CSINN_OP_L2POOL2D */ + {csi_leaky_relu_u8_c906, csi_leaky_relu_f32_c906}, /* CSINN_OP_LEAKY_RELU */ + {NULL, NULL}, /* CSINN_OP_LESS_EQUAL */ + {NULL, NULL}, /* CSINN_OP_LESS */ + {NULL, NULL}, /* CSINN_OP_LOG_SOFTMAX */ + {NULL, NULL}, /* CSINN_OP_LOG */ + {NULL, NULL}, /* CSINN_OP_LOG1P */ + {NULL, NULL}, /* CSINN_OP_LOGICAL_AND */ + {NULL, NULL}, /* CSINN_OP_LOGICAL_NOT */ + {NULL, NULL}, /* CSINN_OP_LOGICAL_OR */ + {NULL, NULL}, /* CSINN_OP_LOGICAL_XOR */ + {NULL, NULL}, /* CSINN_OP_LRN */ + {NULL, NULL}, /* CSINN_OP_MATMUL */ + {NULL, NULL}, /* CSINN_OP_MAX */ + {NULL, NULL}, /* CSINN_OP_MAXINUM */ + {NULL, NULL}, /* CSINN_OP_MAXPOOL2D */ + {NULL, NULL}, /* CSINN_OP_MAXPOOL2D_LOCAT */ + {NULL, NULL}, /* CSINN_OP_MAXPOOL3D */ + {NULL, NULL}, /* CSINN_OP_MEAN */ + {NULL, NULL}, /* CSINN_OP_MEAN_STRIDE */ + {NULL, NULL}, /* CSINN_OP_MIN */ + {NULL, NULL}, /* CSINN_OP_MIN_STRIDE */ + {NULL, NULL}, /* CSINN_OP_MINIMUM */ + {NULL, NULL}, /* CSINN_OP_MOD */ + {NULL, NULL}, /* CSINN_OP_MUL */ + {NULL, NULL}, /* CSINN_OP_NDARRAY_SIZE */ + {NULL, NULL}, /* CSINN_OP_NEGATIIVE */ + {NULL, NULL}, /* CSINN_OP_NON_MAX_SUPPRESSION */ + {NULL, NULL}, /* CSINN_OP_NOT_EQUAL */ + {NULL, NULL}, /* CSINN_OP_NOT */ + {NULL, NULL}, /* CSINN_OP_ONE_HOT */ + {NULL, NULL}, /* CSINN_OP_OR */ + {NULL, NULL}, /* CSINN_OP_PAD */ + {NULL, NULL}, /* CSINN_OP_POWER */ + {csi_prelu_u8_c906, csi_prelu_f32_c906}, /* CSINN_OP_PRELU */ + {NULL, NULL}, /* CSINN_OP_PROD */ + {NULL, NULL}, /* CSINN_OP_PROPOSAL */ + {NULL, NULL}, /* CSINN_OP_PSROIPOOLING */ + {NULL, NULL}, /* CSINN_OP_REDUCE_LOGSUMEXP */ + {NULL, NULL}, /* CSINN_OP_REDUCE_MAX */ + {NULL, NULL}, /* CSINN_OP_REDUCE_MEAN */ + {NULL, NULL}, /* CSINN_OP_REDUCE_MIN */ + {NULL, NULL}, /* CSINN_OP_REDUCE_PROD */ + {NULL, NULL}, /* CSINN_OP_REDUCE_SUM */ + {csi_relu_u8_c906, csi_relu_f32_c906}, /* CSINN_OP_RELU */ + {csi_relu1_u8_c906, csi_relu1_f32_c906}, /* CSINN_OP_RELU1 */ + {csi_relu6_u8_c906, csi_relu6_f32_c906}, /* CSINN_OP_RELU6 */ + {NULL, NULL}, /* CSINN_OP_RELUN */ + {NULL, NULL}, /* CSINN_OP_REORG */ + {NULL, NULL}, /* CSINN_OP_RESHAPE */ + {NULL, NULL}, /* CSINN_OP_RESIZE */ + {NULL, NULL}, /* CSINN_OP_REVERSE */ + {NULL, NULL}, /* CSINN_OP_ROIALIGN */ + {NULL, NULL}, /* CSINN_OP_ROIPOOL */ + {NULL, NULL}, /* CSINN_OP_ROUND */ + {NULL, NULL}, /* CSINN_OP_RSQRT */ + {NULL, NULL}, /* CSINN_OP_SEGMENT_MAX */ + {NULL, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MAX */ + {NULL, NULL}, /* CSINN_OP_SEGMENT_MEAN */ + {NULL, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MEAN */ + {NULL, NULL}, /* CSINN_OP_SEGMENT_MIN */ + {NULL, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MIN */ + {NULL, NULL}, /* CSINN_OP_SEGMENT_PROD */ + {NULL, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_PROD */ + {NULL, NULL}, /* CSINN_OP_SEGMENT_SUM */ + {NULL, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_SUM */ + {NULL, NULL}, /* CSINN_OP_SELECT */ + {NULL, NULL}, /* CSINN_OP_SEQUENCE_MASK */ + {NULL, NULL}, /* CSINN_OP_SHAPE */ + {NULL, NULL}, /* CSINN_OP_SHUFFLE_CHANNEL */ + {NULL, NULL}, /* CSINN_OP_SIGMOID */ + {NULL, NULL}, /* CSINN_OP_SIGN */ + {NULL, NULL}, /* CSINN_OP_SIN */ + {NULL, NULL}, /* CSINN_OP_SINH */ + {NULL, NULL}, /* CSINN_OP_SLICE */ + {NULL, NULL}, /* CSINN_OP_SOFTMAX */ + {NULL, NULL}, /* CSINN_OP_SOFTPLUS */ + {NULL, NULL}, /* CSINN_OP_SOFTRELU */ + {NULL, NULL}, /* CSINN_OP_SOFTSIGN */ + {NULL, NULL}, /* CSINN_OP_SPACE_TO_BATCH */ + {NULL, NULL}, /* CSINN_OP_SPACE_TO_DEPTH */ + {NULL, NULL}, /* CSINN_OP_SPLIT */ + {NULL, NULL}, /* CSINN_OP_SQRT */ + {NULL, NULL}, /* CSINN_OP_SQUARE */ + {NULL, NULL}, /* CSINN_OP_SQUEEZE */ + {NULL, NULL}, /* CSINN_OP_STACK */ + {NULL, NULL}, /* CSINN_OP_STRIDED_SLICE */ + {NULL, NULL}, /* CSINN_OP_SUB */ + {NULL, NULL}, /* CSINN_OP_SUM */ + {NULL, NULL}, /* CSINN_OP_TAN */ + {NULL, NULL}, /* CSINN_OP_TANH */ + {NULL, NULL}, /* CSINN_OP_THRESHOLD_RELU */ + {NULL, NULL}, /* CSINN_OP_TILE */ + {NULL, NULL}, /* CSINN_OP_TOPK */ + {NULL, NULL}, /* CSINN_OP_TRANSPOSE */ + {NULL, NULL}, /* CSINN_OP_TRUNC */ + {NULL, NULL}, /* CSINN_OP_UNPOOLING */ + {NULL, NULL}, /* CSINN_OP_UNSTACK */ + {NULL, NULL}, /* CSINN_OP_WHERE */ + {NULL, NULL}, /* CSINN_OP_XOR */ + {NULL, NULL}, /* CSINN_OP_YUV_RGB_SCALE */ +}; + +void *csi_bc_map_c906(int op, int dtype) +{ + int dt; + switch (dtype) { + case CSINN_DTYPE_UINT8: + dt = 0; + break; + case CSINN_DTYPE_FLOAT32: + dt = 1; + break; + default: + return NULL; + } + return csi_bc_map_table_c906[op][dt]; +} \ No newline at end of file diff --git a/source/openvx/abs.c b/source/openvx/abs.c index 99beb5eb..62e62791 100644 --- a/source/openvx/abs.c +++ b/source/openvx/abs.c @@ -27,9 +27,8 @@ int csi_ovx_abs(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_ABS, input_num, output_num, &node_id); diff --git a/source/openvx/add.c b/source/openvx/add.c index 894b89e7..1e7e833b 100644 --- a/source/openvx/add.c +++ b/source/openvx/add.c @@ -28,9 +28,8 @@ int csi_ovx_add(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_ADD, input_num, output_num, &node_id); diff --git a/source/openvx/and.c b/source/openvx/and.c index d5773493..b9f78aec 100644 --- a/source/openvx/and.c +++ b/source/openvx/and.c @@ -28,9 +28,8 @@ int csi_ovx_and(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_LOGICAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/argmax.c b/source/openvx/argmax.c index 25fd3522..a4a3aa76 100644 --- a/source/openvx/argmax.c +++ b/source/openvx/argmax.c @@ -27,9 +27,8 @@ int csi_ovx_argmax(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_ARGMAX, input_num, output_num, &node_id); diff --git a/source/openvx/argmin.c b/source/openvx/argmin.c index 37411b87..9e5bf74b 100644 --- a/source/openvx/argmin.c +++ b/source/openvx/argmin.c @@ -27,9 +27,8 @@ int csi_ovx_argmin(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_ARGMIN, input_num, output_num, &node_id); diff --git a/source/openvx/averagepool.c b/source/openvx/averagepool.c index dd050ee0..772ed974 100644 --- a/source/openvx/averagepool.c +++ b/source/openvx/averagepool.c @@ -27,9 +27,8 @@ int csi_ovx_averagepool(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOL, input_num, output_num, &node_id); diff --git a/source/openvx/batch_normalization.c b/source/openvx/batch_normalization.c index 665047b1..321de989 100644 --- a/source/openvx/batch_normalization.c +++ b/source/openvx/batch_normalization.c @@ -32,9 +32,8 @@ int csi_ovx_batch_normalization(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 5; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_BATCH_NORM, input_num, output_num, &node_id); diff --git a/source/openvx/batch_to_space.c b/source/openvx/batch_to_space.c index 76228e55..5d7bdc9f 100644 --- a/source/openvx/batch_to_space.c +++ b/source/openvx/batch_to_space.c @@ -27,9 +27,8 @@ int csi_ovx_batch_to_space(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_REORG, input_num, output_num, &node_id); diff --git a/source/openvx/concat.c b/source/openvx/concat.c index 2cbf7b9a..34f78973 100644 --- a/source/openvx/concat.c +++ b/source/openvx/concat.c @@ -18,7 +18,7 @@ #include "csi_ovx.h" -int csi_ovx_concat(struct csi_tensor *input, +int csi_ovx_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params) { @@ -26,13 +26,12 @@ int csi_ovx_concat(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input[0]->sess); + output->sess = input[0]->sess; uint32_t input_num = params->inputs_count; uint32_t output_num = 1; - uint32_t input_dim_num = input[0].dim_count; + uint32_t input_dim_num = input[0]->dim_count; node = vsi_nn_AddNode(graph, VSI_NN_OP_CONCAT, input_num, output_num, &node_id); node->nn_param.concat.axis = input_dim_num -1 - params->axis; @@ -40,7 +39,7 @@ int csi_ovx_concat(struct csi_tensor *input, /* input */ for (int i = 0; i < params->inputs_count; i++) { - node->input.tensors[i] = (vsi_nn_tensor_id_t)input[i].data; + node->input.tensors[i] = (vsi_nn_tensor_id_t)input[i]->data; } /* output */ diff --git a/source/openvx/convolution.c b/source/openvx/convolution.c index a231f66d..180220cf 100644 --- a/source/openvx/convolution.c +++ b/source/openvx/convolution.c @@ -29,9 +29,8 @@ int csi_ovx_conv2d(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; @@ -129,9 +128,8 @@ int csi_ovx_conv2d_relu(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; @@ -211,9 +209,8 @@ int csi_ovx_depthwise_conv2d(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; @@ -293,9 +290,8 @@ int csi_ovx_group_conv2d(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; diff --git a/source/openvx/crop.c b/source/openvx/crop.c index b924b67c..1ecc22f5 100644 --- a/source/openvx/crop.c +++ b/source/openvx/crop.c @@ -26,17 +26,16 @@ int csi_ovx_crop(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_CROP, input_num, output_num, &node_id); - node->nn_param.crop.axis = axis; + node->nn_param.crop.axis = params->axis; node->nn_param.crop.dims = output->dim_count; for (int i = 0; i < output->dim_count; i++) { - node->nn_param.crop.offset[i] = offset[i]; + node->nn_param.crop.offset[i] = params->offset[i]; } attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; diff --git a/source/openvx/deconvolution.c b/source/openvx/deconvolution.c index 58abef61..64271aac 100644 --- a/source/openvx/deconvolution.c +++ b/source/openvx/deconvolution.c @@ -31,9 +31,8 @@ int csi_ovx_deconv2d(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; @@ -100,44 +99,19 @@ int csi_ovx_deconv2d(struct csi_tensor *input, output->data = (void *)output_id; } -void csi_depthwise_deconv2d_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - int32_t stride_height, - int32_t stride_width, - int32_t pad_top, - int32_t pad_left, - int32_t pad_down, - int32_t pad_right, - int32_t dilation_height, - int32_t dilation_width) -{ - /* unsupport */ - assert(0); -} - -void csi_depthwise_deconv2d_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - int32_t stride_height, - int32_t stride_width, - int32_t pad_top, - int32_t pad_left, - int32_t pad_down, - int32_t pad_right, - int32_t dilation_height, - int32_t dilation_width) +int csi_ovx_depthwise_deconv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { vsi_nn_node_t *node; vsi_nn_node_id_t node_id; vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; @@ -145,12 +119,12 @@ void csi_depthwise_deconv2d_u8(struct csi_tensor *input, node->nn_param.deconv.ksize[0] = kernel->dim[3]; node->nn_param.deconv.ksize[1] = kernel->dim[2]; node->nn_param.deconv.weights = output->dim[1]; - node->nn_param.deconv.stride[0] = stride_width; - node->nn_param.deconv.stride[1] = stride_height; - node->nn_param.deconv.pad[0] = pad_left; - node->nn_param.deconv.pad[1] = pad_right; - node->nn_param.deconv.pad[2] = pad_top; - node->nn_param.deconv.pad[3] = pad_down; + node->nn_param.deconv.stride[0] = params->stride_width; + node->nn_param.deconv.stride[1] = params->stride_height; + node->nn_param.deconv.pad[0] = params->pad_left; + node->nn_param.deconv.pad[1] = params->pad_right; + node->nn_param.deconv.pad[2] = params->pad_top; + node->nn_param.deconv.pad[3] = params->pad_down; node->nn_param.deconv.group = output->dim[1]; // node->nn_param.deconv.dilation[0] = dilation_width; // node->nn_param.deconv.dilation[1] = dilation_height; diff --git a/source/openvx/depth_to_space.c b/source/openvx/depth_to_space.c index 09037c4d..17c029d3 100644 --- a/source/openvx/depth_to_space.c +++ b/source/openvx/depth_to_space.c @@ -27,9 +27,8 @@ int csi_ovx_depth_to_space(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_REORG, input_num, output_num, &node_id); diff --git a/source/openvx/div.c b/source/openvx/div.c index 8c9f57a3..abb9b2cb 100644 --- a/source/openvx/div.c +++ b/source/openvx/div.c @@ -27,9 +27,8 @@ int csi_ovx_div(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_DIVIDE, input_num, output_num, &node_id); diff --git a/source/openvx/elu.c b/source/openvx/elu.c index 6b5dc39a..61cb91d8 100644 --- a/source/openvx/elu.c +++ b/source/openvx/elu.c @@ -27,9 +27,8 @@ int csi_ovx_elu(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_ELU, input_num, output_num, &node_id); diff --git a/source/openvx/equal.c b/source/openvx/equal.c index 7e822e3b..bef0504f 100644 --- a/source/openvx/equal.c +++ b/source/openvx/equal.c @@ -27,9 +27,8 @@ int csi_ovx_equal(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELATIONAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/exp.c b/source/openvx/exp.c index 50129397..01a39915 100644 --- a/source/openvx/exp.c +++ b/source/openvx/exp.c @@ -27,9 +27,8 @@ int csi_ovx_exp(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_EXP, input_num, output_num, &node_id); diff --git a/source/openvx/expand_dims.c b/source/openvx/expand_dims.c index 3520a188..5b110cfb 100644 --- a/source/openvx/expand_dims.c +++ b/source/openvx/expand_dims.c @@ -31,9 +31,8 @@ int csi_ovx_expand_dims_f32(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; @@ -72,9 +71,8 @@ int csi_ovx_expand_dims_u8(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/flatten.c b/source/openvx/flatten.c index 1b49661b..de3873c9 100644 --- a/source/openvx/flatten.c +++ b/source/openvx/flatten.c @@ -27,9 +27,8 @@ int csi_ovx_flatten(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RESHAPE, input_num, output_num, &node_id); @@ -70,9 +69,8 @@ int csi_ovx_flatten_tail(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RESHAPE, input_num, output_num, &node_id); diff --git a/source/openvx/floor.c b/source/openvx/floor.c index c5b57413..ddced75e 100644 --- a/source/openvx/floor.c +++ b/source/openvx/floor.c @@ -28,9 +28,8 @@ int csi_ovx_floor(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_FLOOR, input_num, output_num, &node_id); diff --git a/source/openvx/floor_divide.c b/source/openvx/floor_divide.c index 63919841..e6b45bab 100644 --- a/source/openvx/floor_divide.c +++ b/source/openvx/floor_divide.c @@ -27,9 +27,8 @@ int csi_ovx_floor_divide(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_FLOORDIV, input_num, output_num, &node_id); diff --git a/source/openvx/fullyconnected.c b/source/openvx/fullyconnected.c index a70a60c8..1c450cff 100644 --- a/source/openvx/fullyconnected.c +++ b/source/openvx/fullyconnected.c @@ -30,9 +30,8 @@ int csi_ovx_fullyconnected(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_FCL, input_num, output_num, &node_id); @@ -99,9 +98,8 @@ int csi_ovx_fullyconnected_relu(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 3; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_FCL, input_num, output_num, &node_id); diff --git a/source/openvx/global_averagepool.c b/source/openvx/global_averagepool.c index 38a4c703..6e3065b1 100644 --- a/source/openvx/global_averagepool.c +++ b/source/openvx/global_averagepool.c @@ -27,9 +27,8 @@ int csi_ovx_global_averagepool(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOL, input_num, output_num, &node_id); diff --git a/source/openvx/global_maxpool.c b/source/openvx/global_maxpool.c index dd31208d..305d4851 100644 --- a/source/openvx/global_maxpool.c +++ b/source/openvx/global_maxpool.c @@ -27,9 +27,8 @@ int csi_ovx_global_maxpool(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOL, input_num, output_num, &node_id); diff --git a/source/openvx/greater.c b/source/openvx/greater.c index ab9db827..b9b3ede9 100644 --- a/source/openvx/greater.c +++ b/source/openvx/greater.c @@ -28,9 +28,8 @@ int csi_ovx_greater(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELATIONAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/greater_equal.c b/source/openvx/greater_equal.c index 51032fb6..b3a96706 100644 --- a/source/openvx/greater_equal.c +++ b/source/openvx/greater_equal.c @@ -28,9 +28,8 @@ int csi_ovx_greater_equal(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELATIONAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/l2_normalization.c b/source/openvx/l2_normalization.c index 32c3e05a..101cec18 100644 --- a/source/openvx/l2_normalization.c +++ b/source/openvx/l2_normalization.c @@ -26,9 +26,8 @@ int csi_ovx_l2_normalization(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_L2_NORMALIZE, input_num, output_num, &node_id); diff --git a/source/openvx/l2pool.c b/source/openvx/l2pool.c index 1fda3101..4fa87b3b 100644 --- a/source/openvx/l2pool.c +++ b/source/openvx/l2pool.c @@ -30,9 +30,8 @@ int csi_ovx_l2pool(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOL, input_num, output_num, &node_id); diff --git a/source/openvx/leaky_relu.c b/source/openvx/leaky_relu.c index 8d804101..06b2e2de 100644 --- a/source/openvx/leaky_relu.c +++ b/source/openvx/leaky_relu.c @@ -26,9 +26,8 @@ int csi_ovx_leaky_relu(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_LEAKY_RELU, input_num, output_num, &node_id); diff --git a/source/openvx/less.c b/source/openvx/less.c index 95fdc4a0..20a9632a 100644 --- a/source/openvx/less.c +++ b/source/openvx/less.c @@ -27,9 +27,8 @@ int csi_ovx_less(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELATIONAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/less_equal.c b/source/openvx/less_equal.c index a0418422..036a9e73 100644 --- a/source/openvx/less_equal.c +++ b/source/openvx/less_equal.c @@ -28,9 +28,8 @@ int csi_ovx_less_equal(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELATIONAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/lrn.c b/source/openvx/lrn.c index f1010bc1..faf54b9c 100644 --- a/source/openvx/lrn.c +++ b/source/openvx/lrn.c @@ -26,9 +26,8 @@ int csi_ovx_lrn(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_LRN, input_num, output_num, &node_id); diff --git a/source/openvx/matmul.c b/source/openvx/matmul.c index c546d9bb..e032ba6d 100644 --- a/source/openvx/matmul.c +++ b/source/openvx/matmul.c @@ -28,9 +28,8 @@ int csi_ovx_matmul(struct csi_tensor *mat0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = mat0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(mat0->sess); + output->sess = mat0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_MATRIXMUL, input_num, output_num, &node_id); diff --git a/source/openvx/max.c b/source/openvx/max.c index 308bb5dc..94e36639 100644 --- a/source/openvx/max.c +++ b/source/openvx/max.c @@ -29,9 +29,8 @@ int csi_ovx_max(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/maximum.c b/source/openvx/maximum.c index 0cd38277..7120845a 100644 --- a/source/openvx/maximum.c +++ b/source/openvx/maximum.c @@ -28,9 +28,8 @@ int csi_ovx_maximum(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_MAXIMUM, input_num, output_num, &node_id); diff --git a/source/openvx/maxpool.c b/source/openvx/maxpool.c index 17c146f4..07eceb14 100644 --- a/source/openvx/maxpool.c +++ b/source/openvx/maxpool.c @@ -28,9 +28,8 @@ int csi_ovx_maxpool(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOL, input_num, output_num, &node_id); diff --git a/source/openvx/maxpool_locat.c b/source/openvx/maxpool_locat.c index 9b9d420d..24657f64 100644 --- a/source/openvx/maxpool_locat.c +++ b/source/openvx/maxpool_locat.c @@ -27,9 +27,8 @@ int csi_ovx_maxpool2d_locat(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 2; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOLWITHARGMAX, input_num, output_num, &node_id); diff --git a/source/openvx/mean.c b/source/openvx/mean.c index c2714638..b21151f1 100644 --- a/source/openvx/mean.c +++ b/source/openvx/mean.c @@ -29,9 +29,8 @@ int csi_ovx_mean(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/min.c b/source/openvx/min.c index ed52d49e..6518abae 100644 --- a/source/openvx/min.c +++ b/source/openvx/min.c @@ -27,9 +27,8 @@ int csi_ovx_min(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/minimum.c b/source/openvx/minimum.c index 676ffd38..da5aa6ca 100644 --- a/source/openvx/minimum.c +++ b/source/openvx/minimum.c @@ -27,9 +27,8 @@ int csi_ovx_minimum(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_MINIMUM, input_num, output_num, &node_id); diff --git a/source/openvx/mul.c b/source/openvx/mul.c index a70bb1d8..21c1ef43 100644 --- a/source/openvx/mul.c +++ b/source/openvx/mul.c @@ -27,9 +27,8 @@ int csi_ovx_mul(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_MULTIPLY, input_num, output_num, &node_id); diff --git a/source/openvx/negative.c b/source/openvx/negative.c index 9a4ec6a8..069ed92d 100644 --- a/source/openvx/negative.c +++ b/source/openvx/negative.c @@ -27,9 +27,8 @@ int csi_ovx_negative(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_NEG, input_num, output_num, &node_id); diff --git a/source/openvx/not_equal.c b/source/openvx/not_equal.c index 58a90404..6a352e7c 100644 --- a/source/openvx/not_equal.c +++ b/source/openvx/not_equal.c @@ -28,9 +28,8 @@ int csi_ovx_not_equal(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELATIONAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/or.c b/source/openvx/or.c index 97e121cf..f7ce5f9c 100644 --- a/source/openvx/or.c +++ b/source/openvx/or.c @@ -28,9 +28,8 @@ int csi_ovx_or(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_LOGICAL_OPS, input_num, output_num, &node_id); diff --git a/source/openvx/pad.c b/source/openvx/pad.c index 150b4b0d..685794df 100644 --- a/source/openvx/pad.c +++ b/source/openvx/pad.c @@ -26,9 +26,8 @@ int csi_ovx_pad(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_PAD, input_num, output_num, &node_id); diff --git a/source/openvx/pool_with_argmax.c b/source/openvx/pool_with_argmax.c index 3771a607..93cdcbff 100644 --- a/source/openvx/pool_with_argmax.c +++ b/source/openvx/pool_with_argmax.c @@ -28,26 +28,25 @@ int csi_ovx_pool_with_argmax(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POOLWITHARGMAX, input_num, output_num, &node_id); - node->nn_param.pool.ksize[0] = filter_width; - node->nn_param.pool.ksize[1] = filter_height; - node->nn_param.pool.stride[0] = stride_width; - node->nn_param.pool.stride[1] = stride_height; + node->nn_param.pool.ksize[0] = params->filter_width; + node->nn_param.pool.ksize[1] = params->filter_height; + node->nn_param.pool.stride[0] = params->stride_width; + node->nn_param.pool.stride[1] = params->stride_height; - int ceil_mode_h = csi_get_ceil_mode_fix(input->dim[2], filter_height, - stride_height, pad_height); - int ceil_mode_w = csi_get_ceil_mode_fix(input->dim[3], filter_width, - stride_width, pad_width); - node->nn_param.pool.pad[0] = pad_width; - node->nn_param.pool.pad[1] = pad_width + ceil_mode_w; - node->nn_param.pool.pad[2] = pad_height; - node->nn_param.pool.pad[3] = pad_height + ceil_mode_h; - node->nn_param.pool.type = pool_type; + int ceil_mode_h = csi_get_ceil_mode_fix(input->dim[2], params->filter_height, + params->stride_height, params->pad_top); + int ceil_mode_w = csi_get_ceil_mode_fix(input->dim[3], params->filter_width, + params->stride_width, params->pad_right); + node->nn_param.pool.pad[0] = params->pad_right; + node->nn_param.pool.pad[1] = params->pad_right + ceil_mode_w; + node->nn_param.pool.pad[2] = params->pad_top; + node->nn_param.pool.pad[3] = params->pad_top + ceil_mode_h; + node->nn_param.pool.type = params->pool_type; node->nn_param.pool.round_type = VSI_NN_ROUND_CEIL; node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; diff --git a/source/openvx/power.c b/source/openvx/power.c index 11e6cbde..d6b2514c 100644 --- a/source/openvx/power.c +++ b/source/openvx/power.c @@ -28,9 +28,8 @@ int csi_ovx_power(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_POW, input_num, output_num, &node_id); diff --git a/source/openvx/prelu.c b/source/openvx/prelu.c index a0173494..8cd046c6 100644 --- a/source/openvx/prelu.c +++ b/source/openvx/prelu.c @@ -29,9 +29,8 @@ int csi_ovx_prelu(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_PRELU, input_num, output_num, &node_id); diff --git a/source/openvx/prod.c b/source/openvx/prod.c index 8ec67e67..216598fd 100644 --- a/source/openvx/prod.c +++ b/source/openvx/prod.c @@ -28,9 +28,8 @@ int csi_ovx_prod(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/proposal.c b/source/openvx/proposal.c index d967c9ce..60ffecaf 100644 --- a/source/openvx/proposal.c +++ b/source/openvx/proposal.c @@ -31,9 +31,8 @@ int csi_ovx_proposal(struct csi_tensor *cls_prob, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = cls_prob->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(cls_prob->sess); + output->sess = cls_prob->sess; uint32_t input_num = 4; uint32_t output_num = 2; node = vsi_nn_AddNode(graph, VSI_NN_OP_PROPOSAL, input_num, output_num, diff --git a/source/openvx/psroipooling.c b/source/openvx/psroipooling.c index 3bd48460..b70113f9 100644 --- a/source/openvx/psroipooling.c +++ b/source/openvx/psroipooling.c @@ -28,9 +28,8 @@ int csi_ovx_psroipooling(struct csi_tensor *data, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = data->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(data->sess); + output->sess = data->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_CUSTOM_PSROIPOOLING, input_num, output_num, &node_id); diff --git a/source/openvx/relu.c b/source/openvx/relu.c index 113b88f0..6f3cf598 100644 --- a/source/openvx/relu.c +++ b/source/openvx/relu.c @@ -26,9 +26,8 @@ int csi_ovx_relu(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELU, input_num, output_num, &node_id); diff --git a/source/openvx/relu1.c b/source/openvx/relu1.c index 8c05496f..25f56c86 100644 --- a/source/openvx/relu1.c +++ b/source/openvx/relu1.c @@ -26,9 +26,8 @@ int csi_ovx_relu1(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELU1, input_num, output_num, &node_id); diff --git a/source/openvx/relu6.c b/source/openvx/relu6.c index e1b76d97..f0742191 100644 --- a/source/openvx/relu6.c +++ b/source/openvx/relu6.c @@ -26,9 +26,8 @@ int csi_ovx_relu6(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELU6, input_num, output_num, &node_id); diff --git a/source/openvx/relun.c b/source/openvx/relun.c index 623b5b1b..6cdbf107 100644 --- a/source/openvx/relun.c +++ b/source/openvx/relun.c @@ -26,9 +26,8 @@ int csi_ovx_relun(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RELUN, input_num, output_num, &node_id); diff --git a/source/openvx/reorg.c b/source/openvx/reorg.c index 60ae1147..f6fa6aad 100644 --- a/source/openvx/reorg.c +++ b/source/openvx/reorg.c @@ -27,9 +27,8 @@ int csi_ovx_reorg(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_REORG, input_num, output_num, &node_id); diff --git a/source/openvx/reshape.c b/source/openvx/reshape.c index 2e8d4ff9..333f8653 100644 --- a/source/openvx/reshape.c +++ b/source/openvx/reshape.c @@ -28,9 +28,8 @@ int csi_ovx_reshape(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; @@ -76,9 +75,8 @@ int csi_ovx_reshape_tail(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/resize.c b/source/openvx/resize.c index 01128b4c..c9a2541a 100644 --- a/source/openvx/resize.c +++ b/source/openvx/resize.c @@ -27,9 +27,8 @@ int csi_ovx_resize(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RESIZE, input_num, output_num, &node_id); diff --git a/source/openvx/reverse.c b/source/openvx/reverse.c index cf4f8522..cd535d00 100644 --- a/source/openvx/reverse.c +++ b/source/openvx/reverse.c @@ -27,9 +27,8 @@ int csi_ovx_reverse(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_REVERSE, input_num, output_num, &node_id); diff --git a/source/openvx/rsqrt.c b/source/openvx/rsqrt.c index 3f7962d5..74175da0 100644 --- a/source/openvx/rsqrt.c +++ b/source/openvx/rsqrt.c @@ -27,9 +27,8 @@ int csi_ovx_rsqrt(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_RSQRT, input_num, output_num, &node_id); diff --git a/source/openvx/select.c b/source/openvx/select.c index 520e10ef..11cfa7b6 100644 --- a/source/openvx/select.c +++ b/source/openvx/select.c @@ -28,9 +28,8 @@ int csi_ovx_select(struct csi_tensor *condition, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 3; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SELECT, input_num, output_num, &node_id); diff --git a/source/openvx/setup.c b/source/openvx/setup.c index c3e3aa10..300b5f2e 100644 --- a/source/openvx/setup.c +++ b/source/openvx/setup.c @@ -16,105 +16,19 @@ * limitations under the License. */ -#include #include "csi_nn.h" #include "csi_utils.h" #include "csi_ovx.h" -#define NET_TOTAL_TENSOR_NUM 0 /* useless */ -#define NET_NODE_NUM 0 /* useless */ -#define BILLION 1000000000 - -static void get_statistical_data(float *data, int sz) -{ - int i = 0; - float max_value = data[0]; - float min_value = data[0]; - double std = 0.0; - double sum = 0.0; - for (i = 0; i < sz; i++) - { - sum += data[i]; - if (data[i] > max_value) - { - max_value = data[i]; - } - if (data[i] < min_value) - { - min_value = data[i]; - } - } - double mean = sum / sz; - sum = 0.0; - for (i = 0; i < sz; i++) - { - sum += ((data[i]-mean) * (data[i]-mean)); - } - std = sum / sz; - printf("The max_value of output: %lf\n", max_value); - printf("The min_value of output: %lf\n", min_value); - printf("The mean_value of output: %lf\n", mean); - printf("The std_value of output: %lf\n", std); -} - -static vsi_bool get_top - ( - float *pfProb, - float *pfMaxProb, - uint32_t *pMaxClass, - uint32_t outputCount, - uint32_t topNum - ) -{ - uint32_t i, j, k; - - #define MAX_TOP_NUM 20 - if (topNum > MAX_TOP_NUM) return FALSE; - - memset(pfMaxProb, 0xfe, sizeof(float) * topNum); - memset(pMaxClass, 0xff, sizeof(float) * topNum); - - for (j = 0; j < topNum; j++) - { - for (i=0; i *(pfMaxProb+j)) - { - *(pfMaxProb+j) = pfProb[i]; - *(pMaxClass+j) = i; - } - } - } - - return TRUE; -} - -uint64_t csi_get_perf_count() +void csi_ovx_show_top5(int index, struct csi_session *sess) { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return (uint64_t)((uint64_t)ts.tv_nsec + (uint64_t)ts.tv_sec * BILLION); -} - -void csi_nn_show_top5(void *td, int index) -{ - uint32_t i,sz,stride; + uint32_t i, sz, stride; float *buffer = NULL; uint8_t *tensor_data = NULL; - uint32_t MaxClass[5]; - float fMaxProb[5]; - uint32_t topk = 5; + uint32_t class[5]; + float prob[5]; - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, graph->output.tensors[index]); sz = 1; @@ -122,241 +36,190 @@ void csi_nn_show_top5(void *td, int index) sz *= tensor->attr.size[i]; } - if(topk > sz) - topk = sz; - stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type); tensor_data = (uint8_t *)vsi_nn_ConvertTensorToData(graph, tensor); buffer = (float *)malloc(sizeof(float) * sz); - for(i = 0; i < sz; i++) { + for (i = 0; i < sz; i++) { vsi_nn_DtypeToFloat32(&tensor_data[stride * i], &buffer[i], &tensor->attr.dtype); } #ifdef DEBUG_TEST - get_statistical_data(buffer, sz); + csi_statistical_mean_std(buffer, sz); #endif - if (!get_top(buffer, fMaxProb, MaxClass, sz, topk)) - { - printf("Fail to show result.\n"); - exit(-1); - } + csi_get_top5(buffer, sz, prob, class); - printf(" --- Top%d ---\n", topk); - for(i = 0; i< topk; i++) { - printf("%3d: %8.6f\n", MaxClass[i], fMaxProb[i]); + printf(" --- Top ---\n"); + for(i = 0; i< 5; i++) { + printf("%3d: %8.6f\n", class[i], prob[i]); + } + if (tensor_data) { + vsi_nn_Free(tensor_data); + } + if (buffer) { + free(buffer); } - if(tensor_data)vsi_nn_Free(tensor_data); - if(buffer)free(buffer); } -void csi_nn_save_output(void *td, int index, const char *filename) +void csi_ovx_save_output(int index, const char *filename, struct csi_session *sess) { - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_tensor_t *tensor; tensor = vsi_nn_GetTensor(graph, graph->output.tensors[index]); vsi_nn_SaveTensorToTextByFp32(graph, tensor, filename, NULL); } -int csi_nn_get_output_number(void *td) +int csi_ovx_get_output_number(struct csi_session *sess) { - return ((struct __target_data *)td)->output_num; + return sess->output_num; } -int csi_nn_get_input_number(void *td) +int csi_ovx_get_input_number(struct csi_session *sess) { - return ((struct __target_data *)td)->input_num; + return sess->input_num; } -struct csi_tensor *csi_nn_get_output(void *td, int index) +void csi_ovx_set_output_number(int number, struct csi_session *sess) { - struct csi_tensor *ret = malloc(sizeof(struct csi_tensor)); - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; - vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, graph->output.tensors[index]); - - ret->dim_count = tensor->attr.dim_num; - for (int i = 0; i < ret->dim_count; i++) { - ret->dim[i] = tensor->attr.size[ret->dim_count - 1 - i]; - } - - ret->data = (uint8_t *)vsi_nn_ConvertTensorToData(graph, tensor); - ret->scale = tensor->attr.dtype.scale; - ret->zero_point = tensor->attr.dtype.zero_point; - if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - ret->dtype = CSINN_DTYPE_UINT8; - } else if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) { - ret->dtype = CSINN_DTYPE_FLOAT32; - } + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + sess->output_num = number; + vsi_nn_SetGraphOutputs(graph, NULL, number); +} - return ret; +void csi_ovx_set_input_number(int number, struct csi_session *sess) +{ + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + sess->input_num = number; + vsi_nn_SetGraphInputs(graph, NULL, number); } -struct csi_tensor *csi_nn_ovx_get_tensor(void *td, int index) +static int csi_ovx_get_tensor_internal(struct csi_tensor *ret, vsi_nn_tensor_t *tensor, + vsi_nn_graph_t *graph) { - struct csi_tensor *ret = malloc(sizeof(struct csi_tensor)); - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; - vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, index); + if (ret->data == NULL) { + ret->data = (uint8_t *)vsi_nn_ConvertTensorToData(graph, tensor); + ret->dim_count = tensor->attr.dim_num; + for (int i = 0; i < ret->dim_count; i++) { + ret->dim[i] = tensor->attr.size[ret->dim_count - 1 - i]; + } - ret->dim_count = tensor->attr.dim_num; - for (int i = 0; i < ret->dim_count; i++) { - ret->dim[i] = tensor->attr.size[ret->dim_count - 1 - i]; - } + ret->scale = tensor->attr.dtype.scale; + ret->zero_point = tensor->attr.dtype.zero_point; + if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { + ret->dtype = CSINN_DTYPE_UINT8; + } else if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) { + ret->dtype = CSINN_DTYPE_FLOAT32; + } + } else { + if (ret->dim_count != tensor->attr.dim_num) { + return CSINN_FALSE; + } + int size = 1; + for (int i = 0; i < ret->dim_count; i++) { + size *= tensor->attr.size[ret->dim_count - 1 - i]; + } - ret->data = (uint8_t *)vsi_nn_ConvertTensorToData(graph, tensor); - ret->scale = tensor->attr.dtype.scale; - ret->zero_point = tensor->attr.dtype.zero_point; - if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - ret->dtype = CSINN_DTYPE_UINT8; - } else if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) { - ret->dtype = CSINN_DTYPE_FLOAT32; + uint8_t *data = (uint8_t *)vsi_nn_ConvertTensorToData(graph, tensor); + if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { + memcpy(ret->data, data, size); + } else if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) { + memcpy(ret->data, data, size * 4); + } } - - return ret; + return CSINN_TRUE; } -struct csi_tensor *csi_nn_get_input(void *td, int index) +int csi_ovx_get_output(int index, struct csi_tensor *output, struct csi_session *sess) { - struct csi_tensor *ret = malloc(sizeof(struct csi_tensor)); - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; - vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, graph->input.tensors[index]); - - ret->dim_count = tensor->attr.dim_num; - for (int i = 0; i < ret->dim_count; i++) { - ret->dim[i] = tensor->attr.size[ret->dim_count - 1 - i]; - } - - ret->data = (uint8_t *)vsi_nn_ConvertTensorToData(graph, tensor); - ret->scale = tensor->attr.dtype.scale; - ret->zero_point = tensor->attr.dtype.zero_point; - if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - ret->dtype = CSINN_DTYPE_UINT8; - } else if (tensor->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) { - ret->dtype = CSINN_DTYPE_FLOAT32; - } + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, graph->output.tensors[index]); - return ret; + return csi_ovx_get_tensor_internal(output, tensor, graph); } -static void update_graph(vsi_nn_graph_t *graph, struct __target_data *target_data) +int csi_ovx_get_tensor(int index, struct csi_tensor *ret, struct csi_session *sess) { - vsi_nn_node_t *node; - vsi_nn_node_t *prev_node; - int i, j; - int32_t prev_node_index; - int32_t output_index; - - // change the scales of reshape layer - const char *name = ""; - vsi_nn_tensor_t *curr_input_tensor; - vsi_nn_tensor_t *curr_output_tensor; - - vsi_nn_tensor_t *proposal_im_info_tensor; - vsi_nn_tensor_t *proposal_anchor_tensor; - for (i = 1; i < target_data->layer_num; i++) { - node = vsi_nn_GetNode(graph, i); - name = vsi_nn_OpGetName(node->op); - if (strcmp(name, "RESHAPE") == 0 || - strcmp(name, "PERMUTE") == 0) { - curr_input_tensor = vsi_nn_GetTensor(graph, node->input.tensors[0]); - curr_output_tensor = vsi_nn_GetTensor(graph, node->output.tensors[0]); - curr_output_tensor->attr.dtype.scale = curr_input_tensor->attr.dtype.scale; - curr_output_tensor->attr.dtype.zero_point = curr_input_tensor->attr.dtype.zero_point; - } - if (strcmp(name, "PROPOSAL") == 0) { - printf("current op: proposal\n"); - proposal_im_info_tensor = vsi_nn_GetTensor(graph, node->input.tensors[2]); - proposal_anchor_tensor = vsi_nn_GetTensor(graph, node->input.tensors[3]); + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, index); - proposal_im_info_tensor = NULL; - proposal_anchor_tensor = NULL; - } - } + return csi_ovx_get_tensor_internal(ret, tensor, graph); } -void quantize_input(struct csi_tensor *input, - struct csi_tensor *output) +int csi_ovx_get_input(int index, struct csi_tensor *input, struct csi_session *sess) { - float *input_data = input->data; - uint8_t *output_data = output->data; - int size = 1; - for (int i = 0; i < input->dim_count; i++) { - size = size * input->dim[i]; - } + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, graph->input.tensors[index]); - for (int i = 0; i < size; i++) { - uint8_t input_val = round(input_data[i] * output->scale) + output->offset; - output_data[i] = input_val; - } + return csi_ovx_get_tensor_internal(input, tensor, graph); } -int csi_nn_create_tensor(struct csi_tensor *input, - struct csi_tensor *output, - void *td) +void csi_ovx_set_tensor(struct csi_tensor *tensor, struct csi_session *sess) { - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t ret; - vsi_status status; - uint8_t *inputData; + uint8_t *input_data; uint32_t sz = 1; uint32_t stride = 1; int i = 0; - for (i = 0; i < input->dim_count; i++) { - attr.size[i] = input->dim[input->dim_count - 1 - i]; + for (i = 0; i < tensor->dim_count; i++) { + attr.size[i] = tensor->dim[tensor->dim_count - 1 - i]; } - attr.dim_num = input->dim_count; - attr.dtype.scale = output->scale; - attr.dtype.zero_point = output->zero_point; + attr.dim_num = tensor->dim_count; + attr.dtype.scale = tensor->scale; + attr.dtype.zero_point = tensor->zero_point; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; attr.vtl = FALSE; attr.is_const = FALSE; attr.dtype.vx_type = VSI_NN_TYPE_UINT8; for (i = 0; i < 4; i++) { - sz *= input->dim[i]; + sz *= tensor->dim[i]; } stride = vsi_nn_TypeGetBytes(attr.dtype.vx_type); - inputData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t)); + input_data = (uint8_t *)malloc(stride * sz * sizeof(uint8_t)); - ret = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, inputData); - output->data = (void *)ret; - output->t_private = td; - return (int)ret; + ret = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, input_data); + tensor->data = (void *)ret; + tensor->sess = sess; } -int csi_nn_ovx_create_const(struct csi_tensor *input, void *td) +void csi_ovx_set_const_tensor(struct csi_tensor *tensor, struct csi_session *sess) { - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t ret; - vsi_status status; - for (int i = 0; i < input->dim_count; i++) { - attr.size[i] = input->dim[input->dim_count - 1 - i]; + for (int i = 0; i < tensor->dim_count; i++) { + attr.size[i] = tensor->dim[tensor->dim_count - 1 - i]; } - attr.dim_num = input->dim_count; - attr.dtype.scale = input->scale; - attr.dtype.zero_point = input->zero_point; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + attr.dim_num = tensor->dim_count; attr.vtl = FALSE; attr.is_const = TRUE; - attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + if (tensor->dtype == CSINN_DTYPE_UINT8) { + attr.dtype.scale = tensor->scale; + attr.dtype.zero_point = tensor->zero_point; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + } else { + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } - ret = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, input->data); - input->data = (void *)ret; - input->t_private = td; - return (int)ret; + ret = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, tensor->data); + tensor->data = (void *)ret; + tensor->sess = sess; } -uint8_t *csi_nn_input_f32_to_u8(uint32_t index, float *data, void *td) +uint8_t *csi_ovx_input_f32_to_u8(uint32_t index, float *data, struct csi_session *sess) { vsi_nn_tensor_t *tensor; vsi_status status = VSI_FAILURE; - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); tensor = vsi_nn_GetTensor(graph, graph->input.tensors[index]); uint32_t i = 0; uint8_t *tensorData; @@ -374,86 +237,61 @@ uint8_t *csi_nn_input_f32_to_u8(uint32_t index, float *data, void *td) return tensorData; } -static void _handle_multiple_inputs(vsi_nn_graph_t *graph, uint32_t idx, - uint8_t *input_data) +void csi_ovx_update_input(uint32_t idx, struct csi_tensor *input, struct csi_session *sess) { - vsi_nn_tensor_t *tensor; - vsi_status status = VSI_FAILURE; - tensor = NULL; - tensor = vsi_nn_GetTensor( graph, graph->input.tensors[idx] ); + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + vsi_nn_tensor_t *tensor = vsi_nn_GetTensor(graph, graph->input.tensors[idx]); /* Copy the Pre-processed data to input tensor */ - status = vsi_nn_CopyDataToTensor(graph, tensor, input_data); -} - -void csi_nn_update_input(uint32_t idx, uint8_t *data, void *td) { - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; - _handle_multiple_inputs(graph, idx, data); + vsi_nn_CopyDataToTensor(graph, tensor, input->data); } -void *csi_nn_presetup(int input, int output) +void csi_ovx_session_init(struct csi_session *sess) { vsi_nn_graph_t *graph; vsi_nn_context_t ctx; - struct __target_data *target_data = malloc(sizeof(struct __target_data)); - target_data->input_num = input; - target_data->output_num = output; - int32_t input_num = input; - int32_t output_num = output; + struct csi_ovx_target_data *target_data = calloc(sizeof(struct csi_ovx_target_data), 1); ctx = vsi_nn_CreateContext(); #define VNN_VERSION_MAJOR 1 #define VNN_VERSION_MINOR 1 #define VNN_VERSION_PATCH 12 +#define NET_TOTAL_TENSOR_NUM 0 +#define NET_NODE_NUM 0 graph = vsi_nn_CreateGraph(ctx, NET_TOTAL_TENSOR_NUM, NET_NODE_NUM); vsi_nn_SetGraphVersion(graph, VNN_VERSION_MAJOR, VNN_VERSION_MINOR, VNN_VERSION_PATCH); - vsi_nn_SetGraphInputs(graph, NULL, input_num); - vsi_nn_SetGraphOutputs(graph, NULL, output_num); target_data->graph = graph; - return target_data; + sess->td = target_data; + sess->base_dtype = CSINN_DTYPE_UINT8; + sess->base_layout = CSINN_NCHW; } -void csi_nn_init(struct csi_tensor *input, - struct csi_tensor *output) +void csi_ovx_session_setup(struct csi_session *sess) { -} - -void csi_nn_setup(void *td) -{ - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; -// update_graph(graph, td); + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_SetupGraph(graph, FALSE); vsi_nn_VerifyGraph(graph); } -void csi_nn_run(void *td) +void csi_ovx_session_run(struct csi_session *sess) { - vsi_nn_graph_t *graph = ((struct __target_data *)td)->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); uint64_t start_time, end_time; - start_time = csi_get_perf_count(); + start_time = csi_get_timespec(); vsi_nn_RunGraph(graph); - end_time = csi_get_perf_count(); + end_time = csi_get_timespec(); printf("Run graph execution time: %.5fms, FPS=%.2f\n", ((float)(end_time-start_time))/1000000, 1000000000.0/((float)(end_time-start_time))); } -void csi_nn_postprocess(void* td) -{ -} - -void csi_nn_deinit(struct csi_tensor *input, - struct csi_tensor *output) -{ -} - -void csi_nn_set_ovx_input(int index, int input, struct __target_data *td) +void csi_ovx_set_input(int index, struct csi_tensor *input, struct csi_session *sess) { - vsi_nn_graph_t *graph = td->graph; - graph->input.tensors[index] = input; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); + graph->input.tensors[index] = (vsi_nn_tensor_id_t)input->data; } -void csi_nn_set_ovx_output(int index, struct csi_tensor *output, struct __target_data *td) +void csi_ovx_set_output(int index, struct csi_tensor *output, struct csi_session *sess) { - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_tensor_t *tensor; graph->output.tensors[index] = (vsi_nn_tensor_id_t)output->data; @@ -461,14 +299,248 @@ void csi_nn_set_ovx_output(int index, struct csi_tensor *output, struct __target tensor->attr.vtl = FALSE; } -void csi_ovx_free(struct __target_data *td) { - vsi_nn_graph_t *graph = td->graph; +void csi_ovx_session_deinit(struct csi_session *sess) +{ + vsi_nn_graph_t *graph = csi_ovx_get_graph(sess); vsi_nn_context_t ctx; if (graph) { ctx = graph->ctx; vsi_nn_ReleaseGraph(&graph); - vsi_nn_ReleaseContext(&ctx); } + free(sess->td); +} + +void *csi_ovx_get_graph(struct csi_session *sess) +{ + struct csi_ovx_target_data *td = sess->td; + return td->graph; +} + +void* csi_bc_map_table_ovx[CSINN_OP_SIZE][1] = { + {csi_ovx_abs}, /* CSINN_OP_ABS */ + {NULL}, /* CSINN_OP_ACOS */ + {NULL}, /* CSINN_OP_ACOSH */ + {csi_ovx_add}, /* CSINN_OP_ADD */ + {NULL}, /* CSINN_OP_ALL */ + {csi_ovx_and}, /* CSINN_OP_AND */ + {NULL}, /* CSINN_OP_ANY */ + {NULL}, /* CSINN_OP_ARANGE */ + {csi_ovx_argmax}, /* CSINN_OP_ARGMAX */ + {csi_ovx_argmin}, /* CSINN_OP_ARGMIN */ + {NULL}, /* CSINN_OP_ASIN */ + {NULL}, /* CSINN_OP_ASINH */ + {NULL}, /* CSINN_OP_ATAN */ + {NULL}, /* CSINN_OP_ATANH */ + {csi_ovx_averagepool}, /* CSINN_OP_AVGPOOL2D */ + {NULL}, /* CSINN_OP_AVGPOOL3D */ + {csi_ovx_batch_normalization}, /* CSINN_OP_BN */ + {csi_ovx_batch_to_space}, /* CSINN_OP_BATCH_TO_SPACE */ + {NULL}, /* CSINN_OP_BROADCOST */ + {NULL}, /* CSINN_OP_CEIL */ + {NULL}, /* CSINN_OP_CLIP */ + {NULL}, /* CSINN_OP_COL2IM */ + {csi_ovx_concat}, /* CSINN_OP_CONCAT */ + {csi_ovx_conv2d}, /* CSINN_OP_CONV2D */ + {NULL}, /* CSINN_OP_CONV2D_RELU */ + {NULL}, /* CSINN_OP_CONV2D_RELU6 */ + {NULL}, /* CSINN_OP_CONV2D_CHANNEL */ + {NULL}, /* CSINN_OP_CONV2D_CHANNEL_RELU */ + {NULL}, /* CSINN_OP_CONV2D_CHANNEL_RELU6 */ + {csi_ovx_depthwise_conv2d}, /* CSINN_OP_DEPTHWISE_CONV2D */ + {NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_RELU */ + {NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_RELU6 */ + {NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL */ + {NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU */ + {NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6 */ + {csi_ovx_group_conv2d}, /* CSINN_OP_GROUP_CONV2D */ + {NULL}, /* CSINN_OP_GROUP_CONV2D_RELU */ + {NULL}, /* CSINN_OP_GROUP_CONV2D_CHANNEL */ + {NULL}, /* CSINN_OP_GROUP_CONV2D_CHANNEL_RELU */ + {NULL}, /* CSINN_OP_CONV3D */ + {NULL}, /* CSINN_OP_COS */ + {NULL}, /* CSINN_OP_COSH */ + {NULL}, /* CSINN_OP_CUMPROD */ + {NULL}, /* CSINN_OP_CUMSUM */ + {csi_ovx_deconv2d}, /* CSINN_OP_DECONV2D */ + {csi_ovx_depthwise_deconv2d}, /* CSINN_OP_DEPTHWISE_DECONV2D */ + {NULL}, /* CSINN_OP_DECONV3D */ + {csi_ovx_depth_to_space}, /* CSINN_OP_DEPTH_TO_SPACE */ + {csi_ovx_div}, /* CSINN_OP_DIV */ + {csi_ovx_elu}, /* CSINN_OP_ELU */ + {csi_ovx_equal}, /* CSINN_OP_EQUANL */ + {NULL}, /* CSINN_OP_ERF */ + {csi_ovx_exp}, /* CSINN_OP_EXP */ + {csi_ovx_expand_dims_u8}, /* CSINN_OP_EXPAND_DIMS */ + {NULL}, /* CSINN_OP_EXPM1 */ + {csi_ovx_flatten}, /* CSINN_OP_FLATTEN */ + {csi_ovx_floor_divide}, /* CSINN_OP_FLOOR_DIVIDE */ + {NULL}, /* CSINN_OP_FLOOR_MOD */ + {csi_ovx_floor}, /* CSINN_OP_FLOOR */ + {csi_ovx_fullyconnected}, /* CSINN_OP_FULLYCONNECTED */ + {NULL}, /* CSINN_OP_GATHER_ND */ + {NULL}, /* CSINN_OP_GATHER */ + {csi_ovx_global_averagepool}, /* CSINN_OP_GLOBAL_AVGPOOL2D */ + {csi_ovx_global_maxpool}, /* CSINN_OP_GLOBAL_MAXPOOL2D */ + {csi_ovx_greater_equal}, /* CSINN_OP_GREATHER_EQUAL */ + {csi_ovx_greater}, /* CSINN_OP_GREATHER */ + {NULL}, /* CSINN_OP_HARD_SIGMOID */ + {NULL}, /* CSINN_OP_IM2COL */ + {NULL}, /* CSINN_OP_ISNAN */ + {csi_ovx_l2_normalization}, /* CSINN_OP_L2N */ + {csi_ovx_l2pool}, /* CSINN_OP_L2POOL2D */ + {csi_ovx_leaky_relu}, /* CSINN_OP_LEAKY_RELU */ + {csi_ovx_less_equal}, /* CSINN_OP_LESS_EQUAL */ + {csi_ovx_less}, /* CSINN_OP_LESS */ + {NULL}, /* CSINN_OP_LOG_SOFTMAX */ + {NULL}, /* CSINN_OP_LOG */ + {NULL}, /* CSINN_OP_LOG1P */ + {NULL}, /* CSINN_OP_LOGICAL_AND */ + {NULL}, /* CSINN_OP_LOGICAL_NOT */ + {NULL}, /* CSINN_OP_LOGICAL_OR */ + {NULL}, /* CSINN_OP_LOGICAL_XOR */ + {csi_ovx_lrn}, /* CSINN_OP_LRN */ + {csi_ovx_matmul}, /* CSINN_OP_MATMUL */ + {csi_ovx_max}, /* CSINN_OP_MAX */ + {csi_ovx_maximum}, /* CSINN_OP_MAXINUM */ + {csi_ovx_maxpool}, /* CSINN_OP_MAXPOOL2D */ + {csi_ovx_maxpool2d_locat}, /* CSINN_OP_MAXPOOL2D_LOCAT */ + {NULL}, /* CSINN_OP_MAXPOOL3D */ + {csi_ovx_mean}, /* CSINN_OP_MEAN */ + {csi_ovx_mean}, /* CSINN_OP_MEAN_STRIDE */ + {csi_ovx_min}, /* CSINN_OP_MIN */ + {NULL}, /* CSINN_OP_MIN_STRIDE */ + {csi_ovx_minimum}, /* CSINN_OP_MINIMUM */ + {NULL}, /* CSINN_OP_MOD */ + {csi_ovx_mul}, /* CSINN_OP_MUL */ + {NULL}, /* CSINN_OP_NDARRAY_SIZE */ + {csi_ovx_negative}, /* CSINN_OP_NEGATIIVE */ + {NULL}, /* CSINN_OP_NON_MAX_SUPPRESSION */ + {csi_ovx_not_equal}, /* CSINN_OP_NOT_EQUAL */ + {NULL}, /* CSINN_OP_NOT */ + {NULL}, /* CSINN_OP_ONE_HOT */ + {csi_ovx_or}, /* CSINN_OP_OR */ + {csi_ovx_pad}, /* CSINN_OP_PAD */ + {csi_ovx_power}, /* CSINN_OP_POWER */ + {csi_ovx_prelu}, /* CSINN_OP_PRELU */ + {csi_ovx_prod}, /* CSINN_OP_PROD */ + {csi_ovx_proposal}, /* CSINN_OP_PROPOSAL */ + {csi_ovx_psroipooling}, /* CSINN_OP_PSROIPOOLING */ + {NULL}, /* CSINN_OP_REDUCE_LOGSUMEXP */ + {NULL}, /* CSINN_OP_REDUCE_MAX */ + {NULL}, /* CSINN_OP_REDUCE_MEAN */ + {NULL}, /* CSINN_OP_REDUCE_MIN */ + {NULL}, /* CSINN_OP_REDUCE_PROD */ + {NULL}, /* CSINN_OP_REDUCE_SUM */ + {csi_ovx_relu}, /* CSINN_OP_RELU */ + {csi_ovx_relu1}, /* CSINN_OP_RELU1 */ + {csi_ovx_relu6}, /* CSINN_OP_RELU6 */ + {csi_ovx_relun}, /* CSINN_OP_RELUN */ + {csi_ovx_reorg}, /* CSINN_OP_REORG */ + {csi_ovx_reshape}, /* CSINN_OP_RESHAPE */ + {csi_ovx_resize}, /* CSINN_OP_RESIZE */ + {csi_ovx_reverse}, /* CSINN_OP_REVERSE */ + {NULL}, /* CSINN_OP_ROIALIGN */ + {NULL}, /* CSINN_OP_ROIPOOL */ + {NULL}, /* CSINN_OP_ROUND */ + {csi_ovx_rsqrt}, /* CSINN_OP_RSQRT */ + {NULL}, /* CSINN_OP_SEGMENT_MAX */ + {NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MAX */ + {NULL}, /* CSINN_OP_SEGMENT_MEAN */ + {NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MEAN */ + {NULL}, /* CSINN_OP_SEGMENT_MIN */ + {NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MIN */ + {NULL}, /* CSINN_OP_SEGMENT_PROD */ + {NULL}, /* CSINN_OP_UNSORTED_SEGMENT_PROD */ + {NULL}, /* CSINN_OP_SEGMENT_SUM */ + {NULL}, /* CSINN_OP_UNSORTED_SEGMENT_SUM */ + {csi_ovx_select}, /* CSINN_OP_SELECT */ + {NULL}, /* CSINN_OP_SEQUENCE_MASK */ + {NULL}, /* CSINN_OP_SHAPE */ + {NULL}, /* CSINN_OP_SHUFFLE_CHANNEL */ + {csi_ovx_sigmoid}, /* CSINN_OP_SIGMOID */ + {NULL}, /* CSINN_OP_SIGN */ + {NULL}, /* CSINN_OP_SIN */ + {NULL}, /* CSINN_OP_SINH */ + {csi_ovx_slice}, /* CSINN_OP_SLICE */ + {csi_ovx_softmax}, /* CSINN_OP_SOFTMAX */ + {csi_ovx_softplus}, /* CSINN_OP_SOFTPLUS */ + {NULL}, /* CSINN_OP_SOFTRELU */ + {NULL}, /* CSINN_OP_SOFTSIGN */ + {csi_ovx_space_to_batch}, /* CSINN_OP_SPACE_TO_BATCH */ + {csi_ovx_space_to_depth}, /* CSINN_OP_SPACE_TO_DEPTH */ + {csi_ovx_split}, /* CSINN_OP_SPLIT */ + {csi_ovx_sqrt}, /* CSINN_OP_SQRT */ + {csi_ovx_square}, /* CSINN_OP_SQUARE */ + {csi_ovx_squeeze}, /* CSINN_OP_SQUEEZE */ + {csi_ovx_stack}, /* CSINN_OP_STACK */ + {NULL}, /* CSINN_OP_STRIDED_SLICE */ + {csi_ovx_sub}, /* CSINN_OP_SUB */ + {csi_ovx_sum}, /* CSINN_OP_SUM */ + {NULL}, /* CSINN_OP_TAN */ + {csi_ovx_tanh}, /* CSINN_OP_TANH */ + {NULL}, /* CSINN_OP_THRESHOLD_RELU */ + {csi_ovx_tile}, /* CSINN_OP_TILE */ + {NULL}, /* CSINN_OP_TOPK */ + {csi_ovx_transpose}, /* CSINN_OP_TRANSPOSE */ + {NULL}, /* CSINN_OP_TRUNC */ + {csi_ovx_unpooling}, /* CSINN_OP_UNPOOLING */ + {csi_ovx_unstack}, /* CSINN_OP_UNSTACK */ + {NULL}, /* CSINN_OP_WHERE */ + {NULL}, /* CSINN_OP_XOR */ + {NULL}, /* CSINN_OP_YUV_RGB_SCALE */ + + /* utils functions */ + {csi_ovx_session_init}, + {csi_ovx_session_deinit}, + {csi_ovx_session_setup}, + {csi_ovx_session_run}, + {csi_ovx_update_input}, + {csi_ovx_set_input_number}, + {csi_ovx_set_output_number}, + {csi_ovx_get_input_number}, + {csi_ovx_get_output_number}, + {csi_ovx_set_input}, + {csi_ovx_set_output}, + {csi_ovx_get_input}, + {csi_ovx_get_output}, +}; + +void *csi_bc_map_ovx(int op, int dtype) +{ + int dt; + switch (dtype) { + case CSINN_DTYPE_UINT8: + dt = 0; + break; + default: + return NULL; + } + + return csi_bc_map_table_ovx[op][dt]; } + +void csi_ovx_nbg(struct csi_tensor **input, struct csi_tensor **output, + uint32_t inputs_count, uint32_t outputs_count, const char *url) +{ + vsi_nn_node_t *node; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input[0]->sess); + + uint32_t input_num = inputs_count; + uint32_t output_num = outputs_count; + node = vsi_nn_AddNode(graph, VSI_NN_OP_NBG, input_num, output_num, NULL); + node->nn_param.nbg.type = VSI_NN_NBG_FILE; + node->nn_param.nbg.url = url; + + /* input */ + for (uint32_t i = 0; i < input_num; i++) { + node->input.tensors[i] = (vsi_nn_tensor_id_t)input[i]->data; + } + + /* output */ + for (uint32_t i = 0; i < output_num; i++) { + node->output.tensors[i] = (vsi_nn_tensor_id_t)output[i]->data; + } +} + diff --git a/source/openvx/sigmoid.c b/source/openvx/sigmoid.c index 74e0fd61..e2eaa6a5 100644 --- a/source/openvx/sigmoid.c +++ b/source/openvx/sigmoid.c @@ -27,9 +27,8 @@ int csi_ovx_sigmoid(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SIGMOID, input_num, output_num, &node_id); diff --git a/source/openvx/slice.c b/source/openvx/slice.c index 09b56743..9325fc44 100644 --- a/source/openvx/slice.c +++ b/source/openvx/slice.c @@ -26,9 +26,8 @@ int csi_ovx_slice(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i; @@ -66,9 +65,8 @@ int csi_ovx_slice_tail(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i; diff --git a/source/openvx/softmax.c b/source/openvx/softmax.c index e2c46576..42ebf63a 100644 --- a/source/openvx/softmax.c +++ b/source/openvx/softmax.c @@ -27,9 +27,8 @@ int csi_ovx_softmax(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_id_t output_id; vsi_nn_tensor_attr_t attr; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SOFTMAX, input_num, output_num, &node_id); diff --git a/source/openvx/softplus.c b/source/openvx/softplus.c index 15446b2e..c9283b8f 100644 --- a/source/openvx/softplus.c +++ b/source/openvx/softplus.c @@ -26,9 +26,8 @@ int csi_ovx_softplus(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SOFTRELU, input_num, output_num, &node_id); diff --git a/source/openvx/space_to_batch.c b/source/openvx/space_to_batch.c index fe2c8cd7..e62d4b65 100644 --- a/source/openvx/space_to_batch.c +++ b/source/openvx/space_to_batch.c @@ -26,9 +26,8 @@ int csi_ovx_space_to_batch(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_REORG, input_num, output_num, &node_id); diff --git a/source/openvx/space_to_depth.c b/source/openvx/space_to_depth.c index eda087be..5b0d2b95 100644 --- a/source/openvx/space_to_depth.c +++ b/source/openvx/space_to_depth.c @@ -27,9 +27,8 @@ int csi_ovx_space_to_depth(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_REORG, input_num, output_num, &node_id); diff --git a/source/openvx/split.c b/source/openvx/split.c index 85938ba9..3705cd39 100644 --- a/source/openvx/split.c +++ b/source/openvx/split.c @@ -19,7 +19,7 @@ #include "csi_ovx.h" int csi_ovx_split(struct csi_tensor *input, - struct csi_tensor *output, + struct csi_tensor **output, struct split_params *params) { vsi_nn_node_t *node; @@ -27,8 +27,7 @@ int csi_ovx_split(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); uint32_t input_num = 1; uint32_t split_count = params->output_num; int i = 0; @@ -54,8 +53,8 @@ int csi_ovx_split(struct csi_tensor *input, /* output */ for (i = 0; i < split_count; i++) { - attr.dtype.scale = output[i].scale; - attr.dtype.zero_point = output[i].zero_point; + attr.dtype.scale = output[i]->scale; + attr.dtype.zero_point = output[i]->zero_point; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; memset(attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); attr.dim_num = VSI_NN_DIM_AUTO; @@ -64,7 +63,7 @@ int csi_ovx_split(struct csi_tensor *input, attr.dtype.vx_type = VSI_NN_TYPE_UINT8; output_id = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL); node->output.tensors[i] = output_id; - output[i].data = (void *)output_id; - output[i].t_private = td; + output[i]->data = (void *)output_id; + output[i]->sess = input->sess; } } diff --git a/source/openvx/sqrt.c b/source/openvx/sqrt.c index efd788b7..6d23bc52 100644 --- a/source/openvx/sqrt.c +++ b/source/openvx/sqrt.c @@ -27,9 +27,8 @@ int csi_ovx_sqrt(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SQRT, input_num, output_num, &node_id); diff --git a/source/openvx/square.c b/source/openvx/square.c index 96d0d0c6..f8ba4154 100644 --- a/source/openvx/square.c +++ b/source/openvx/square.c @@ -27,9 +27,8 @@ int csi_ovx_square(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SQUARE, input_num, output_num, &node_id); diff --git a/source/openvx/squeeze.c b/source/openvx/squeeze.c index 3cc82028..2d75a5d4 100644 --- a/source/openvx/squeeze.c +++ b/source/openvx/squeeze.c @@ -29,9 +29,8 @@ int csi_ovx_squeeze(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; @@ -73,9 +72,8 @@ int csi_ovx_squeeze_tail(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/stack.c b/source/openvx/stack.c index 9391e156..9de464b5 100644 --- a/source/openvx/stack.c +++ b/source/openvx/stack.c @@ -27,9 +27,8 @@ int csi_ovx_stack(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = params->inputs_count; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_STACK, input_num, output_num, &node_id); diff --git a/source/openvx/sub.c b/source/openvx/sub.c index a42ed694..a144e1df 100644 --- a/source/openvx/sub.c +++ b/source/openvx/sub.c @@ -28,9 +28,8 @@ int csi_ovx_sub(struct csi_tensor *input0, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input0->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input0->sess); + output->sess = input0->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_SUBTRACT, input_num, output_num, &node_id); diff --git a/source/openvx/sum.c b/source/openvx/sum.c index 9f43e041..e5b9416a 100644 --- a/source/openvx/sum.c +++ b/source/openvx/sum.c @@ -28,9 +28,8 @@ int csi_ovx_sum(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; int i = 0; diff --git a/source/openvx/tanh.c b/source/openvx/tanh.c index cefdd896..f7a9b928 100644 --- a/source/openvx/tanh.c +++ b/source/openvx/tanh.c @@ -28,9 +28,8 @@ int csi_ovx_tanh(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_TANH, input_num, output_num, &node_id); diff --git a/source/openvx/tile.c b/source/openvx/tile.c index 09e56c6a..4b9e4169 100644 --- a/source/openvx/tile.c +++ b/source/openvx/tile.c @@ -28,9 +28,8 @@ int csi_ovx_tile(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_TILE, input_num, output_num, &node_id); diff --git a/source/openvx/transpose.c b/source/openvx/transpose.c index 27d4b76a..5920c0af 100644 --- a/source/openvx/transpose.c +++ b/source/openvx/transpose.c @@ -26,9 +26,8 @@ int csi_ovx_transpose(struct csi_tensor *input, vsi_nn_node_id_t node_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_PERMUTE, input_num, output_num, &node_id); diff --git a/source/openvx/unpool.c b/source/openvx/unpool.c index ea35eb23..886f6825 100644 --- a/source/openvx/unpool.c +++ b/source/openvx/unpool.c @@ -28,9 +28,8 @@ int csi_ovx_unpooling(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 2; uint32_t output_num = 1; node = vsi_nn_AddNode(graph, VSI_NN_OP_UPSAMPLE, input_num, output_num, &node_id); diff --git a/source/openvx/unstack.c b/source/openvx/unstack.c index 4a0578f4..4c7810bc 100644 --- a/source/openvx/unstack.c +++ b/source/openvx/unstack.c @@ -20,7 +20,7 @@ #include "csi_ovx.h" int csi_ovx_unstack(struct csi_tensor *input, - struct csi_tensor *outputs, + struct csi_tensor *output, struct unstack_params *params) { vsi_nn_node_t *node; @@ -28,14 +28,13 @@ int csi_ovx_unstack(struct csi_tensor *input, vsi_nn_tensor_id_t input_id; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_id_t output_id; - struct __target_data *td = input->t_private; - output->t_private = td; - vsi_nn_graph_t *graph = td->graph; + vsi_nn_graph_t *graph = csi_ovx_get_graph(input->sess); + output->sess = input->sess; uint32_t input_num = 1; - uint32_t output_num = outputs_count; + uint32_t output_num = params->outputs_count; int i = 0; node = vsi_nn_AddNode(graph, VSI_NN_OP_UNSTACK, input_num, output_num, &node_id); - node->nn_param.unstack.axis = axis; + node->nn_param.unstack.axis = params->axis; attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; @@ -43,7 +42,7 @@ int csi_ovx_unstack(struct csi_tensor *input, node->input.tensors[0] = (vsi_nn_tensor_id_t)input->data; /* output */ - for (i = 0; i < outputs_count; i++) { + for (i = 0; i < output_num; i++) { attr.dtype.scale = 1; attr.dtype.zero_point = 0; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; @@ -54,6 +53,6 @@ int csi_ovx_unstack(struct csi_tensor *input, attr.dtype.vx_type = VSI_NN_TYPE_UINT8; output_id = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL); node->output.tensors[i] = output_id; - output[i]->data = (void *)output_id; + output[i].data = (void *)output_id; } } diff --git a/source/reference/abs.c b/source/reference/abs.c index 23b89723..438bc28d 100644 --- a/source/reference/abs.c +++ b/source/reference/abs.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_abs_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_abs_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -36,9 +36,9 @@ static int csi_abs_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_abs_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_abs_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -48,9 +48,9 @@ static int csi_abs_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, input->shift); + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float abs_val = fabs(input_val); - output_data[i] = csi_quantize_f32(abs_val, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(abs_val, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -59,19 +59,16 @@ int csi_abs_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_abs_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_abs_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ABS, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_abs(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/acos.c b/source/reference/acos.c index 63b920b3..8be8f7c7 100644 --- a/source/reference/acos.c +++ b/source/reference/acos.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_acos_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_acos_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_acos_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_acos_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_acos_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,24 +49,21 @@ static int csi_acos_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = acos(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_acos_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_acos_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_acos_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ACOS, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/acosh.c b/source/reference/acosh.c index 64ebd60b..dade1d11 100644 --- a/source/reference/acosh.c +++ b/source/reference/acosh.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_acosh_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_acosh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_acosh_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_acosh_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_acosh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_acosh_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = acosh(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_acosh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_acosh_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_acosh_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ACOSH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_acosh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/add.c b/source/reference/add.c index 7f132027..b88b8b9f 100644 --- a/source/reference/add.c +++ b/source/reference/add.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_add_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_add_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -58,10 +58,10 @@ static int csi_add_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_add_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_add_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -89,11 +89,11 @@ static int csi_add_u8(struct csi_tensor *input0, if(size0 == size1){ for (int i = 0; i < size0; i++) { float input0_val = - csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); float input1_val = - csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val + input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } } else if(input1->dim[axis] == channel && size1 == input1->dim[axis]){ @@ -106,13 +106,13 @@ static int csi_add_u8(struct csi_tensor *input0, else if (params->layout == CSINN_NCHW){channel = h;} float input1_val = - csi_dequantize_f32(input1_data[channel], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[channel], input1->zero_point, input1->multiplier, input1->shift); int index = csi_get_index(input0->dim, n, h, w, c); float input0_val = - csi_dequantize_f32(input0_data[index], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[index], input0->zero_point, input0->multiplier, input0->shift); float res = input0_val + input1_val; - output_data[index] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[index] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } } } @@ -126,11 +126,8 @@ int csi_add_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_add_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_add_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ADD, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/and.c b/source/reference/and.c index 913366f7..60eb7f4f 100644 --- a/source/reference/and.c +++ b/source/reference/and.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include -static int csi_and_u32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_and_u32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint32_t *input0_data = input0->data; uint32_t *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_and_u32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_and_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_and_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -62,11 +62,8 @@ int csi_and_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_and_u8; - } else if (input0->dtype == CSINN_DTYPE_UINT32) { - params->bc = csi_and_u32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_AND, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/arange.c b/source/reference/arange.c index 7af1d7f1..7ed87bbb 100644 --- a/source/reference/arange.c +++ b/source/reference/arange.c @@ -19,7 +19,8 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_arange_f32(struct csi_tensor *output, struct arange_params *params) +int csi_arange_f32(struct csi_tensor *output, + struct arange_params *params) { float_t * data = output->data; int j = 0; @@ -40,12 +41,12 @@ static int csi_arange_f32(struct csi_tensor *output, struct arange_params *param return CSINN_TRUE; } -static int csi_arange_u8(struct csi_tensor *output, - struct arange_params *params) +int csi_arange_u8(struct csi_tensor *output, + struct arange_params *params) { - float start = csi_dequantize_f32(1.0, 0, params->start_multiplier, params->start_shift); - float stop = csi_dequantize_f32(1.0, 0, params->stop_multiplier, params->stop_shift); - float step = csi_dequantize_f32(1.0, 0, params->step_multiplier, params->step_shift); + float start = csi_dequantize_u8_to_f32(1.0, 0, params->start_multiplier, params->start_shift); + float stop = csi_dequantize_u8_to_f32(1.0, 0, params->stop_multiplier, params->stop_shift); + float step = csi_dequantize_u8_to_f32(1.0, 0, params->step_multiplier, params->step_shift); uint8_t * data = output->data; int j = 0; @@ -59,7 +60,7 @@ static int csi_arange_u8(struct csi_tensor *output, break; } - data[j] = csi_quantize_f32(i, output->offset, output->multiplier, output->shift); + data[j] = csi_quantize_f32_to_u8(i, output->zero_point, output->multiplier, output->shift); i+=step; j++; } @@ -69,11 +70,8 @@ static int csi_arange_u8(struct csi_tensor *output, int csi_arange_init(struct csi_tensor *output, struct arange_params *params) { - if (output->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_arange_u8; - } else if (output->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_arange_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ARANGE, output->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/argmax.c b/source/reference/argmax.c index e330283e..c0fc499f 100644 --- a/source/reference/argmax.c +++ b/source/reference/argmax.c @@ -32,9 +32,10 @@ static struct ArgPos fargmax_stride(struct ArgPos lhs, struct ArgPos rhs) { return lhs; } -static int csi_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) { - +int csi_argmax_stride_i32_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) +{ float *input_data = input->data; float *output_data = output->data; @@ -66,9 +67,10 @@ static int csi_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor return CSINN_TRUE; } -static int csi_argmax_stride_i32_u8(struct csi_tensor *input, struct csi_tensor *output, - struct reduce_params *params) { - +int csi_argmax_stride_i32_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) +{ uint8_t *input_data = input->data; int32_t *output_data = output->data; @@ -90,7 +92,7 @@ static int csi_argmax_stride_i32_u8(struct csi_tensor *input, struct csi_tensor for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); struct ArgPos pos = {val, inner}; result = fargmax_stride(result, pos); @@ -108,11 +110,8 @@ int csi_argmax_init(struct csi_tensor *input, if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_argmax_stride_i32_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_argmax_stride_i32_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ARGMAX, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } diff --git a/source/reference/argmin.c b/source/reference/argmin.c index 148e27f2..79f3b961 100644 --- a/source/reference/argmin.c +++ b/source/reference/argmin.c @@ -31,11 +31,10 @@ static struct ArgPos fargmin_stride(struct ArgPos lhs, struct ArgPos rhs) { return lhs; } -static int csi_argmin_stride_i32_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_argmin_stride_i32_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { - float *input_data = input->data; float *output_data = output->data; @@ -67,9 +66,9 @@ static int csi_argmin_stride_i32_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_argmin_stride_i32_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_argmin_stride_i32_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = input->data; @@ -93,7 +92,7 @@ static int csi_argmin_stride_i32_u8(struct csi_tensor *input, for (int32_t inner = 0; inner < inner_size; inner++) { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); struct ArgPos pos = {val, inner}; result = fargmin_stride(result, pos); @@ -111,11 +110,8 @@ int csi_argmin_init(struct csi_tensor *input, if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_argmin_stride_i32_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_argmin_stride_i32_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ARGMIN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } diff --git a/source/reference/asin.c b/source/reference/asin.c index 36112297..ce7b6203 100644 --- a/source/reference/asin.c +++ b/source/reference/asin.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_asin_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_asin_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_asin_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_asin_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_asin_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_asin_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, - input->shift); + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, + input->shift); float res = asin(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_asin_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_asin_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_asin_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ASIN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_asin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/asinh.c b/source/reference/asinh.c index 9edc4aa1..d9ec9794 100644 --- a/source/reference/asinh.c +++ b/source/reference/asinh.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_asinh_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_asinh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_asinh_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_asinh_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_asinh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_asinh_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = asinh(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_asinh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_asinh_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_asinh_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ASINH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_asinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/atan.c b/source/reference/atan.c index 89ee9bb7..8da67e82 100644 --- a/source/reference/atan.c +++ b/source/reference/atan.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_atan_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_atan_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_atan_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_atan_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_atan_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_atan_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = atan(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_atan_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_atan_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_atan_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ATAN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_atan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/atanh.c b/source/reference/atanh.c index 7189d9c9..78d2925a 100644 --- a/source/reference/atanh.c +++ b/source/reference/atanh.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_atanh_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_atanh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_atanh_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_atanh_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_atanh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_atanh_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = atanh(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_atanh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_atanh_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_atanh_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ATANH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_atanh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/averagepool.c b/source/reference/averagepool.c index 0d7f4bba..7d14d9a8 100644 --- a/source/reference/averagepool.c +++ b/source/reference/averagepool.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_averagepool_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_averagepool_nhwc_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -67,9 +67,9 @@ static int csi_averagepool_nhwc_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_averagepool_nhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_averagepool_nhwc_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -80,10 +80,10 @@ static int csi_averagepool_nhwc_u8(struct csi_tensor *input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -109,14 +109,14 @@ static int csi_averagepool_nhwc_u8(struct csi_tensor *input, const int in_y = in_y_origin + filter_y; uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, in_x, channel)]; - total += csi_dequantize_f32(input_val, input_offset, input_multiplier, + total += csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, input_shift); filter_count++; } } float average = filter_count == 0 ? total : total / filter_count; output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = - csi_quantize_f32(average, output_offset, output_multiplier, output_shift); + csi_quantize_f32_to_u8(average, output_offset, output_multiplier, output_shift); } } } @@ -125,8 +125,8 @@ static int csi_averagepool_nhwc_u8(struct csi_tensor *input, } static int csi_averagepool_nchw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -173,13 +173,13 @@ static int csi_averagepool_nchw_f32(struct csi_tensor *input, } static int csi_averagepool_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct pool_params *params) + struct csi_tensor *o_output, + struct pool_params *params) { struct csi_tensor* input; struct csi_tensor* output; - input = csi_nchw_to_nhwc_u8(o_input); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -190,10 +190,10 @@ static int csi_averagepool_nchw_u8(struct csi_tensor *o_input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -219,45 +219,57 @@ static int csi_averagepool_nchw_u8(struct csi_tensor *o_input, const int in_y = in_y_origin + filter_y; uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, in_x, channel)]; - total += csi_dequantize_f32(input_val, input_offset, input_multiplier, + total += csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, input_shift); filter_count++; } } float average = filter_count == 0 ? total : total / filter_count; output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = - csi_quantize_f32(average, output_offset, output_multiplier, output_shift); + csi_quantize_f32_to_u8(average, output_offset, output_multiplier, output_shift); } } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); return CSINN_TRUE; } -int csi_averagepool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_averagepool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_averagepool_nchw_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_averagepool_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_averagepool_nchw_f32(input, output, params); } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_averagepool_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_averagepool_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_averagepool_nhwc_f32(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_averagepool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_averagepool_nchw_u8(input, output, params); + } else if (params->layout = CSINN_NHWC) { + csi_averagepool_nhwc_u8(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_averagepool_init(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_AVGPOOL2D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; } diff --git a/source/reference/averagepool3d.c b/source/reference/averagepool3d.c index 4c8488b4..881dd5dc 100644 --- a/source/reference/averagepool3d.c +++ b/source/reference/averagepool3d.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_averagepool3d_ncdhw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_averagepool3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -80,9 +79,9 @@ static int csi_averagepool3d_ncdhw_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_averagepool3d_ncdhw_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_averagepool3d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -124,14 +123,14 @@ static int csi_averagepool3d_ncdhw_u8(struct csi_tensor *input, int in_h = in_h_origin + filter_h; int in_w = in_w_origin + filter_w; uint8_t input_val = input_data[csi_get_index_5(input->dim, in_ch, out_ch, in_d, in_h, in_w)]; - total += csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + total += csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); // filter_cnt++; } } } // float average = filter_cnt==0 ? total : total/filter_cnt; float average = total/filter_cnt; - uint8_t output_val = csi_quantize_f32(average, output->offset, output->multiplier, output->shift); + uint8_t output_val = csi_quantize_f32_to_u8(average, output->zero_point, output->multiplier, output->shift); output_data[csi_get_index_5(output->dim, in_ch, out_ch, out_d, out_h, out_w)] = output_val; } } @@ -142,57 +141,20 @@ static int csi_averagepool3d_ncdhw_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_averagepool3d_ndhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - - return CSINN_FALSE; -} - -static int csi_averagepool3d_ndhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) -{ - uint8_t *input_data = (uint8_t *)input->data; - uint8_t *output_data = (uint8_t *)output->data; - - return CSINN_FALSE; -} - - int csi_averagepool3d_init(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params) { - if(params->layout == CSINN_NCDHW) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_averagepool3d_ncdhw_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_averagepool3d_ncdhw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if(params->layout == CSINN_NDHWC) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_averagepool3d_ndhwc_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_averagepool3d_ndhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_UNSUPPORT_LAYOUT; + params->bc = csi_bc_map(params->api, CSINN_OP_AVGPOOL3D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } - int csi_averagepool3d(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { if(params->bc !=NULL) { params->bc(input, output, params); diff --git a/source/reference/batch_normalization.c b/source/reference/batch_normalization.c index dcfebf62..22edcb68 100644 --- a/source/reference/batch_normalization.c +++ b/source/reference/batch_normalization.c @@ -20,13 +20,13 @@ #include "csi_utils.h" /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/nn_impl.py#L1474-L1542 */ -static int csi_batch_normalization_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *mean, - struct csi_tensor *variance, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct csi_tensor *output, - struct bn_params *params) +int csi_batch_normalization_f32(struct csi_tensor *input, + struct csi_tensor *mean, + struct csi_tensor *variance, + struct csi_tensor *gamma, + struct csi_tensor *beta, + struct csi_tensor *output, + struct bn_params *params) { float *input_data = input->data; float *mean_data = mean->data; @@ -63,13 +63,13 @@ static int csi_batch_normalization_nhwc_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_batch_normalization_nhwc_u8(struct csi_tensor *input, - struct csi_tensor *mean, - struct csi_tensor *variance, - struct csi_tensor *gamma, - struct csi_tensor *beta, - struct csi_tensor *output, - struct bn_params *params) +int csi_batch_normalization_u8(struct csi_tensor *input, + struct csi_tensor *mean, + struct csi_tensor *variance, + struct csi_tensor *gamma, + struct csi_tensor *beta, + struct csi_tensor *output, + struct bn_params *params) { uint8_t *input_data = input->data; uint8_t *mean_data = mean->data; @@ -88,23 +88,23 @@ static int csi_batch_normalization_nhwc_u8(struct csi_tensor *input, for (int b = 0; b < batches; ++b) { for (int c = 0; c < input->dim[dims_count - 1]; ++c) { - float intput_val = csi_dequantize_f32(input_data[b * batch_offset + c], input->offset, + float intput_val = csi_dequantize_u8_to_f32(input_data[b * batch_offset + c], input->zero_point, input->multiplier, input->shift); - float mean_val = csi_dequantize_f32(mean_data[c], mean->offset, mean->multiplier, + float mean_val = csi_dequantize_u8_to_f32(mean_data[c], mean->zero_point, mean->multiplier, mean->shift); - float var_val = csi_dequantize_f32(var_data[c], variance->offset, variance->multiplier, + float var_val = csi_dequantize_u8_to_f32(var_data[c], variance->zero_point, variance->multiplier, variance->shift); - float beta_val = csi_dequantize_f32(beta_data[c], beta->offset, beta->multiplier, + float beta_val = csi_dequantize_u8_to_f32(beta_data[c], beta->zero_point, beta->multiplier, beta->shift); float result = 1/sqrt(var_val + params->epsilon); result *= (intput_val - mean_val); if (gamma != NULL) { uint8_t *gamma_data = gamma->data; - result *= csi_dequantize_f32(gamma_data[c], gamma->offset, gamma->multiplier, + result *= csi_dequantize_u8_to_f32(gamma_data[c], gamma->zero_point, gamma->multiplier, gamma->shift); } result += beta_val; - output_data[b * batch_offset + c] = csi_quantize_f32(result, output->offset, + output_data[b * batch_offset + c] = csi_quantize_f32_to_u8(result, output->zero_point, output->multiplier, output->shift); } } @@ -121,21 +121,14 @@ int csi_batch_normalization_init(struct csi_tensor *input, struct csi_tensor *output, struct bn_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - if (params->layout == CSINN_NHWC) { - params->bc = csi_batch_normalization_nhwc_u8; - } else if (params->layout == CSINN_NCHW) { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - if (params->layout == CSINN_NHWC) { - params->bc = csi_batch_normalization_nhwc_f32; - } else if (params->layout == CSINN_NCHW) { - return CSINN_UNSUPPORT_DTYPE; - } - } else { + if (params->layout == CSINN_NCHW) { return CSINN_UNSUPPORT_DTYPE; } + params->bc = csi_bc_map(params->api, CSINN_OP_BN, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; } diff --git a/source/reference/batch_to_space.c b/source/reference/batch_to_space.c index 5a48f137..99c96198 100644 --- a/source/reference/batch_to_space.c +++ b/source/reference/batch_to_space.c @@ -20,9 +20,9 @@ #include "csi_utils.h" //the input->data is a 4-D Tensor with shape [batch, depth, height, width]. -static int csi_batch_to_space_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_params *params) +int csi_batch_to_space_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct batch_to_space_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -69,9 +69,9 @@ static int csi_batch_to_space_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_batch_to_space_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct batch_to_space_params *params) +int csi_batch_to_space_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct batch_to_space_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -107,8 +107,8 @@ static int csi_batch_to_space_u8(struct csi_tensor *input, if(h_now >= 0 && h_now < out_height && w_now >= 0 && w_now < out_width) { int out_addr = csi_get_index(output->dim, out_b, out_c, h_now, w_now); // output_data[out_addr] = temp[h * block_size + w]; - output_data[out_addr] = csi_requantize_u8(temp[h * block_size + w], input->offset, input->multiplier, input->shift, - output->offset, output->multiplier, output->shift); + output_data[out_addr] = csi_requantize_u8(temp[h * block_size + w], input->zero_point, input->multiplier, input->shift, + output->zero_point, output->multiplier, output->shift); } } } @@ -124,11 +124,8 @@ int csi_batch_to_space_init(struct csi_tensor *input, struct csi_tensor *output, struct batch_to_space_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_batch_to_space_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_batch_to_space_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_BATCH_TO_SPACE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/broadcast_to.c b/source/reference/broadcast_to.c index d2fc1f90..2a75c40c 100644 --- a/source/reference/broadcast_to.c +++ b/source/reference/broadcast_to.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_broadcast_to_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int csi_broadcast_to_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -43,9 +42,9 @@ static int csi_broadcast_to_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_broadcast_to_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) +int csi_broadcast_to_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct broadcast_to_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -65,24 +64,20 @@ static int csi_broadcast_to_u8(struct csi_tensor *input, return CSINN_TRUE; } - int csi_broadcast_to_init(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) + struct csi_tensor *output, + struct broadcast_to_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_broadcast_to_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_broadcast_to_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_BROADCOST, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_broadcast_to(struct csi_tensor *input, - struct csi_tensor *output, - struct broadcast_to_params *params) + struct csi_tensor *output, + struct broadcast_to_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/ceil.c b/source/reference/ceil.c index a9b0aa90..0bc5883f 100644 --- a/source/reference/ceil.c +++ b/source/reference/ceil.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_ceil_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_ceil_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -38,9 +38,9 @@ static int csi_ceil_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_ceil_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_ceil_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -50,11 +50,11 @@ static int csi_ceil_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = ceil(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -63,11 +63,8 @@ int csi_ceil_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_ceil_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_ceil_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_CEIL, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/clip.c b/source/reference/clip.c index 58c6942c..ed8922dc 100644 --- a/source/reference/clip.c +++ b/source/reference/clip.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_clip_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int csi_clip_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -43,9 +43,9 @@ static int csi_clip_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_clip_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct clip_params *params) +int csi_clip_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct clip_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -55,7 +55,7 @@ static int csi_clip_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = 0.0f; if(input_val < params->min_value) { @@ -65,7 +65,7 @@ static int csi_clip_u8(struct csi_tensor *input, } else { res = output_data[i]; } - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -74,11 +74,8 @@ int csi_clip_init(struct csi_tensor *input, struct csi_tensor *output, struct clip_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_clip_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_clip_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_CLIP, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/col2im.c b/source/reference/col2im.c index ef091237..3857a2e6 100644 --- a/source/reference/col2im.c +++ b/source/reference/col2im.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_col2im_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct col2im_params *params) +int csi_col2im_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct col2im_params *params) { int32_t height = input->dim[1]; int32_t width = input->dim[2]; @@ -67,9 +67,8 @@ int csi_col2im_init(struct csi_tensor *input, struct csi_tensor *kernel, struct col2im_params *params) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_col2im_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_COL2IM, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } diff --git a/source/reference/concat.c b/source/reference/concat.c index 276b71c1..f0ee4656 100644 --- a/source/reference/concat.c +++ b/source/reference/concat.c @@ -19,16 +19,15 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_concat_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct concat_params *params) +int csi_concat_f32(struct csi_tensor **input, + struct csi_tensor *output, + struct concat_params *params) { int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { outer_size *= output->dim[i]; } - // For all input arrays, - // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; for (int i = params->axis + 1; i < output->dim_count; ++i) { base_inner_size *= output->dim[i]; @@ -37,7 +36,7 @@ static int csi_concat_f32(struct csi_tensor *input, float *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input + i; + struct csi_tensor *input_item = input[i]; float *input_item_data = input_item->data; const int copy_size = input_item->dim[params->axis] * base_inner_size; const float *input_ptr = input_item_data + k * copy_size; @@ -48,19 +47,18 @@ static int csi_concat_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_concat_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct concat_params *params) +int csi_concat_u8(struct csi_tensor **input, + struct csi_tensor *output, + struct concat_params *params) { if (params->axis == -1){ - params->axis= input->dim_count -1; + params->axis= input[0]->dim_count - 1; } int64_t outer_size = 1; for (int i = 0; i < params->axis; ++i) { outer_size *= output->dim[i]; } - // For all input arrays, - // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; for (int i = params->axis + 1; i < output->dim_count; ++i) { base_inner_size *= output->dim[i]; @@ -69,19 +67,19 @@ static int csi_concat_u8(struct csi_tensor *input, uint8_t *output_ptr = output->data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < params->inputs_count; ++i) { - struct csi_tensor *input_item = input + i; + struct csi_tensor *input_item = input[i]; const int copy_size = input_item->dim[params->axis] * base_inner_size; uint8_t *input_item_data = input_item->data; const uint8_t *input_ptr = input_item_data + k * copy_size; - if (input_item->offset == output->offset && + if (input_item->zero_point == output->zero_point && input_item->multiplier == output->multiplier && input_item->shift == output->shift) { memcpy(output_ptr, input_ptr, copy_size); } else { for (int j = 0; j < copy_size; ++j) { - output_ptr[j] = csi_requantize_u8(input_ptr[j], input_item->offset, + output_ptr[j] = csi_requantize_u8(input_ptr[j], input_item->zero_point, input_item->multiplier, input_item->shift, - output->offset, output->multiplier, output->shift); + output->zero_point, output->multiplier, output->shift); } } output_ptr += copy_size; @@ -90,21 +88,18 @@ static int csi_concat_u8(struct csi_tensor *input, return CSINN_TRUE; } -int csi_concat_init(struct csi_tensor *input, +int csi_concat_init(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params) { - if (output->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_concat_u8; - } else if (output->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_concat_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_CONCAT, output->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } -int csi_concat(struct csi_tensor *input, +int csi_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params) { diff --git a/source/reference/convolution.c b/source/reference/convolution.c index a2ac2361..2bdaefa2 100644 --- a/source/reference/convolution.c +++ b/source/reference/convolution.c @@ -18,14 +18,17 @@ #include "csi_nn.h" #include "csi_utils.h" +#ifdef CSI_AVX_OPT +#include "conv_avx.c" +#endif /* reference https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/conv.h */ static int csi_conv2d_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -84,6 +87,58 @@ static int csi_conv2d_nhwc_f32(struct csi_tensor *input, return CSINN_TRUE; } +static int csi_conv2d_nchw_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + struct csi_tensor t_input; + memcpy(&t_input, input, sizeof(struct csi_tensor)); + int32_t pad_b[4] = {0, params->pad_top, params->pad_left, 0}; + int32_t pad_a[4] = {0, params->pad_down, params->pad_right, 0}; + t_input.dim[2] = input->dim[2] + params->pad_top + params->pad_down; + t_input.dim[3] = input->dim[3] + params->pad_left + params->pad_right; + t_input.data = malloc(t_input.dim[0] * t_input.dim[1] * + t_input.dim[2] * t_input.dim[3] * 4); + struct pad_params pparams; + pparams.layout = CSINN_NCHW; + pparams.api = CSINN_REF; + pparams.pad_before = pad_b; + pparams.pad_after = pad_a; + pparams.pad_mode = 0; + pparams.pad_value = 0; + csi_pad_init(input, &t_input, &pparams); + csi_pad(input, &t_input, &pparams); + + struct csi_tensor t_kernel; + conv_trans_kernel_avx(kernel, &t_kernel); + conv_im2col_sgemm_avx(&t_input, output, &t_kernel, bias, + kernel->dim[3], kernel->dim[2], + params->stride_width, params->stride_height); + + free(t_input.data); + free(t_kernel.data); +#else + struct csi_tensor* t_input; + struct csi_tensor* t_output; + struct csi_tensor* t_kernel; + struct csi_tensor* t_bias = bias; + t_input = csi_nchw_to_nhwc_f32(input); + t_kernel = csi_nchw_to_nhwc_f32(kernel); + t_output = csi_nchw_to_nhwc_f32(output); + int out = csi_conv2d_nhwc_f32(t_input, t_output, t_kernel, t_bias, params); + csi_nhwc_to_nchw_f32(output, t_output); + free(t_input->data); + free(t_input); + free(t_kernel->data); + free(t_kernel); + +#endif + return CSINN_TRUE; +} + static int csi_conv2d_nhwc_u8(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, @@ -96,9 +151,9 @@ static int csi_conv2d_nhwc_u8(struct csi_tensor *input, int32_t *bias_data = bias->data; const int32_t dilation_width_factor = params->dilation_width; const int32_t dilation_height_factor = params->dilation_height; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -135,7 +190,7 @@ static int csi_conv2d_nhwc_u8(struct csi_tensor *input, kernel->dim, out_channel, filter_y, filter_x, in_channel); int32_t filter_val = kernel_data[filter_index]; acc += - (filter_val + filter_offset) * (input_val + input_offset); + (filter_val - filter_offset) * (input_val - input_offset); } } } @@ -152,6 +207,74 @@ static int csi_conv2d_nhwc_u8(struct csi_tensor *input, return CSINN_TRUE; } +static int csi_conv2d_nhwc_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = csi_quantize_i8(acc, output_offset, output_multiplier, output_shift); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + return CSINN_TRUE; +} + static int csi_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, @@ -173,7 +296,130 @@ static int csi_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, const int32_t filter_width = kernel->dim[2]; const int32_t output_height = output->dim[1]; const int32_t output_width = output->dim[2]; + assert(input_depth == output_depth); // The input and output channels are equal for dw convolution + + for (int32_t b = 0; b < batches; ++b) { + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + float acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + float input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + float filter_val = kernel_data[csi_get_index( + kernel->dim, 0, filter_y, filter_x, ic)]; + acc += (filter_val) * (input_val); + } + } + } + if (bias_data) { + acc += bias_data[ic]; + } + output_data[csi_get_index(output->dim, b, out_y, out_x, ic)] = acc; + } + } + } + } + return CSINN_TRUE; +} + + +static int csi_depthwise_conv2d_nchw_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + float *kernel_data = (float *)kernel->data; + float *bias_data = (float *)bias->data; + + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[1]; + const int32_t output_depth = output->dim[1]; + const int32_t input_height = input->dim[2]; + const int32_t input_width = input->dim[3]; + const int32_t filter_height = kernel->dim[2]; + const int32_t filter_width = kernel->dim[3]; + const int32_t output_height = output->dim[2]; + const int32_t output_width = output->dim[3]; + assert(input_depth == output_depth); // The input and output channels are equal for dw convolution + + for (int32_t b = 0; b < batches; ++b) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + float acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + float input_val = + input_data[csi_get_index(input->dim, b, ic, in_y, in_x)]; + float filter_val = kernel_data[csi_get_index( + kernel->dim, ic, 0, filter_y, filter_x)]; + acc += (filter_val) * (input_val); + } + } + } + if (bias_data) { + acc += bias_data[ic]; + } + output_data[csi_get_index(output->dim, b, ic, out_y, out_x)] = acc; + + } + } + } + } + +} + +static int csi_depthwise_conv2d_nhwc_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; for (int32_t b = 0; b < batches; ++b) { for (int32_t out_y = 0; out_y < output_height; ++out_y) { @@ -183,27 +429,30 @@ static int csi_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, const int32_t oc = m + ic * depth_multiplier; const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; - float acc = 0; + int64_t acc = 0; for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; - const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; // If the location is outside the bounds of the input image, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - float input_val = + int32_t input_val = input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; - float filter_val = kernel_data[csi_get_index( - kernel->dim, 0, filter_y, filter_x, oc)]; - acc += (filter_val) * (input_val); + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); } } } if (bias->dim_count != 0) { acc += bias_data[oc]; } - output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = acc; + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = + csi_quantize_u8(acc, output_offset, output_multiplier, output_shift); } } } @@ -212,15 +461,15 @@ static int csi_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_depthwise_conv2d_nhwc_u8(struct csi_tensor *input, +static int csi_depthwise_conv2d_nhwc_i8(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, struct csi_tensor *bias, struct conv2d_params *params) { - uint8_t *input_data = input->data; - uint8_t *output_data = output->data; - uint8_t *kernel_data = kernel->data; + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; int32_t *bias_data = bias->data; const int32_t dilation_width_factor = params->dilation_width; const int32_t dilation_height_factor = params->dilation_height; @@ -234,9 +483,9 @@ static int csi_depthwise_conv2d_nhwc_u8(struct csi_tensor *input, const int32_t output_height = output->dim[1]; const int32_t output_width = output->dim[2]; const int32_t depth_multiplier = output_depth / input_depth; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -263,7 +512,7 @@ static int csi_depthwise_conv2d_nhwc_u8(struct csi_tensor *input, int32_t filter_val = kernel_data[csi_get_index( kernel->dim, ic, filter_y, filter_x, m)]; acc += - (filter_val + filter_offset) * (input_val + input_offset); + (filter_val - filter_offset) * (input_val - input_offset); } } } @@ -271,7 +520,7 @@ static int csi_depthwise_conv2d_nhwc_u8(struct csi_tensor *input, acc += bias_data[oc]; } output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = - csi_quantize_u8(acc, output_offset, output_multiplier, output_shift); + csi_quantize_i8(acc, output_offset, output_multiplier, output_shift); } } } @@ -324,25 +573,48 @@ static int csi_group_conv2d_nhwc_u8(struct csi_tensor *o_input, return CSINN_TRUE; } -#ifdef CSI_AVX_OPT -#include "conv_avx.c" -#endif - -static float uint8_to_float(uint8_t i, struct csi_tensor *t) +static int csi_group_conv2d_nhwc_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) { - return ((float)i - t->zero_point) * t->scale; -} + struct csi_tensor input; + struct csi_tensor output; + struct csi_tensor kernel; + struct csi_tensor bias; -static uint8_t float_to_uint8(float i, struct csi_tensor *t) -{ - float ret = round(i / t->scale) + t->zero_point; - if (ret > 255) { - return 255; - } else if (ret < 0) { - return 0; - } else { - return ret; + memcpy(&input, o_input, sizeof(struct csi_tensor)); + memcpy(&output, o_output, sizeof(struct csi_tensor)); + memcpy(&kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&bias, o_bias, sizeof(struct csi_tensor)); + + input.dim[3] /= params->group; + output.dim[3] /= params->group; + kernel.dim[0] /= params->group; + + int input_size = 1; + int output_size = 1; + int kernel_size = 1; + + for (int i = 0; i < input.dim_count; i++) { + input_size *= input.dim[i]; + output_size *= output.dim[i]; + kernel_size *= kernel.dim[i]; + } + + int8_t *input_data = o_input->data; + int8_t *output_data = o_output->data; + int8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + for (int i = 0; i < params->group; i++) { + input.data = input_data + i * input_size; + output.data = output_data + i * output_size; + kernel.data = kernel_data + i * kernel_size; + bias.data = bias_data + i * o_output->dim[3] / params->group; + csi_conv2d_nhwc_i8(&input, &output, &kernel, &bias, params); } + return CSINN_TRUE; } static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, @@ -413,6 +685,7 @@ static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, t_input.dim[2] * t_input.dim[3] * 4); struct pad_params pparams; pparams.layout = CSINN_NCHW; + pparams.api = CSINN_REF; pparams.pad_before = pad_b; pparams.pad_after = pad_a; pparams.pad_mode = 0; @@ -441,9 +714,9 @@ static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, struct csi_tensor* output; struct csi_tensor* kernel; struct csi_tensor* bias = o_bias; - input = csi_nchw_to_nhwc_u8(o_input); - kernel = csi_nchw_to_nhwc_u8(o_kernel); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -451,9 +724,9 @@ static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, int32_t *bias_data = bias->data; const int32_t dilation_width_factor = params->dilation_width; const int32_t dilation_height_factor = params->dilation_height; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -491,7 +764,7 @@ static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, kernel->dim, out_channel, filter_y, filter_x, in_channel); int32_t filter_val = kernel_data[filter_index]; acc += - (filter_val + filter_offset) * (input_val + input_offset); + (filter_val - filter_offset) * (input_val - input_offset); } } } @@ -505,7 +778,7 @@ static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); free(input->data); free(input); free(kernel->data); @@ -514,52 +787,11 @@ static int csi_conv2d_nchw_u8(struct csi_tensor *o_input, return CSINN_TRUE; } -static int64_t conv_out_u8(int64_t res, - struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel) -{ - float t = res * input->scale * kernel->scale / output->scale; - if (t < 0) { - t = 0; - } - int32_t out = round(t + output->zero_point); - if (out < 0) { - return 0; - } else if (out > 255) { - return 255; - } else { - return out; - } -} - -static int64_t conv_relu6_out_u8(int64_t res, - struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel) -{ - float t = res * input->scale * kernel->scale / output->scale; - if (t < 0) { - t = 0; - } - if (t * output->scale > 6) { - t = 6 / output->scale; - } - int32_t out = round(t + output->zero_point); - if (out < 0) { - return 0; - } else if (out > 255) { - return 255; - } else { - return out; - } -} - -static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int csi_conv2d_nchw_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) { #ifdef CSI_AVX_OPT float *float_input_data; @@ -570,10 +802,10 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, struct csi_tensor float_kernel; struct csi_tensor float_bias; struct csi_tensor float_output; - uint8_t *input_data = o_input->data; - uint8_t *kernel_data = o_kernel->data; + int8_t *input_data = o_input->data; + int8_t *kernel_data = o_kernel->data; int32_t *bias_data = o_bias->data; - uint8_t *output_data = o_output->data; + int8_t *output_data = o_output->data; int input_size = 1; int kernel_size = 1; int output_size = 1; @@ -604,10 +836,10 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, float_output.data = float_output_data; for (int i = 0; i < input_size; i++) { - float_input_data[i] = uint8_to_float(input_data[i], o_input); + float_input_data[i] = int8_to_float(input_data[i], o_input); } for (int i = 0; i < kernel_size; i++) { - float_kernel_data[i] = uint8_to_float(kernel_data[i], o_kernel); + float_kernel_data[i] = int8_to_float(kernel_data[i], o_kernel); } for (int i = 0; i < bias_size; i++) { float_bias_data[i] = bias_data[i] * o_kernel->scale * o_input->scale; @@ -623,6 +855,7 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, t_input.dim[2] * t_input.dim[3] * 4); struct pad_params pparams; pparams.layout = CSINN_NCHW; + pparams.api = CSINN_REF; pparams.pad_before = pad_b; pparams.pad_after = pad_a; pparams.pad_mode = 0; @@ -638,10 +871,7 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, for (int i = 0; i < output_size; i++) { - if (float_output_data[i] < 0) { - float_output_data[i] = 0; - } - output_data[i] = float_to_uint8(float_output_data[i], o_output); + output_data[i] = float_to_int8(float_output_data[i], o_output); } free(float_input_data); free(float_kernel_data); @@ -654,19 +884,19 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, struct csi_tensor* output; struct csi_tensor* kernel; struct csi_tensor* bias = o_bias; - input = csi_nchw_to_nhwc_u8(o_input); - kernel = csi_nchw_to_nhwc_u8(o_kernel); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); - uint8_t *input_data = input->data; - uint8_t *output_data = output->data; - uint8_t *kernel_data = kernel->data; + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; int32_t *bias_data = bias->data; const int32_t dilation_width_factor = params->dilation_width; const int32_t dilation_height_factor = params->dilation_height; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -704,7 +934,7 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, kernel->dim, out_channel, filter_y, filter_x, in_channel); int32_t filter_val = kernel_data[filter_index]; acc += - (filter_val + filter_offset) * (input_val + input_offset); + (filter_val - filter_offset) * (input_val - input_offset); } } } @@ -712,13 +942,13 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, if (bias->dim_count != 0) { acc += bias_data[out_channel]; } - acc = conv_out_u8(acc, input, output, kernel); + acc = csi_quantize_i8(acc, output_offset, output_multiplier, output_shift); output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; } } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); free(input->data); free(input); free(kernel->data); @@ -727,87 +957,6 @@ static int csi_conv2d_relu_nchw_u8(struct csi_tensor *o_input, return CSINN_TRUE; } -static int csi_conv2d_relu6_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) -{ - struct csi_tensor* input; - struct csi_tensor* output; - struct csi_tensor* kernel; - struct csi_tensor* bias = o_bias; - input = csi_nchw_to_nhwc_u8(o_input); - kernel = csi_nchw_to_nhwc_u8(o_kernel); - output = csi_nchw_to_nhwc_u8(o_output); - - uint8_t *input_data = input->data; - uint8_t *output_data = output->data; - uint8_t *kernel_data = kernel->data; - int32_t *bias_data = bias->data; - const int32_t dilation_width_factor = params->dilation_width; - const int32_t dilation_height_factor = params->dilation_height; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; - const int32_t output_multiplier = output->multiplier; - const int32_t output_shift = output->shift; - - const int32_t batches = input->dim[0]; - const int32_t input_depth = input->dim[3]; - const int32_t output_depth = output->dim[3]; - const int32_t input_height = input->dim[1]; - const int32_t input_width = input->dim[2]; - const int32_t filter_height = kernel->dim[1]; - const int32_t filter_width = kernel->dim[2]; - const int32_t output_height = output->dim[1]; - const int32_t output_width = output->dim[2]; - - for (int32_t batch = 0; batch < batches; ++batch) { - #pragma omp parallel for num_threads(8) - for (int32_t out_y = 0; out_y < output_height; ++out_y) { - for (int32_t out_x = 0; out_x < output_width; ++out_x) { - for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { - const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; - const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; - int64_t acc = 0; - for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { - for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { - for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { - const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; - const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && - (in_y < input_height)) { - int32_t input_index = - csi_get_index(input->dim, batch, in_y, in_x, in_channel); - int32_t input_val = input_data[input_index]; - int32_t filter_index = csi_get_index( - kernel->dim, out_channel, filter_y, filter_x, in_channel); - int32_t filter_val = kernel_data[filter_index]; - acc += - (filter_val + filter_offset) * (input_val + input_offset); - } - } - } - } - if (bias->dim_count != 0) { - acc += bias_data[out_channel]; - } - acc = conv_relu6_out_u8(acc, input, output, kernel); - output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; - } - } - } - } - csi_nhwc_to_nchw_u8(o_output, output); - free(input->data); - free(input); - free(kernel->data); - free(kernel); - return CSINN_TRUE; -} static int csi_depthwise_conv2d_nchw_u8(struct csi_tensor *o_input, struct csi_tensor *o_output, @@ -815,19 +964,13 @@ static int csi_depthwise_conv2d_nchw_u8(struct csi_tensor *o_input, struct csi_tensor *o_bias, struct conv2d_params *params) { -#if 0 - csi_group_conv2d_u8(o_input, o_output, o_kernel, o_bias, - o_input->dim[1], stride_height, stride_width, - pad_top, pad_left, pad_down, pad_right, - dilation_height, dilation_width); -#else struct csi_tensor* input; struct csi_tensor* output; struct csi_tensor* kernel; struct csi_tensor* bias = o_bias; - input = csi_nchw_to_nhwc_u8(o_input); - kernel = csi_nchw_to_nhwc_u8(o_kernel); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -845,9 +988,9 @@ static int csi_depthwise_conv2d_nchw_u8(struct csi_tensor *o_input, const int32_t output_height = output->dim[1]; const int32_t output_width = output->dim[2]; const int32_t depth_multiplier = output_depth / input_depth; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -875,7 +1018,7 @@ static int csi_depthwise_conv2d_nchw_u8(struct csi_tensor *o_input, int32_t filter_val = kernel_data[csi_get_index( kernel->dim, ic, filter_y, filter_x, m)]; acc += - (filter_val + filter_offset) * (input_val + input_offset); + (filter_val - filter_offset) * (input_val - input_offset); } } } @@ -889,115 +1032,31 @@ static int csi_depthwise_conv2d_nchw_u8(struct csi_tensor *o_input, } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); free(input->data); free(input); free(kernel->data); free(kernel); -#endif return CSINN_TRUE; } -static int csi_depthwise_conv2d_relu_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) -{ - struct csi_tensor* input; - struct csi_tensor* output; - struct csi_tensor* kernel; - struct csi_tensor* bias = o_bias; - input = csi_nchw_to_nhwc_u8(o_input); - kernel = csi_nchw_to_nhwc_u8(o_kernel); - output = csi_nchw_to_nhwc_u8(o_output); - - uint8_t *input_data = input->data; - uint8_t *output_data = output->data; - uint8_t *kernel_data = kernel->data; - int32_t *bias_data = bias->data; - const int32_t dilation_width_factor = params->dilation_width; - const int32_t dilation_height_factor = params->dilation_height; - const int32_t batches = input->dim[0]; - const int32_t input_depth = input->dim[3]; - const int32_t output_depth = output->dim[3]; - const int32_t input_height = input->dim[1]; - const int32_t input_width = input->dim[2]; - const int32_t filter_height = kernel->dim[1]; - const int32_t filter_width = kernel->dim[2]; - const int32_t output_height = output->dim[1]; - const int32_t output_width = output->dim[2]; - const int32_t depth_multiplier = output_depth / input_depth; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; - const int32_t output_multiplier = output->multiplier; - const int32_t output_shift = output->shift; - - for (int32_t b = 0; b < batches; ++b) { - #pragma omp parallel for num_threads(8) - for (int32_t out_y = 0; out_y < output_height; ++out_y) { - for (int32_t out_x = 0; out_x < output_width; ++out_x) { - for (int32_t ic = 0; ic < input_depth; ++ic) { - for (int32_t m = 0; m < depth_multiplier; m++) { - const int32_t oc = m + ic * depth_multiplier; - const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; - const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; - int64_t acc = 0; - for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { - for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { - const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; - const int32_t in_y = - in_y_origin + dilation_height_factor * filter_y; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && - (in_y < input_height)) { - int32_t input_val = - input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; - int32_t filter_val = kernel_data[csi_get_index( - kernel->dim, ic, filter_y, filter_x, m)]; - acc += - (filter_val + filter_offset) * (input_val + input_offset); - } - } - } - if (bias->dim_count != 0) { - acc += bias_data[oc]; - } - acc = conv_out_u8(acc, input, output, kernel); - output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = - acc; - } - } - } - } - } - csi_nhwc_to_nchw_u8(o_output, output); - free(input->data); - free(input); - free(kernel->data); - free(kernel); - return CSINN_TRUE; -} - -static int csi_depthwise_conv2d_relu6_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int csi_depthwise_conv2d_nchw_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) { struct csi_tensor* input; struct csi_tensor* output; struct csi_tensor* kernel; struct csi_tensor* bias = o_bias; - input = csi_nchw_to_nhwc_u8(o_input); - kernel = csi_nchw_to_nhwc_u8(o_kernel); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); - uint8_t *input_data = input->data; - uint8_t *output_data = output->data; - uint8_t *kernel_data = kernel->data; + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; int32_t *bias_data = bias->data; const int32_t dilation_width_factor = params->dilation_width; const int32_t dilation_height_factor = params->dilation_height; @@ -1011,9 +1070,9 @@ static int csi_depthwise_conv2d_relu6_nchw_u8(struct csi_tensor *o_input, const int32_t output_height = output->dim[1]; const int32_t output_width = output->dim[2]; const int32_t depth_multiplier = output_depth / input_depth; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -1041,22 +1100,21 @@ static int csi_depthwise_conv2d_relu6_nchw_u8(struct csi_tensor *o_input, int32_t filter_val = kernel_data[csi_get_index( kernel->dim, ic, filter_y, filter_x, m)]; acc += - (filter_val + filter_offset) * (input_val + input_offset); + (filter_val - filter_offset) * (input_val - input_offset); } } } if (bias->dim_count != 0) { acc += bias_data[oc]; } - acc = conv_relu6_out_u8(acc, input, output, kernel); output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = - acc; + csi_quantize_i8(acc, output_offset, output_multiplier, output_shift); } } } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); free(input->data); free(input); free(kernel->data); @@ -1108,11 +1166,11 @@ static int csi_group_conv2d_nchw_u8(struct csi_tensor *o_input, return CSINN_TRUE; } -static int csi_group_conv2d_relu_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct csi_tensor *o_kernel, - struct csi_tensor *o_bias, - struct conv2d_params *params) +static int csi_group_conv2d_nchw_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) { struct csi_tensor input; struct csi_tensor output; @@ -1138,25 +1196,25 @@ static int csi_group_conv2d_relu_nchw_u8(struct csi_tensor *o_input, kernel_size *= kernel.dim[i]; } - uint8_t *input_data = o_input->data; - uint8_t *output_data = o_output->data; - uint8_t *kernel_data = o_kernel->data; - uint8_t *bias_data = o_bias->data; + int8_t *input_data = o_input->data; + int8_t *output_data = o_output->data; + int8_t *kernel_data = o_kernel->data; + int8_t *bias_data = o_bias->data; for (int i = 0; i < params->group; i++) { input.data = input_data + i * input_size; output.data = output_data + i * output_size; kernel.data = kernel_data + i * kernel_size; bias.data = bias_data + i * o_output->dim[1] / params->group; - csi_conv2d_relu_nchw_u8(&input, &output, &kernel, &bias, params); + csi_conv2d_nchw_i8(&input, &output, &kernel, &bias, params); } return CSINN_TRUE; } static int csi_group_conv2d_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { int input_size = 1; int output_size = 1; @@ -1229,139 +1287,172 @@ static int csi_group_conv2d_nhwc_f32(struct csi_tensor *input, return CSINN_TRUE; } -int csi_conv2d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csi_conv2d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NHWC) { + csi_conv2d_nhwc_f32(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NCHW) { + csi_conv2d_nchw_f32(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_conv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { if (params->layout == CSINN_NCHW) { - if (params->group == 1) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_conv2d_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->group == input->dim[1]) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_depthwise_conv2d_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_group_conv2d_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } + csi_conv2d_nchw_u8(input, output, kernel, bias, params); } else if (params->layout == CSINN_NHWC) { - if (params->group == 1) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_conv2d_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32){ - params->bc = csi_conv2d_nhwc_f32; - } - } else if (params->group == input->dim[3]) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_depthwise_conv2d_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_depthwise_conv2d_nhwc_f32; - } - } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_group_conv2d_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32){ - params->bc = csi_group_conv2d_nhwc_f32; - } - } + csi_conv2d_nhwc_u8(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } - return CSINN_TRUE; } -int csi_conv2d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csi_conv2d_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { - if (params->bc != NULL) { - params->bc(input, output, kernel, bias, params); + if (params->layout == CSINN_NCHW) { + csi_conv2d_nchw_i8(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NHWC) { + csi_conv2d_nhwc_i8(input, output, kernel, bias, params); } else { - return CSINN_CALLBACK_UNSET; + return CSINN_UNSUPPORT_LAYOUT; } - return CSINN_TRUE; } -int csi_conv2d_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csi_depthwise_conv2d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NHWC) { + csi_depthwise_conv2d_nhwc_f32(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NCHW) { + csi_depthwise_conv2d_nchw_f32(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_depthwise_conv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { if (params->layout == CSINN_NCHW) { - if (params->group == 1) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_conv2d_relu_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->group == output->dim[1]) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_depthwise_conv2d_relu_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_group_conv2d_relu_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } + csi_depthwise_conv2d_nchw_u8(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NHWC) { + csi_depthwise_conv2d_nhwc_u8(input, output, kernel, bias, params); } else { return CSINN_UNSUPPORT_LAYOUT; } - return CSINN_TRUE; } -int csi_conv2d_relu(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csi_depthwise_conv2d_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { - if (params->bc != NULL) { - params->bc(input, output, kernel, bias, params); + if (params->layout == CSINN_NCHW) { + csi_depthwise_conv2d_nchw_i8(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NHWC) { + csi_depthwise_conv2d_nhwc_i8(input, output, kernel, bias, params); } else { - return CSINN_CALLBACK_UNSET; + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_group_conv2d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NHWC) { + csi_group_conv2d_nhwc_f32(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_group_conv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_group_conv2d_nchw_u8(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NHWC) { + csi_group_conv2d_nhwc_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; } - return CSINN_TRUE; } -int csi_conv2d_relu6_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csi_group_conv2d_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { if (params->layout == CSINN_NCHW) { + csi_group_conv2d_nchw_i8(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NHWC) { + csi_group_conv2d_nhwc_i8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_conv2d_init(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->wscales != NULL && params->wzps != NULL){ + if (params->layout != CSINN_NCHW){ + return CSINN_UNSUPPORT_DTYPE; + } if (params->group == 1) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_conv2d_relu6_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->group == output->dim[1]) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_depthwise_conv2d_relu6_nchw_u8; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV2D_CHANNEL, input->dtype); + } else if (params->group == input->dim[1]) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_CONV2D_CHANNEL, input->dtype); + } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GROUP_CONV2D_CHANNEL, input->dtype); + } + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } + return CSINN_TRUE; + } + + if (params->layout == CSINN_NCHW || params->layout == CSINN_NHWC) { + if (params->group == 1) { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV2D, input->dtype); + } else if (params->group == input->dim[1] || params->group == input->dim[3]) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_CONV2D, input->dtype); } else { - return CSINN_FALSE; + params->bc = csi_bc_map(params->api, CSINN_OP_GROUP_CONV2D, input->dtype); + } + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } } else { return CSINN_UNSUPPORT_LAYOUT; @@ -1369,11 +1460,11 @@ int csi_conv2d_relu6_init(struct csi_tensor *input, return CSINN_TRUE; } -int csi_conv2d_relu6(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv2d_params *params) +int csi_conv2d(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) { if (params->bc != NULL) { params->bc(input, output, kernel, bias, params); diff --git a/source/reference/convolution3d.c b/source/reference/convolution3d.c index d4b0fc4c..bafd01af 100644 --- a/source/reference/convolution3d.c +++ b/source/reference/convolution3d.c @@ -19,11 +19,11 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_conv3d_ncdhw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csi_conv3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -98,12 +98,11 @@ static int csi_conv3d_ncdhw_f32(struct csi_tensor *input, return CSINN_TRUE; } - -static int csi_conv3d_ncdhw_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csi_conv3d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -132,9 +131,9 @@ static int csi_conv3d_ncdhw_u8(struct csi_tensor *input, const int32_t dilation_height = params->dilation_height; const int32_t dilation_width = params->dilation_width; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -164,7 +163,7 @@ static int csi_conv3d_ncdhw_u8(struct csi_tensor *input, int32_t input_val = input_data[input_idx]; int32_t filter_idx = csi_get_index_5(kernel->dim, out_ch, in_ch, filter_d, filter_h, filter_w); int32_t filter_val = kernel_data[filter_idx]; - acc += (input_val+input_offset) * (filter_val+filter_offset); + acc += (input_val - input_offset) * (filter_val - filter_offset); } } } @@ -186,49 +185,15 @@ static int csi_conv3d_ncdhw_u8(struct csi_tensor *input, } -static int csi_conv3d_ndhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - - return CSINN_FALSE; -} - - -static int csi_conv3d_ndhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) -{ - - return CSINN_FALSE; -} - int csi_conv3d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, struct csi_tensor *bias, struct conv3d_params *params) { - if(params->layout == CSINN_NCDHW) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_conv3d_ncdhw_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_conv3d_ncdhw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if(params->layout == CSINN_NDHWC) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_conv3d_ndhwc_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_conv3d_ndhwc_f32; - } else { + if (params->layout == CSINN_NCDHW) { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV3D, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } else { @@ -238,10 +203,10 @@ int csi_conv3d_init(struct csi_tensor *input, } int csi_conv3d(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params) { if(params->bc != NULL) { params->bc(input, output, kernel, bias, params); diff --git a/source/reference/convolution_channel.c b/source/reference/convolution_channel.c new file mode 100644 index 00000000..8b0304d0 --- /dev/null +++ b/source/reference/convolution_channel.c @@ -0,0 +1,953 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +/* reference https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/conv.h */ + + +static int csi_conv2d_channel_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + uint8_t *input_data = o_input->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + uint8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = uint8_to_float(input_data[i], o_input); + } + for (int i = 0; i < o_kernel->dim[0]; i++) { + int per_cahnnel = kernel_size / o_kernel->dim[0]; + for (int j = 0; j < per_cahnnel; j++) { + int index = i * per_cahnnel + j; + float_kernel_data[index] = uint8_to_float_channel(kernel_data[index], + params->wscales[i], params->wzps[i]); + } + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * params->wscales[i] * o_input->scale; + } + params->wscales = NULL; + params->wzps = NULL; + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + for (int i = 0; i < output_size; i++) { + output_data[i] = float_to_uint8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t *filter_offset = params->wzps; + const float *filter_scales = params->wscales; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset[out_channel]) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = csi_quantize_channel_u8(acc, input, output, filter_scales[out_channel]); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + + +static int csi_conv2d_channel_relu_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + uint8_t *input_data = o_input->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + uint8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = uint8_to_float(input_data[i], o_input); + } + for (int i = 0; i < o_kernel->dim[0]; i++) { + int per_cahnnel = kernel_size / o_kernel->dim[0]; + for (int j = 0; j < per_cahnnel; j++) { + int index = i * per_cahnnel + j; + float_kernel_data[index] = uint8_to_float_channel(kernel_data[index], + params->wscales[i], params->wzps[i]); + } + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * params->wscales[i] * o_input->scale; + } + params->wscales = NULL; + params->wzps = NULL; + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + + for (int i = 0; i < output_size; i++) { + if (float_output_data[i] < 0) { + float_output_data[i] = 0; + } + output_data[i] = float_to_uint8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t *filter_offset = params->wzps; + const float *filter_scales = params->wscales; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset[out_channel]) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = conv_channel_out_u8(acc, input, output, filter_scales[out_channel]); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + + +static int csi_conv2d_channel_relu6_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + uint8_t *input_data = o_input->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + uint8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = uint8_to_float(input_data[i], o_input); + } + for (int i = 0; i < o_kernel->dim[0]; i++) { + int per_cahnnel = kernel_size / o_kernel->dim[0]; + for (int j = 0; j < per_cahnnel; j++) { + int index = i * per_cahnnel + j; + float_kernel_data[index] = uint8_to_float_channel(kernel_data[index], + params->wscales[i], params->wzps[i]); + } + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * params->wscales[i] * o_input->scale; + } + params->wscales = NULL; + params->wzps = NULL; + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + + for (int i = 0; i < output_size; i++) { + if (float_output_data[i] < 0) { + float_output_data[i] = 0; + }else if (float_output_data[i] > 6) { + float_output_data[i] = 6; + } + output_data[i] = float_to_uint8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t *filter_offset = params->wzps; + const float *filter_scales = params->wscales; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset[out_channel]) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = conv_channel_relu6_u8(acc, input, output, filter_scales[out_channel]); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + + +static int csi_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t *filter_offset = params->wzps; + const float *filter_scales = params->wscales; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset[oc]) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + + uint8_t out = csi_quantize_channel_u8(acc, input, output, filter_scales[oc]); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = out; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + + +static int csi_depthwise_conv2d_channel_relu_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t *filter_offset = params->wzps; + const float *filter_scales = params->wscales; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset[ic]) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + acc = conv_channel_out_u8(acc, input, output, filter_scales[ic]); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = acc; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + + +static int csi_depthwise_conv2d_channel_relu6_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t *filter_offset = params->wzps; + const float *filter_scales = params->wscales; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset[oc]) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + acc = conv_channel_relu6_u8(acc, input, output, filter_scales[oc]); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = acc; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + + +static int csi_group_conv2d_channel_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor input; + struct csi_tensor output; + struct csi_tensor kernel; + struct csi_tensor bias; + struct conv2d_params pparams; + + + memcpy(&input, o_input, sizeof(struct csi_tensor)); + memcpy(&output, o_output, sizeof(struct csi_tensor)); + memcpy(&kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&pparams, params, sizeof(struct conv2d_params)); + + input.dim[1] /= params->group; + output.dim[1] /= params->group; + kernel.dim[0] /= params->group; + + pparams.group = 1; + int input_size = 1; + int output_size = 1; + int kernel_size = 1; + + for (int i = 0; i < input.dim_count; i++) { + input_size *= input.dim[i]; + output_size *= output.dim[i]; + kernel_size *= kernel.dim[i]; + } + + uint8_t *input_data = o_input->data; + uint8_t *output_data = o_output->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + for (int i = 0; i < params->group; i++) { + input.data = input_data + i * input_size; + output.data = output_data + i * output_size; + kernel.data = kernel_data + i * kernel_size; + bias.data = bias_data + i * o_output->dim[1] / params->group; + pparams.wscales = params->wscales + i * o_output->dim[1] / params->group; + pparams.wzps = params->wzps + i * o_output->dim[1] / params->group; + + csi_conv2d_channel_nchw_u8(&input, &output, &kernel, &bias, &pparams); + } + return CSINN_TRUE; +} + + +static int csi_group_conv2d_channel_relu_nchw_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor input; + struct csi_tensor output; + struct csi_tensor kernel; + struct csi_tensor bias; + struct conv2d_params pparams; + + + memcpy(&input, o_input, sizeof(struct csi_tensor)); + memcpy(&output, o_output, sizeof(struct csi_tensor)); + memcpy(&kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&pparams, params, sizeof(struct conv2d_params)); + + input.dim[1] /= params->group; + output.dim[1] /= params->group; + kernel.dim[0] /= params->group; + + pparams.group = 1; + int input_size = 1; + int output_size = 1; + int kernel_size = 1; + + for (int i = 0; i < input.dim_count; i++) { + input_size *= input.dim[i]; + output_size *= output.dim[i]; + kernel_size *= kernel.dim[i]; + } + + uint8_t *input_data = o_input->data; + uint8_t *output_data = o_output->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + for (int i = 0; i < params->group; i++) { + input.data = input_data + i * input_size; + output.data = output_data + i * output_size; + kernel.data = kernel_data + i * kernel_size; + bias.data = bias_data + i * o_output->dim[1] / params->group; + pparams.wscales = params->wscales + i * o_output->dim[1] / params->group; + pparams.wzps = params->wzps + i * o_output->dim[1] / params->group; + csi_conv2d_channel_relu_nchw_u8(&input, &output, &kernel, &bias, &pparams); + } + return CSINN_TRUE; +} + +int csi_conv2d_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_conv2d_channel_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + + +int csi_conv2d_channel_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_conv2d_channel_relu_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_conv2d_channel_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_conv2d_channel_relu6_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + + +int csi_depthwise_conv2d_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_depthwise_conv2d_channel_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_depthwise_conv2d_channel_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_depthwise_conv2d_channel_relu_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_depthwise_conv2d_channel_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_depthwise_conv2d_channel_relu6_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_group_conv2d_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_group_conv2d_channel_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_group_conv2d_channel_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_group_conv2d_channel_relu_nchw_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} \ No newline at end of file diff --git a/source/reference/convolution_relu.c b/source/reference/convolution_relu.c new file mode 100644 index 00000000..3e90f091 --- /dev/null +++ b/source/reference/convolution_relu.c @@ -0,0 +1,635 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_conv2d_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + uint8_t *input_data = o_input->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + uint8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = uint8_to_float(input_data[i], o_input); + } + for (int i = 0; i < kernel_size; i++) { + float_kernel_data[i] = uint8_to_float(kernel_data[i], o_kernel); + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * o_kernel->scale * o_input->scale; + } + + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + + for (int i = 0; i < output_size; i++) { + if (float_output_data[i] < 0) { + float_output_data[i] = 0; + } + output_data[i] = float_to_uint8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = conv_out_u8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + +int csi_conv2d_relu_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + int8_t *input_data = o_input->data; + int8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + int8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = int8_to_float(input_data[i], o_input); + } + for (int i = 0; i < kernel_size; i++) { + float_kernel_data[i] = int8_to_float(kernel_data[i], o_kernel); + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * o_kernel->scale * o_input->scale; + } + + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + + for (int i = 0; i < output_size; i++) { + if (float_output_data[i] < 0) { + float_output_data[i] = 0; + } + output_data[i] = float_to_int8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = conv_out_i8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + + +int csi_depthwise_conv2d_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + acc = conv_out_u8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = + acc; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + +int csi_depthwise_conv2d_relu_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + acc = conv_out_i8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = + acc; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + + +int csi_group_conv2d_relu_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor input; + struct csi_tensor output; + struct csi_tensor kernel; + struct csi_tensor bias; + struct conv2d_params pparams; + + memcpy(&input, o_input, sizeof(struct csi_tensor)); + memcpy(&output, o_output, sizeof(struct csi_tensor)); + memcpy(&kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&pparams, params, sizeof(struct conv2d_params)); + + input.dim[1] /= params->group; + output.dim[1] /= params->group; + kernel.dim[0] /= params->group; + + pparams.group = 1; + int input_size = 1; + int output_size = 1; + int kernel_size = 1; + + for (int i = 0; i < input.dim_count; i++) { + input_size *= input.dim[i]; + output_size *= output.dim[i]; + kernel_size *= kernel.dim[i]; + } + + uint8_t *input_data = o_input->data; + uint8_t *output_data = o_output->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + for (int i = 0; i < params->group; i++) { + input.data = input_data + i * input_size; + output.data = output_data + i * output_size; + kernel.data = kernel_data + i * kernel_size; + bias.data = bias_data + i * o_output->dim[1] / params->group; + csi_conv2d_relu_u8(&input, &output, &kernel, &bias, &pparams); + } + return CSINN_TRUE; +} + +int csi_group_conv2d_relu_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor input; + struct csi_tensor output; + struct csi_tensor kernel; + struct csi_tensor bias; + struct conv2d_params pparams; + + memcpy(&input, o_input, sizeof(struct csi_tensor)); + memcpy(&output, o_output, sizeof(struct csi_tensor)); + memcpy(&kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&pparams, params, sizeof(struct conv2d_params)); + + input.dim[1] /= params->group; + output.dim[1] /= params->group; + kernel.dim[0] /= params->group; + + pparams.group = 1; + int input_size = 1; + int output_size = 1; + int kernel_size = 1; + + for (int i = 0; i < input.dim_count; i++) { + input_size *= input.dim[i]; + output_size *= output.dim[i]; + kernel_size *= kernel.dim[i]; + } + + int8_t *input_data = o_input->data; + int8_t *output_data = o_output->data; + int8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + for (int i = 0; i < params->group; i++) { + input.data = input_data + i * input_size; + output.data = output_data + i * output_size; + kernel.data = kernel_data + i * kernel_size; + bias.data = bias_data + i * o_output->dim[1] / params->group; + csi_conv2d_relu_i8(&input, &output, &kernel, &bias, &pparams); + } + return CSINN_TRUE; +} + +int csi_conv2d_relu_init(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if(params->wscales != NULL && params->wzps != NULL){ + if (params->layout == CSINN_NCHW) { + if (params->group == 1) { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV2D_CHANNEL_RELU, input->dtype); + } else if (params->group == output->dim[1]) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU, input->dtype); + } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GROUP_CONV2D_CHANNEL_RELU, input->dtype); + } + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + } else { + return CSINN_UNSUPPORT_LAYOUT; + } + return CSINN_TRUE; + } + + if (params->layout == CSINN_NCHW) { + if (params->group == 1) { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV2D_RELU, input->dtype); + } else if (params->group == output->dim[1]) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype); + } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GROUP_CONV2D_RELU, input->dtype); + } + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + } else { + return CSINN_UNSUPPORT_LAYOUT; + } + return CSINN_TRUE; +} + +int csi_conv2d_relu(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->bc != NULL) { + params->bc(input, output, kernel, bias, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} diff --git a/source/reference/convolution_relu6.c b/source/reference/convolution_relu6.c new file mode 100644 index 00000000..88b7f60a --- /dev/null +++ b/source/reference/convolution_relu6.c @@ -0,0 +1,545 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_conv2d_relu6_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + uint8_t *input_data = o_input->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + uint8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = uint8_to_float(input_data[i], o_input); + } + for (int i = 0; i < kernel_size; i++) { + float_kernel_data[i] = uint8_to_float(kernel_data[i], o_kernel); + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * o_kernel->scale * o_input->scale; + } + + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + + for (int i = 0; i < output_size; i++) { + if (float_output_data[i] < 0) { + float_output_data[i] = 0; + }else if (float_output_data[i] > 0) { + float_output_data[i] = 6; + } + output_data[i] = float_to_uint8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = conv_relu6_out_u8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + + +int csi_conv2d_relu6_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ +#ifdef CSI_AVX_OPT + float *float_input_data; + float *float_kernel_data; + float *float_bias_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_kernel; + struct csi_tensor float_bias; + struct csi_tensor float_output; + uint8_t *input_data = o_input->data; + uint8_t *kernel_data = o_kernel->data; + int32_t *bias_data = o_bias->data; + uint8_t *output_data = o_output->data; + int input_size = 1; + int kernel_size = 1; + int output_size = 1; + + for (int i = 0; i < o_input->dim_count; i++) { + input_size *= o_input->dim[i]; + } + for (int i = 0; i < o_kernel->dim_count; i++) { + kernel_size *= o_kernel->dim[i]; + } + for (int i = 0; i < o_output->dim_count; i++) { + output_size *= o_output->dim[i]; + } + int bias_size = o_output->dim[1]; + + memcpy(&float_input, o_input, sizeof(struct csi_tensor)); + memcpy(&float_kernel, o_kernel, sizeof(struct csi_tensor)); + memcpy(&float_bias, o_bias, sizeof(struct csi_tensor)); + memcpy(&float_output, o_output, sizeof(struct csi_tensor)); + float_input_data = malloc(input_size * sizeof(float)); + float_output_data = malloc(output_size * sizeof(float)); + float_kernel_data = malloc(kernel_size * sizeof(float)); + float_bias_data = malloc(bias_size * sizeof(float)); + float_input.dtype = CSINN_DTYPE_FLOAT32; + float_input.data = float_input_data; + float_kernel.data = float_kernel_data; + float_bias.data = float_bias_data; + float_output.data = float_output_data; + + for (int i = 0; i < input_size; i++) { + float_input_data[i] = int8_to_float(input_data[i], o_input); + } + for (int i = 0; i < kernel_size; i++) { + float_kernel_data[i] = int8_to_float(kernel_data[i], o_kernel); + } + for (int i = 0; i < bias_size; i++) { + float_bias_data[i] = bias_data[i] * o_kernel->scale * o_input->scale; + } + + csi_conv2d_init(&float_input, &float_output, &float_kernel, &float_bias, params); + csi_conv2d(&float_input, &float_output, &float_kernel, &float_bias, params); + + for (int i = 0; i < output_size; i++) { + if (float_output_data[i] < 0) { + float_output_data[i] = 0; + }else if (float_output_data[i] > 0) { + float_output_data[i] = 6; + } + output_data[i] = float_to_int8(float_output_data[i], o_output); + } + free(float_input_data); + free(float_kernel_data); + free(float_bias_data); + free(float_output_data); +#else + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + + for (int32_t batch = 0; batch < batches; ++batch) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t out_channel = 0; out_channel < output_depth; ++out_channel) { + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int32_t in_channel = 0; in_channel < input_depth; ++in_channel) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_index = + csi_get_index(input->dim, batch, in_y, in_x, in_channel); + int32_t input_val = input_data[input_index]; + int32_t filter_index = csi_get_index( + kernel->dim, out_channel, filter_y, filter_x, in_channel); + int32_t filter_val = kernel_data[filter_index]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[out_channel]; + } + acc = conv_relu6_out_i8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] = acc; + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); +#endif + return CSINN_TRUE; +} + + +int csi_depthwise_conv2d_relu6_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + uint8_t *input_data = input->data; + uint8_t *output_data = output->data; + uint8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + acc = conv_relu6_out_u8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = + acc; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + +int csi_depthwise_conv2d_relu6_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct csi_tensor *o_kernel, + struct csi_tensor *o_bias, + struct conv2d_params *params) +{ + struct csi_tensor* input; + struct csi_tensor* output; + struct csi_tensor* kernel; + struct csi_tensor* bias = o_bias; + input = csi_nchw_to_nhwc_8(o_input); + kernel = csi_nchw_to_nhwc_8(o_kernel); + output = csi_nchw_to_nhwc_8(o_output); + + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int8_t *kernel_data = kernel->data; + int32_t *bias_data = bias->data; + const int32_t dilation_width_factor = params->dilation_width; + const int32_t dilation_height_factor = params->dilation_height; + const int32_t batches = input->dim[0]; + const int32_t input_depth = input->dim[3]; + const int32_t output_depth = output->dim[3]; + const int32_t input_height = input->dim[1]; + const int32_t input_width = input->dim[2]; + const int32_t filter_height = kernel->dim[1]; + const int32_t filter_width = kernel->dim[2]; + const int32_t output_height = output->dim[1]; + const int32_t output_width = output->dim[2]; + const int32_t depth_multiplier = output_depth / input_depth; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + for (int32_t b = 0; b < batches; ++b) { + #pragma omp parallel for num_threads(8) + for (int32_t out_y = 0; out_y < output_height; ++out_y) { + for (int32_t out_x = 0; out_x < output_width; ++out_x) { + for (int32_t ic = 0; ic < input_depth; ++ic) { + for (int32_t m = 0; m < depth_multiplier; m++) { + const int32_t oc = m + ic * depth_multiplier; + const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left; + const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top; + int64_t acc = 0; + for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) { + const int32_t in_x = in_x_origin + dilation_width_factor * filter_x; + const int32_t in_y = + in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height)) { + int32_t input_val = + input_data[csi_get_index(input->dim, b, in_y, in_x, ic)]; + int32_t filter_val = kernel_data[csi_get_index( + kernel->dim, ic, filter_y, filter_x, m)]; + acc += + (filter_val - filter_offset) * (input_val - input_offset); + } + } + } + if (bias->dim_count != 0) { + acc += bias_data[oc]; + } + acc = conv_relu6_out_i8(acc, input, output, kernel); + output_data[csi_get_index(output->dim, b, out_y, out_x, oc)] = + acc; + } + } + } + } + } + csi_nhwc_to_nchw_8(o_output, output); + free(input->data); + free(input); + free(kernel->data); + free(kernel); + return CSINN_TRUE; +} + + +int csi_conv2d_relu6_init(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if(params->wscales != NULL && params->wzps != NULL){ + if (params->layout == CSINN_NCHW) { + if (params->group == 1) { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV2D_CHANNEL_RELU6, input->dtype); + } else if (params->group == output->dim[1]) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6, input->dtype); + } else { + return CSINN_FALSE; + } + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + } else { + return CSINN_UNSUPPORT_LAYOUT; + } + return CSINN_TRUE; + } + if (params->layout == CSINN_NCHW) { + if (params->group == 1) { + params->bc = csi_bc_map(params->api, CSINN_OP_CONV2D_RELU6, input->dtype); + } else if (params->group == output->dim[1]) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype); + } else { + return CSINN_FALSE; + } + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + } else { + return CSINN_UNSUPPORT_LAYOUT; + } + return CSINN_TRUE; +} + +int csi_conv2d_relu6(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->bc != NULL) { + params->bc(input, output, kernel, bias, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} diff --git a/source/reference/cos.c b/source/reference/cos.c index 7a9ab6d4..02d18f71 100644 --- a/source/reference/cos.c +++ b/source/reference/cos.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_cos_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_cos_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_cos_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_cos_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_cos_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_cos_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = cos(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,19 +62,16 @@ int csi_cos_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_cos_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_cos_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_COS, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_cos(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/cosh.c b/source/reference/cosh.c index f7d69864..3a9f20bd 100644 --- a/source/reference/cosh.c +++ b/source/reference/cosh.c @@ -20,7 +20,7 @@ #include "csi_utils.h" #include -static int csi_cosh_f32(struct csi_tensor *input, +int csi_cosh_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { @@ -37,7 +37,7 @@ static int csi_cosh_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_cosh_u8(struct csi_tensor *input, +int csi_cosh_u8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { @@ -49,24 +49,21 @@ static int csi_cosh_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = cosh(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_cosh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_cosh_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_cosh_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_COSH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/cumprod.c b/source/reference/cumprod.c index 6085b0af..ce0daaa5 100644 --- a/source/reference/cumprod.c +++ b/source/reference/cumprod.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_cumprod_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params) +int csi_cumprod_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct cumprod_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -59,10 +59,9 @@ static int csi_cumprod_f32(struct csi_tensor *input, return CSINN_TRUE; } - -static int csi_cumprod_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params) +int csi_cumprod_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct cumprod_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -86,13 +85,13 @@ static int csi_cumprod_u8(struct csi_tensor *input, float temp = 1.0f; for(int j = 0; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp *= input_temp; float output_temp = temp; if(!params->exclusive) { - *(output_data + j * inner_size + k) = csi_quantize_f32(output_temp, output->offset, output->multiplier, output->shift); + *(output_data + j * inner_size + k) = csi_quantize_f32_to_u8(output_temp, output->zero_point, output->multiplier, output->shift); } else { - *(output_data + j * inner_size + k) = csi_quantize_f32(output_temp / input_temp, output->offset, output->multiplier, output->shift); + *(output_data + j * inner_size + k) = csi_quantize_f32_to_u8(output_temp / input_temp, output->zero_point, output->multiplier, output->shift); } } } @@ -103,14 +102,11 @@ static int csi_cumprod_u8(struct csi_tensor *input, } int csi_cumprod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct cumprod_params *params) + struct csi_tensor *output, + struct cumprod_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_cumprod_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_cumprod_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_CUMPROD, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/cumsum.c b/source/reference/cumsum.c index 09600057..8926682e 100644 --- a/source/reference/cumsum.c +++ b/source/reference/cumsum.c @@ -20,10 +20,9 @@ #include "csi_utils.h" #include - -static int csi_cumsum_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params) +int csi_cumsum_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct cumsum_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -60,9 +59,9 @@ static int csi_cumsum_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_cumsum_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct cumsum_params *params) +int csi_cumsum_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct cumsum_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -86,13 +85,13 @@ static int csi_cumsum_u8(struct csi_tensor *input, float temp = 0.0f; for(int j = 0; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp *= input_temp; float output_temp = temp; if(!params->exclusive) { - *(output_data + j * inner_size + k) = csi_quantize_f32(output_temp, output->offset, output->multiplier, output->shift); + *(output_data + j * inner_size + k) = csi_quantize_f32_to_u8(output_temp, output->zero_point, output->multiplier, output->shift); } else { - *(output_data + j * inner_size + k) = csi_quantize_f32(output_temp - input_temp, output->offset, output->multiplier, output->shift); + *(output_data + j * inner_size + k) = csi_quantize_f32_to_u8(output_temp - input_temp, output->zero_point, output->multiplier, output->shift); } } } @@ -106,11 +105,8 @@ int csi_cumsum_init(struct csi_tensor *input, struct csi_tensor *output, struct cumsum_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_cumsum_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_cumsum_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_CUMSUM, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/deconvolution.c b/source/reference/deconvolution.c index 8f0c3b6c..25d4876d 100644 --- a/source/reference/deconvolution.c +++ b/source/reference/deconvolution.c @@ -39,9 +39,9 @@ static int csi_deconv2d_nhwc_u8(struct csi_tensor *input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; const int output_batch = output->dim[0]; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int output_shift = output->shift; @@ -77,8 +77,8 @@ static int csi_deconv2d_nhwc_u8(struct csi_tensor *input, kernel->dim, out_channel, filter_y, filter_x, in_channel)]; scratch_buffer[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] += - (input_value + input_offset) * - (filter_value + filter_offset); + (input_value - input_offset) * + (filter_value - filter_offset); } } } @@ -103,7 +103,7 @@ static int csi_deconv2d_nhwc_u8(struct csi_tensor *input, for (int i = 0; i < num_elements; ++i) { output_data[i] = - csi_quantize_u8(scratch_buffer[i], output->offset, output->multiplier, output->shift); + csi_quantize_u8(scratch_buffer[i], output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; @@ -115,8 +115,8 @@ static int csi_deconv2d_nchw_u8(struct csi_tensor *o_input, struct csi_tensor *o_bias, struct conv2d_params *params) { - struct csi_tensor* input = csi_nchw_to_nhwc_u8(o_input); - struct csi_tensor* output = csi_nchw_to_nhwc_u8(o_output); + struct csi_tensor* input = csi_nchw_to_nhwc_8(o_input); + struct csi_tensor* output = csi_nchw_to_nhwc_8(o_output); int32_t permute[4] = {1, 2, 3, 0}; struct csi_tensor* kernel = csi_deconv_kernel_nchw_to_nhwc_u8(o_kernel, permute); struct csi_tensor* bias = o_bias; @@ -135,9 +135,9 @@ static int csi_deconv2d_nchw_u8(struct csi_tensor *o_input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; const int output_batch = output->dim[0]; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int output_shift = output->shift; @@ -173,8 +173,8 @@ static int csi_deconv2d_nchw_u8(struct csi_tensor *o_input, kernel->dim, out_channel, filter_y, filter_x, in_channel)]; scratch_buffer[csi_get_index(output->dim, batch, out_y, out_x, out_channel)] += - (input_value + input_offset) * - (filter_value + filter_offset); + (input_value - input_offset) * + (filter_value - filter_offset); } } } @@ -198,21 +198,21 @@ static int csi_deconv2d_nchw_u8(struct csi_tensor *o_input, for (int i = 0; i < num_elements; ++i) { output_data[i] = - csi_quantize_u8(scratch_buffer[i], output->offset, output->multiplier, output->shift); + csi_quantize_u8(scratch_buffer[i], output->zero_point, output->multiplier, output->shift); } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); return CSINN_TRUE; } -static int csi_depthwise_deconv2d_nchw_u8(struct csi_tensor *o_input, +int csi_depthwise_deconv2d_u8(struct csi_tensor *o_input, struct csi_tensor *o_output, struct csi_tensor *o_kernel, struct csi_tensor *o_bias, struct conv2d_params *params) { - struct csi_tensor* input = csi_nchw_to_nhwc_u8(o_input); - struct csi_tensor* output = csi_nchw_to_nhwc_u8(o_output); + struct csi_tensor* input = csi_nchw_to_nhwc_8(o_input); + struct csi_tensor* output = csi_nchw_to_nhwc_8(o_output); int32_t permute[4] = {1, 2, 3, 0}; struct csi_tensor* kernel = csi_deconv_kernel_nchw_to_nhwc_u8(o_kernel, permute); struct csi_tensor* bias = o_bias; @@ -231,9 +231,9 @@ static int csi_depthwise_deconv2d_nchw_u8(struct csi_tensor *o_input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; const int output_batch = output->dim[0]; - const int32_t input_offset = input->offset; - const int32_t filter_offset = kernel->offset; - const int32_t output_offset = output->offset; + const int32_t input_offset = input->zero_point; + const int32_t filter_offset = kernel->zero_point; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int output_shift = output->shift; @@ -268,8 +268,8 @@ static int csi_depthwise_deconv2d_nchw_u8(struct csi_tensor *o_input, kernel->dim, 0, filter_y, filter_x, in_channel)]; scratch_buffer[csi_get_index(output->dim, batch, out_y, out_x, in_channel)] += - (input_value + input_offset) * - (filter_value + filter_offset); + (input_value - input_offset) * + (filter_value - filter_offset); } } } @@ -292,47 +292,46 @@ static int csi_depthwise_deconv2d_nchw_u8(struct csi_tensor *o_input, for (int i = 0; i < num_elements; ++i) { output_data[i] = - csi_quantize_u8(scratch_buffer[i], output->offset, output->multiplier, output->shift); + csi_quantize_u8(scratch_buffer[i], output->zero_point, output->multiplier, output->shift); } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); return CSINN_TRUE; } +int csi_deconv2d_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv2d_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_deconv2d_nchw_u8(input, output, kernel, bias, params); + } else if (params->layout == CSINN_NHWC) { + csi_deconv2d_nhwc_u8(input, output, kernel, bias, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + int csi_deconv2d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, struct csi_tensor *bias, struct conv2d_params *params) { - if (params->layout == CSINN_NCHW) { - if (params->group == 1) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_deconv2d_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->group == output->dim[1]) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_depthwise_deconv2d_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_FALSE; + if (params->group == 1) { + params->bc = csi_bc_map(params->api, CSINN_OP_DECONV2D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } - } else if (params->layout == CSINN_NHWC) { - if (params->group == 1) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_deconv2d_nhwc_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_FALSE; + } else if (params->group == output->dim[1] && params->layout == CSINN_NCHW) { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTHWISE_DECONV2D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } } else { - return CSINN_UNSUPPORT_LAYOUT; + return CSINN_FALSE; } return CSINN_TRUE; } diff --git a/source/reference/deconvolution3d.c b/source/reference/deconvolution3d.c index 10f4ba77..e003ad55 100644 --- a/source/reference/deconvolution3d.c +++ b/source/reference/deconvolution3d.c @@ -19,12 +19,11 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_deconv3d_ncdhw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) +int csi_deconv3d_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -117,69 +116,21 @@ static int csi_deconv3d_ncdhw_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_deconv3d_ncdhw_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) -{ - - - return CSINN_FALSE; -} - -static int csi_deconv3d_ndhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) -{ - - return CSINN_FALSE; -} - -static int csi_deconv3d_ndhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) -{ - - return CSINN_FALSE; -} - - - - int csi_deconv3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *kernel, - struct csi_tensor *bias, - struct conv3d_params *params) + struct csi_tensor *output, + struct csi_tensor *kernel, + struct csi_tensor *bias, + struct conv3d_params *params) { - if(params->layout == CSINN_NCDHW) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_deconv3d_ncdhw_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_deconv3d_ncdhw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if(params->layout == CSINN_NDHWC) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_deconv3d_ndhwc_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_deconv3d_ndhwc_f32; - } else { + if (params->layout == CSINN_NCDHW) { + params->bc = csi_bc_map(params->api, CSINN_OP_DECONV3D, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } - } else { - return CSINN_UNSUPPORT_LAYOUT; } return CSINN_TRUE; } - int csi_deconv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel, diff --git a/source/reference/depth_to_space.c b/source/reference/depth_to_space.c index db348f28..af958006 100644 --- a/source/reference/depth_to_space.c +++ b/source/reference/depth_to_space.c @@ -21,9 +21,9 @@ //the input->data is a 4-D Tensor with shape [batch, depth, height, width]. -static int csi_depth_to_space_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct depth_to_space_params *params) +int csi_depth_to_space_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct depth_to_space_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,10 +65,9 @@ static int csi_depth_to_space_f32(struct csi_tensor *input, return CSINN_TRUE; } - -static int csi_depth_to_space_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct depth_to_space_params *params) +int csi_depth_to_space_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct depth_to_space_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -115,11 +114,8 @@ int csi_depth_to_space_init(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_depth_to_space_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_depth_to_space_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_DEPTH_TO_SPACE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/div.c b/source/reference/div.c index 1cb59146..e0b5f777 100644 --- a/source/reference/div.c +++ b/source/reference/div.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_div_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_div_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_div_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_div_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_div_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -53,11 +53,11 @@ static int csi_div_u8(struct csi_tensor *input0, for (int i = 0; i < size; i++) { float input0_val = - csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); float input1_val = - csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val / input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -67,11 +67,8 @@ int csi_div_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_div_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_div_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_DIV, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/elu.c b/source/reference/elu.c index d82ed97a..549374f4 100644 --- a/source/reference/elu.c +++ b/source/reference/elu.c @@ -24,9 +24,9 @@ static float elu(float x){ return x < 0.0 ? exp(x) - 1 : x; } -static int csi_elu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_elu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,9 +41,9 @@ static int csi_elu_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_elu_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_elu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -53,11 +53,11 @@ static int csi_elu_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = elu(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -66,11 +66,8 @@ int csi_elu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_elu_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_elu_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ELU, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/equal.c b/source/reference/equal.c index d5731f3c..d24967a8 100644 --- a/source/reference/equal.c +++ b/source/reference/equal.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_equal_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_equal_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_equal_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,13 +52,13 @@ static int csi_equal_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val == input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -68,11 +68,8 @@ int csi_equal_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_equal_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_equal_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_EQUANL, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/erf.c b/source/reference/erf.c index d0413556..d8a70700 100644 --- a/source/reference/erf.c +++ b/source/reference/erf.c @@ -22,9 +22,9 @@ #define ERF_PARAM 1.128379167 -static int csi_erf_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_erf_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -36,9 +36,9 @@ static int csi_erf_f32(struct csi_tensor *input, return CSINN_FALSE; } -static int csi_erf_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_erf_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -54,11 +54,8 @@ int csi_erf_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_erf_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_erf_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ERF, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/exp.c b/source/reference/exp.c index e90b4f94..d158f894 100644 --- a/source/reference/exp.c +++ b/source/reference/exp.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_exp_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_exp_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_exp_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_exp_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_exp_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_exp_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = exp(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,11 +62,8 @@ int csi_exp_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_exp_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_exp_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_EXP, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/expand_dims.c b/source/reference/expand_dims.c index ed1d89dd..5b5117c4 100644 --- a/source/reference/expand_dims.c +++ b/source/reference/expand_dims.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_expand_dims_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct expand_dims_params *params) +int csi_expand_dims_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct expand_dims_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -35,9 +35,9 @@ static int csi_expand_dims_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_expand_dims_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct expand_dims_params *params) +int csi_expand_dims_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct expand_dims_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -55,11 +55,8 @@ int csi_expand_dims_init(struct csi_tensor *input, struct csi_tensor *output, struct expand_dims_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_expand_dims_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_expand_dims_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_EXPAND_DIMS, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/expm1.c b/source/reference/expm1.c index faf1c8e0..1803952d 100644 --- a/source/reference/expm1.c +++ b/source/reference/expm1.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_expm1_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_expm1_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_expm1_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_expm1_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_expm1_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_expm1_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = exp(input0_val) - 1; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_expm1_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_expm1_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_expm1_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_EXPM1, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_expm1(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/flatten.c b/source/reference/flatten.c index da8afc74..1ef81edc 100644 --- a/source/reference/flatten.c +++ b/source/reference/flatten.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_flatten_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct flatten_params *params) +int csi_flatten_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct flatten_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -39,9 +39,9 @@ static int csi_flatten_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_flatten_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct flatten_params *params) +int csi_flatten_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct flatten_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -63,11 +63,8 @@ int csi_flatten_init(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_flatten_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_flatten_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_FLATTEN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/floor.c b/source/reference/floor.c index 8ac7242e..3bb95906 100644 --- a/source/reference/floor.c +++ b/source/reference/floor.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_floor_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_floor_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_floor_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_floor_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_floor_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_floor_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = floor(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,11 +62,8 @@ int csi_floor_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_floor_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_floor_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_FLOOR, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/floor_divide.c b/source/reference/floor_divide.c index 3945b19c..1571743f 100644 --- a/source/reference/floor_divide.c +++ b/source/reference/floor_divide.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_floor_divide_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_floor_divide_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_floor_divide_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_floor_divide_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_floor_divide_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,13 +52,13 @@ static int csi_floor_divide_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = floor(input0_val / input1_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -68,11 +68,8 @@ int csi_floor_divide_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_floor_divide_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_floor_divide_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_FLOOR_DIVIDE, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/floor_mod.c b/source/reference/floor_mod.c index eedeac65..f4205d85 100644 --- a/source/reference/floor_mod.c +++ b/source/reference/floor_mod.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_floor_mod_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_floor_mod_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -40,10 +40,10 @@ static int csi_floor_mod_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_floor_mod_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_floor_mod_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -54,14 +54,14 @@ static int csi_floor_mod_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val - floor(input0_val / input1_val) * input1_val; // res = floor(res); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -71,11 +71,8 @@ int csi_floor_mod_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_floor_mod_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_floor_mod_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_FLOOR_MOD, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/fullyconnected.c b/source/reference/fullyconnected.c index f162cfc3..466bcfad 100644 --- a/source/reference/fullyconnected.c +++ b/source/reference/fullyconnected.c @@ -19,11 +19,11 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_fullyconnected_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int csi_fullyconnected_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -50,11 +50,11 @@ static int csi_fullyconnected_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_fullyconnected_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct csi_tensor *weights, - struct csi_tensor *bias, - struct fc_params *params) +int csi_fullyconnected_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *weights, + struct csi_tensor *bias, + struct fc_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -72,14 +72,14 @@ static int csi_fullyconnected_u8(struct csi_tensor *input, for (int d = 0; d < accum_depth; ++d) { int32_t input_val = input_data[b * accum_depth + d]; int32_t filter_val = weights_data[out_c * accum_depth + d]; - acc += (filter_val + weights->offset) * (input_val + input->offset); + acc += (filter_val - weights->zero_point) * (input_val - input->zero_point); } if (bias_data != NULL) { acc += bias_data[out_c]; } output_data[out_c + output_depth * b] = - csi_quantize_u8(acc, output->offset, output->multiplier, output->shift); + csi_quantize_u8(acc, output->zero_point, output->multiplier, output->shift); } } return CSINN_TRUE; @@ -91,11 +91,8 @@ int csi_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *bias, struct fc_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_fullyconnected_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_fullyconnected_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_FULLYCONNECTED, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/gather.c b/source/reference/gather.c index 2fa155dc..64f75258 100644 --- a/source/reference/gather.c +++ b/source/reference/gather.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_gather_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct gather_params *params) +int csi_gather_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct gather_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -45,9 +44,9 @@ static int csi_gather_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_gather_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct gather_params *params) +int csi_gather_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct gather_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -61,11 +60,11 @@ static int csi_gather_u8(struct csi_tensor *input, if(params->indices[i] < input->dim[0]) { for(int j = 0; j < inner_size; j++) { *(output_data + j) = csi_requantize_u8(*(input_data + params->indices[i] * inner_size + j), - input->offset, input->multiplier, input->shift, output->offset, output->multiplier, output->shift); + input->zero_point, input->multiplier, input->shift, output->zero_point, output->multiplier, output->shift); } } else { - uint8_t zero = csi_requantize_u8(0.0f, input->offset, input->multiplier, input->shift, - output->offset, output->multiplier, output->shift); + uint8_t zero = csi_requantize_u8(0.0f, input->zero_point, input->multiplier, input->shift, + output->zero_point, output->multiplier, output->shift); for(int j = 0; j < inner_size; j++) { *(output_data + j) = zero; } @@ -76,22 +75,19 @@ static int csi_gather_u8(struct csi_tensor *input, } int csi_gather_init(struct csi_tensor *input, - struct csi_tensor *output, - struct gather_params *params) + struct csi_tensor *output, + struct gather_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_gather_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_gather_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GATHER, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_gather(struct csi_tensor *input, - struct csi_tensor *output, - struct gather_params *params) + struct csi_tensor *output, + struct gather_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/gather_nd.c b/source/reference/gather_nd.c index 714811fc..a67b5725 100644 --- a/source/reference/gather_nd.c +++ b/source/reference/gather_nd.c @@ -29,10 +29,10 @@ static int Multiplication(int *input, int s, int e) return res; } -static int csi_gather_nd_f32(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_nd_params *params) +int csi_gather_nd_f32(struct csi_tensor *input, + struct csi_tensor *indices, + struct csi_tensor *output, + struct gather_nd_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -89,11 +89,10 @@ static int csi_gather_nd_f32(struct csi_tensor *input, return CSINN_TRUE; } - -static int csi_gather_nd_u8(struct csi_tensor *input, - struct csi_tensor *indices, - struct csi_tensor *output, - struct gather_nd_params *params) +int csi_gather_nd_u8(struct csi_tensor *input, + struct csi_tensor *indices, + struct csi_tensor *output, + struct gather_nd_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -138,16 +137,16 @@ static int csi_gather_nd_u8(struct csi_tensor *input, } if(dim_over_flag == 1) { dim_over_flag = 0; - uint8_t zero = csi_requantize_u8(0.0f, input->offset, input->multiplier, input->shift, - output->offset, output->multiplier, output->shift); + uint8_t zero = csi_requantize_u8(0.0f, input->zero_point, input->multiplier, input->shift, + output->zero_point, output->multiplier, output->shift); for(int n = 0; n < input_inner_size; n++) { *(output_data + n) = zero; } } else { in_copy_addr = input_data + input_outer_idx * input_inner_size; for(int k = 0; k < input_inner_size; k++) { - *(output_data + k) = csi_requantize_u8(*(in_copy_addr + k), input->offset, input->multiplier, input->shift, - output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_requantize_u8(*(in_copy_addr + k), input->zero_point, input->multiplier, input->shift, + output->zero_point, output->multiplier, output->shift); } } output_data += input_inner_size; @@ -161,11 +160,8 @@ int csi_gather_nd_init(struct csi_tensor *input, struct csi_tensor *output, struct gather_nd_params *params) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_gather_nd_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_gather_nd_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GATHER_ND, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/global_averagepool.c b/source/reference/global_averagepool.c index 0a6f23ee..64a48e03 100644 --- a/source/reference/global_averagepool.c +++ b/source/reference/global_averagepool.c @@ -20,8 +20,8 @@ #include "csi_utils.h" static int csi_global_averagepool_nhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -32,10 +32,10 @@ static int csi_global_averagepool_nhwc_u8(struct csi_tensor *input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -68,7 +68,7 @@ static int csi_global_averagepool_nhwc_u8(struct csi_tensor *input, const int in_y = in_y_origin + filter_y; uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, in_x, channel)]; - total += csi_dequantize_f32(input_val, input_offset, input_multiplier, + total += csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, input_shift); filter_count++; } @@ -76,7 +76,72 @@ static int csi_global_averagepool_nhwc_u8(struct csi_tensor *input, assert(filter_count != 0); float average = total / filter_count; output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = - csi_quantize_f32(average, output_offset, output_multiplier, output_shift); + csi_quantize_f32_to_u8(average, output_offset, output_multiplier, output_shift); + } + } + } + } + return CSINN_TRUE; +} + +static int csi_global_averagepool_nhwc_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + int8_t *input_data = input->data; + int8_t *output_data = output->data; + const int batches = input->dim[0]; + const int depth = input->dim[3]; + const int input_height = input->dim[1]; + const int input_width = input->dim[2]; + const int output_height = output->dim[1]; + const int output_width = output->dim[2]; + + const int32_t input_offset = input->zero_point; + const int32_t input_multiplier = input->multiplier; + const int32_t input_shift = input->shift; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + int filter_height = input_height; + int filter_width = input_width; + int stride_height = 1; + int stride_width = 1; + int pad_height = 0; + int pad_width = 0; + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int channel = 0; channel < depth; ++channel) { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = csi_max_internal_s32(0, -in_x_origin); + const int filter_x_end = + csi_min_internal_s32(filter_width, input_width - in_x_origin); + const int filter_y_start = csi_max_internal_s32(0, -in_y_origin); + const int filter_y_end = + csi_min_internal_s32(filter_height, input_height - in_y_origin); + float total = 0; + int32_t filter_count = 0; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, + in_x, channel)]; + total += csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, + input_shift); + filter_count++; + } + } + assert(filter_count != 0); + float average = total / filter_count; + output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = + csi_quantize_f32_to_i8(average, output_offset, output_multiplier, output_shift); } } } @@ -85,13 +150,13 @@ static int csi_global_averagepool_nhwc_u8(struct csi_tensor *input, } static int csi_global_averagepool_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct pool_params *params) + struct csi_tensor *o_output, + struct pool_params *params) { struct csi_tensor* input; struct csi_tensor* output; - input = csi_nchw_to_nhwc_u8(o_input); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -102,10 +167,10 @@ static int csi_global_averagepool_nchw_u8(struct csi_tensor *o_input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -138,7 +203,7 @@ static int csi_global_averagepool_nchw_u8(struct csi_tensor *o_input, const int in_y = in_y_origin + filter_y; uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, in_x, channel)]; - total += csi_dequantize_f32(input_val, input_offset, input_multiplier, + total += csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, input_shift); filter_count++; } @@ -146,34 +211,120 @@ static int csi_global_averagepool_nchw_u8(struct csi_tensor *o_input, assert(filter_count != 0); float average = total / filter_count; output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = - csi_quantize_f32(average, output_offset, output_multiplier, output_shift); + csi_quantize_f32_to_u8(average, output_offset, output_multiplier, output_shift); } } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); return CSINN_TRUE; } -int csi_global_averagepool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +static int csi_global_averagepool_nchw_i8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct pool_params *params) { - if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_global_averagepool_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_global_averagepool_nhwc_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; + struct csi_tensor* input; + struct csi_tensor* output; + input = csi_nchw_to_nhwc_8(o_input); + output = csi_nchw_to_nhwc_8(o_output); + + int8_t *input_data = input->data; + int8_t *output_data = output->data; + const int batches = input->dim[0]; + const int depth = input->dim[3]; + const int input_height = input->dim[1]; + const int input_width = input->dim[2]; + const int output_height = output->dim[1]; + const int output_width = output->dim[2]; + + const int32_t input_offset = input->zero_point; + const int32_t input_multiplier = input->multiplier; + const int32_t input_shift = input->shift; + const int32_t output_offset = output->zero_point; + const int32_t output_multiplier = output->multiplier; + const int32_t output_shift = output->shift; + + int filter_height = input_height; + int filter_width = input_width; + int stride_height = 1; + int stride_width = 1; + int pad_height = 0; + int pad_width = 0; + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int channel = 0; channel < depth; ++channel) { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = csi_max_internal_s32(0, -in_x_origin); + const int filter_x_end = + csi_min_internal_s32(filter_width, input_width - in_x_origin); + const int filter_y_start = csi_max_internal_s32(0, -in_y_origin); + const int filter_y_end = + csi_min_internal_s32(filter_height, input_height - in_y_origin); + float total = 0; + int32_t filter_count = 0; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, + in_x, channel)]; + total += csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, + input_shift); + filter_count++; + } + } + assert(filter_count != 0); + float average = total / filter_count; + output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = + csi_quantize_f32_to_i8(average, output_offset, output_multiplier, output_shift); + } + } } + } + csi_nhwc_to_nchw_8(o_output, output); + return CSINN_TRUE; +} + +int csi_global_averagepool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_global_averagepool_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_global_averagepool_nhwc_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_global_averagepool_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_global_averagepool_nchw_i8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_global_averagepool_nhwc_i8(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_global_averagepool_init(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_GLOBAL_AVGPOOL2D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } return CSINN_TRUE; } diff --git a/source/reference/global_maxpool.c b/source/reference/global_maxpool.c index 61ece444..d1800019 100644 --- a/source/reference/global_maxpool.c +++ b/source/reference/global_maxpool.c @@ -33,10 +33,10 @@ static int csi_global_maxpool_nhwc_u8(struct csi_tensor *input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -69,7 +69,7 @@ static int csi_global_maxpool_nhwc_u8(struct csi_tensor *input, const int in_y = in_y_origin + filter_y; uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, in_x, channel)]; - curr_value = csi_dequantize_f32(input_val, input_offset, input_multiplier, + curr_value = csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, input_shift); if (curr_value > max_value) { max_value = curr_value; @@ -77,7 +77,7 @@ static int csi_global_maxpool_nhwc_u8(struct csi_tensor *input, } } output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = - csi_quantize_f32(max_value, output_offset, output_multiplier, output_shift); + csi_quantize_f32_to_u8(max_value, output_offset, output_multiplier, output_shift); } } } @@ -91,8 +91,8 @@ static int csi_global_maxpool_nchw_u8(struct csi_tensor *o_input, { struct csi_tensor* input; struct csi_tensor* output; - input = csi_nchw_to_nhwc_u8(o_input); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -103,10 +103,10 @@ static int csi_global_maxpool_nchw_u8(struct csi_tensor *o_input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -139,7 +139,7 @@ static int csi_global_maxpool_nchw_u8(struct csi_tensor *o_input, const int in_y = in_y_origin + filter_y; uint8_t input_val = input_data[csi_get_index(input->dim, batch, in_y, in_x, channel)]; - curr_value = csi_dequantize_f32(input_val, input_offset, input_multiplier, + curr_value = csi_dequantize_u8_to_f32(input_val, input_offset, input_multiplier, input_shift); if (curr_value > max_value) { max_value = curr_value; @@ -147,34 +147,36 @@ static int csi_global_maxpool_nchw_u8(struct csi_tensor *o_input, } } output_data[csi_get_index(output->dim, batch, out_y, out_x, channel)] = - csi_quantize_f32(max_value, output_offset, output_multiplier, output_shift); + csi_quantize_f32_to_u8(max_value, output_offset, output_multiplier, output_shift); } } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); return CSINN_TRUE; } -int csi_global_maxpool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_global_maxpool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_global_maxpool_nchw_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_global_maxpool_nhwc_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_global_maxpool_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_global_maxpool_nhwc_u8(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_global_maxpool_init(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_GLOBAL_MAXPOOL2D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } return CSINN_TRUE; } diff --git a/source/reference/greater.c b/source/reference/greater.c index dc7e7b7c..efe4c548 100644 --- a/source/reference/greater.c +++ b/source/reference/greater.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_greater_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_greater_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_greater_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_greater_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_greater_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,27 +52,24 @@ static int csi_greater_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val > input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_greater_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_greater_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_greater_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GREATHER_EQUAL, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/greater_equal.c b/source/reference/greater_equal.c index 0059ef75..137892ff 100644 --- a/source/reference/greater_equal.c +++ b/source/reference/greater_equal.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_greater_equal_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_greater_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_greater_equal_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_greater_equal_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_greater_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,36 +52,33 @@ static int csi_greater_equal_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val >= input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_greater_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_greater_equal_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_greater_equal_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_GREATHER_EQUAL, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_greater_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/hard_sigmoid.c b/source/reference/hard_sigmoid.c index 3f54d608..d3649f57 100644 --- a/source/reference/hard_sigmoid.c +++ b/source/reference/hard_sigmoid.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_hard_sigmoid_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csi_hard_sigmoid_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -41,9 +41,9 @@ static int csi_hard_sigmoid_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_hard_sigmoid_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csi_hard_sigmoid_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -52,7 +52,7 @@ static int csi_hard_sigmoid_u8(struct csi_tensor *input, size *= input->dim[i]; } for(int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float output_val = 0.0f; if(input_val < -2.5) { @@ -62,28 +62,25 @@ static int csi_hard_sigmoid_u8(struct csi_tensor *input, } else { output_val = 0.2 * input_val + 0.5; } - output_data[i] = csi_quantize_f32(output_val, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(output_val, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_hard_sigmoid_init(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) + struct csi_tensor *output, + struct sigmoid_params *params) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_hard_sigmoid_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_hard_sigmoid_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_HARD_SIGMOID, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_hard_sigmoid(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) + struct csi_tensor *output, + struct sigmoid_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/im2col.c b/source/reference/im2col.c index 43cba157..7e7b7a3f 100644 --- a/source/reference/im2col.c +++ b/source/reference/im2col.c @@ -19,23 +19,284 @@ #include "csi_nn.h" #include "csi_utils.h" +// input_data layout:NCHW +// https://github.com/pjreddie/darknet/blob/master/src/im2col.c +// output_data: row = channels*ksize_h*ksize_w, col = batch*height_col*width_col +static int csi_im2col_nchw_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int32_t batch = input->dim[0]; + int32_t channel = input->dim[1]; + int32_t height = input->dim[2]; + int32_t width = input->dim[3]; + int32_t ksize_h = params->kernel_h; + int32_t ksize_w = params->kernel_w; + int32_t stride_h = params->stride_h; + int32_t stride_w = params->stride_w; + + int height_col = (height + params->pad_top + params->pad_down - ksize_h) / stride_h + 1; // output_height + int width_col = (width + params->pad_left + params->pad_right - ksize_w) / stride_w + 1; // output_width, batch * output_height * output_width = matrix_col + int channel_col = channel * ksize_h * ksize_w; + + for(int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_w; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + for(int b = 0; b < batch; ++b) { + for(int h = 0; h < height_col; ++h) { + for(int w = 0; w < width_col; ++w) { + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = ((c * batch + b) * height_col + h) * width_col + w; + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if(im_row < 0 || im_col < 0 || im_row >= height || im_col >=width) { + output_data[col_index] = 0.0f; + } else { + output_data[col_index] = input_data[csi_get_index(input->dim, b, c_im, im_row, im_col)]; + } + } + } + } + } + return CSINN_TRUE; +} + +static int csi_im2col_nchw_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params) +{ + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *output_data = (uint8_t *)output->data; + int32_t batch = input->dim[0]; + int32_t channel = input->dim[1]; + int32_t height = input->dim[2]; + int32_t width = input->dim[3]; + int32_t ksize_h = params->kernel_h; + int32_t ksize_w = params->kernel_w; + int32_t stride_h = params->stride_h; + int32_t stride_w = params->stride_w; + + int height_col = (height + params->pad_top + params->pad_down - ksize_h) / stride_h + 1; + int width_col = (width + params->pad_left + params->pad_right - ksize_w) / stride_w + 1; + int channel_col = channel * ksize_h * ksize_w; + + for(int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_h; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + for(int b = 0; b < batch; ++b) { + for(int h = 0; h < height_col; ++h) { + for(int w = 0; w < width_col; ++w) { + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = ((c * batch + b) * height_col + h) * width_col + w; + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if(im_row < 0 || im_col < 0 || im_row >= height || im_col >=width) { + output_data[col_index] = csi_quantize_f32_to_u8(0.0f, input->zero_point, input->multiplier, input->shift); + } + output_data[col_index] = input_data[csi_get_index(input->dim, b, c_im, im_row, im_col)]; + } + } + } + } + return CSINN_TRUE; +} + + +// input_data layout:NHWC +// output_data: row = batch*height_col*width_col, col = channels*ksize_h*ksize_w +static int csi_im2col_nhwc_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + int32_t batch = input->dim[0]; + int32_t channel = input->dim[3]; + int32_t height = input->dim[1]; + int32_t width = input->dim[2]; + int32_t ksize_h = params->kernel_h; + int32_t ksize_w = params->kernel_w; + int32_t stride_h = params->stride_h; + int32_t stride_w = params->stride_w; + + + int height_col = (height + params->pad_top + params->pad_down - ksize_h) / stride_h + 1; // output_height + int width_col = (width + params->pad_left + params->pad_right - ksize_w) / stride_w + 1; // output_width, output_height * output_width = matrix_ + int channel_col = channel * ksize_h * ksize_w; + + for(int b = 0; b < batch; ++b) { + for(int h = 0; h < height_col; ++h) { + for(int w = 0; w < width_col; ++w) { + for(int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_w; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = ((b * height_col + h) * width_col + w) * channel_col + c; + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if(im_row < 0 || im_col < 0 || im_row >= height || im_col >=width) { + output_data[col_index] = 0.0f; + } else { + output_data[col_index] = input_data[csi_get_index(input->dim, b, im_row, im_col, c_im)]; + } + } + } + } + } + +/* +output_data layout: row = channels*ksize_h*ksize_w, col = batch*height_col*width_col +*/ + // for(int c = 0; c < channel_col; ++c) { + // int w_offset = c % ksize_w; + // int h_offset = c / ksize_w % ksize_h; + // int c_im = c / ksize_h / ksize_w; + // for(int b = 0; b < batch; ++b) { + // for(int h = 0; h < height_col; ++h) { + // for(int w = 0; w < width_col; ++w) { + // int im_row = h_offset + h * stride_h; + // int im_col = w_offset + w * stride_w; + // int col_index = ((c * batch + b) * height_col + h) * width_col + w; + // im_row = im_row - params->pad_top; + // im_col = im_col - params->pad_left; + // if(im_row < 0 || im_col < 0 || im_row >= height || im_col >=width) { + // output_data[col_index] = 0.0f; + // } else { + // output_data[col_index] = input_data[csi_get_index(input->dim, b, im_row, im_col, c_im)]; + // } + // } + // } + // } + // } + + return CSINN_TRUE; +} + +static int csi_im2col_nhwc_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params) +{ + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *output_data = (uint8_t *)output->data; + int32_t batch = input->dim[0]; + int32_t channel = input->dim[3]; + int32_t height = input->dim[1]; + int32_t width = input->dim[2]; + int32_t ksize_h = params->kernel_h; + int32_t ksize_w = params->kernel_w; + int32_t stride_h = params->stride_h; + int32_t stride_w = params->stride_w; + + int height_col = (height + params->pad_top + params->pad_down - ksize_h) / stride_h + 1; // output_height + int width_col = (width + params->pad_left + params->pad_right - ksize_w) / stride_w + 1; // output_width, output_height * output_width = matrix_ + int channel_col = channel * ksize_h * ksize_w; + + for(int b = 0; b < batch; ++b) { + for(int h = 0; h < height_col; ++h) { + for(int w = 0; w < width_col; ++w) { + for(int c = 0; c < channel_col; ++c) { + int w_offset = c % ksize_w; + int h_offset = c / ksize_w % ksize_h; + int c_im = c / ksize_h / ksize_w; + + int im_row = h_offset + h * stride_h; + int im_col = w_offset + w * stride_w; + int col_index = ((b * height_col + h) * width_col + w) * channel_col + c; + im_row = im_row - params->pad_top; + im_col = im_col - params->pad_left; + if(im_row < 0 || im_col < 0 || im_row >= height || im_col >=width) { + output_data[col_index] = csi_quantize_f32_to_u8(0.0f, input->zero_point, input->multiplier, input->shift); + } else { + output_data[col_index] = input_data[csi_get_index(input->dim, b, im_row, im_col, c_im)]; + } + } + } + } + } + + // for(int c = 0; c < channel_col; ++c) { + // int w_offset = c % ksize_w; + // int h_offset = c / ksize_w % ksize_h; + // int c_im = c / ksize_h / ksize_w; + // for(int b = 0; b < batch; ++b) { + // for(int h = 0; h < height_col; ++h) { + // for(int w = 0; w < width_col; ++w) { + // int im_row = h_offset + h * stride_h; + // int im_col = w_offset + w * stride_w; + // int col_index = ((c * batch + b) * height_col + h) * width_col + w; + // im_row = im_row - params->pad_top; + // im_col = im_col - params->pad_left; + // if(im_row < 0 || im_col < 0 || im_row >= height || im_col >=width) { + // output_data[col_index] = csi_quantize_f32_to_u8(0.0f, input->zero_point, input->multiplier, input->shift); + // } else { + // output_data[col_index] = input_data[csi_get_index(input->dim, b, im_row, im_col, c_im)]; + // } + // } + // } + // } + // } + + return CSINN_TRUE; +} + + +int csi_im2col_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_im2col_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_im2col_nhwc_f32(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } + return CSINN_TRUE; +} + +int csi_im2col_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct im2col_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_im2col_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_im2col_nhwc_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } + return CSINN_TRUE; +} + + int csi_im2col_init(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct im2col_params *params) { - return CSINN_FALSE; + params->bc = csi_bc_map(params->api, CSINN_OP_IM2COL, input->dtype); + if(params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; } int csi_im2col(struct csi_tensor *input, struct csi_tensor *output, - struct csi_tensor *kernel, struct im2col_params *params) { if (params->bc != NULL) { - params->bc(input, output, kernel, params); + params->bc(input, output, params); } else { return CSINN_CALLBACK_UNSET; } return CSINN_TRUE; -} \ No newline at end of file +} diff --git a/source/reference/isnan.c b/source/reference/isnan.c index adab9093..6bc8c7c2 100644 --- a/source/reference/isnan.c +++ b/source/reference/isnan.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_isnan_bool_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_isnan_bool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; bool *output_data = output->data; @@ -36,9 +36,9 @@ static int csi_isnan_bool_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_isnan_bool_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_isnan_bool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; bool *output_data = output->data; @@ -48,7 +48,7 @@ static int csi_isnan_bool_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); output_data[i] = isnan(input0_val); } @@ -56,22 +56,19 @@ static int csi_isnan_bool_u8(struct csi_tensor *input, } int csi_isnan_bool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_isnan_bool_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_isnan_bool_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ISNAN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_isnan_bool(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/l2_normalization.c b/source/reference/l2_normalization.c index 3a9d00c6..acdef754 100644 --- a/source/reference/l2_normalization.c +++ b/source/reference/l2_normalization.c @@ -21,9 +21,9 @@ /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/l2normalization.h */ -static int csi_l2_normalization_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) +int csi_l2_normalization_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct l2n_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -49,9 +49,9 @@ static int csi_l2_normalization_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_l2_normalization_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) +int csi_l2_normalization_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct l2n_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -66,13 +66,13 @@ static int csi_l2_normalization_u8(struct csi_tensor *input, for (int i = 0; i < outer_size; ++i) { float squared_l2_norm = 0; for (int c = 0; c < depth; ++c) { - const float val = csi_dequantize_f32(input_data[depth * i + c], input->offset, + const float val = csi_dequantize_u8_to_f32(input_data[depth * i + c], input->zero_point, input->multiplier, input->shift); squared_l2_norm += val * val; } const float l2_norm = sqrt(squared_l2_norm + params->epsilon); for (int c = 0; c < depth; ++c) { - output_data[depth * i + c] = csi_quantize_f32(input_data[depth * i + c] / l2_norm, output->offset, + output_data[depth * i + c] = csi_quantize_f32_to_u8(input_data[depth * i + c] / l2_norm, output->zero_point, output->multiplier, output->shift); } } @@ -80,22 +80,19 @@ static int csi_l2_normalization_u8(struct csi_tensor *input, } int csi_l2_normalization_init(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) + struct csi_tensor *output, + struct l2n_params *params) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_l2_normalization_f32; - } else if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_l2_normalization_u8; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_L2N, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_l2_normalization(struct csi_tensor *input, - struct csi_tensor *output, - struct l2n_params *params) + struct csi_tensor *output, + struct l2n_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/l2pool.c b/source/reference/l2pool.c index c3d859d2..7df0d569 100644 --- a/source/reference/l2pool.c +++ b/source/reference/l2pool.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_l2pool_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_l2pool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -67,20 +67,19 @@ static int csi_l2pool_f32(struct csi_tensor *input, } int csi_l2pool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_l2pool_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_L2POOL2D, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_l2pool(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/leaky_relu.c b/source/reference/leaky_relu.c index 96c022f9..508e6df0 100644 --- a/source/reference/leaky_relu.c +++ b/source/reference/leaky_relu.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_leaky_relu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_leaky_relu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_leaky_relu_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_leaky_relu_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_leaky_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -48,13 +48,13 @@ static int csi_leaky_relu_u8(struct csi_tensor *input, size = size * input->dim[i]; } - float alpha_f = csi_dequantize_f32(1, 0, params->n_multiplier, params->n_shift); + float alpha_f = csi_dequantize_u8_to_f32(1, 0, params->n_multiplier, params->n_shift); for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = input_val > 0 ? input_val : input_val * alpha_f; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -63,11 +63,8 @@ int csi_leaky_relu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_leaky_relu_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_leaky_relu_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LEAKY_RELU, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/less.c b/source/reference/less.c index 35bac4e1..13f0fe6f 100644 --- a/source/reference/less.c +++ b/source/reference/less.c @@ -19,7 +19,7 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_less_f32(struct csi_tensor *input0, +int csi_less_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, struct diso_params *params) @@ -38,7 +38,7 @@ static int csi_less_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_less_u8(struct csi_tensor *input0, +int csi_less_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, struct diso_params *params) @@ -52,36 +52,33 @@ static int csi_less_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val < input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_less_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_less_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_less_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LESS, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_less(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/less_equal.c b/source/reference/less_equal.c index 3717050e..80d8d773 100644 --- a/source/reference/less_equal.c +++ b/source/reference/less_equal.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_less_equal_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_less_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_less_equal_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_less_equal_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_less_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,37 +52,33 @@ static int csi_less_equal_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val <= input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } - int csi_less_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_less_equal_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_less_equal_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LESS_EQUAL, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_less_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/log.c b/source/reference/log.c index 48f3288e..e6b9dcf2 100644 --- a/source/reference/log.c +++ b/source/reference/log.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_log_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_log_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_log_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_log_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_log_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_log_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = log(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,19 +62,16 @@ int csi_log_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_log_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_log_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOG, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_log(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/log1p.c b/source/reference/log1p.c index 95f90663..c24b0f68 100644 --- a/source/reference/log1p.c +++ b/source/reference/log1p.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_log1p_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_log1p_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_log1p_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_log1p_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_log1p_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_log1p_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = log(1+input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_log1p_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_log1p_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_log1p_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOG1P, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_log1p(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/log_softmax.c b/source/reference/log_softmax.c index 0542da6b..d3ca2e7d 100644 --- a/source/reference/log_softmax.c +++ b/source/reference/log_softmax.c @@ -19,74 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -// logsoftmax = logits - log(reduce_sum(exp(logits), axis)) -// static int csi_log_softmax_f32(struct csi_tensor *input, -// struct csi_tensor *output, -// struct softmax_params *params) -// { -// // now only support 2D input -// assert(params->axis == 1 && input->dim_count == 2); -// float *input_data = (float *)input->data; -// float *output_data = (float *)output->data; - -// int in_size = 1, out_size = 1; -// for(int i = 0; i < input->dim_count; i++) { -// in_size *= input->dim[i]; -// } -// out_size = in_size; -// int input_outer_size = 1; -// for(int i = 0; i < params->axis; i++) { -// input_outer_size *= input->dim[i]; -// } -// int input_inner_size = 1; -// for(int i = params->axis + 1; i < input->dim_count; i++) { -// input_inner_size *= input->dim[i]; -// } -// int axis_dim = input->dim[params->axis]; - - -// struct csi_tensor *input_1 = (struct csi_tensor *)malloc(sizeof(struct csi_tensor)); -// memcpy(input_1, input, sizeof(struct csi_tensor)); -// struct csi_tensor output_1; -// struct reduce_params rparams; - -// input_1->data = (float *)malloc(in_size * sizeof(float)); -// float *input_1_data = (float *)input_1->data; -// memcpy(input_1->data, (float *)input->data, in_size * sizeof(float)); - -// for(int i = 0; i < in_size; i++) { -// input_1_data[i] = exp(input_1_data[i]); -// } - -// output_1.data = (float *)malloc(in_size / axis_dim * sizeof(float)); -// float *output_1_data = (float *)output_1.data; - -// rparams.axis_count = 1; -// rparams.axis = (int *)malloc(sizeof(int) * rparams.axis_count); -// rparams.axis[0] = params->axis; -// csi_reduce_sum_init(input_1, &output_1, &rparams); -// csi_reduce_sum(input_1, &output_1, &rparams); - -// for(int i = 0; i < input_outer_size; i++) { -// for(int j = 0; j < axis_dim; j++) { -// for(int k = 0; k < input_inner_size; k++) { -// int index1 = (i * axis_dim + j) * input_inner_size + k; -// int index2 = i * input_inner_size + k; -// output_data[index1] = input_data[index1] - log(output_1_data[index2]); -// } -// } -// } -// free(input_1->data); -// free(output_1.data); -// free(rparams.axis); -// return CSINN_TRUE; -// } - - /* logsoftmax = logits - log(reduce_sum(exp(logits), axis)) */ -static int csi_log_softmax_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int csi_log_softmax_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) { // now only support 2D input assert(params->axis == 1 && input->dim_count == 2); @@ -127,9 +63,9 @@ static int csi_log_softmax_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_log_softmax_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +int csi_log_softmax_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) { // now only support 2D input assert(params->axis == 1 && input->dim_count == 2); @@ -157,13 +93,13 @@ static int csi_log_softmax_u8(struct csi_tensor *input, float input_temp = 0.0f; for(int j = 0; j < axis_dim; j++) { uint8_t input_val = *(input_data + j * input_inner_size + k); - input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); acc += exp(input_temp); } acc = log(acc); for(int j = 0; j < axis_dim; j++) { - input_temp = csi_dequantize_f32(*(input_data + j * input_inner_size + k), input->offset, input->multiplier, input->shift); - *(output_data + j * input_inner_size + k) = csi_quantize_f32(input_temp - acc, output->offset, output->multiplier, output->shift); + input_temp = csi_dequantize_u8_to_f32(*(input_data + j * input_inner_size + k), input->zero_point, input->multiplier, input->shift); + *(output_data + j * input_inner_size + k) = csi_quantize_f32_to_u8(input_temp - acc, output->zero_point, output->multiplier, output->shift); } } input_data += input_inner_size * axis_dim; @@ -173,14 +109,11 @@ static int csi_log_softmax_u8(struct csi_tensor *input, } int csi_log_softmax_init(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) + struct csi_tensor *output, + struct softmax_params *params) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_log_softmax_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_log_softmax_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOG_SOFTMAX, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/logical_and.c b/source/reference/logical_and.c index d15a13de..61e55fc4 100644 --- a/source/reference/logical_and.c +++ b/source/reference/logical_and.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_logical_and_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_logical_and_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_logical_and_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_logical_and_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_logical_and_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,36 +52,33 @@ static int csi_logical_and_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val && input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_logical_and_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_logical_and_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_logical_and_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOGICAL_AND, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_logical_and(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/logical_not.c b/source/reference/logical_not.c index 6d74327f..3541f9fb 100644 --- a/source/reference/logical_not.c +++ b/source/reference/logical_not.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_logical_not_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_logical_not_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -35,9 +35,9 @@ static int csi_logical_not_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_logical_not_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_logical_not_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -47,32 +47,29 @@ static int csi_logical_not_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = !input_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_logical_not_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_logical_not_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_logical_not_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOGICAL_NOT, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_logical_not(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/logical_or.c b/source/reference/logical_or.c index f0648a44..a1cb77e7 100644 --- a/source/reference/logical_or.c +++ b/source/reference/logical_or.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_logical_or_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_logical_or_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_logical_or_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_logical_or_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_logical_or_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,36 +52,33 @@ static int csi_logical_or_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val || input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_logical_or_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_logical_or_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_logical_or_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOGICAL_OR, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_logical_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/logical_xor.c b/source/reference/logical_xor.c index 280eefbc..4f6ec7bb 100644 --- a/source/reference/logical_xor.c +++ b/source/reference/logical_xor.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_logical_xor_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_logical_xor_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = (float *)input0->data; float *input1_data = (float *)input1->data; @@ -38,10 +38,10 @@ static int csi_logical_xor_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_logical_xor_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_logical_xor_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,27 +52,24 @@ static int csi_logical_xor_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = (int)input0_val ^ (int)input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_logical_xor_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_logical_xor_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_logical_xor_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_LOGICAL_XOR, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/lrn.c b/source/reference/lrn.c index c52d90b7..2e1bc2d5 100644 --- a/source/reference/lrn.c +++ b/source/reference/lrn.c @@ -77,13 +77,13 @@ static int csi_lrn_nhwc_u8(struct csi_tensor *input, float_output.dtype = CSINN_DTYPE_FLOAT32; for (int i = 0; i < size; i++) { - float_input_data[i] = csi_dequantize_f32(input_data[i], input->offset, + float_input_data[i] = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); } - bias_f = csi_dequantize_f32(1, 0, params->bias_multiplier, params->bias_shift); - alpha_f = csi_dequantize_f32(1, 0, params->alpha_multiplier, params->alpha_shift); - beta_f = csi_dequantize_f32(1, 0, params->beta_multiplier, params->beta_shift); + bias_f = csi_dequantize_u8_to_f32(1, 0, params->bias_multiplier, params->bias_shift); + alpha_f = csi_dequantize_u8_to_f32(1, 0, params->alpha_multiplier, params->alpha_shift); + beta_f = csi_dequantize_u8_to_f32(1, 0, params->beta_multiplier, params->beta_shift); params->bias = bias_f; params->alpha = alpha_f; @@ -92,7 +92,7 @@ static int csi_lrn_nhwc_u8(struct csi_tensor *input, csi_lrn_nhwc_f32(&float_input, &float_output, params); for (int i = 0; i < size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_input_data); @@ -164,13 +164,13 @@ static int csi_lrn_nchw_u8(struct csi_tensor *input, float_output.dtype = CSINN_DTYPE_FLOAT32; for (int i = 0; i < size; i++) { - float_input_data[i] = csi_dequantize_f32(input_data[i], input->offset, + float_input_data[i] = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); } - bias_f = csi_dequantize_f32(1, 0, params->bias_multiplier, params->bias_shift); - alpha_f = csi_dequantize_f32(1, 0, params->alpha_multiplier, params->alpha_shift); - beta_f = csi_dequantize_f32(1, 0, params->beta_multiplier, params->beta_shift); + bias_f = csi_dequantize_u8_to_f32(1, 0, params->bias_multiplier, params->bias_shift); + alpha_f = csi_dequantize_u8_to_f32(1, 0, params->alpha_multiplier, params->alpha_shift); + beta_f = csi_dequantize_u8_to_f32(1, 0, params->beta_multiplier, params->beta_shift); params->bias = bias_f; params->alpha = alpha_f; @@ -179,7 +179,7 @@ static int csi_lrn_nchw_u8(struct csi_tensor *input, csi_lrn_nchw_f32(&float_input, &float_output, params); for (int i = 0; i < size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_input_data); @@ -187,30 +187,39 @@ static int csi_lrn_nchw_u8(struct csi_tensor *input, return CSINN_TRUE; } +int csi_lrn_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct lrn_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_lrn_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_lrn_nhwc_f32(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} +int csi_lrn_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct lrn_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_lrn_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_lrn_nhwc_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} int csi_lrn_init(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params) { - if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_lrn_nchw_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_lrn_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_lrn_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_lrn_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_UNSUPPORT_LAYOUT; + params->bc = csi_bc_map(params->api, CSINN_OP_LRN, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } diff --git a/source/reference/matmul.c b/source/reference/matmul.c index 8f99630d..f0c1e880 100644 --- a/source/reference/matmul.c +++ b/source/reference/matmul.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_matmul_f32(struct csi_tensor *mat0, - struct csi_tensor *mat1, - struct csi_tensor *output, - struct matmul_params *params) +int csi_matmul_f32(struct csi_tensor *mat0, + struct csi_tensor *mat1, + struct csi_tensor *output, + struct matmul_params *params) { float *mat0_data = mat0->data; float *mat1_data = mat1->data; @@ -103,10 +103,10 @@ static int csi_matmul_f32(struct csi_tensor *mat0, return CSINN_TRUE; } -static int csi_matmul_u8(struct csi_tensor *mat0, - struct csi_tensor *mat1, - struct csi_tensor *output, - struct matmul_params *params) +int csi_matmul_u8(struct csi_tensor *mat0, + struct csi_tensor *mat1, + struct csi_tensor *output, + struct matmul_params *params) { uint8_t *mat0_data = mat0->data; uint8_t *mat1_data = mat1->data; @@ -134,14 +134,14 @@ static int csi_matmul_u8(struct csi_tensor *mat0, for (int k = 0; k < dim_k; ++k) { int offset0 = mat0_offset * b + i * dim_k + k; int offset1 = mat1_offset * b + k * dim_j + j; - float input_val0 = csi_dequantize_f32(mat0_data[offset0], mat0->offset, + float input_val0 = csi_dequantize_u8_to_f32(mat0_data[offset0], mat0->zero_point, mat0->multiplier, mat0->shift); - float input_val1 = csi_dequantize_f32(mat1_data[offset1], mat1->offset, + float input_val1 = csi_dequantize_u8_to_f32(mat1_data[offset1], mat1->zero_point, mat1->multiplier, mat1->shift); total += input_val0 * input_val1; } - output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32(total, - output->offset, output->multiplier, output->shift); + output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32_to_u8(total, + output->zero_point, output->multiplier, output->shift); } } } @@ -153,14 +153,14 @@ static int csi_matmul_u8(struct csi_tensor *mat0, for (int k = 0; k < dim_k; ++k) { int offset0 = mat0_offset * b + i * dim_k + k; int offset1 = mat1_offset * b + j * dim_k + k; - float input_val0 = csi_dequantize_f32(mat0_data[offset0], mat0->offset, + float input_val0 = csi_dequantize_u8_to_f32(mat0_data[offset0], mat0->zero_point, mat0->multiplier, mat0->shift); - float input_val1 = csi_dequantize_f32(mat1_data[offset1], mat1->offset, + float input_val1 = csi_dequantize_u8_to_f32(mat1_data[offset1], mat1->zero_point, mat1->multiplier, mat1->shift); total += input_val0 * input_val1; } - output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32(total, - output->offset, output->multiplier, output->shift); + output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32_to_u8(total, + output->zero_point, output->multiplier, output->shift); } } } @@ -172,14 +172,14 @@ static int csi_matmul_u8(struct csi_tensor *mat0, for (int k = 0; k < dim_k; ++k) { int offset0 = mat0_offset * b + k * dim_i + i; int offset1 = mat1_offset * b + k * dim_j + j; - float input_val0 = csi_dequantize_f32(mat0_data[offset0], mat0->offset, + float input_val0 = csi_dequantize_u8_to_f32(mat0_data[offset0], mat0->zero_point, mat0->multiplier, mat0->shift); - float input_val1 = csi_dequantize_f32(mat1_data[offset1], mat1->offset, + float input_val1 = csi_dequantize_u8_to_f32(mat1_data[offset1], mat1->zero_point, mat1->multiplier, mat1->shift); total += input_val0 * input_val1; } - output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32(total, - output->offset, output->multiplier, output->shift); + output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32_to_u8(total, + output->zero_point, output->multiplier, output->shift); } } } @@ -191,14 +191,14 @@ static int csi_matmul_u8(struct csi_tensor *mat0, for (int k = 0; k < dim_k; ++k) { int offset0 = mat0_offset * b + k * dim_i + i; int offset1 = mat1_offset * b + j * dim_k + k; - float input_val0 = csi_dequantize_f32(mat0_data[offset0], mat0->offset, + float input_val0 = csi_dequantize_u8_to_f32(mat0_data[offset0], mat0->zero_point, mat0->multiplier, mat0->shift); - float input_val1 = csi_dequantize_f32(mat1_data[offset1], mat1->offset, + float input_val1 = csi_dequantize_u8_to_f32(mat1_data[offset1], mat1->zero_point, mat1->multiplier, mat1->shift); total += input_val0 * input_val1; } - output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32(total, - output->offset, output->multiplier, output->shift); + output_data[b * out_offset + i * dim_j + j] = csi_quantize_f32_to_u8(total, + output->zero_point, output->multiplier, output->shift); } } } @@ -212,11 +212,8 @@ int csi_matmul_init(struct csi_tensor *mat0, struct csi_tensor *output, struct matmul_params *params) { - if (mat0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_matmul_u8; - } else if (mat0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_matmul_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MATMUL, mat0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/max.c b/source/reference/max.c index 8e193bde..e057a04e 100644 --- a/source/reference/max.c +++ b/source/reference/max.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_max_stride_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_max_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = input->data; @@ -58,9 +58,9 @@ static int csi_max_stride_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_max_stride_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_max_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = input->data; @@ -88,12 +88,12 @@ static int csi_max_stride_u8(struct csi_tensor *input, { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); result = fmax(result, val); } - output_data[out] = csi_quantize_f32(result, output->offset, + output_data[out] = csi_quantize_f32_to_u8(result, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; @@ -106,11 +106,8 @@ int csi_max_init(struct csi_tensor *input, if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_max_stride_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_max_stride_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MAX, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } diff --git a/source/reference/maximum.c b/source/reference/maximum.c index d1ce8711..9d632564 100644 --- a/source/reference/maximum.c +++ b/source/reference/maximum.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_maximum_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_maximum_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_maximum_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_maximum_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_maximum_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,13 +52,13 @@ static int csi_maximum_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = fmax(input0_val, input1_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -68,11 +68,8 @@ int csi_maximum_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maximum_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maximum_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MAXINUM, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/maxpool.c b/source/reference/maxpool.c index 395bf6e9..2074c5b8 100644 --- a/source/reference/maxpool.c +++ b/source/reference/maxpool.c @@ -20,8 +20,8 @@ #include "csi_utils.h" static int csi_maxpool_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -65,8 +65,8 @@ static int csi_maxpool_nhwc_f32(struct csi_tensor *input, } static int csi_maxpool_nhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -77,10 +77,10 @@ static int csi_maxpool_nhwc_u8(struct csi_tensor *input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -122,8 +122,8 @@ static int csi_maxpool_nhwc_u8(struct csi_tensor *input, } static int csi_maxpool_nchw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -167,13 +167,13 @@ static int csi_maxpool_nchw_f32(struct csi_tensor *input, } static int csi_maxpool_nchw_u8(struct csi_tensor *o_input, - struct csi_tensor *o_output, - struct pool_params *params) + struct csi_tensor *o_output, + struct pool_params *params) { struct csi_tensor* input; struct csi_tensor* output; - input = csi_nchw_to_nhwc_u8(o_input); - output = csi_nchw_to_nhwc_u8(o_output); + input = csi_nchw_to_nhwc_8(o_input); + output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -184,10 +184,10 @@ static int csi_maxpool_nchw_u8(struct csi_tensor *o_input, const int output_height = output->dim[1]; const int output_width = output->dim[2]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -226,36 +226,46 @@ static int csi_maxpool_nchw_u8(struct csi_tensor *o_input, } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); free(input->data); free(input); return CSINN_TRUE; } +int csi_maxpool_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_maxpool_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_maxpool_nhwc_f32(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} -int csi_maxpool_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_maxpool_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maxpool_nchw_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maxpool_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maxpool_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maxpool_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_maxpool_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_maxpool_nhwc_u8(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_maxpool_init(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_MAXPOOL2D, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } return CSINN_TRUE; } diff --git a/source/reference/maxpool2d_locat.c b/source/reference/maxpool2d_locat.c index 4a0015a1..7367d7d3 100644 --- a/source/reference/maxpool2d_locat.c +++ b/source/reference/maxpool2d_locat.c @@ -20,8 +20,8 @@ #include "csi_utils.h" static int csi_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -68,8 +68,8 @@ static int csi_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, } static int csi_maxpool2d_locat_nhwc_i32_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { uint8_t *input_data = input->data; int32_t *output_data = output->data; @@ -116,8 +116,8 @@ static int csi_maxpool2d_locat_nhwc_i32_u8(struct csi_tensor *input, } static int csi_maxpool2d_locat_nchw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -164,8 +164,8 @@ static int csi_maxpool2d_locat_nchw_f32(struct csi_tensor *input, } static int csi_maxpool2d_locat_nchw_i32_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { uint8_t *input_data = input->data; int32_t *output_data = output->data; @@ -211,29 +211,40 @@ static int csi_maxpool2d_locat_nchw_i32_u8(struct csi_tensor *input, return CSINN_TRUE; } -int csi_maxpool2d_locat_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) +int csi_maxpool2d_locat_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) { if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maxpool2d_locat_nchw_i32_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maxpool2d_locat_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maxpool2d_locat_nhwc_i32_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maxpool2d_locat_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_maxpool2d_locat_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_maxpool2d_locat_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_maxpool2d_locat_i32_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_maxpool2d_locat_nchw_i32_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_maxpool2d_locat_nhwc_i32_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_maxpool2d_locat_init(struct csi_tensor *input, + struct csi_tensor *output, + struct pool_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_MAXPOOL2D_LOCAT, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } return CSINN_TRUE; } diff --git a/source/reference/maxpool3d.c b/source/reference/maxpool3d.c index 695ed3dd..2562669d 100644 --- a/source/reference/maxpool3d.c +++ b/source/reference/maxpool3d.c @@ -19,7 +19,7 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_maxpool3d_ncdhw_f32(struct csi_tensor *input, +int csi_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params) { @@ -53,7 +53,7 @@ static int csi_maxpool3d_ncdhw_f32(struct csi_tensor *input, const int filter_w_begin = csi_max_internal_s32(0, -in_w_origin); const int filter_w_end = csi_min_internal_s32(params->filter_width, in_width - in_w_origin); - float max = -1000; + float max = -FLT_MAX; int filter_cnt = 0; for(int filter_d=filter_d_begin; filter_ddata; uint8_t *output_data = (uint8_t *)output->data; @@ -94,10 +94,10 @@ static int csi_maxpool3d_ncdhw_u8(struct csi_tensor *input, const int out_height = output->dim[3]; const int out_width = output->dim[4]; - const int32_t input_offset = input->offset; + const int32_t input_offset = input->zero_point; const int32_t input_multiplier = input->multiplier; const int32_t input_shift = input->shift; - const int32_t output_offset = output->offset; + const int32_t output_offset = output->zero_point; const int32_t output_multiplier = output->multiplier; const int32_t output_shift = output->shift; @@ -146,45 +146,13 @@ static int csi_maxpool3d_ncdhw_u8(struct csi_tensor *input, return CSINN_TRUE; } - -static int csi_maxpool3d_ndhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) -{ - float *input_data = (float *)input->data; - float *output_data = (float *)output->data; - - return CSINN_FALSE; -} - -static int csi_maxpool3d_ndhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) -{ - uint8_t *input_data = (uint8_t *)input->data; - uint8_t *output_data = (uint8_t *)output->data; - - return CSINN_FALSE; -} - int csi_maxpool3d_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pool_params *params) + struct csi_tensor *output, + struct pool_params *params) { if(params->layout == CSINN_NCDHW) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maxpool3d_ncdhw_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maxpool3d_ncdhw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if(params->layout == CSINN_NDHWC) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_maxpool3d_ndhwc_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_maxpool3d_ndhwc_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MAXPOOL3D, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } else { diff --git a/source/reference/mean.c b/source/reference/mean.c index 609b5880..37cf8d03 100644 --- a/source/reference/mean.c +++ b/source/reference/mean.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_mean_stride_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_mean_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = input->data; @@ -58,9 +58,9 @@ static int csi_mean_stride_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_mean_stride_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_mean_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = input->data; @@ -88,20 +88,20 @@ static int csi_mean_stride_u8(struct csi_tensor *input, { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); result += val; } - output_data[out] = csi_quantize_f32(result / inner_size, output->offset, + output_data[out] = csi_quantize_f32_to_u8(result / inner_size, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } static int csi_mean_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->axis_count != 2 || params->axis[0] != 2 || params->axis[1] != 3 || input->dim_count != 4 || output->dim_count != 4) { @@ -119,21 +119,13 @@ int csi_mean_init(struct csi_tensor *input, struct reduce_params *params) { if (params->n == 0 && params->m == 0) { - if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_mean_u8; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_UNSUPPORT_LAYOUT; + params->bc = csi_bc_map(params->api, CSINN_OP_MEAN, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_mean_stride_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_mean_stride_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MEAN_STRIDE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } diff --git a/source/reference/min.c b/source/reference/min.c index 240af971..d2431a51 100644 --- a/source/reference/min.c +++ b/source/reference/min.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_min_stride_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_min_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = input->data; @@ -58,9 +58,9 @@ static int csi_min_stride_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_min_stride_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_min_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = input->data; @@ -88,38 +88,35 @@ static int csi_min_stride_u8(struct csi_tensor *input, { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); result = fmin(result, val); } - output_data[out] = csi_quantize_f32(result, output->offset, + output_data[out] = csi_quantize_f32_to_u8(result, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_min_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_min_stride_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_min_stride_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + params->bc = csi_bc_map(params->api, CSINN_OP_MIN_STRIDE, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } return CSINN_TRUE; } int csi_min(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/minimum.c b/source/reference/minimum.c index dc63d8b7..a30c080a 100644 --- a/source/reference/minimum.c +++ b/source/reference/minimum.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_minimum_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_minimum_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_minimum_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_minimum_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_minimum_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,13 +52,13 @@ static int csi_minimum_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = fmin(input0_val, input1_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -68,11 +68,8 @@ int csi_minimum_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_minimum_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_minimum_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MINIMUM, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/mod.c b/source/reference/mod.c index debce0e5..9cc559df 100644 --- a/source/reference/mod.c +++ b/source/reference/mod.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_mod_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_mod_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -39,10 +39,10 @@ static int csi_mod_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_mod_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_mod_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -53,36 +53,33 @@ static int csi_mod_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val - floor(input0_val / input1_val) * input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_mod_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_mod_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_mod_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MOD, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_mod(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/mul.c b/source/reference/mul.c index 8d650a03..3897019f 100644 --- a/source/reference/mul.c +++ b/source/reference/mul.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_mul_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_mul_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -59,10 +59,10 @@ static int csi_mul_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_mul_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_mul_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -89,11 +89,11 @@ static int csi_mul_u8(struct csi_tensor *input0, if(size0 == size1){ for (int i = 0; i < size0; i++) { float input0_val = - csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); float input1_val = - csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val * input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } } else if(input1->dim[axis] == channel && size1 == input1->dim[axis]){ @@ -105,13 +105,13 @@ static int csi_mul_u8(struct csi_tensor *input0, else if (params->layout == CSINN_NCHW){channel = h;} float input1_val = - csi_dequantize_f32(input1_data[channel], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[channel], input1->zero_point, input1->multiplier, input1->shift); int index = csi_get_index(input0->dim, n, h, w, c); float input0_val = - csi_dequantize_f32(input0_data[index], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[index], input0->zero_point, input0->multiplier, input0->shift); float res = input0_val * input1_val; - output_data[index] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[index] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } } } @@ -119,12 +119,12 @@ static int csi_mul_u8(struct csi_tensor *input0, } else if (input1->dim_count == 0){ float input1_val = - csi_dequantize_f32(input1_data[0], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[0], input1->zero_point, input1->multiplier, input1->shift); for (int i=0; i< size0; i++){ float input0_val = - csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); float res = input0_val * input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } } @@ -136,13 +136,13 @@ static int csi_mul_u8(struct csi_tensor *input0, for(int m = 0; m < input0->dim[4]; m++){ int in1_index = l * input1->dim[1] + m; float input1_val = - csi_dequantize_f32(input1_data[in1_index], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[in1_index], input1->zero_point, input1->multiplier, input1->shift); int index = csi_get_index_5(input0->dim, i, j, k, l, m); float input0_val = - csi_dequantize_f32(input0_data[index], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[index], input0->zero_point, input0->multiplier, input0->shift); float res = input0_val * input1_val; - output_data[index] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[index] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } } } @@ -153,24 +153,21 @@ static int csi_mul_u8(struct csi_tensor *input0, } int csi_mul_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_mul_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_mul_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_MUL, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_mul(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/ndarray_size.c b/source/reference/ndarray_size.c index 5d2f30ab..27e29c0c 100644 --- a/source/reference/ndarray_size.c +++ b/source/reference/ndarray_size.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_ndarray_size_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) +int csi_ndarray_size_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct ndarray_size_params *params) { float *output_data = output->data; int size = 1; @@ -33,9 +33,9 @@ static int csi_ndarray_size_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_ndarray_size_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) +int csi_ndarray_size_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct ndarray_size_params *params) { uint8_t *output_data = output->data; int size = 1; @@ -47,9 +47,9 @@ static int csi_ndarray_size_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_ndarray_size_i32(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) +int csi_ndarray_size_i32(struct csi_tensor *input, + struct csi_tensor *output, + struct ndarray_size_params *params) { int32_t *output_data = output->data; int size = 1; @@ -62,24 +62,19 @@ static int csi_ndarray_size_i32(struct csi_tensor *input, } int csi_ndarray_size_init(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) + struct csi_tensor *output, + struct ndarray_size_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_ndarray_size_u8; - } else if (input->dtype == CSINN_DTYPE_INT32) { - params->bc = csi_ndarray_size_i32; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_ndarray_size_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_NDARRAY_SIZE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_ndarray_size(struct csi_tensor *input, - struct csi_tensor *output, - struct ndarray_size_params *params) + struct csi_tensor *output, + struct ndarray_size_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/negative.c b/source/reference/negative.c index 6e5ce6fc..6c7ec56a 100644 --- a/source/reference/negative.c +++ b/source/reference/negative.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_negative_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_negative_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_negative_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_negative_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_negative_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,24 +49,21 @@ static int csi_negative_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = -input0_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_negative_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_negative_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_negative_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_NEGATIIVE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/non_max_suppression.c b/source/reference/non_max_suppression.c new file mode 100644 index 00000000..9a83cea5 --- /dev/null +++ b/source/reference/non_max_suppression.c @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + + +static int find_max_score_idx(const float *scores, int *flag, int len) +{ + int res = 0; + float max = FLT_MIN; + for(int i = 0; i < len; i++) { + if(scores[i] > max && !flag[i]) { + max = scores[i]; + res = i; + } + } + return res; +} + +// box = [y1, x1, y2, x2] +static float get_iou(const float *box1, const float *box2) +{ + // determine the (x, y)-coordinates of the intersection rectangle + float x1 = fmax(box1[0], box2[0]); + float y1 = fmax(box1[1], box2[1]); + float x2 = fmin(box1[2], box2[2]); + float y2 = fmin(box1[3], box2[3]); + // compute the area of intersection rectangle + float inter_area = fmax(0, x2 - x1) * fmax(0, y2 - y1); + // compute the area of both the prediction and ground-truth rectangles + float box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]); + float box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1]);; + // compute the intersection over union by taking the intersection area and + // dividing it by the sum of prediction + ground-truth areas - the interesection area + float iou = inter_area / (box1_area + box2_area - inter_area); + return iou; +} + +int csi_non_max_suppression_std(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct non_max_suppression_params *params) +{ + float *boxes = (float *)input0->data; + float *scores = (float *)input1->data; + int *indices = (int *)output->data; + + float iou_threshold = params->iou_threshold; + int max_output_size = params->max_output_size; + + int box_num = input1->dim[0]; + int box_num_exist = box_num; + + int *flag = (int *)calloc(box_num, sizeof(int)); + + int box_cnt = 0; + while(box_num_exist) { + int max_box_idx = find_max_score_idx(scores, flag, box_num); + flag[max_box_idx] = 1; + box_num_exist--; + *indices++ = max_box_idx; + box_cnt++; + if(box_cnt == max_output_size) { + break; + } + for(int i = 0; i < box_num; i++) { + if(!flag[i]) { + float *box1_addr = boxes + 4 * max_box_idx; + float *box2_addr = boxes + 4 * i; + float iou = get_iou(box1_addr, box2_addr); + if(iou > iou_threshold) { + flag[i] = 1; + box_num_exist--; + } + } + } + } + free(flag); + return CSINN_TRUE; +} + +int csi_non_max_suppression_init(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct non_max_suppression_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_NON_MAX_SUPPRESSION, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; +} + + +int csi_non_max_suppression(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct non_max_suppression_params *params) +{ + if(params->bc != NULL) { + params->bc(input0, input1, output, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} \ No newline at end of file diff --git a/source/reference/not.c b/source/reference/not.c index 3e8373b8..dc3710b3 100644 --- a/source/reference/not.c +++ b/source/reference/not.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include -static int csi_not_u32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_not_u32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint32_t *input_data = input->data; @@ -37,9 +37,9 @@ static int csi_not_u32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_not_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_not_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -58,11 +58,8 @@ int csi_not_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_not_u8; - } else if (input->dtype == CSINN_DTYPE_UINT32) { - params->bc = csi_not_u32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_NOT, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/not_equal.c b/source/reference/not_equal.c index 50008f20..c5e2c4eb 100644 --- a/source/reference/not_equal.c +++ b/source/reference/not_equal.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_not_equal_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_not_equal_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_not_equal_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_not_equal_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_not_equal_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,36 +52,33 @@ static int csi_not_equal_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val != input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_not_equal_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_not_equal_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_not_equal_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_NOT_EQUAL, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_not_equal(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/or.c b/source/reference/or.c index acecb83e..19fa819f 100644 --- a/source/reference/or.c +++ b/source/reference/or.c @@ -19,11 +19,10 @@ #include "csi_nn.h" #include - -static int csi_or_u32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_or_u32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint32_t *input0_data = input0->data; uint32_t *input1_data = input1->data; @@ -39,10 +38,10 @@ static int csi_or_u32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_or_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_or_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -59,24 +58,21 @@ static int csi_or_u8(struct csi_tensor *input0, } int csi_or_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_or_u8; - } else if (input0->dtype == CSINN_DTYPE_UINT32) { - params->bc = csi_or_u32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_OR, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_or(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/pad.c b/source/reference/pad.c index 3f38cadb..f8f3b4cc 100644 --- a/source/reference/pad.c +++ b/source/reference/pad.c @@ -20,8 +20,8 @@ #include "csi_utils.h" static int csi_pad_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) + struct csi_tensor *output, + struct pad_params *params) { const int output_batch = output->dim[0]; const int output_height = output->dim[1]; @@ -72,8 +72,8 @@ static int csi_pad_nhwc_f32(struct csi_tensor *input, } static int csi_pad_nhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) + struct csi_tensor *output, + struct pad_params *params) { const int output_batch = output->dim[0]; const int output_height = output->dim[1]; @@ -123,8 +123,8 @@ static int csi_pad_nhwc_u8(struct csi_tensor *input, } static int csi_pad_nchw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) + struct csi_tensor *output, + struct pad_params *params) { const int output_batch = output->dim[0]; const int output_depth = output->dim[1]; @@ -175,8 +175,8 @@ static int csi_pad_nchw_f32(struct csi_tensor *input, } static int csi_pad_nchw_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) + struct csi_tensor *output, + struct pad_params *params) { const int output_batch = output->dim[0]; const int output_depth = output->dim[1]; @@ -225,29 +225,40 @@ static int csi_pad_nchw_u8(struct csi_tensor *input, return CSINN_TRUE; } -int csi_pad_init(struct csi_tensor *input, - struct csi_tensor *output, - struct pad_params *params) +int csi_pad_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct pad_params *params) { if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_pad_nchw_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_pad_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_pad_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_pad_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_pad_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_pad_nhwc_f32(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_pad_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct pad_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_pad_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_pad_nhwc_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_pad_init(struct csi_tensor *input, + struct csi_tensor *output, + struct pad_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_PAD, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } return CSINN_TRUE; } diff --git a/source/reference/power.c b/source/reference/power.c index 7ef04469..8cbb2b08 100644 --- a/source/reference/power.c +++ b/source/reference/power.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_power_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_power_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_power_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_power_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_power_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -52,13 +52,13 @@ static int csi_power_u8(struct csi_tensor *input0, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); - float input1_val = csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, + float input1_val = csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = powf(input0_val, input1_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -68,11 +68,8 @@ int csi_power_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_power_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_power_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_POWER, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/prelu.c b/source/reference/prelu.c index 224da545..1f81d266 100644 --- a/source/reference/prelu.c +++ b/source/reference/prelu.c @@ -59,22 +59,22 @@ static int csi_prelu_nhwc_u8(struct csi_tensor *input, uint8_t *input_data = input->data; uint8_t *output_data = output->data; uint8_t *alpha_data = alpha->data; - const int32_t input_offset = input->offset; - const int32_t alpha_offset = alpha->offset; + const int32_t input_offset = input->zero_point; + const int32_t alpha_offset = alpha->zero_point; for (int b = 0; b < output->dim[0]; ++b) { for (int y = 0; y < output->dim[1]; ++y) { for (int x = 0; x < output->dim[2]; ++x) { for (int c = 0; c < output->dim[3]; ++c) { int index = csi_get_index(input->dim, b, y, x, c); - const float input_value = csi_dequantize_f32(input_data[index], input->offset, input->multiplier, input->shift); + const float input_value = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); if (input_value >= 0) { - output_data[index] = csi_quantize_f32(input_value, - output->offset, output->multiplier, output->shift); + output_data[index] = csi_quantize_f32_to_u8(input_value, + output->zero_point, output->multiplier, output->shift); } else { - float alpha_val = csi_dequantize_f32(alpha_data[c], alpha->offset, alpha->multiplier, alpha->shift); - output_data[index] = csi_quantize_f32(input_value * alpha_val, - output->offset, output->multiplier, output->shift); + float alpha_val = csi_dequantize_u8_to_f32(alpha_data[c], alpha->zero_point, alpha->multiplier, alpha->shift); + output_data[index] = csi_quantize_f32_to_u8(input_value * alpha_val, + output->zero_point, output->multiplier, output->shift); } } } @@ -92,12 +92,12 @@ static int csi_prelu_nchw_f32(struct csi_tensor *input, float *output_data = output->data; float *alpha_data = alpha->data; for (int b = 0; b < output->dim[0]; ++b) { - for (int y = 0; y < output->dim[1]; ++y) { - for (int x = 0; x < output->dim[2]; ++x) { - for (int c = 0; c < output->dim[3]; ++c) { - int output_index = csi_get_index(output->dim, b, y, x, c); - int input_index = csi_get_index(input->dim, b, y, x, c); - const int32_t input_value = input->offset + input_data[input_index]; + for (int y = 0; y < output->dim[2]; ++y) { + for (int x = 0; x < output->dim[3]; ++x) { + for (int c = 0; c < output->dim[1]; ++c) { + int output_index = csi_get_index(output->dim, b, c, y, x); + int input_index = csi_get_index(input->dim, b, c, y, x); + float input_value = input_data[input_index]; if (input_value >= 0) { output_data[output_index] = input_data[input_index]; } else { @@ -115,8 +115,8 @@ static int csi_prelu_nchw_u8(struct csi_tensor *o_input, struct csi_tensor *o_output, struct prelu_params *params) { - struct csi_tensor* input = csi_nchw_to_nhwc_u8(o_input);; - struct csi_tensor* output = csi_nchw_to_nhwc_u8(o_output);; + struct csi_tensor* input = csi_nchw_to_nhwc_8(o_input);; + struct csi_tensor* output = csi_nchw_to_nhwc_8(o_output);; int num_elements = 1; for (int i = 0; i < output->dim_count; i++) { num_elements *= output->dim[i]; @@ -125,54 +125,67 @@ static int csi_prelu_nchw_u8(struct csi_tensor *o_input, uint8_t *input_data = input->data; uint8_t *output_data = output->data; uint8_t *alpha_data = alpha->data; - const int32_t input_offset = input->offset; - const int32_t alpha_offset = alpha->offset; + const int32_t input_offset = input->zero_point; + const int32_t alpha_offset = alpha->zero_point; for (int b = 0; b < output->dim[0]; ++b) { for (int y = 0; y < output->dim[1]; ++y) { for (int x = 0; x < output->dim[2]; ++x) { for (int c = 0; c < output->dim[3]; ++c) { int index = csi_get_index(input->dim, b, y, x, c); - const float input_value = csi_dequantize_f32(input_data[index], input->offset, input->multiplier, input->shift); + const float input_value = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); if (input_value >= 0) { - output_data[index] = csi_quantize_f32(input_value, - output->offset, output->multiplier, output->shift); + output_data[index] = csi_quantize_f32_to_u8(input_value, + output->zero_point, output->multiplier, output->shift); } else { - float alpha_val = csi_dequantize_f32(alpha_data[c], alpha->offset, alpha->multiplier, alpha->shift); - output_data[index] = csi_quantize_f32(input_value * alpha_val, - output->offset, output->multiplier, output->shift); + float alpha_val = csi_dequantize_u8_to_f32(alpha_data[c], alpha->zero_point, alpha->multiplier, alpha->shift); + output_data[index] = csi_quantize_f32_to_u8(input_value * alpha_val, + output->zero_point, output->multiplier, output->shift); } } } } } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); return CSINN_TRUE; } +int csi_prelu_f32(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_prelu_nchw_f32(input, alpha, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_prelu_nhwc_f32(input, alpha, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_prelu_u8(struct csi_tensor *input, + struct csi_tensor *alpha, + struct csi_tensor *output, + struct prelu_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_prelu_nhwc_u8(input, alpha, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_prelu_nchw_u8(input, alpha, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + int csi_prelu_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output, struct prelu_params *params) { - if (params->layout == CSINN_NCHW) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_prelu_nchw_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_prelu_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_prelu_nhwc_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_prelu_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_UNSUPPORT_LAYOUT; + params->bc = csi_bc_map(params->api, CSINN_OP_PRELU, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } diff --git a/source/reference/prod.c b/source/reference/prod.c index 59b3704e..2081afc8 100644 --- a/source/reference/prod.c +++ b/source/reference/prod.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_prod_stride_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_prod_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = input->data; @@ -58,9 +58,9 @@ static int csi_prod_stride_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_prod_stride_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_prod_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = input->data; @@ -88,29 +88,26 @@ static int csi_prod_stride_u8(struct csi_tensor *input, { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); result *= val; } - output_data[out] = csi_quantize_f32(result, output->offset, + output_data[out] = csi_quantize_f32_to_u8(result, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_prod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_prod_stride_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_prod_stride_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_PROD, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } @@ -118,8 +115,8 @@ int csi_prod_init(struct csi_tensor *input, } int csi_prod(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/proposal.c b/source/reference/proposal.c index c772a3b8..203e39fd 100644 --- a/source/reference/proposal.c +++ b/source/reference/proposal.c @@ -30,7 +30,8 @@ struct bbox { }; static struct bbox reg_iou(float x1, float y1, float x2, float y2, float dx1, - float dy1, float dx2, float dy2) { + float dy1, float dx2, float dy2) +{ struct bbox pred; pred.x1 = x1 + dx1; pred.y1 = y1 + dy1; @@ -40,7 +41,8 @@ static struct bbox reg_iou(float x1, float y1, float x2, float y2, float dx1, } static struct bbox reg_bbox(float x1, float y1, float x2, float y2, float dx, float dy, - float dw, float dh) { + float dw, float dh) +{ float bbox_w = x2 - x1 + 1.0; float bbox_h = y2 - y1 + 1.0; float ctr_x = x1 + 0.5 * (bbox_w - 1.0); @@ -59,7 +61,8 @@ static struct bbox reg_bbox(float x1, float y1, float x2, float y2, float dx, fl return pred; } -static struct bbox generate_anchor(float ratio, float scale, int32_t base_size) { +static struct bbox generate_anchor(float ratio, float scale, int32_t base_size) +{ float w, h; w = h = (float)base_size; float x_ctr = 0.5 * (w - 1.0); @@ -78,11 +81,12 @@ static struct bbox generate_anchor(float ratio, float scale, int32_t base_size) } static float *predict_bbox(struct csi_tensor *cls_prob_tensor, - struct csi_tensor *bbox_pred_tensor, - struct csi_tensor *im_info_tensor, float *ratios, - int32_t ratios_num, float *scales, int32_t scales_num, - int32_t feature_stride, int32_t iou_loss, - int32_t rpn_min_size) { + struct csi_tensor *bbox_pred_tensor, + struct csi_tensor *im_info_tensor, float *ratios, + int32_t ratios_num, float *scales, int32_t scales_num, + int32_t feature_stride, int32_t iou_loss, + int32_t rpn_min_size) +{ int len_scales = scales_num; int len_ratios = ratios_num; int batch = cls_prob_tensor->dim[0]; @@ -167,11 +171,13 @@ typedef struct { float data; } index_value; -static int argsort(const void *a, const void *b) { +static int argsort(const void *a, const void *b) +{ return ((((index_value *)a)->data - ((index_value *)b)->data > 0) ? -1 : 1); } -static float calculate_overlap(float *out_tensor, int box_a_idx, int box_b_idx) { +static float calculate_overlap(float *out_tensor, int box_a_idx, int box_b_idx) +{ float w = MAX(0.0, MIN(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - MAX(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0); @@ -188,7 +194,8 @@ static float calculate_overlap(float *out_tensor, int box_a_idx, int box_b_idx) } static float *compute_nms(int batch, int num_bbox, float *sorted_bbox, - float threshold) { + float threshold) +{ float *out = malloc(batch * num_bbox * sizeof(float)); for (int b = 0; b < batch; b++) { int base_idx = b * num_bbox; @@ -213,7 +220,8 @@ static float *compute_nms(int batch, int num_bbox, float *sorted_bbox, } static float *prepare_output(float *sorted_bbox, float *remove_mask, int batch, - int num_bbox, int rpn_post_nms_top_n) { + int num_bbox, int rpn_post_nms_top_n) +{ int *i = malloc(batch * sizeof(int)); int *nkeep = malloc(batch * sizeof(int)); float *output = malloc(batch * rpn_post_nms_top_n * 5 * sizeof(int)); @@ -251,11 +259,11 @@ static float *prepare_output(float *sorted_bbox, float *remove_mask, int batch, return output; } -static int csi_proposal_f32(struct csi_tensor *cls_prob, - struct csi_tensor *bbox_pred, - struct csi_tensor *im_info, - struct csi_tensor *output, - struct proposal_params *params) +int csi_proposal_f32(struct csi_tensor *cls_prob, + struct csi_tensor *bbox_pred, + struct csi_tensor *im_info, + struct csi_tensor *output, + struct proposal_params *params) { float *output_data = output->data; @@ -304,11 +312,11 @@ static int csi_proposal_f32(struct csi_tensor *cls_prob, for (int i = 0; i < batch * params->rpn_post_nms_top_n * 5; i++) { output_data[i] = nms_out[i]; } - + return CSINN_TRUE; } -static int csi_proposal_u8(struct csi_tensor *cls_prob, +int csi_proposal_u8(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred, struct csi_tensor *im_info, struct csi_tensor *output, @@ -361,12 +369,12 @@ static int csi_proposal_u8(struct csi_tensor *cls_prob, f_bbox.data = f_bbox_data; for (int i = 0; i < c_size; i++) { - f_cls_data[i] = csi_dequantize_f32(cls_data[i], cls_prob->offset, + f_cls_data[i] = csi_dequantize_u8_to_f32(cls_data[i], cls_prob->zero_point, cls_prob->multiplier, cls_prob->shift); } for (int i = 0; i < b_size; i++) { - f_bbox_data[i] = csi_dequantize_f32(bbox_data[i], bbox_pred->offset, + f_bbox_data[i] = csi_dequantize_u8_to_f32(bbox_data[i], bbox_pred->zero_point, bbox_pred->multiplier, bbox_pred->shift); } @@ -376,7 +384,7 @@ static int csi_proposal_u8(struct csi_tensor *cls_prob, csi_proposal_f32(&f_cls, &f_bbox, im_info, &float_output, params); for (int i = 0; i < out_size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_output_data); @@ -391,11 +399,8 @@ int csi_proposal_init(struct csi_tensor *cls_prob, struct csi_tensor *output, struct proposal_params *params) { - if (cls_prob->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_proposal_f32; - } else if (cls_prob->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_proposal_u8; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_PROPOSAL, output->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/psroipooling.c b/source/reference/psroipooling.c index 98316b18..59e0cca0 100644 --- a/source/reference/psroipooling.c +++ b/source/reference/psroipooling.c @@ -20,10 +20,10 @@ #include "csi_utils.h" #include -static int csi_psroipooling_f32(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct psroipooling_params *params) +int csi_psroipooling_f32(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct psroipooling_params *params) { float *output_data = output->data; float *bottom_data = data->data; @@ -93,10 +93,10 @@ static int csi_psroipooling_f32(struct csi_tensor *data, return CSINN_TRUE; } -static int csi_psroipooling_u8(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct psroipooling_params *params) +int csi_psroipooling_u8(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct psroipooling_params *params) { uint8_t *output_data = output->data; @@ -122,7 +122,7 @@ static int csi_psroipooling_u8(struct csi_tensor *data, uint8_t *data_item_data = data->data; for (int k = 0; k < size; k++) { - float_data_item_data[k] = csi_dequantize_f32(data_item_data[k], data->offset, + float_data_item_data[k] = csi_dequantize_u8_to_f32(data_item_data[k], data->zero_point, data->multiplier, data->shift); } float_data.data = float_data_item_data; @@ -138,17 +138,17 @@ static int csi_psroipooling_u8(struct csi_tensor *data, uint8_t *rois_item_data = rois->data; for (int k = 0; k < size; k++) { - float_rois_item_data[k] = csi_dequantize_f32(rois_item_data[k], rois->offset, + float_rois_item_data[k] = csi_dequantize_u8_to_f32(rois_item_data[k], rois->zero_point, rois->multiplier, rois->shift); } float_rois.data = float_rois_item_data; - params->spatial_scale = csi_dequantize_f32(1.0, 0, params->spatial_scale_multiplier, params->spatial_scale_shift); + params->spatial_scale = csi_dequantize_u8_to_f32(1.0, 0, params->spatial_scale_multiplier, params->spatial_scale_shift); csi_psroipooling_f32(&float_data, &float_rois, &float_output, params); for (int i = 0; i < outer_size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_data_item_data); @@ -163,11 +163,8 @@ int csi_psroipooling_init(struct csi_tensor *data, struct csi_tensor *output, struct psroipooling_params *params) { - if (data->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_psroipooling_u8; - } else if (data->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_psroipooling_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_PSROIPOOLING, data->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reduce_logsumexp.c b/source/reference/reduce_logsumexp.c index 208c4ed3..5d3c6745 100644 --- a/source/reference/reduce_logsumexp.c +++ b/source/reference/reduce_logsumexp.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_reduce_logsumexp_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_logsumexp_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -67,7 +66,7 @@ static int csi_reduce_logsumexp_f32(struct csi_tensor *input, } -static int csi_reduce_logsumexp_u8(struct csi_tensor *input, +int csi_reduce_logsumexp_u8(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params) { @@ -82,10 +81,10 @@ static int csi_reduce_logsumexp_u8(struct csi_tensor *input, } float res = 0.0f; for(int j = 0; j < size; j++) { - float input_temp = csi_dequantize_f32(input_data[j], input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_data[j], input->zero_point, input->multiplier, input->shift); res = res + exp(input_temp); } - *output_data = csi_quantize_f32(log(res), output->offset, output->multiplier, output->shift); + *output_data = csi_quantize_f32_to_u8(log(res), output->zero_point, output->multiplier, output->shift); } else { int axis = *(params->axis); int64_t outer_size = 1; @@ -103,10 +102,10 @@ static int csi_reduce_logsumexp_u8(struct csi_tensor *input, float temp = 0.0f; for(int j = 0; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp += exp(input_temp); } - *(output_data + k) = csi_quantize_f32(log(temp), output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_quantize_f32_to_u8(log(temp), output->zero_point, output->multiplier, output->shift); } input_data += inner_size * cnt; output_data += inner_size; @@ -116,22 +115,19 @@ static int csi_reduce_logsumexp_u8(struct csi_tensor *input, } int csi_reduce_logsumexp_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reduce_logsumexp_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reduce_logsumexp_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REDUCE_LOGSUMEXP, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_reduce_logsumexp(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/reduce_max.c b/source/reference/reduce_max.c index 7f1d8c14..bcf5a477 100644 --- a/source/reference/reduce_max.c +++ b/source/reference/reduce_max.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_reduce_max_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_max_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,9 +64,9 @@ static int csi_reduce_max_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reduce_max_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_max_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -78,12 +77,12 @@ static int csi_reduce_max_u8(struct csi_tensor *input, for(int i=0; idim_count; i++) { size = size * input->dim[i]; } - float res = csi_dequantize_f32(input_data[0], input->offset, input->multiplier, input->shift); + float res = csi_dequantize_u8_to_f32(input_data[0], input->zero_point, input->multiplier, input->shift); for(int j = 1; j < size; j++) { - float input_temp = csi_dequantize_f32(input_data[j], input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_data[j], input->zero_point, input->multiplier, input->shift); res = fmax(res, input_temp); } - *output_data = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + *output_data = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } else { int axis = *(params->axis); int64_t outer_size = 1; @@ -98,13 +97,13 @@ static int csi_reduce_max_u8(struct csi_tensor *input, for(int i = 0; i < outer_size; i++) { for(int k = 0; k < inner_size; k++) { - float temp = csi_dequantize_f32(input_data[k], input->offset, input->multiplier, input->shift); + float temp = csi_dequantize_u8_to_f32(input_data[k], input->zero_point, input->multiplier, input->shift); for(int j = 1; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp = fmax(temp, input_temp); } - *(output_data + k) = csi_quantize_f32(temp, output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_quantize_f32_to_u8(temp, output->zero_point, output->multiplier, output->shift); } input_data += inner_size * cnt; output_data += inner_size; @@ -117,11 +116,8 @@ int csi_reduce_max_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reduce_max_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reduce_max_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REDUCE_MAX, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reduce_mean.c b/source/reference/reduce_mean.c index 5ac3ba15..9c1fd997 100644 --- a/source/reference/reduce_mean.c +++ b/source/reference/reduce_mean.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_reduce_mean_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_mean_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,9 +64,9 @@ static int csi_reduce_mean_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reduce_mean_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_mean_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -80,10 +79,10 @@ static int csi_reduce_mean_u8(struct csi_tensor *input, } float res = 0.0f; for(int j = 0; j < size; j++) { - float input_temp = csi_dequantize_f32(input_data[j], input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_data[j], input->zero_point, input->multiplier, input->shift); res = res + input_temp; } - *output_data = csi_quantize_f32(res / size, output->offset, output->multiplier, output->shift); + *output_data = csi_quantize_f32_to_u8(res / size, output->zero_point, output->multiplier, output->shift); } else { int axis = *(params->axis); int64_t outer_size = 1; @@ -101,10 +100,10 @@ static int csi_reduce_mean_u8(struct csi_tensor *input, float temp = 0.0f; for(int j = 0; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp += input_temp; } - *(output_data + k) = csi_quantize_f32(temp / cnt, output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_quantize_f32_to_u8(temp / cnt, output->zero_point, output->multiplier, output->shift); } input_data += inner_size * cnt; output_data += inner_size; @@ -114,14 +113,11 @@ static int csi_reduce_mean_u8(struct csi_tensor *input, } int csi_reduce_mean_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reduce_mean_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reduce_mean_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REDUCE_MEAN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reduce_min.c b/source/reference/reduce_min.c index a34d8c55..47efa760 100644 --- a/source/reference/reduce_min.c +++ b/source/reference/reduce_min.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_reduce_min_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_min_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -64,9 +64,9 @@ static int csi_reduce_min_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reduce_min_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_min_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -77,12 +77,12 @@ static int csi_reduce_min_u8(struct csi_tensor *input, for(int i=0; idim_count; i++) { size = size * input->dim[i]; } - float res = csi_dequantize_f32(input_data[0], input->offset, input->multiplier, input->shift); + float res = csi_dequantize_u8_to_f32(input_data[0], input->zero_point, input->multiplier, input->shift); for(int j = 1; j < size; j++) { - float input_temp = csi_dequantize_f32(input_data[j], input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_data[j], input->zero_point, input->multiplier, input->shift); res = fmin(res, input_temp); } - *output_data = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + *output_data = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } else { int axis = *(params->axis); int64_t outer_size = 1; @@ -97,13 +97,13 @@ static int csi_reduce_min_u8(struct csi_tensor *input, for(int i = 0; i < outer_size; i++) { for(int k = 0; k < inner_size; k++) { - float temp = csi_dequantize_f32(input_data[k], input->offset, input->multiplier, input->shift); + float temp = csi_dequantize_u8_to_f32(input_data[k], input->zero_point, input->multiplier, input->shift); for(int j = 1; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp = fmin(temp, input_temp); } - *(output_data + k) = csi_quantize_f32(temp, output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_quantize_f32_to_u8(temp, output->zero_point, output->multiplier, output->shift); } input_data += inner_size * cnt; output_data += inner_size; @@ -116,11 +116,8 @@ int csi_reduce_min_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reduce_min_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reduce_min_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REDUCE_MIN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reduce_prod.c b/source/reference/reduce_prod.c index c78e5e59..bc6e7645 100644 --- a/source/reference/reduce_prod.c +++ b/source/reference/reduce_prod.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_reduce_prod_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_prod_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,9 +64,9 @@ static int csi_reduce_prod_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reduce_prod_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_prod_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -80,10 +79,10 @@ static int csi_reduce_prod_u8(struct csi_tensor *input, } float res = 1.0f; for(int j = 0; j < size; j++) { - float input_temp = csi_dequantize_f32(input_data[j], input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_data[j], input->zero_point, input->multiplier, input->shift); res = res * input_temp; } - *output_data = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + *output_data = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } else { int axis = *(params->axis); int64_t outer_size = 1; @@ -101,10 +100,10 @@ static int csi_reduce_prod_u8(struct csi_tensor *input, float temp = 1.0f; for(int j = 0; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp *= input_temp; } - *(output_data + k) = csi_quantize_f32(temp, output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_quantize_f32_to_u8(temp, output->zero_point, output->multiplier, output->shift); } input_data += inner_size * cnt; output_data += inner_size; @@ -114,14 +113,11 @@ static int csi_reduce_prod_u8(struct csi_tensor *input, } int csi_reduce_prod_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reduce_prod_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reduce_prod_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REDUCE_PROD, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reduce_sum.c b/source/reference/reduce_sum.c index d5a870dd..cc5c0094 100644 --- a/source/reference/reduce_sum.c +++ b/source/reference/reduce_sum.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_reduce_sum_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_sum_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -65,9 +64,9 @@ static int csi_reduce_sum_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reduce_sum_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_reduce_sum_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -80,10 +79,10 @@ static int csi_reduce_sum_u8(struct csi_tensor *input, } float res = 0.0f; for(int j = 0; j < size; j++) { - float input_temp = csi_dequantize_f32(input_data[j], input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_data[j], input->zero_point, input->multiplier, input->shift); res = res + input_temp; } - *output_data = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + *output_data = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } else { int axis = *(params->axis); int64_t outer_size = 1; @@ -101,10 +100,10 @@ static int csi_reduce_sum_u8(struct csi_tensor *input, float temp = 0.0f; for(int j = 0; j < cnt; j++) { uint8_t input_val = *(input_data + j * inner_size + k); - float input_temp = csi_dequantize_f32(input_val, input->offset, input->multiplier, input->shift); + float input_temp = csi_dequantize_u8_to_f32(input_val, input->zero_point, input->multiplier, input->shift); temp += input_temp; } - *(output_data + k) = csi_quantize_f32(temp, output->offset, output->multiplier, output->shift); + *(output_data + k) = csi_quantize_f32_to_u8(temp, output->zero_point, output->multiplier, output->shift); } input_data += inner_size * cnt; output_data += inner_size; @@ -117,11 +116,8 @@ int csi_reduce_sum_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reduce_sum_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reduce_sum_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REDUCE_SUM, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/relu.c b/source/reference/relu.c index 159b272e..18fe00fc 100644 --- a/source/reference/relu.c +++ b/source/reference/relu.c @@ -24,9 +24,9 @@ static float relu(float x){ return x > 0 ? x : 0; } -static int csi_relu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,9 +41,9 @@ static int csi_relu_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_relu_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -54,11 +54,11 @@ static int csi_relu_u8(struct csi_tensor *input, #pragma omp parallel for num_threads(8) for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = relu(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -67,11 +67,8 @@ int csi_relu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_relu_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_relu_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RELU, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/relu1.c b/source/reference/relu1.c index b476baf2..a5990370 100644 --- a/source/reference/relu1.c +++ b/source/reference/relu1.c @@ -24,9 +24,9 @@ static float relu1(float x){ return fmin(x > 0 ? x : 0, 1); } -static int csi_relu1_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relu1_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,9 +41,9 @@ static int csi_relu1_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_relu1_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relu1_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -53,11 +53,11 @@ static int csi_relu1_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = relu1(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -66,11 +66,8 @@ int csi_relu1_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_relu1_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_relu1_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RELU1, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/relu6.c b/source/reference/relu6.c index 6bbb3f91..d51e3a5b 100644 --- a/source/reference/relu6.c +++ b/source/reference/relu6.c @@ -24,9 +24,9 @@ static float relu6(float x){ return fmin(x > 0 ? x : 0, 6); } -static int csi_relu6_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relu6_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -41,9 +41,9 @@ static int csi_relu6_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_relu6_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relu6_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -53,11 +53,11 @@ static int csi_relu6_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = relu6(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -66,11 +66,8 @@ int csi_relu6_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_relu6_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_relu6_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RELU6, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/relun.c b/source/reference/relun.c index 7060063f..b4d90fc6 100644 --- a/source/reference/relun.c +++ b/source/reference/relun.c @@ -23,9 +23,9 @@ static float relun(float x, float y){ return fmin(x > 0.0 ? x : 0.0, y); } -static int csi_relun_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relun_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -40,9 +40,9 @@ static int csi_relun_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_relun_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_relun_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -51,13 +51,13 @@ static int csi_relun_u8(struct csi_tensor *input, size = size * input->dim[i]; } - float n = csi_dequantize_f32(1, 0, params->n_multiplier, params->n_shift); + float n = csi_dequantize_u8_to_f32(1, 0, params->n_multiplier, params->n_shift); for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = relun(input_val, n); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -66,11 +66,8 @@ int csi_relun_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_relun_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_relun_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RELUN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reshape.c b/source/reference/reshape.c index 7403cf42..496b6fd8 100644 --- a/source/reference/reshape.c +++ b/source/reference/reshape.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_reshape_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) +int csi_reshape_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reshape_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -35,9 +35,9 @@ static int csi_reshape_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reshape_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) +int csi_reshape_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reshape_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -52,22 +52,19 @@ static int csi_reshape_u8(struct csi_tensor *input, } int csi_reshape_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) + struct csi_tensor *output, + struct reshape_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reshape_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reshape_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RESHAPE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_reshape(struct csi_tensor *input, - struct csi_tensor *output, - struct reshape_params *params) + struct csi_tensor *output, + struct reshape_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/resize.c b/source/reference/resize.c index ac89a2bb..7eb5ed26 100644 --- a/source/reference/resize.c +++ b/source/reference/resize.c @@ -221,11 +221,11 @@ static void csi_resize_nearest_neighbor_u8(struct csi_tensor *input, struct csi_ } static void csi_resize_nearest_neighbor_nchw_u8(struct csi_tensor *o_input, struct csi_tensor *o_output, - bool align_corners) + bool align_corners) { - struct csi_tensor* input = csi_nchw_to_nhwc_u8(o_input); - struct csi_tensor* output = csi_nchw_to_nhwc_u8(o_output); + struct csi_tensor* input = csi_nchw_to_nhwc_8(o_input); + struct csi_tensor* output = csi_nchw_to_nhwc_8(o_output); uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -267,7 +267,7 @@ static void csi_resize_nearest_neighbor_nchw_u8(struct csi_tensor *o_input, stru } input_ptr += batch_offset; } - csi_nhwc_to_nchw_u8(o_output, output); + csi_nhwc_to_nchw_8(o_output, output); } static void csi_resize_nearest_bicubic_u8(struct csi_tensor *input, struct csi_tensor *output, @@ -277,9 +277,9 @@ static void csi_resize_nearest_bicubic_u8(struct csi_tensor *input, struct csi_t assert(0); } -static int csi_resize_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct resize_params *params) +int csi_resize_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct resize_params *params) { if (params->resize_mode == CSINN_RESIZE_BILINEAR) { csi_resize_bilinear_f32(input, output, params->align_corners); @@ -291,9 +291,9 @@ static int csi_resize_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_resize_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct resize_params *params) +int csi_resize_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct resize_params *params) { if (params->resize_mode == CSINN_RESIZE_BILINEAR) { csi_resize_bilinear_u8(input, output, params->align_corners); @@ -313,11 +313,8 @@ int csi_resize_init(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_resize_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_resize_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RESIZE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/reverse.c b/source/reference/reverse.c index 04429349..980eca19 100644 --- a/source/reference/reverse.c +++ b/source/reference/reverse.c @@ -19,21 +19,6 @@ #include "csi_nn.h" #include "csi_utils.h" - -// static void reverse_axis(float *start, float *end, int cnt, int step) -// { -// assert(start!=NULL && end!=NULL); -// for(int i=0; idata; float *output_data = (float *)output->data; @@ -77,9 +62,9 @@ static int csi_reverse_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_reverse_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reverse_params *params) +int csi_reverse_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reverse_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -115,11 +100,8 @@ int csi_reverse_init(struct csi_tensor *input, struct csi_tensor *output, struct reverse_params *params) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_reverse_u8; - } else if(input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_reverse_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_REVERSE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/roialign.c b/source/reference/roialign.c index 1a7ce4ca..7dd79df8 100644 --- a/source/reference/roialign.c +++ b/source/reference/roialign.c @@ -71,7 +71,7 @@ static float _bilinear(const float *data, int32_t batch, int32_t channel, return val; } -static int csi_roi_align_f32(struct csi_tensor *data, +int csi_roi_align_f32(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output, struct roi_align_params *params) @@ -155,9 +155,8 @@ int csi_roi_align_init(struct csi_tensor *data, struct csi_tensor *output, struct roi_align_params *params) { - if (data->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_roi_align_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ROIALIGN, data->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/roipool.c b/source/reference/roipool.c index ec39eec5..210849a9 100644 --- a/source/reference/roipool.c +++ b/source/reference/roipool.c @@ -17,20 +17,162 @@ */ #include "csi_nn.h" -#include +#include "csi_utils.h" +#include -int csi_roi_pool_init(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_pool_params *params) + +// https://github.com/pytorch/pytorch/blob/master/caffe2/operators/roi_pool_op.cc +// defalut input layout: NCHW +int csi_roipool_f32(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params) +{ + float *output_data = (float *)output->data; + float *bottom_data = (float *)data->data; + float *bottom_rois = (float *)rois->data; + + int batch = data->dim[0]; + int channel = data->dim[1]; + int height = data->dim[2]; + int width = data->dim[3]; + int num_rois = rois->dim[0]; + + int pooled_height = params->pooled_size_h; + int pooled_width = params->pooled_size_w; + + for(int n = 0; n < num_rois; n++) { + int roi_add = n * 5; + int roi_batch_idx = bottom_rois[roi_add]; + assert(roi_batch_idx < batch); + float roi_start_w = (float)(round(bottom_rois[roi_add + 1]) * params->spatial_scale); + float roi_start_h = (float)(round(bottom_rois[roi_add + 2]) * params->spatial_scale); + float roi_end_w = (float)(round(bottom_rois[roi_add + 3]) * params->spatial_scale); + float roi_end_h = (float)(round(bottom_rois[roi_add + 4]) * params->spatial_scale); + + float roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1); + float roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1); + float bin_size_h = (float)(roi_height) / (float)(pooled_height); + float bin_size_w = (float)(roi_width) / (float)(pooled_width); + + const float *batch_data = bottom_data + roi_batch_idx * channel * height * width; + + for (int c = 0; c < channel; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + // Compute pooling region for this output unit: + // start (included) = floor(ph * roi_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) + int hstart = (int)(floor((float)(ph) * bin_size_h + roi_start_h)); + int wstart = (int)(floor((float)(pw) * bin_size_w + roi_start_w)); + int hend = (int)(ceil((float)(ph + 1) * bin_size_h + roi_start_h)); + int wend = (int)(ceil((float)(pw + 1) * bin_size_w + roi_start_w)); + hstart = fminf(fmaxf(hstart, 0), height); + hend = fminf(fmaxf(hend , 0), height); + wstart = fminf(fmaxf(wstart, 0), width); + wend = fminf(fmaxf(wend , 0), width); + + const int pool_index = ph * pooled_width + pw; + int is_empty = (hend <= hstart) || (wend <= wstart); + + *(output_data + pool_index) = is_empty ? 0 : -FLT_MAX; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int index = h * width + w; + if(*(batch_data + index) > *(output_data + pool_index)) { + *(output_data + pool_index) = *(output_data + pool_index); + } + } + } + } + } + // Increment all data pointers by one channel + batch_data += height * width; + output_data += pooled_height * pooled_width; + } + } + return CSINN_TRUE; +} + +int csi_roipool_u8(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params) +{ + uint8_t *output_data = (uint8_t *)output->data; + + // init output + struct csi_tensor float_output; + memcpy(&float_output, output, sizeof(struct csi_tensor)); + int64_t out_size = 1; + for(int i = 0; i < output->dim_count; i++) { + out_size *= output->dim[i]; + } + float *float_output_data = (float *)malloc(out_size * sizeof(float)); + float_output.data = float_output_data; + + // convert input(data) to float + struct csi_tensor float_data; + memcpy(&float_data, data, sizeof(struct csi_tensor)); + int64_t in_size = 1; + for(int i = 0; i < data->dim_count; i++) { + in_size *= data->dim[i]; + } + float *float_data_item_data = (float *)malloc(in_size * sizeof(float)); + + uint8_t *data_item_data = (uint8_t *)data->data; + for(int i = 0; i < in_size; i++) { + float_data_item_data[i] = csi_dequantize_u8_to_f32(data_item_data[i], data->zero_point, data->multiplier, data->shift); + } + float_data.data = float_data_item_data; + + // convert input(rois) to float + struct csi_tensor float_rois; + memcpy(&float_rois, rois, sizeof(struct csi_tensor)); + int64_t rois_size = 1; + for(int i = 0; i < rois->dim_count; i++) { + rois_size *= rois->dim[i]; + } + float *float_rois_item_data = (float *)malloc(rois_size * sizeof(float)); + + uint8_t *rois_item_data = (uint8_t *)rois->data; + for(int i = 0; i < rois_size; i++) { + float_rois_item_data[i] = csi_dequantize_u8_to_f32(rois_item_data[i], rois->zero_point, rois->multiplier, rois->shift); + } + float_rois.data = float_rois_item_data; + + // convert params to float + params->spatial_scale = csi_dequantize_u8_to_f32(1.0, 0, params->spatial_scale_multiplier, params->spatial_scale_shift); + + csi_roipool_f32(&float_data, &float_rois, &float_output, params); + + for(int i = 0; i < out_size; i++) { + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); + } + + free(float_data_item_data); + free(float_rois_item_data); + free(float_output_data); + return CSINN_TRUE; +} + +int csi_roipool_init(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params) { - return CSINN_FALSE; + params->bc = csi_bc_map(params->api, CSINN_OP_ROIPOOL, data->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; } -int csi_roi_pool(struct csi_tensor *data, - struct csi_tensor *rois, - struct csi_tensor *output, - struct roi_pool_params *params) +int csi_roipool(struct csi_tensor *data, + struct csi_tensor *rois, + struct csi_tensor *output, + struct roi_pool_params *params) { if (params->bc != NULL) { params->bc(data, rois, output, params); diff --git a/source/reference/round.c b/source/reference/round.c index 75c000a1..2024fb49 100644 --- a/source/reference/round.c +++ b/source/reference/round.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_round_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_round_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_round_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_round_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_round_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_round_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = round(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,11 +62,8 @@ int csi_round_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_round_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_round_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_ROUND, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/rsqrt.c b/source/reference/rsqrt.c index 02e9c2c3..504bb860 100644 --- a/source/reference/rsqrt.c +++ b/source/reference/rsqrt.c @@ -20,7 +20,7 @@ #include "csi_utils.h" #include -static int csi_rsqrt_f32(struct csi_tensor *input, +int csi_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { @@ -37,7 +37,7 @@ static int csi_rsqrt_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_rsqrt_u8(struct csi_tensor *input, +int csi_rsqrt_u8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { @@ -49,33 +49,29 @@ static int csi_rsqrt_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = 1.0/sqrt(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } - int csi_rsqrt_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_rsqrt_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_rsqrt_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_RSQRT, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_rsqrt(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/segment_max.c b/source/reference/segment_max.c index 7e3124c5..5fc76dd1 100644 --- a/source/reference/segment_max.c +++ b/source/reference/segment_max.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_unsorted_segment_max_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_max_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -67,10 +67,10 @@ static int csi_unsorted_segment_max_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_max_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_max_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -118,10 +118,10 @@ static int csi_segment_max_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_unsorted_segment_max_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_max_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -166,10 +166,10 @@ static int csi_unsorted_segment_max_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_max_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_max_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -218,32 +218,28 @@ static int csi_segment_max_u8(struct csi_tensor *input, } int csi_segment_max_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_max_u8; - } else { - params->bc = csi_segment_max_u8; - } - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_max_f32; - } else { - params->bc = csi_segment_max_f32; - } + if (params->unsorted == CSINN_TRUE) { + params->bc = csi_bc_map(params->api, CSINN_OP_UNSORTED_SEGMENT_MAX, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } else { - return CSINN_UNSUPPORT_DTYPE; + params->bc = csi_bc_map(params->api, CSINN_OP_SEGMENT_MAX, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } return CSINN_TRUE; } int csi_segment_max(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params) { if (params->bc != NULL) { params->bc(input0, input1, output, params); diff --git a/source/reference/segment_mean.c b/source/reference/segment_mean.c index 63afe8e1..08504023 100644 --- a/source/reference/segment_mean.c +++ b/source/reference/segment_mean.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_unsorted_segment_mean_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_mean_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -70,10 +70,10 @@ static int csi_unsorted_segment_mean_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_mean_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_mean_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -123,10 +123,10 @@ static int csi_segment_mean_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_unsorted_segment_mean_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_mean_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -143,7 +143,7 @@ static int csi_unsorted_segment_mean_u8(struct csi_tensor *input, for(int w = 0; w < input->dim[2]; w++) { for(int c = 0; c < input->dim[3]; c++) { int output_index = csi_get_index(input->dim, n, h, w, c); - output_data[output_index] = csi_quantize_f32(0, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(0, output->zero_point, output->multiplier, output->shift); } } @@ -165,12 +165,12 @@ static int csi_unsorted_segment_mean_u8(struct csi_tensor *input, float temp_sum = 0; for(int k = 0; k < num; k++) { int32_t input_index = csi_get_index(input->dim, index[k], h, w, c); - float input_value = csi_dequantize_f32(input_data[input_index], input->offset, + float input_value = csi_dequantize_u8_to_f32(input_data[input_index], input->zero_point, input->multiplier, input->shift); temp_sum += input_value; } float mean_value = temp_sum / mean_n; - output_data[output_index] = csi_quantize_f32(mean_value, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(mean_value, output->zero_point, output->multiplier, output->shift); } } @@ -182,10 +182,10 @@ static int csi_unsorted_segment_mean_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_mean_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_mean_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -203,7 +203,7 @@ static int csi_segment_mean_u8(struct csi_tensor *input, for(int w = 0; w < input->dim[2]; w++) { for(int c = 0; c < input->dim[3]; c++) { int output_index = csi_get_index(input->dim, n, h, w, c); - output_data[output_index] = csi_quantize_f32(0, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(0, output->zero_point, output->multiplier, output->shift); } } @@ -226,12 +226,12 @@ static int csi_segment_mean_u8(struct csi_tensor *input, float temp_sum = 0; for(int k = 0; k < num; k++) { int32_t input_index = csi_get_index(input->dim, index[k], h, w, c); - float input_value = csi_dequantize_f32(input_data[input_index], input->offset, + float input_value = csi_dequantize_u8_to_f32(input_data[input_index], input->zero_point, input->multiplier, input->shift); temp_sum += input_value; } float mean_value = temp_sum / mean_n; - output_data[output_index] = csi_quantize_f32(mean_value, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(mean_value, output->zero_point, output->multiplier, output->shift); } } @@ -243,24 +243,20 @@ static int csi_segment_mean_u8(struct csi_tensor *input, } int csi_segment_mean_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_mean_u8; - } else { - params->bc = csi_segment_mean_u8; - } - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_mean_f32; - } else { - params->bc = csi_segment_mean_f32; + if (params->unsorted == CSINN_TRUE) { + params->bc = csi_bc_map(params->api, CSINN_OP_UNSORTED_SEGMENT_MEAN, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } } else { - return CSINN_UNSUPPORT_DTYPE; + params->bc = csi_bc_map(params->api, CSINN_OP_SEGMENT_MEAN, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } return CSINN_TRUE; } diff --git a/source/reference/segment_min.c b/source/reference/segment_min.c index 342e9c61..612dda73 100644 --- a/source/reference/segment_min.c +++ b/source/reference/segment_min.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_unsorted_segment_min_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_min_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -67,10 +67,10 @@ static int csi_unsorted_segment_min_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_min_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_min_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -118,10 +118,10 @@ static int csi_segment_min_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_unsorted_segment_min_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_min_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -166,10 +166,10 @@ static int csi_unsorted_segment_min_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_min_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_min_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -218,24 +218,20 @@ static int csi_segment_min_u8(struct csi_tensor *input, } int csi_segment_min_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_min_u8; - } else { - params->bc = csi_segment_min_u8; - } - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_min_f32; - } else { - params->bc = csi_segment_min_f32; - } + if (params->unsorted == CSINN_TRUE) { + params->bc = csi_bc_map(params->api, CSINN_OP_UNSORTED_SEGMENT_MIN, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } else { - return CSINN_UNSUPPORT_DTYPE; + params->bc = csi_bc_map(params->api, CSINN_OP_SEGMENT_MIN, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } return CSINN_TRUE; } diff --git a/source/reference/segment_prod.c b/source/reference/segment_prod.c index e55b6307..dfb240bb 100644 --- a/source/reference/segment_prod.c +++ b/source/reference/segment_prod.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_unsorted_segment_prod_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_prod_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -65,10 +65,10 @@ static int csi_unsorted_segment_prod_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_prod_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_prod_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -114,10 +114,10 @@ static int csi_segment_prod_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_unsorted_segment_prod_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_prod_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -134,7 +134,7 @@ static int csi_unsorted_segment_prod_u8(struct csi_tensor *input, for(int w = 0; w < input->dim[2]; w++) { for(int c = 0; c < input->dim[3]; c++) { int output_index = csi_get_index(input->dim, n, h, w, c); - output_data[output_index] = csi_quantize_f32(1, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(1, output->zero_point, output->multiplier, output->shift); } } @@ -155,11 +155,11 @@ static int csi_unsorted_segment_prod_u8(struct csi_tensor *input, float temp_sum = 1; for(int k = 0; k < num; k++) { int32_t input_index = csi_get_index(input->dim, index[k], h, w, c); - float input_value = csi_dequantize_f32(input_data[input_index], input->offset, + float input_value = csi_dequantize_u8_to_f32(input_data[input_index], input->zero_point, input->multiplier, input->shift); temp_sum *= input_value; } - output_data[output_index] = csi_quantize_f32(temp_sum, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(temp_sum, output->zero_point, output->multiplier, output->shift); } } @@ -171,10 +171,10 @@ static int csi_unsorted_segment_prod_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_prod_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_prod_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -192,7 +192,7 @@ static int csi_segment_prod_u8(struct csi_tensor *input, for(int w = 0; w < input->dim[2]; w++) { for(int c = 0; c < input->dim[3]; c++) { int output_index = csi_get_index(input->dim, n, h, w, c); - output_data[output_index] = csi_quantize_f32(1, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(1, output->zero_point, output->multiplier, output->shift); } } @@ -215,11 +215,11 @@ static int csi_segment_prod_u8(struct csi_tensor *input, float temp_sum = 1; for(int k = 0; k < num; k++) { int32_t input_index = csi_get_index(input->dim, index[k], h, w, c); - float input_value = csi_dequantize_f32(input_data[input_index], input->offset, + float input_value = csi_dequantize_u8_to_f32(input_data[input_index], input->zero_point, input->multiplier, input->shift); temp_sum *= input_value; } - output_data[output_index] = csi_quantize_f32(temp_sum, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(temp_sum, output->zero_point, output->multiplier, output->shift); } } @@ -231,24 +231,20 @@ static int csi_segment_prod_u8(struct csi_tensor *input, } int csi_segment_prod_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_prod_u8; - } else { - params->bc = csi_segment_prod_u8; - } - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_prod_f32; - } else { - params->bc = csi_segment_prod_f32; + if (params->unsorted == CSINN_TRUE) { + params->bc = csi_bc_map(params->api, CSINN_OP_UNSORTED_SEGMENT_PROD, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } } else { - return CSINN_UNSUPPORT_DTYPE; + params->bc = csi_bc_map(params->api, CSINN_OP_SEGMENT_PROD, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } return CSINN_TRUE; } diff --git a/source/reference/segment_sum.c b/source/reference/segment_sum.c index 591f630d..e5974806 100644 --- a/source/reference/segment_sum.c +++ b/source/reference/segment_sum.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_unsorted_segment_sum_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_sum_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -66,10 +66,10 @@ static int csi_unsorted_segment_sum_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_sum_f32(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_sum_f32(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { float *input_data = input->data; int *segment_data = segment_ids->data; @@ -115,10 +115,10 @@ static int csi_segment_sum_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_unsorted_segment_sum_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_unsorted_segment_sum_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -135,7 +135,7 @@ static int csi_unsorted_segment_sum_u8(struct csi_tensor *input, for(int w = 0; w < input->dim[2]; w++) { for(int c = 0; c < input->dim[3]; c++) { int output_index = csi_get_index(input->dim, n, h, w, c); - output_data[output_index] = csi_quantize_f32(0, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(0, output->zero_point, output->multiplier, output->shift); } } @@ -157,11 +157,11 @@ static int csi_unsorted_segment_sum_u8(struct csi_tensor *input, float temp_sum = 0; for(int k = 0; k < num; k++) { int32_t input_index = csi_get_index(input->dim, index[k], h, w, c); - float input_value = csi_dequantize_f32(input_data[input_index], input->offset, + float input_value = csi_dequantize_u8_to_f32(input_data[input_index], input->zero_point, input->multiplier, input->shift); temp_sum += input_value; } - output_data[output_index] = csi_quantize_f32(temp_sum, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(temp_sum, output->zero_point, output->multiplier, output->shift); } } @@ -173,10 +173,10 @@ static int csi_unsorted_segment_sum_u8(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_segment_sum_u8(struct csi_tensor *input, - struct csi_tensor *segment_ids, - struct csi_tensor *output, - struct segment_params *params) +int csi_segment_sum_u8(struct csi_tensor *input, + struct csi_tensor *segment_ids, + struct csi_tensor *output, + struct segment_params *params) { uint8_t *input_data = input->data; int *segment_data = segment_ids->data; @@ -194,7 +194,7 @@ static int csi_segment_sum_u8(struct csi_tensor *input, for(int w = 0; w < input->dim[2]; w++) { for(int c = 0; c < input->dim[3]; c++) { int output_index = csi_get_index(input->dim, n, h, w, c); - output_data[output_index] = csi_quantize_f32(0, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(0, output->zero_point, output->multiplier, output->shift); } } @@ -217,11 +217,11 @@ static int csi_segment_sum_u8(struct csi_tensor *input, float temp_sum = 0; for(int k = 0; k < num; k++) { int32_t input_index = csi_get_index(input->dim, index[k], h, w, c); - float input_value = csi_dequantize_f32(input_data[input_index], input->offset, + float input_value = csi_dequantize_u8_to_f32(input_data[input_index], input->zero_point, input->multiplier, input->shift); temp_sum += input_value; } - output_data[output_index] = csi_quantize_f32(temp_sum, output->offset, + output_data[output_index] = csi_quantize_f32_to_u8(temp_sum, output->zero_point, output->multiplier, output->shift); } } @@ -233,24 +233,20 @@ static int csi_segment_sum_u8(struct csi_tensor *input, } int csi_segment_sum_init(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct segment_params *params) + struct csi_tensor *input1, + struct csi_tensor *output, + struct segment_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_sum_u8; - } else { - params->bc = csi_segment_sum_u8; - } - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - if (params->unsorted == CSINN_TRUE) { - params->bc = csi_unsorted_segment_sum_f32; - } else { - params->bc = csi_segment_sum_f32; + if (params->unsorted == CSINN_TRUE) { + params->bc = csi_bc_map(params->api, CSINN_OP_UNSORTED_SEGMENT_SUM, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } } else { - return CSINN_UNSUPPORT_DTYPE; + params->bc = csi_bc_map(params->api, CSINN_OP_SEGMENT_SUM, input0->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } } return CSINN_TRUE; } diff --git a/source/reference/select.c b/source/reference/select.c index df3020b2..73ad7b03 100644 --- a/source/reference/select.c +++ b/source/reference/select.c @@ -19,11 +19,11 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_select_f32(struct csi_tensor *condition, - struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct select_params *params) +int csi_select_f32(struct csi_tensor *condition, + struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct select_params *params) { float *input_data0 = input0->data; float *input_data1 = input1->data; @@ -40,11 +40,11 @@ static int csi_select_f32(struct csi_tensor *condition, return CSINN_TRUE; } -static int csi_select_u8(struct csi_tensor *condition, - struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct select_params *params) +int csi_select_u8(struct csi_tensor *condition, + struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct select_params *params) { uint8_t *input_data0 = input0->data; uint8_t *input_data1 = input1->data; @@ -67,11 +67,8 @@ int csi_select_init(struct csi_tensor *condition, struct csi_tensor *output, struct select_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_select_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_select_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SELECT, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/setup.c b/source/reference/setup.c index b4f709ad..dd9526d2 100644 --- a/source/reference/setup.c +++ b/source/reference/setup.c @@ -18,41 +18,418 @@ #include "csi_nn.h" #include "csi_utils.h" +#include "csi_internal_ref.h" void csi_nn_init(struct csi_tensor *input, struct csi_tensor *output) { - float *input_data = input->data; - uint8_t *output_data = output->data; int size = 1; for (int i = 0; i < input->dim_count; i++) { size = size * input->dim[i]; } - - for (int i = 0; i < size; i++) { - int32_t input_val = round(input_data[i] / output->scale) + output->zero_point;; - if (input_val < 0) { - input_val = 0; - } else if (input_val > 255) { - input_val = 255; + if (output->dtype == CSINN_DTYPE_UINT8){ + float *input_data = input->data; + uint8_t *output_data = output->data; + for (int i = 0; i < size; i++) { + int32_t input_val = round(input_data[i] / output->scale) + output->zero_point; + if (input_val < 0) { + input_val = 0; + } else if (input_val > 255) { + input_val = 255; + } + output_data[i] = input_val; + } + }else if (output->dtype == CSINN_DTYPE_INT8){ + float *input_data = input->data; + int8_t *output_data = output->data; + for (int i = 0; i < size; i++) { + int32_t input_val = round(input_data[i] / output->scale) + output->zero_point; + if (input_val < -127) { + input_val = 0; + } else if (input_val > 127) { + input_val = 127; + } + output_data[i] = input_val; } - output_data[i] = input_val; } } - void csi_nn_deinit(struct csi_tensor *input, struct csi_tensor *output) { - uint8_t *input_data = input->data; - float *output_data = output->data; int size = 1; for (int i = 0; i < input->dim_count; i++) { size = size * input->dim[i]; } + if (input->dtype == CSINN_DTYPE_UINT8){ + uint8_t *input_data = input->data; + float *output_data = output->data; + for (int i = 0; i < size; i++) { + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); + output_data[i] = input_val; + } + } else if (input->dtype == CSINN_DTYPE_INT8){ + int8_t *input_data = input->data; + float *output_data = output->data; + for (int i = 0; i < size; i++) { + float input_val = csi_dequantize_i8_to_f32(input_data[i], 0, input->multiplier, input->shift); + output_data[i] = input_val; + } + } +} + +struct csi_tensor *csi_alloc_tensor(struct csi_session *session) +{ + struct csi_tensor *ret = calloc(1, sizeof(struct csi_tensor)); + ret->dtype = session->base_dtype; + ret->layout = session->base_layout; + ret->sess = session; + return ret; +} - for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, input->shift); - output_data[i] = input_val; +struct csi_session *csi_alloc_session() +{ + return calloc(1, sizeof(struct csi_session)); +} + +void csi_free_session(struct csi_session *sess) +{ + free(sess); +} + +void* csi_bc_map_table_ref[CSINN_OP_SIZE][CSINN_DTYPE_SIZE] = { + {csi_abs_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_abs_f32, NULL}, /* CSINN_OP_ABS */ + {csi_acos_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_acos_f32, NULL}, /* CSINN_OP_ACOS */ + {csi_acosh_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_acosh_f32, NULL}, /* CSINN_OP_ACOSH */ + {csi_add_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_add_f32, NULL}, /* CSINN_OP_ADD */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_ALL */ + {csi_and_u8, NULL, NULL, NULL, csi_and_u32, NULL, NULL, NULL, NULL}, /* CSINN_OP_AND */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_ANY */ + {csi_arange_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_arange_f32, NULL}, /* CSINN_OP_ARANGE */ + {csi_argmax_stride_i32_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_argmax_stride_i32_f32, NULL}, /* CSINN_OP_ARGMAX */ + {csi_argmin_stride_i32_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_argmin_stride_i32_f32, NULL}, /* CSINN_OP_ARGMIN */ + {csi_asin_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_asin_f32, NULL}, /* CSINN_OP_ASIN */ + {csi_asinh_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_asinh_f32, NULL}, /* CSINN_OP_ASINH */ + {csi_atan_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_atan_f32, NULL}, /* CSINN_OP_ATAN */ + {csi_atanh_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_atanh_f32, NULL}, /* CSINN_OP_ATANH */ + {csi_averagepool_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_averagepool_f32, NULL}, /* CSINN_OP_AVGPOOL2D */ + {csi_averagepool3d_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_averagepool3d_f32, NULL}, /* CSINN_OP_AVGPOOL3D */ + {csi_batch_normalization_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_batch_normalization_f32, NULL}, /* CSINN_OP_BN */ + {csi_batch_to_space_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_batch_to_space_f32, NULL}, /* CSINN_OP_BATCH_TO_SPACE */ + {csi_broadcast_to_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_broadcast_to_f32, NULL}, /* CSINN_OP_BROADCOST */ + {csi_ceil_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_ceil_f32, NULL}, /* CSINN_OP_CEIL */ + {csi_clip_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_clip_f32, NULL}, /* CSINN_OP_CLIP */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, csi_col2im_f32, NULL}, /* CSINN_OP_COL2IM */ + {csi_concat_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_concat_f32, NULL}, /* CSINN_OP_CONCAT */ + {csi_conv2d_u8, csi_conv2d_i8, NULL, NULL, NULL, NULL, NULL, csi_conv2d_f32, NULL}, /* CSINN_OP_CONV2D */ + {csi_conv2d_relu_u8, csi_conv2d_relu_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_CONV2D_RELU */ + {csi_conv2d_relu6_u8, csi_conv2d_relu6_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_CONV2D_RELU6 */ + {csi_conv2d_channel_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_CONV2D_CHANNEL */ + {csi_conv2d_channel_relu_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_CONV2D_CHANNEL_RELU */ + {csi_conv2d_channel_relu6_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_CONV2D_CHANNEL_RELU6 */ + {csi_depthwise_conv2d_u8, csi_depthwise_conv2d_i8, NULL, NULL, NULL, NULL, NULL, csi_depthwise_conv2d_f32, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D */ + {csi_depthwise_conv2d_relu_u8, csi_depthwise_conv2d_relu_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_RELU */ + {csi_depthwise_conv2d_relu6_u8, csi_depthwise_conv2d_relu6_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_RELU6 */ + {csi_depthwise_conv2d_channel_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL */ + {csi_depthwise_conv2d_channel_relu_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU */ + {csi_depthwise_conv2d_channel_relu6_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6 */ + {csi_group_conv2d_u8, csi_group_conv2d_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_GROUP_CONV2D */ + {csi_group_conv2d_relu_u8, csi_group_conv2d_relu_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_GROUP_CONV2D_RELU */ + {csi_group_conv2d_channel_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_GROUP_CONV2D_CHANNEL */ + {csi_group_conv2d_channel_relu_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_GROUP_CONV2D_CHANNEL_RELU */ + {csi_conv3d_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_conv3d_f32, NULL}, /* CSINN_OP_CONV3D */ + {csi_cos_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_cos_f32, NULL}, /* CSINN_OP_COS */ + {csi_cosh_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_cosh_f32, NULL}, /* CSINN_OP_COSH */ + {csi_cumprod_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_cumprod_f32, NULL}, /* CSINN_OP_CUMPROD */ + {csi_cumsum_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_cumsum_f32, NULL}, /* CSINN_OP_CUMSUM */ + {csi_deconv2d_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DECONV2D */ + {csi_depthwise_deconv2d_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_DEPTHWISE_DECONV2D */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, csi_deconv3d_f32, NULL}, /* CSINN_OP_DECONV3D */ + {csi_depth_to_space_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_depth_to_space_f32, NULL}, /* CSINN_OP_DEPTH_TO_SPACE */ + {csi_div_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_div_f32, NULL}, /* CSINN_OP_DIV */ + {csi_elu_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_elu_f32, NULL}, /* CSINN_OP_ELU */ + {csi_equal_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_equal_f32, NULL}, /* CSINN_OP_EQUANL */ + {csi_erf_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_erf_f32, NULL}, /* CSINN_OP_ERF */ + {csi_exp_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_exp_f32, NULL}, /* CSINN_OP_EXP */ + {csi_expand_dims_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_expand_dims_f32, NULL}, /* CSINN_OP_EXPAND_DIMS */ + {csi_expm1_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_expm1_f32, NULL}, /* CSINN_OP_EXPM1 */ + {csi_flatten_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_flatten_f32, NULL}, /* CSINN_OP_FLATTEN */ + {csi_floor_divide_f32, NULL, NULL, NULL, NULL, NULL, NULL, csi_floor_divide_f32, NULL}, /* CSINN_OP_FLOOR_DIVIDE */ + {csi_floor_mod_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_floor_mod_f32, NULL}, /* CSINN_OP_FLOOR_MOD */ + {csi_floor_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_floor_f32, NULL}, /* CSINN_OP_FLOOR */ + {csi_fullyconnected_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_fullyconnected_f32, NULL}, /* CSINN_OP_FULLYCONNECTED */ + {csi_gather_nd_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_gather_nd_f32, NULL}, /* CSINN_OP_GATHER_ND */ + {csi_gather_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_gather_f32, NULL}, /* CSINN_OP_GATHER */ + {csi_global_averagepool_u8, csi_global_averagepool_i8, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_GLOBAL_AVGPOOL2D */ + {csi_global_maxpool_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_GLOBAL_MAXPOOL2D */ + {csi_greater_equal_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_greater_equal_f32, NULL}, /* CSINN_OP_GREATHER_EQUAL */ + {csi_greater_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_greater_f32, NULL}, /* CSINN_OP_GREATHER */ + {csi_hard_sigmoid_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_hard_sigmoid_f32, NULL}, /* CSINN_OP_HARD_SIGMOID */ + {csi_im2col_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_im2col_f32, NULL}, /* CSINN_OP_IM2COL */ + {csi_isnan_bool_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_isnan_bool_f32, NULL}, /* CSINN_OP_ISNAN */ + {csi_l2_normalization_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_l2_normalization_f32, NULL}, /* CSINN_OP_L2N */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, csi_l2pool_f32, NULL}, /* CSINN_OP_L2POOL2D */ + {csi_leaky_relu_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_leaky_relu_f32, NULL}, /* CSINN_OP_LEAKY_RELU */ + {csi_less_equal_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_less_equal_f32, NULL}, /* CSINN_OP_LESS_EQUAL */ + {csi_less_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_less_f32, NULL}, /* CSINN_OP_LESS */ + {csi_log_softmax_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_log_softmax_f32, NULL}, /* CSINN_OP_LOG_SOFTMAX */ + {csi_log_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_log_f32, NULL}, /* CSINN_OP_LOG */ + {csi_log1p_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_log1p_f32, NULL}, /* CSINN_OP_LOG1P */ + {csi_logical_and_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_logical_and_f32, NULL}, /* CSINN_OP_LOGICAL_AND */ + {csi_logical_not_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_logical_not_f32, NULL}, /* CSINN_OP_LOGICAL_NOT */ + {csi_logical_or_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_logical_or_f32, NULL}, /* CSINN_OP_LOGICAL_OR */ + {csi_logical_xor_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_logical_xor_f32, NULL}, /* CSINN_OP_LOGICAL_XOR */ + {csi_lrn_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_lrn_f32, NULL}, /* CSINN_OP_LRN */ + {csi_matmul_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_matmul_f32, NULL}, /* CSINN_OP_MATMUL */ + {csi_max_stride_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_max_stride_f32, NULL}, /* CSINN_OP_MAX */ + {csi_maximum_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_maximum_f32, NULL}, /* CSINN_OP_MAXINUM */ + {csi_maxpool_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_maxpool_f32, NULL}, /* CSINN_OP_MAXPOOL2D */ + {csi_maxpool2d_locat_i32_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_maxpool2d_locat_f32, NULL}, /* CSINN_OP_MAXPOOL2D_LOCAT */ + {csi_maxpool3d_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_maxpool3d_f32, NULL}, /* CSINN_OP_MAXPOOL3D */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_MEAN */ + {csi_mean_stride_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_mean_stride_f32, NULL}, /* CSINN_OP_MEAN_STRIDE */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_MIN */ + {csi_min_stride_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_min_stride_f32, NULL}, /* CSINN_OP_MIN_STRIDE */ + {csi_minimum_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_minimum_f32, NULL}, /* CSINN_OP_MINIMUM */ + {csi_mod_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_mod_f32, NULL}, /* CSINN_OP_MOD */ + {csi_mul_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_mul_f32, NULL}, /* CSINN_OP_MUL */ + {csi_ndarray_size_u8, NULL, NULL, NULL, NULL, csi_ndarray_size_i32, NULL, csi_ndarray_size_f32, NULL}, /* CSINN_OP_NDARRAY_SIZE */ + {csi_negative_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_negative_f32, NULL}, /* CSINN_OP_NEGATIIVE */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, csi_non_max_suppression_std, NULL}, /* CSINN_OP_NON_MAX_SUPPRESSION */ + {csi_not_equal_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_not_equal_f32, NULL}, /* CSINN_OP_NOT_EQUAL */ + {csi_not_u8, NULL, NULL, NULL, csi_not_u32, NULL, NULL, NULL, NULL}, /* CSINN_OP_NOT */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_ONE_HOT */ + {csi_or_u8, NULL, NULL, NULL, csi_or_u32, NULL, NULL, NULL, NULL}, /* CSINN_OP_OR */ + {csi_pad_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_pad_f32, NULL}, /* CSINN_OP_PAD */ + {csi_power_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_power_f32, NULL}, /* CSINN_OP_POWER */ + {csi_prelu_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_prelu_f32, NULL}, /* CSINN_OP_PRELU */ + {csi_prod_stride_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_prod_stride_f32, NULL}, /* CSINN_OP_PROD */ + {csi_proposal_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_proposal_f32, NULL}, /* CSINN_OP_PROPOSAL */ + {csi_psroipooling_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_psroipooling_f32, NULL}, /* CSINN_OP_PSROIPOOLING */ + {csi_reduce_logsumexp_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reduce_logsumexp_f32, NULL}, /* CSINN_OP_REDUCE_LOGSUMEXP */ + {csi_reduce_max_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reduce_max_f32, NULL}, /* CSINN_OP_REDUCE_MAX */ + {csi_reduce_mean_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reduce_mean_f32, NULL}, /* CSINN_OP_REDUCE_MEAN */ + {csi_reduce_min_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reduce_min_f32, NULL}, /* CSINN_OP_REDUCE_MIN */ + {csi_reduce_prod_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reduce_prod_f32, NULL}, /* CSINN_OP_REDUCE_PROD */ + {csi_reduce_sum_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reduce_sum_f32, NULL}, /* CSINN_OP_REDUCE_SUM */ + {csi_relu_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_relu_f32, NULL}, /* CSINN_OP_RELU */ + {csi_relu1_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_relu1_f32, NULL}, /* CSINN_OP_RELU1 */ + {csi_relu6_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_relu6_f32, NULL}, /* CSINN_OP_RELU6 */ + {csi_relun_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_relun_f32, NULL}, /* CSINN_OP_RELUN */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_REORG */ + {csi_reshape_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reshape_f32, NULL}, /* CSINN_OP_RESHAPE */ + {csi_resize_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_resize_f32, NULL}, /* CSINN_OP_RESIZE */ + {csi_reverse_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_reverse_f32, NULL}, /* CSINN_OP_REVERSE */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, csi_roi_align_f32, NULL}, /* CSINN_OP_ROIALIGN */ + {csi_roipool_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_roipool_f32, NULL}, /* CSINN_OP_ROIPOOL */ + {csi_round_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_round_f32, NULL}, /* CSINN_OP_ROUND */ + {csi_rsqrt_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_rsqrt_f32, NULL}, /* CSINN_OP_RSQRT */ + {csi_segment_max_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_segment_max_f32, NULL}, /* CSINN_OP_SEGMENT_MAX */ + {csi_unsorted_segment_max_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unsorted_segment_max_f32, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MAX */ + {csi_segment_mean_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_segment_mean_f32, NULL}, /* CSINN_OP_SEGMENT_MEAN */ + {csi_unsorted_segment_mean_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unsorted_segment_mean_f32, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MEAN */ + {csi_segment_min_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_segment_min_f32, NULL}, /* CSINN_OP_SEGMENT_MIN */ + {csi_unsorted_segment_min_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unsorted_segment_min_f32, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_MIN */ + {csi_segment_prod_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_segment_prod_f32, NULL}, /* CSINN_OP_SEGMENT_PROD */ + {csi_unsorted_segment_prod_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unsorted_segment_prod_f32, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_PROD */ + {csi_segment_sum_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_segment_sum_f32, NULL}, /* CSINN_OP_SEGMENT_SUM */ + {csi_unsorted_segment_sum_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unsorted_segment_sum_f32, NULL}, /* CSINN_OP_UNSORTED_SEGMENT_SUM */ + {csi_select_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_select_f32, NULL}, /* CSINN_OP_SELECT */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_SEQUENCE_MASK */ + {csi_shape_u8, NULL, NULL, NULL, NULL, csi_shape_i32, NULL, NULL, NULL}, /* CSINN_OP_SHAPE */ + {csi_shuffle_channel_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_shuffle_channel_f32, NULL}, /* CSINN_OP_SHUFFLE_CHANNEL */ + {csi_sigmoid_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sigmoid_f32, NULL}, /* CSINN_OP_SIGMOID */ + {csi_sign_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sign_f32, NULL}, /* CSINN_OP_SIGN */ + {csi_sin_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sin_f32, NULL}, /* CSINN_OP_SIN */ + {csi_sinh_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sinh_f32, NULL}, /* CSINN_OP_SINH */ + {csi_slice_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_slice_f32, NULL}, /* CSINN_OP_SLICE */ + {csi_softmax_u8, csi_softmax_i8, NULL, NULL, NULL, NULL, NULL, csi_softmax_f32, NULL}, /* CSINN_OP_SOFTMAX */ + {csi_softplus_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_softplus_f32, NULL}, /* CSINN_OP_SOFTPLUS */ + {csi_softrelu_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_softrelu_f32, NULL}, /* CSINN_OP_SOFTRELU */ + {csi_softsign_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_softsign_f32, NULL}, /* CSINN_OP_SOFTSIGN */ + {csi_space_to_batch_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_space_to_batch_f32, NULL}, /* CSINN_OP_SPACE_TO_BATCH */ + {csi_space_to_depth_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_space_to_depth_f32, NULL}, /* CSINN_OP_SPACE_TO_DEPTH */ + {csi_split_u8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_SPLIT */ + {csi_sqrt_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sqrt_f32, NULL}, /* CSINN_OP_SQRT */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, csi_square_f32, NULL}, /* CSINN_OP_SQUARE */ + {csi_squeeze_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_squeeze_f32, NULL}, /* CSINN_OP_SQUEEZE */ + {csi_stack_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_stack_f32, NULL}, /* CSINN_OP_STACK */ + {csi_strided_slice_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_strided_slice_f32, NULL}, /* CSINN_OP_STRIDED_SLICE */ + {csi_sub_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sub_f32, NULL}, /* CSINN_OP_SUB */ + {csi_sum_stride_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_sum_stride_f32, NULL}, /* CSINN_OP_SUM */ + {csi_tan_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_tan_f32, NULL}, /* CSINN_OP_TAN */ + {csi_tanh_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_tanh_f32, csi_tanh_f64}, /* CSINN_OP_TANH */ + {csi_threshold_relu_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_threshold_relu_f32, NULL}, /* CSINN_OP_THRESHOLD_RELU */ + {csi_tile_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_tile_f32, NULL}, /* CSINN_OP_TILE */ + {csi_topk_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_topk_f32, NULL}, /* CSINN_OP_TOPK */ + {csi_transpose_u8, csi_transpose_i8, NULL, NULL, NULL, NULL, NULL, csi_transpose_f32, NULL}, /* CSINN_OP_TRANSPOSE */ + {csi_trunc_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_trunc_f32, NULL}, /* CSINN_OP_TRUNC */ + {csi_unpooling_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unpooling_f32, NULL}, /* CSINN_OP_UNPOOLING */ + {csi_unstack_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_unstack_f32, NULL}, /* CSINN_OP_UNSTACK */ + {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}, /* CSINN_OP_WHERE */ + {csi_xor_u8, NULL, NULL, NULL, csi_xor_u32, NULL, NULL, NULL, NULL}, /* CSINN_OP_XOR */ + {csi_yuv_rgb_scale_u8, NULL, NULL, NULL, NULL, NULL, NULL, csi_yuv_rgb_scale_f32, NULL}, /* CSINN_OP_YUV_RGB_SCALE */ +}; + +void *csi_bc_map_ref(int op, int dtype) +{ + return csi_bc_map_table_ref[op][dtype]; +} + +void *csi_bc_map_ovx(int op, int dtype); +void *csi_bc_map_c906(int op, int dtype); +void *csi_bc_map_pnna(int op, int dtype); +void *csi_bc_func_table[CSINN_API_SIZE] = { + csi_bc_map_ref, /* ref */ + NULL, /* c860 */ +#ifdef CSI_BUILD_C906 + csi_bc_map_c906, +#else + NULL, /* c906 */ +#endif + NULL, /* c910 */ +#ifdef CSI_BUILD_OPENVX + csi_bc_map_ovx, +#else + NULL, /* anole */ +#endif + NULL, /* tx510 */ +#ifdef CSI_BUILD_PNNA + csi_bc_map_pnna, +#else + NULL, /* light */ +#endif + NULL, /* tvmgen */ +}; + +void *csi_bc_map(int api, int op, int dtype) +{ + void* (*func)() = csi_bc_func_table[api]; + return func(op, dtype); +} + +void csi_session_init(struct csi_session *sess) +{ + void* (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SESSION_INIT, sess->base_dtype); + if (func != NULL) { + func(sess); + } +} + +void csi_session_deinit(struct csi_session *sess) +{ + void* (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SESSION_DEINIT, sess->base_dtype); + if (func != NULL) { + func(sess); + } +} + +void csi_set_output_number(int number, struct csi_session *sess) +{ + void (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SET_OUTPUT_NUMBER, sess->base_dtype); + if (func != NULL) { + func(number, sess); + } +} + +void csi_set_input_number(int number, struct csi_session *sess) +{ + void (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SET_INPUT_NUMBER, sess->base_dtype); + if (func != NULL) { + func(number, sess); + } +} + +int csi_get_output_number(struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_GET_OUTPUT_NUMBER, sess->base_dtype); + if (func != NULL) { + return func(sess); + } + return CSINN_FALSE; +} + +int csi_get_input_number(struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_GET_INPUT_NUMBER, sess->base_dtype); + if (func != NULL) { + return func(sess); + } + return CSINN_FALSE; +} + +int csi_set_output(int index, struct csi_tensor *output, struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SET_OUTPUT, sess->base_dtype); + if (func != NULL) { + return func(index, output, sess); + } + return CSINN_FALSE; +} + +int csi_set_input(int index, struct csi_tensor *input, struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SET_INPUT, sess->base_dtype); + if (func != NULL) { + return func(index, input, sess); + } + return CSINN_FALSE; +} + +int csi_get_output(int index, struct csi_tensor *output, struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_GET_OUTPUT, sess->base_dtype); + if (func != NULL) { + return func(index, output, sess); + } + return CSINN_FALSE; +} + +int csi_get_input(int index, struct csi_tensor *input, struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_GET_INPUT, sess->base_dtype); + if (func != NULL) { + return func(index, input, sess); + } + return CSINN_FALSE; +} + +int csi_update_input(int index, struct csi_tensor *input, struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_UPDATE_INPUT, sess->base_dtype); + if (func != NULL) { + return func(index, input, sess); + } + return CSINN_FALSE; +} + +int csi_session_setup(struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SESSION_SETUP, sess->base_dtype); + if (func != NULL) { + return func(sess); + } + return CSINN_FALSE; +} + +int csi_session_run(struct csi_session *sess) +{ + int (*func)(); + func = csi_bc_map(sess->base_api, CSINN_SESSION_RUN, sess->base_dtype); + if (func != NULL) { + return func(sess); } + return CSINN_FALSE; } diff --git a/source/reference/shape.c b/source/reference/shape.c index 22a9cfa8..116d649e 100644 --- a/source/reference/shape.c +++ b/source/reference/shape.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_shape_i32(struct csi_tensor *input, - struct csi_tensor *output, - struct shape_params *params) +int csi_shape_i32(struct csi_tensor *input, + struct csi_tensor *output, + struct shape_params *params) { int32_t * data = output->data; for (int i = 0; i < input->dim_count; i++) { @@ -30,9 +30,9 @@ static int csi_shape_i32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_shape_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct shape_params *params) +int csi_shape_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct shape_params *params) { uint8_t * data = output->data; for (int i = 0; i < input->dim_count; i++) { @@ -45,11 +45,8 @@ int csi_shape_init(struct csi_tensor *input, struct csi_tensor *output, struct shape_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_shape_u8; - } else if (input->dtype == CSINN_DTYPE_INT32) { - params->bc = csi_shape_i32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SHAPE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/shuffle_channel.c b/source/reference/shuffle_channel.c new file mode 100644 index 00000000..fc7059d0 --- /dev/null +++ b/source/reference/shuffle_channel.c @@ -0,0 +1,200 @@ + /* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +static int csi_shuffle_channel_nchw_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params) +{ + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int channel = input->dim[1]; + int height = input->dim[2]; + int width = input->dim[3]; + int group = params->group; + int group_channel = channel / group; + int input_inner_size = input->dim[2] * input->dim[3]; + + float *input_data_addr = input_data; + for(int i = 0; i < batch; i++) { + for(int j = 0; j < group_channel; j++) { + for(int k = 0; k < group; k++) { + float *input_data_addr1 = input_data_addr + (k * group_channel + j) * input_inner_size; + memcpy(output_data, input_data_addr1, input_inner_size * sizeof(float)); + output_data += input_inner_size; + } + } + input_data_addr += channel * input_inner_size; + } + return CSINN_TRUE; +} + +static int csi_shuffle_channel_nchw_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params) +{ + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *output_data = (uint8_t *)output->data; + + int batch = input->dim[0]; + int channel = input->dim[1]; + int height = input->dim[2]; + int width = input->dim[3]; + int group = params->group; + int group_channel = channel / group; + int input_inner_size = input->dim[2] * input->dim[3]; + + uint8_t *input_data_addr = input_data; + for(int i = 0; i < batch; i++) { + for(int j = 0; j < group_channel; j++) { + for(int k = 0; k < group; k++) { + uint8_t *input_data_addr1 = input_data_addr + (k * group_channel + j) * input_inner_size; + memcpy(output_data, input_data_addr1, input_inner_size * sizeof(uint8_t)); + output_data += input_inner_size; + } + } + input_data_addr += channel * input_inner_size; + } + return CSINN_TRUE; +} + +// defalut input_layout = NCHW +static int csi_shuffle_channel_nhwc_f32(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct shuffle_channel_params *params) +{ + struct csi_tensor *input; + struct csi_tensor *output; + input = csi_nchw_to_nhwc_f32(o_input); + output = csi_nchw_to_nhwc_f32(o_output); + float *input_data = (float *)input->data; + float *output_data = (float *)output->data; + + int batch = input->dim[0]; + int height = input->dim[1]; + int width = input->dim[2]; + int channel = input->dim[3]; + int group = params->group; + int group_channel = channel / group; + int input_outer_size = input->dim[0] * input->dim[1] * input->dim[2]; + int input_inner_size = 1; + + float *input_data_addr = input_data; + for(int i = 0; i < input_outer_size; i++) { + for(int j = 0; j < group_channel; j++) { + for(int k = 0; k < group; k++) { + float *input_data_addr1 = input_data_addr + (k * group_channel + j) * input_inner_size; + memcpy(output_data, input_data_addr1, input_inner_size * sizeof(float)); + output_data += input_inner_size; + } + } + input_data_addr += channel * input_inner_size; + } + csi_nhwc_to_nchw_f32(o_output, output); + return CSINN_TRUE; +} + +// defalut input_layout = NCHW +static int csi_shuffle_channel_nhwc_u8(struct csi_tensor *o_input, + struct csi_tensor *o_output, + struct shuffle_channel_params *params) +{ + struct csi_tensor *input; + struct csi_tensor *output; + input = csi_nchw_to_nhwc_8(o_input); + output = csi_nchw_to_nhwc_8(o_output); + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *output_data = (uint8_t *)output->data; + + int batch = input->dim[0]; + int height = input->dim[1]; + int width = input->dim[2]; + int channel = input->dim[3]; + int group = params->group; + int group_channel = channel / group; + int input_outer_size = input->dim[0] * input->dim[1] * input->dim[2]; + int input_inner_size = 1; + + uint8_t *input_data_addr = input_data; + for(int i = 0; i < input_outer_size; i++) { + for(int j = 0; j < group_channel; j++) { + for(int k = 0; k < group; k++) { + uint8_t *input_data_addr1 = input_data_addr + (k * group_channel + j) * input_inner_size; + memcpy(output_data, input_data_addr1, input_inner_size * sizeof(uint8_t)); + output_data += input_inner_size; + } + } + input_data_addr += channel * input_inner_size; + } + csi_nhwc_to_nchw_8(o_output, output); + return CSINN_TRUE; +} + +int csi_shuffle_channel_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_shuffle_channel_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_shuffle_channel_nhwc_f32(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_shuffle_channel_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_shuffle_channel_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_shuffle_channel_nhwc_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_shuffle_channel_init(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_SHUFFLE_CHANNEL, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; +} + +int csi_shuffle_channel(struct csi_tensor *input, + struct csi_tensor *output, + struct shuffle_channel_params *params) +{ + if (params->bc != NULL) { + params->bc(input, output, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} + diff --git a/source/reference/sigmoid.c b/source/reference/sigmoid.c index b7dc64c8..061a15eb 100644 --- a/source/reference/sigmoid.c +++ b/source/reference/sigmoid.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_sigmoid_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csi_sigmoid_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_sigmoid_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_sigmoid_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct sigmoid_params *params) +int csi_sigmoid_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct sigmoid_params *params) { float *float_input_data; float *float_output_data; @@ -61,14 +61,14 @@ static int csi_sigmoid_u8(struct csi_tensor *input, float_output.data = float_output_data; for (int i = 0; i < size; i++) { - float_input_data[i] = csi_dequantize_f32(input_data[i], input->offset, + float_input_data[i] = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); } csi_sigmoid_f32(&float_input, &float_output, params); for (int i = 0; i < size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_input_data); @@ -81,11 +81,8 @@ int csi_sigmoid_init(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sigmoid_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sigmoid_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SIGMOID, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/sign.c b/source/reference/sign.c index 9314f183..1bf2e8ce 100644 --- a/source/reference/sign.c +++ b/source/reference/sign.c @@ -29,9 +29,9 @@ float sign(float v){ return -1; } -static int csi_sign_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sign_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -46,9 +46,9 @@ static int csi_sign_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_sign_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sign_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -58,24 +58,21 @@ static int csi_sign_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = sign(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_sign_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sign_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sign_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SIGN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/sin.c b/source/reference/sin.c index 40ad6555..b1c41211 100644 --- a/source/reference/sin.c +++ b/source/reference/sin.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_sin_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sin_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_sin_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_sin_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sin_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,10 +49,10 @@ static int csi_sin_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = sin(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -61,19 +61,16 @@ int csi_sin_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sin_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sin_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SIN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_sin(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/sinh.c b/source/reference/sinh.c index eb2c15a7..3e9c9634 100644 --- a/source/reference/sinh.c +++ b/source/reference/sinh.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_sinh_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sinh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -37,9 +37,9 @@ static int csi_sinh_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_sinh_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sinh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -49,32 +49,29 @@ static int csi_sinh_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = sinh(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_sinh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sinh_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sinh_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SINH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_sinh(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/slice.c b/source/reference/slice.c index 0c27fc17..597c96e3 100644 --- a/source/reference/slice.c +++ b/source/reference/slice.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_strided_slice_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct slice_params *params) +int csi_slice_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct slice_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -50,9 +50,9 @@ static int csi_strided_slice_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_strided_slice_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct slice_params *params) +int csi_slice_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct slice_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -70,8 +70,8 @@ static int csi_strided_slice_u8(struct csi_tensor *input, for(int h = params->begin[2]; h < params->end[2]; h++){ for(int w = params->begin[3]; w < params->end[3]; w++){ int32_t input_index = csi_get_index(input->dim, b, c, h, w); - uint8_t out_val = csi_requantize_u8(input_data[input_index], input->offset, input->multiplier, - input->shift, output->offset, output->multiplier, output->shift); + uint8_t out_val = csi_requantize_u8(input_data[input_index], input->zero_point, input->multiplier, + input->shift, output->zero_point, output->multiplier, output->shift); int32_t out_index = csi_get_index(output->dim, b-params->begin[0], c-params->begin[1], h-params->begin[2], w-params->begin[3]); output_data[out_index] = out_val; } @@ -93,8 +93,8 @@ static int csi_strided_slice_u8(struct csi_tensor *input, for(int l = params->begin[3]; l < params->end[3]; l++){ for(int m = params->begin[4]; m < params->end[4]; m++){ int32_t input_index = csi_get_index_5(input->dim, i, j, k, l, m); - uint8_t out_val = csi_requantize_u8(input_data[input_index], input->offset, input->multiplier, - input->shift, output->offset, output->multiplier, output->shift); + uint8_t out_val = csi_requantize_u8(input_data[input_index], input->zero_point, input->multiplier, + input->shift, output->zero_point, output->multiplier, output->shift); int32_t out_index = csi_get_index_5(output->dim, i-params->begin[0], j-params->begin[1], k-params->begin[2], l-params->begin[3], m-params->begin[4]); output_data[out_index] = out_val; } @@ -112,11 +112,8 @@ int csi_slice_init(struct csi_tensor *input, struct slice_params *params) { if (params->begin != NULL) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_strided_slice_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_strided_slice_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SLICE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } else { diff --git a/source/reference/softmax.c b/source/reference/softmax.c index 28246b72..fbfd6ddd 100644 --- a/source/reference/softmax.c +++ b/source/reference/softmax.c @@ -20,8 +20,8 @@ #include "csi_utils.h" static int csi_softmax_nhwc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) + struct csi_tensor *output, + struct softmax_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -57,8 +57,8 @@ static int csi_softmax_nhwc_f32(struct csi_tensor *input, } static int csi_softmax_nhwc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) + struct csi_tensor *output, + struct softmax_params *params) { float *float_input_data; float *float_output_data; @@ -80,14 +80,14 @@ static int csi_softmax_nhwc_u8(struct csi_tensor *input, float_output.data = float_output_data; for (int i = 0; i < size; i++) { - float_input_data[i] = csi_dequantize_f32(input_data[i], input->offset, + float_input_data[i] = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); } csi_softmax_nhwc_f32(&float_input, &float_output, params); for (int i = 0; i < size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_input_data); @@ -95,10 +95,48 @@ static int csi_softmax_nhwc_u8(struct csi_tensor *input, return CSINN_TRUE; } +static int csi_softmax_nhwc_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) +{ + float *float_input_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_output; + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int size = 1; + + for (int i = 0; i < input->dim_count; i++) { + size *= input->dim[i]; + } + + memcpy(&float_input, input, sizeof(struct csi_tensor)); + memcpy(&float_output, output, sizeof(struct csi_tensor)); + float_input_data = malloc(size * sizeof(float)); + float_output_data = malloc(size * sizeof(float)); + float_input.data = float_input_data; + float_output.data = float_output_data; + + for (int i = 0; i < size; i++) { + float_input_data[i] = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, + input->multiplier, input->shift); + } + + csi_softmax_nhwc_f32(&float_input, &float_output, params); + + for (int i = 0; i < size; i++) { + output_data[i] = csi_quantize_f32_to_i8(float_output_data[i], output->zero_point, + output->multiplier, output->shift); + } + free(float_input_data); + free(float_output_data); + return CSINN_TRUE; +} static int csi_softmax_nchw_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) + struct csi_tensor *output, + struct softmax_params *params) { // assert(input->dim_count - 1 >= axis); @@ -165,8 +203,8 @@ static int csi_softmax_nchw_f32(struct csi_tensor *input, } static int csi_softmax_nchw_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) + struct csi_tensor *output, + struct softmax_params *params) { // assert(input->dim_count - 1 == axis); float *float_input_data; @@ -189,14 +227,14 @@ static int csi_softmax_nchw_u8(struct csi_tensor *input, float_output.data = float_output_data; for (int i = 0; i < size; i++) { - float_input_data[i] = csi_dequantize_f32(input_data[i], input->offset, + float_input_data[i] = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); } csi_softmax_nchw_f32(&float_input, &float_output, params); for (int i = 0; i < size; i++) { - output_data[i] = csi_quantize_f32(float_output_data[i], output->offset, + output_data[i] = csi_quantize_f32_to_u8(float_output_data[i], output->zero_point, output->multiplier, output->shift); } free(float_input_data); @@ -205,29 +243,93 @@ static int csi_softmax_nchw_u8(struct csi_tensor *input, } -int csi_softmax_init(struct csi_tensor *input, - struct csi_tensor *output, - struct softmax_params *params) +static int csi_softmax_nchw_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) +{ + // assert(input->dim_count - 1 == axis); + float *float_input_data; + float *float_output_data; + struct csi_tensor float_input; + struct csi_tensor float_output; + int8_t *input_data = input->data; + int8_t *output_data = output->data; + int size = 1; + + for (int i = 0; i < input->dim_count; i++) { + size *= input->dim[i]; + } + + memcpy(&float_input, input, sizeof(struct csi_tensor)); + memcpy(&float_output, output, sizeof(struct csi_tensor)); + float_input_data = malloc(size * sizeof(float)); + float_output_data = malloc(size * sizeof(float)); + float_input.data = float_input_data; + float_output.data = float_output_data; + + for (int i = 0; i < size; i++) { + float_input_data[i] = csi_dequantize_i8_to_f32(input_data[i], input->zero_point, + input->multiplier, input->shift); + } + + csi_softmax_nchw_f32(&float_input, &float_output, params); + + for (int i = 0; i < size; i++) { + output_data[i] = csi_quantize_f32_to_i8(float_output_data[i], output->zero_point, + output->multiplier, output->shift); + } + free(float_input_data); + free(float_output_data); + return CSINN_TRUE; +} + +int csi_softmax_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) { if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_softmax_nchw_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_softmax_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_softmax_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_softmax_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } + csi_softmax_nchw_f32(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_softmax_nhwc_f32(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_softmax_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_softmax_nchw_u8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_softmax_nhwc_u8(input, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_softmax_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_softmax_nchw_i8(input, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_softmax_nhwc_i8(input, output, params); } else { return CSINN_UNSUPPORT_LAYOUT; } +} + +int csi_softmax_init(struct csi_tensor *input, + struct csi_tensor *output, + struct softmax_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_SOFTMAX, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } return CSINN_TRUE; } diff --git a/source/reference/softplus.c b/source/reference/softplus.c index 4e5b558d..f6eca329 100644 --- a/source/reference/softplus.c +++ b/source/reference/softplus.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_softplus_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_softplus_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -36,9 +35,9 @@ static int csi_softplus_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_softplus_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_softplus_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -47,10 +46,10 @@ static int csi_softplus_u8(struct csi_tensor *input, size = size * input->dim[i]; } for(int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = log(1+exp(input0_val)); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -59,11 +58,8 @@ int csi_softplus_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_softplus_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_softplus_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SOFTPLUS, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/softrelu.c b/source/reference/softrelu.c index 83738b40..11e31793 100644 --- a/source/reference/softrelu.c +++ b/source/reference/softrelu.c @@ -20,12 +20,12 @@ #include "csi_utils.h" static float softrelu(float x, float y){ - return log(1+exp(fmax(fmin(x,y),y))); + return log(1 + exp(fmax(fmin(x, y), y))); } -static int csi_softrelu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_softrelu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -40,9 +40,9 @@ static int csi_softrelu_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_softrelu_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_softrelu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -51,34 +51,31 @@ static int csi_softrelu_u8(struct csi_tensor *input, size = size * input->dim[i]; } - float n_f = csi_dequantize_f32(1, 0, params->n_multiplier, params->n_shift); + float n_f = csi_dequantize_u8_to_f32(1, 0, params->n_multiplier, params->n_shift); for (int i = 0; i < size; i++) { - float input_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = softrelu(input_val, n_f); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_softrelu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) + struct csi_tensor *output, + struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_softrelu_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_softrelu_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SOFTRELU, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_softrelu(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) + struct csi_tensor *output, + struct relu_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/softsign.c b/source/reference/softsign.c index 8760c5b2..4142e21f 100644 --- a/source/reference/softsign.c +++ b/source/reference/softsign.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_softsign_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_softsign_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -35,9 +35,9 @@ static int csi_softsign_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_softsign_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_softsign_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -47,11 +47,11 @@ static int csi_softsign_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = input0_val / (1+fabs(input0_val)); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; @@ -61,11 +61,8 @@ int csi_softsign_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if(input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_softsign_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_softsign_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SOFTSIGN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/space_to_batch.c b/source/reference/space_to_batch.c index 2734a930..02df1a8b 100644 --- a/source/reference/space_to_batch.c +++ b/source/reference/space_to_batch.c @@ -21,9 +21,9 @@ //tf.nn.space_to_batch:the input mast a 4-D Tensor with shape [batch, height, width, depth]. -static int csi_space_to_batch_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_params *params) +int csi_space_to_batch_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_batch_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -71,9 +71,9 @@ static int csi_space_to_batch_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_space_to_batch_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_batch_params *params) +int csi_space_to_batch_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_batch_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -112,8 +112,8 @@ static int csi_space_to_batch_u8(struct csi_tensor *input, int out_start_addr = csi_get_index(output->dim, in_b, out_c, out_h / block_size, out_w / block_size); for(int i = 0; i < block_size2; ++i) { output_data[out_start_addr + i * batch * out_channel * out_height * out_width] = - csi_requantize_u8(temp[i], input->offset, input->multiplier, input->shift, - output->offset, output->multiplier, output->shift); + csi_requantize_u8(temp[i], input->zero_point, input->multiplier, input->shift, + output->zero_point, output->multiplier, output->shift); } free(temp); } @@ -128,11 +128,8 @@ int csi_space_to_batch_init(struct csi_tensor *input, struct csi_tensor *output, struct space_to_batch_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_space_to_batch_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_space_to_batch_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SPACE_TO_BATCH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/space_to_depth.c b/source/reference/space_to_depth.c index 664e09b8..5315e8a2 100644 --- a/source/reference/space_to_depth.c +++ b/source/reference/space_to_depth.c @@ -20,9 +20,9 @@ #include "csi_utils.h" //the input->data is a 4-D Tensor with shape [batch, depth, height, width]. -static int csi_space_to_depth_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_depth_params *params) +int csi_space_to_depth_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_depth_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -64,9 +64,9 @@ static int csi_space_to_depth_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_space_to_depth_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct space_to_depth_params *params) +int csi_space_to_depth_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct space_to_depth_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -113,11 +113,8 @@ int csi_space_to_depth_init(struct csi_tensor *input, struct csi_tensor *output, struct space_to_depth_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_space_to_depth_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_space_to_depth_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SPACE_TO_DEPTH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/split.c b/source/reference/split.c index ac87ec6c..a889235c 100644 --- a/source/reference/split.c +++ b/source/reference/split.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_split_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct split_params *params) +int csi_split_u8(struct csi_tensor *input, + struct csi_tensor **output, + struct split_params *params) { const int32_t batches = input->dim[0]; const int32_t input_depth = input->dim[1]; @@ -41,34 +41,33 @@ static int csi_split_u8(struct csi_tensor *input, } int32_t end[4] = {batches, end_1, input_width, input_height}; int32_t strides[4] = {1, 1, 1, 1}; - struct csi_tensor *output_ptr = output + i; + struct csi_tensor *output_ptr = output[i]; struct slice_params sparams; sparams.layout = CSINN_NCHW; sparams.begin = begin; sparams.end = end; sparams.strides = strides; + sparams.api = CSINN_REF; csi_slice_init(input, output_ptr, &sparams); csi_slice(input, output_ptr, &sparams); } return CSINN_TRUE; } - int csi_split_init(struct csi_tensor *input, - struct csi_tensor *output, - struct split_params *params) + struct csi_tensor **output, + struct split_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_split_u8; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SPLIT, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_split(struct csi_tensor *input, - struct csi_tensor *output, - struct split_params *params) + struct csi_tensor **output, + struct split_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/sqrt.c b/source/reference/sqrt.c index ccd6becf..fe37f9a5 100644 --- a/source/reference/sqrt.c +++ b/source/reference/sqrt.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_sqrt_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sqrt_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_sqrt_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_sqrt_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_sqrt_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_sqrt_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = sqrt(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,11 +62,8 @@ int csi_sqrt_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sqrt_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sqrt_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SQRT, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/square.c b/source/reference/square.c index 12a405bb..7b9ff7f8 100644 --- a/source/reference/square.c +++ b/source/reference/square.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_square_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_square_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -40,9 +40,8 @@ int csi_square_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_square_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SQUARE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/squeeze.c b/source/reference/squeeze.c index 1bbf91eb..a626fcc8 100644 --- a/source/reference/squeeze.c +++ b/source/reference/squeeze.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_squeeze_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct squeeze_params *params) +int csi_squeeze_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct squeeze_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -35,9 +35,9 @@ static int csi_squeeze_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_squeeze_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct squeeze_params *params) +int csi_squeeze_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct squeeze_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -55,11 +55,8 @@ int csi_squeeze_init(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_squeeze_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_squeeze_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SQUEEZE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/stack.c b/source/reference/stack.c index f173c462..2599ec9d 100644 --- a/source/reference/stack.c +++ b/source/reference/stack.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_stack_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct stack_params *params) +int csi_stack_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct stack_params *params) { int input_count = params->inputs_count; int axis = params->axis; @@ -52,9 +51,9 @@ static int csi_stack_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_stack_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct stack_params *params) +int csi_stack_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct stack_params *params) { if (params->axis == -1){ params->axis= input->dim_count -1; @@ -80,14 +79,14 @@ static int csi_stack_u8(struct csi_tensor *input, struct csi_tensor *input_item = input + j; uint8_t *input_item_data = (uint8_t *)input_item->data; const uint8_t *input_ptr = input_item_data + i * copy_size; - if(input_item->offset == output->offset && + if(input_item->zero_point == output->zero_point && input_item->multiplier == output->multiplier && input_item->shift == output->shift) { memcpy(output_data, input_ptr, copy_size * sizeof(uint8_t)); } else { for(int n = 0; n < copy_size; n++) { - output_data[j] = csi_requantize_u8(input_ptr[j], input_item->offset, input_item->multiplier, input_item->shift, - output->offset, output->multiplier, output->shift); + output_data[j] = csi_requantize_u8(input_ptr[j], input_item->zero_point, input_item->multiplier, input_item->shift, + output->zero_point, output->multiplier, output->shift); } } output_data += copy_size; @@ -97,22 +96,19 @@ static int csi_stack_u8(struct csi_tensor *input, } int csi_stack_init(struct csi_tensor *input, - struct csi_tensor *output, - struct stack_params *params) + struct csi_tensor *output, + struct stack_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_stack_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_stack_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_STACK, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_stack(struct csi_tensor *input, - struct csi_tensor *output, - struct stack_params *params) + struct csi_tensor *output, + struct stack_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/strided_slice.c b/source/reference/strided_slice.c index 2189b5c3..e2e2ea4c 100644 --- a/source/reference/strided_slice.c +++ b/source/reference/strided_slice.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_strided_slice_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) +int csi_strided_slice_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct strided_slice_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -84,9 +83,9 @@ static int csi_strided_slice_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_strided_slice_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) +int csi_strided_slice_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct strided_slice_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -144,31 +143,27 @@ static int csi_strided_slice_u8(struct csi_tensor *input, } out_size = out_size * inner_size; for(int i = 0; i < out_size; i++) { - output_data[i] = csi_requantize_u8(input_data[i],input->offset, input->multiplier, input->shift, - output->offset, output->multiplier, output->shift); + output_data[i] = csi_requantize_u8(input_data[i],input->zero_point, input->multiplier, input->shift, + output->zero_point, output->multiplier, output->shift); } free(input_data); return CSINN_TRUE; } - int csi_strided_slice_init(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) + struct csi_tensor *output, + struct strided_slice_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_strided_slice_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_strided_slice_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_STRIDED_SLICE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_strided_slice(struct csi_tensor *input, - struct csi_tensor *output, - struct strided_slice_params *params) + struct csi_tensor *output, + struct strided_slice_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/sub.c b/source/reference/sub.c index c4c656b8..0ccb6e48 100644 --- a/source/reference/sub.c +++ b/source/reference/sub.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_sub_f32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_sub_f32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { float *input0_data = input0->data; float *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_sub_f32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_sub_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_sub_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -53,11 +53,11 @@ static int csi_sub_u8(struct csi_tensor *input0, for (int i = 0; i < size; i++) { float input0_val = - csi_dequantize_f32(input0_data[i], input0->offset, input0->multiplier, input0->shift); + csi_dequantize_u8_to_f32(input0_data[i], input0->zero_point, input0->multiplier, input0->shift); float input1_val = - csi_dequantize_f32(input1_data[i], input1->offset, input1->multiplier, input1->shift); + csi_dequantize_u8_to_f32(input1_data[i], input1->zero_point, input1->multiplier, input1->shift); float res = input0_val - input1_val; - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -67,11 +67,8 @@ int csi_sub_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sub_u8; - } else if (input0->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sub_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SUB, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/sum.c b/source/reference/sum.c index c5ebcc2e..02304faa 100644 --- a/source/reference/sum.c +++ b/source/reference/sum.c @@ -19,9 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" -static int csi_sum_stride_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_sum_stride_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { float *input_data = input->data; @@ -58,9 +58,9 @@ static int csi_sum_stride_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_sum_stride_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) +int csi_sum_stride_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct reduce_params *params) { uint8_t *input_data = input->data; @@ -88,29 +88,26 @@ static int csi_sum_stride_u8(struct csi_tensor *input, { int32_t index = out_index + get_reduction_index(inner, params->inner_strides, params->inner_extents, params->m); - float val = csi_dequantize_f32(input_data[index], input->offset, + float val = csi_dequantize_u8_to_f32(input_data[index], input->zero_point, input->multiplier, input->shift); result += val; } - output_data[out] = csi_quantize_f32(result, output->offset, + output_data[out] = csi_quantize_f32_to_u8(result, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_sum_init(struct csi_tensor *input, - struct csi_tensor *output, - struct reduce_params *params) + struct csi_tensor *output, + struct reduce_params *params) { if (params->n == 0 && params->m == 0) { return CSINN_FALSE; } else { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_sum_stride_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_sum_stride_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_SUM, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } } diff --git a/source/reference/tan.c b/source/reference/tan.c index b4ae9e77..273b9545 100644 --- a/source/reference/tan.c +++ b/source/reference/tan.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_tan_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_tan_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_tan_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_tan_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_tan_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_tan_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = tan(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,19 +62,16 @@ int csi_tan_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_tan_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_tan_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_TAN, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_tan(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params); diff --git a/source/reference/tanh.c b/source/reference/tanh.c index e7177d46..1980670c 100644 --- a/source/reference/tanh.c +++ b/source/reference/tanh.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_tanh_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_tanh_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_tanh_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_tanh_f64(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_tanh_f64(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { double *input_data = input->data; double *output_data = output->data; @@ -54,9 +54,9 @@ static int csi_tanh_f64(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_tanh_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_tanh_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -66,26 +66,21 @@ static int csi_tanh_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = tanh(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_tanh_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_tanh_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_tanh_f32; - } else if (input->dtype == CSINN_DTYPE_FLOAT64) { - params->bc = csi_tanh_f64; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_TANH, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/threshold_relu.c b/source/reference/threshold_relu.c index caafd3a9..93a0c1f5 100644 --- a/source/reference/threshold_relu.c +++ b/source/reference/threshold_relu.c @@ -24,9 +24,9 @@ static float threshold_relu(float x, float theta){ return x > theta ? x : 0; } -static int csi_threshold_relu_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_threshold_relu_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -42,9 +42,9 @@ static int csi_threshold_relu_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_threshold_relu_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) +int csi_threshold_relu_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct relu_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -56,24 +56,21 @@ static int csi_threshold_relu_u8(struct csi_tensor *input, #pragma omp parallel for num_threads(8) for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = threshold_relu(input0_val, theta); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } int csi_threshold_relu_init(struct csi_tensor *input, - struct csi_tensor *output, - struct relu_params *params) + struct csi_tensor *output, + struct relu_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_threshold_relu_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_threshold_relu_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_THRESHOLD_RELU, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/tile.c b/source/reference/tile.c index dc01267d..b35f80ec 100644 --- a/source/reference/tile.c +++ b/source/reference/tile.c @@ -28,9 +28,9 @@ static int Multiplication(int *dim, int s, int e) return res; } -static int csi_tile_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct tile_params *params) +int csi_tile_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct tile_params *params) { float *input_data = (float *)input->data; float *output_data = (float *)output->data; @@ -70,9 +70,9 @@ static int csi_tile_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_tile_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct tile_params *params) +int csi_tile_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct tile_params *params) { uint8_t *input_data = (uint8_t *)input->data; uint8_t *output_data = (uint8_t *)output->data; @@ -116,11 +116,8 @@ int csi_tile_init(struct csi_tensor *input, struct csi_tensor *output, struct tile_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_tile_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_tile_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_TILE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/topk.c b/source/reference/topk.c new file mode 100644 index 00000000..4fb02ead --- /dev/null +++ b/source/reference/topk.c @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2016-2020 C-SKY Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "csi_nn.h" +#include "csi_utils.h" + +int csi_topk_f32(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params) +{ + float *input_data = (float *)input->data; + float *values_data = (float *)output1->data; + int *indices_data = (int *)output2->data; + + int k = params->k; + int last_dim = input->dim[input->dim_count - 1]; + int inner_size = 1; + for(int i = 0; i < input->dim_count - 1; i++) + { + inner_size *= input->dim[i]; + } + float *input_sort_addr = input_data; + for(int n = 0; n < inner_size; n++) { + int *flag = (int *)calloc(last_dim, sizeof(int)); + for(int i = 0; i < k; i++) { + values_data[i] = -FLT_MAX; + for(int j = 0; j < last_dim; j++) { + if(input_sort_addr[j] > values_data[i] && !flag[j]) { + values_data[i] = input_sort_addr[j]; + indices_data[i] = j; + } + } + flag[indices_data[i]] = 1; + } + free(flag); + flag = NULL; + input_sort_addr += last_dim; + values_data += k; + indices_data += k; + } + return CSINN_TRUE; +} + +int csi_topk_u8(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params) +{ + uint8_t *input_data = (uint8_t *)input->data; + uint8_t *values_data = (uint8_t *)output1->data; + int *indices_data = (int *)output2->data; + + int k = params->k; + int last_dim = input->dim[input->dim_count - 1]; + int inner_size = 1; + for(int i = 0; i < input->dim_count - 1; i++) + { + inner_size *= input->dim[i]; + } + uint8_t *input_sort_addr = input_data; + for(int n = 0; n < inner_size; n++) { + int *flag = (int *)calloc(last_dim, sizeof(int)); + for(int i = 0; i < k; i++) { + values_data[i] = 0; + for(int j = 0; j < last_dim; j++) { + // >= :for k = last_dim + if(input_sort_addr[j] >= values_data[i] && !flag[j]) { + values_data[i] = input_sort_addr[j]; + indices_data[i] = j; + } + } + values_data[i] = csi_requantize_u8(values_data[i], input->zero_point, input->multiplier, input->shift, + output1->zero_point, output1->multiplier, output1->shift); + flag[indices_data[i]] = 1; + } + free(flag); + flag = NULL; + input_sort_addr += last_dim; + values_data += k; + indices_data += k; + } + return CSINN_TRUE; +} + +int csi_topk_init(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params) +{ + params->bc = csi_bc_map(params->api, CSINN_OP_TOPK, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; + } + return CSINN_TRUE; +} + +int csi_topk(struct csi_tensor *input, + struct csi_tensor *output1, + struct csi_tensor *output2, + struct topk_params *params) +{ + if (params->bc != NULL) { + params->bc(input, output1, output2, params); + } else { + return CSINN_CALLBACK_UNSET; + } + return CSINN_TRUE; +} diff --git a/source/reference/transpose.c b/source/reference/transpose.c index f93a9631..ef46f1f3 100644 --- a/source/reference/transpose.c +++ b/source/reference/transpose.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_transpose_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params) +int csi_transpose_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -68,9 +68,9 @@ static int csi_transpose_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_transpose_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params) +int csi_transpose_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -135,15 +135,79 @@ static int csi_transpose_u8(struct csi_tensor *input, return CSINN_TRUE; } +int csi_transpose_i8(struct csi_tensor *input, + struct csi_tensor *output, + struct transpose_params *params) +{ + int8_t *input_data = input->data; + int8_t *output_data = output->data; + const int unextended_output_size = output->dim_count;; + assert(unextended_output_size < 8); + + const int input_ext_size = unextended_output_size - input->dim_count; + const int output_ext_size = unextended_output_size - unextended_output_size; + int extended_perm[unextended_output_size]; + for (int i = 0; i < output_ext_size; ++i) { + extended_perm[i] = i; + } + for (int i = 0; i < unextended_output_size; ++i) { + extended_perm[i + output_ext_size] = params->permute[i] + input_ext_size; + } + int out_sizes[unextended_output_size]; + for (int k = 0; k < unextended_output_size; k++) { + out_sizes[k] = output->dim[k]; + } + int o[unextended_output_size]; // loop index (on output). + int i[unextended_output_size]; + if (unextended_output_size == 4){ + // Naive transpose loop (iterate on output index and compute input index). + for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { + i[extended_perm[3]] = o[3]; + for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { + i[extended_perm[2]] = o[2]; + for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { + i[extended_perm[1]] = o[1]; + for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { + i[extended_perm[0]] = o[0]; + output_data[csi_get_index(output->dim, o[0], o[1], o[2], o[3])] = + input_data[csi_get_index(input->dim, i[0], i[1], i[2], i[3])]; + } + } + } + } + } + else if (unextended_output_size == 6){ + // Naive transpose loop (iterate on output index and compute input index). + for (o[5] = 0; o[5] < out_sizes[5]; o[5]++) { + i[extended_perm[5]] = o[5]; + for (o[4] = 0; o[4] < out_sizes[4]; o[4]++) { + i[extended_perm[4]] = o[4]; + for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { + i[extended_perm[3]] = o[3]; + for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { + i[extended_perm[2]] = o[2]; + for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { + i[extended_perm[1]] = o[1]; + for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { + i[extended_perm[0]] = o[0]; + output_data[csi_get_index_6(output->dim, o[0], o[1], o[2], o[3], o[4], o[5])] = + input_data[csi_get_index_6(input->dim, i[0], i[1], i[2], i[3], i[4], i[5])]; + } + } + } + } + } + } + } + return CSINN_TRUE; +} + int csi_transpose_init(struct csi_tensor *input, - struct csi_tensor *output, - struct transpose_params *params) + struct csi_tensor *output, + struct transpose_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_transpose_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_transpose_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_TRANSPOSE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/trunc.c b/source/reference/trunc.c index 24721a9b..a85e91fa 100644 --- a/source/reference/trunc.c +++ b/source/reference/trunc.c @@ -20,9 +20,9 @@ #include "csi_utils.h" #include -static int csi_trunc_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_trunc_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -37,9 +37,9 @@ static int csi_trunc_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_trunc_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_trunc_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; uint8_t *output_data = output->data; @@ -49,11 +49,11 @@ static int csi_trunc_u8(struct csi_tensor *input, } for (int i = 0; i < size; i++) { - float input0_val = csi_dequantize_f32(input_data[i], input->offset, input->multiplier, + float input0_val = csi_dequantize_u8_to_f32(input_data[i], input->zero_point, input->multiplier, input->shift); float res = trunc(input0_val); - output_data[i] = csi_quantize_f32(res, output->offset, output->multiplier, output->shift); + output_data[i] = csi_quantize_f32_to_u8(res, output->zero_point, output->multiplier, output->shift); } return CSINN_TRUE; } @@ -62,11 +62,8 @@ int csi_trunc_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_trunc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_trunc_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_TRUNC, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/unpooling.c b/source/reference/unpooling.c index 77862eb3..9e181c3c 100644 --- a/source/reference/unpooling.c +++ b/source/reference/unpooling.c @@ -187,29 +187,42 @@ static int csi_unpooling_nchw_u8(struct csi_tensor *input, return CSINN_TRUE; } +int csi_unpooling_f32(struct csi_tensor *input, + struct csi_tensor *mask, + struct csi_tensor *output, + struct unpooling_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_unpooling_nchw_f32(input, mask, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_unpooling_nhwc_f32(input, mask, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + +int csi_unpooling_u8(struct csi_tensor *input, + struct csi_tensor *mask, + struct csi_tensor *output, + struct unpooling_params *params) +{ + if (params->layout == CSINN_NCHW) { + csi_unpooling_nchw_u8(input, mask, output, params); + } else if (params->layout == CSINN_NHWC) { + csi_unpooling_nhwc_u8(input, mask, output, params); + } else { + return CSINN_UNSUPPORT_LAYOUT; + } +} + int csi_unpooling_init(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output, struct unpooling_params *params) { - if (params->layout == CSINN_NCHW) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_unpooling_nchw_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_unpooling_nchw_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else if (params->layout = CSINN_NHWC) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_unpooling_nhwc_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_unpooling_nhwc_f32; - } else { - return CSINN_UNSUPPORT_DTYPE; - } - } else { - return CSINN_UNSUPPORT_LAYOUT; + params->bc = csi_bc_map(params->api, CSINN_OP_UNPOOLING, input->dtype); + if (params->bc == NULL) { + return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } diff --git a/source/reference/unstack.c b/source/reference/unstack.c index b70737e4..5252415d 100644 --- a/source/reference/unstack.c +++ b/source/reference/unstack.c @@ -19,10 +19,9 @@ #include "csi_nn.h" #include "csi_utils.h" - -static int csi_unstack_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct unstack_params *params) +int csi_unstack_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct unstack_params *params) { int axis = params->axis; int output_count = input->dim[axis]; @@ -52,9 +51,9 @@ static int csi_unstack_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_unstack_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct unstack_params *params) +int csi_unstack_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct unstack_params *params) { int axis = params->axis; int output_count = input->dim[axis]; @@ -77,14 +76,14 @@ static int csi_unstack_u8(struct csi_tensor *input, struct csi_tensor *output_item = output + j; float *output_item_data = (float *)output_item->data; float *output_ptr = output_item_data + i * copy_size; - if(output_item->offset == input->offset && + if(output_item->zero_point == input->zero_point && output_item->multiplier == input->multiplier && output_item->shift == input->shift) { memcpy(output_ptr, input_data, copy_size * sizeof(float)); } else { for(int n = 0; n < copy_size; n++) { - output_ptr[j] = csi_requantize_u8(input_data[j], input->offset, input->multiplier, input->shift, - output_item->offset, output_item->multiplier, output_item->shift); + output_ptr[j] = csi_requantize_u8(input_data[j], input->zero_point, input->multiplier, input->shift, + output_item->zero_point, output_item->multiplier, output_item->shift); } } input_data += copy_size; @@ -94,14 +93,11 @@ static int csi_unstack_u8(struct csi_tensor *input, } int csi_unstack_init(struct csi_tensor *input, - struct csi_tensor *output, - struct unstack_params *params) + struct csi_tensor *output, + struct unstack_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_unstack_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_unstack_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_UNSTACK, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/utils.c b/source/reference/utils.c index 56af0bd0..94c8b212 100644 --- a/source/reference/utils.c +++ b/source/reference/utils.c @@ -18,6 +18,7 @@ #include "csi_nn.h" #include "csi_utils.h" +#include int32_t csi_max_internal_s32(int32_t a, int32_t b) { @@ -121,28 +122,58 @@ uint8_t csi_quantize_u8(int32_t input, int32_t offset, int32_t multiplier, int32 return csi_min_internal_s32(255, csi_max_internal_s32(0, output)); } -float csi_dequantize_f32(uint8_t input, int32_t offset, int32_t multiplier, int32_t shift) +int8_t csi_quantize_i8(int32_t input, int32_t offset, int32_t multiplier, int32_t shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + int32_t output = round_div_pot(high_mul_sat_round_double(input * (1 << left_shift), multiplier), right_shift); + output += offset; + return csi_min_internal_s32(127, csi_max_internal_s32(-127, output)); +} + +uint8_t csi_quantize_channel_u8(int32_t data, struct csi_tensor* input, struct csi_tensor* output, float wscale) +{ + float out = data * input->scale * wscale; + return csi_quantize_f32_to_u8(out, output->zero_point, output->multiplier, output->shift); +} + +float csi_dequantize_u8_to_f32(uint8_t input, int32_t offset, int32_t multiplier, int32_t shift) +{ + float x = input; + x -= offset; + float scale = csi_get_scale(multiplier, shift); + return x * scale; +} + +float csi_dequantize_i8_to_f32(int8_t input, int32_t offset, int32_t multiplier, int32_t shift) { float x = input; - x += offset; + x -= offset; float scale = csi_get_scale(multiplier, shift); return x * scale; } -uint8_t csi_quantize_f32(float input, int32_t offset, int32_t multiplier, int32_t shift) +uint8_t csi_quantize_f32_to_u8(float input, int32_t offset, int32_t multiplier, int32_t shift) { float scale = csi_get_scale(multiplier, shift); float output = round(input / scale + offset); return fmin(255, fmax(0, output)); } +int8_t csi_quantize_f32_to_i8(float input, int32_t offset, int32_t multiplier, int32_t shift) +{ + float scale = csi_get_scale(multiplier, shift); + float output = round(input / scale + offset); + return fmin(127, fmax(-127, output)); +} + uint8_t csi_requantize_u8(uint8_t input, int32_t input_offset, int32_t input_multiplier, int32_t input_shift, int32_t output_offset, int32_t output_multiplier, int32_t output_shift) { - float val = csi_dequantize_f32(input, input_offset, input_multiplier, input_shift); - return csi_quantize_f32(val, output_offset, output_multiplier, output_shift); + float val = csi_dequantize_u8_to_f32(input, input_offset, input_multiplier, input_shift); + return csi_quantize_f32_to_u8(val, output_offset, output_multiplier, output_shift); } struct csi_tensor *csi_deconv_kernel_nchw_to_nhwc_u8(struct csi_tensor *t, int32_t *permute) @@ -172,13 +203,14 @@ struct csi_tensor *csi_deconv_kernel_nchw_to_nhwc_u8(struct csi_tensor *t, int32 struct transpose_params tparams; tparams.permute = permute; + tparams.api = CSINN_REF; csi_transpose_init(t, nt, &tparams); csi_transpose(t, nt, &tparams); t->dim_count = t_dim; return nt; } -struct csi_tensor *csi_nchw_to_nhwc_u8(struct csi_tensor *t) +struct csi_tensor *csi_nchw_to_nhwc_8(struct csi_tensor *t) { struct csi_tensor *nt = malloc(sizeof(struct csi_tensor)); @@ -205,13 +237,14 @@ struct csi_tensor *csi_nchw_to_nhwc_u8(struct csi_tensor *t) struct transpose_params tparams; tparams.permute = permute; + tparams.api = CSINN_REF; csi_transpose_init(t, nt, &tparams); csi_transpose(t, nt, &tparams); t->dim_count = t_dim; return nt; } -void csi_nhwc_to_nchw_u8(struct csi_tensor *nt, struct csi_tensor *t) +void csi_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t) { nt->dim[1] = t->dim[3]; nt->dim[2] = t->dim[1]; @@ -224,6 +257,7 @@ void csi_nhwc_to_nchw_u8(struct csi_tensor *nt, struct csi_tensor *t) struct transpose_params tparams; tparams.permute = permute; + tparams.api = CSINN_REF; csi_transpose_init(t, nt, &tparams); csi_transpose(t, nt, &tparams); @@ -260,6 +294,7 @@ struct csi_tensor *csi_nchw_to_nhwc_f32(struct csi_tensor *t) struct transpose_params tparams; tparams.permute = permute; + tparams.api = CSINN_REF; csi_transpose_init(t, nt, &tparams); csi_transpose(t, nt, &tparams); t->dim_count = t_dim; @@ -279,6 +314,7 @@ void csi_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t) struct transpose_params tparams; tparams.permute = permute; + tparams.api = CSINN_REF; csi_transpose_init(t, nt, &tparams); csi_transpose(t, nt, &tparams); @@ -289,18 +325,244 @@ void csi_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t) } int32_t get_reduction_index(int32_t k, const int32_t *strides, - const int32_t *extents, int32_t n) { + const int32_t *extents, int32_t n) +{ + int32_t index = 0; + for (int32_t i = 0; i < n; i++) + { + int32_t div = 1; + for (int32_t j = i + 1; j < n; j++) + { + div *= extents[j]; + } + int32_t mod = div * extents[i]; + + index += ((k % mod) / div * strides[i]); + } + + return index; +} + +float uint8_to_float(uint8_t i, struct csi_tensor *t) +{ + return ((float)i - t->zero_point) * t->scale; +} + +float int8_to_float(int8_t i, struct csi_tensor *t) +{ + return ((float)i - t->zero_point) * t->scale; +} + +uint8_t float_to_uint8(float i, struct csi_tensor *t) +{ + float ret = round(i / t->scale) + t->zero_point; + if (ret > 255) { + return 255; + } else if (ret < 0) { + return 0; + } else { + return ret; + } +} - int32_t index = 0; - for (int32_t i = 0; i < n; i++) { - int32_t div = 1; - for (int32_t j = i + 1; j < n; j++) { - div *= extents[j]; +int8_t float_to_int8(float i, struct csi_tensor *t) +{ + int8_t ret = round(i / t->scale) + t->zero_point; + if (ret > 127) { + return 127; + } else if (ret < -127) { + return 127; + } else { + return ret; } - int32_t mod = div * extents[i]; +} - index += ((k % mod) / div * strides[i]); - } +int64_t conv_out_u8(int64_t res, + struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel) +{ + float t = res * input->scale * kernel->scale / output->scale; + if (t < 0) { + t = 0; + } + int32_t out = round(t + output->zero_point); + if (out < 0) { + return 0; + } else if (out > 255) { + return 255; + } else { + return out; + } +} - return index; -} \ No newline at end of file +int64_t conv_out_i8(int64_t res, + struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel) +{ + float t = res * input->scale * kernel->scale / output->scale; + if (t < 0) { + t = 0; + } + int32_t out = round(t + output->zero_point); + if (out < 0) { + return 0; + } else if (out > 127) { + return 127; + } else { + return out; + } +} + +int64_t conv_relu6_out_u8(int64_t res, + struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel) +{ + float t = res * input->scale * kernel->scale; + if (t < 0) { + t = 0; + } else if (t * output->scale > 6) { + t = 6; + } + int32_t out = round(t / output->scale + output->zero_point); + if (out < 0) { + return 0; + } else if (out > 255) { + return 255; + } else { + return out; + } +} + +int64_t conv_relu6_out_i8(int64_t res, + struct csi_tensor *input, + struct csi_tensor *output, + struct csi_tensor *kernel) +{ + float t = res * input->scale * kernel->scale; + if (t < 0) { + t = 0; + } else if (t * output->scale > 6) { + t = 6; + } + int32_t out = round(t / output->scale + output->zero_point); + if (out < 0) { + return 0; + } else if (out > 127) { + return 127; + } else { + return out; + } +} + +float uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point) +{ + return ((float)i - zero_point) * scale; +} + +int64_t conv_channel_out_u8(int64_t res, + struct csi_tensor *input, + struct csi_tensor *output, + float kscale) +{ + float t = res * input->scale * kscale / output->scale; + if (t < 0) { + t = 0; + } + int32_t out = round(t + output->zero_point); + if (out < 0) { + return 0; + } else if (out > 255) { + return 255; + } else { + return out; + } +} +int64_t conv_channel_relu6_u8(int64_t res, + struct csi_tensor *input, + struct csi_tensor *output, + float kscale) +{ + float t = res * input->scale * kscale; + if (t < 0) { + t = 0; + } else if (t > 6) { + t = 6; + } + int32_t out = round(t / output->scale + output->zero_point); + if (out < 0) { + return 0; + } else if (out > 255) { + return 255; + } else { + return out; + } +} + +void csi_statistical_mean_std(float *data, int sz) +{ + int i = 0; + float max_value = data[0]; + float min_value = data[0]; + double std = 0.0; + double sum = 0.0; + for (i = 0; i < sz; i++) { + sum += data[i]; + if (data[i] > max_value) { + max_value = data[i]; + } + if (data[i] < min_value) { + min_value = data[i]; + } + } + double mean = sum / sz; + sum = 0.0; + for (i = 0; i < sz; i++) { + sum += ((data[i] - mean) * (data[i] - mean)); + } + std = sum / sz; + printf("The max_value of output: %lf\n", max_value); + printf("The min_value of output: %lf\n", min_value); + printf("The mean_value of output: %lf\n", mean); + printf("The std_value of output: %lf\n", std); +} + +void csi_get_top5(float *buf, + uint32_t size, + float *prob, + uint32_t *class) +{ + uint32_t i, j, k; + + memset(prob, 0xfe, sizeof(float) * 5); + memset(class, 0xff, sizeof(uint32_t) * 5); + + for (j = 0; j < 5; j++) { + for (i = 0; i < size; i++) { + for (k = 0; k < 5; k++) { + if (i == class[k]) { + break; + } + } + + if (k != 5) { + continue; + } + + if (buf[i] > prob[j]) { + prob[j] = buf[i]; + class[j] = i; + } + } + } +} + +#define BILLION 1000000000 +uint64_t csi_get_timespec() +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)((uint64_t)ts.tv_nsec + (uint64_t)ts.tv_sec * BILLION); +} diff --git a/source/reference/xor.c b/source/reference/xor.c index 356b1eee..085e8349 100644 --- a/source/reference/xor.c +++ b/source/reference/xor.c @@ -19,10 +19,10 @@ #include "csi_nn.h" #include -static int csi_xor_u32(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_xor_u32(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint32_t *input0_data = input0->data; uint32_t *input1_data = input1->data; @@ -38,10 +38,10 @@ static int csi_xor_u32(struct csi_tensor *input0, return CSINN_TRUE; } -static int csi_xor_u8(struct csi_tensor *input0, - struct csi_tensor *input1, - struct csi_tensor *output, - struct diso_params *params) +int csi_xor_u8(struct csi_tensor *input0, + struct csi_tensor *input1, + struct csi_tensor *output, + struct diso_params *params) { uint8_t *input0_data = input0->data; uint8_t *input1_data = input1->data; @@ -62,11 +62,8 @@ int csi_xor_init(struct csi_tensor *input0, struct csi_tensor *output, struct diso_params *params) { - if (input0->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_xor_u8; - } else if (input0->dtype == CSINN_DTYPE_UINT32) { - params->bc = csi_xor_u32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_XOR, input0->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; diff --git a/source/reference/yuv_rgb_scale.c b/source/reference/yuv_rgb_scale.c index 37a7e977..3bcd42e0 100644 --- a/source/reference/yuv_rgb_scale.c +++ b/source/reference/yuv_rgb_scale.c @@ -21,9 +21,9 @@ /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L3279-L3325 line 3279*/ -static int csi_yuv_rgb_scale_f32(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_yuv_rgb_scale_f32(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { float *input_data = input->data; float *output_data = output->data; @@ -51,9 +51,9 @@ static int csi_yuv_rgb_scale_f32(struct csi_tensor *input, return CSINN_TRUE; } -static int csi_yuv_rgb_scale_u8(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) +int csi_yuv_rgb_scale_u8(struct csi_tensor *input, + struct csi_tensor *output, + struct siso_params *params) { uint8_t *input_data = input->data; @@ -62,18 +62,18 @@ static int csi_yuv_rgb_scale_u8(struct csi_tensor *input, for(int n = 0; n < input->dim[0]; n++){ for(int h = 0; h < input->dim[1]; h++){ for(int w = 0; w < input->dim[2]; w++){ - float y = csi_dequantize_f32(input_data[0], input->offset, input->multiplier, input->shift); - float u = csi_dequantize_f32(input_data[1], input->offset, input->multiplier, input->shift); - float v = csi_dequantize_f32(input_data[2], input->offset, input->multiplier, input->shift); + float y = csi_dequantize_u8_to_f32(input_data[0], input->zero_point, input->multiplier, input->shift); + float u = csi_dequantize_u8_to_f32(input_data[1], input->zero_point, input->multiplier, input->shift); + float v = csi_dequantize_u8_to_f32(input_data[2], input->zero_point, input->multiplier, input->shift); float r = y + 1.13988303 * v; float g = y - 0.394642334 * u - 0.58062185 * v; float b = y + 2.03206185 * u; input_data += 3; - output_data[0] = csi_quantize_f32(r, output->offset, output->multiplier, output->shift); - output_data[1] = csi_quantize_f32(g, output->offset, output->multiplier, output->shift); - output_data[2] = csi_quantize_f32(b, output->offset, output->multiplier, output->shift); + output_data[0] = csi_quantize_f32_to_u8(r, output->zero_point, output->multiplier, output->shift); + output_data[1] = csi_quantize_f32_to_u8(g, output->zero_point, output->multiplier, output->shift); + output_data[2] = csi_quantize_f32_to_u8(b, output->zero_point, output->multiplier, output->shift); output_data += 3; } } @@ -83,22 +83,19 @@ static int csi_yuv_rgb_scale_u8(struct csi_tensor *input, } int csi_yuv_rgb_scale_init(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { - if (input->dtype == CSINN_DTYPE_UINT8) { - params->bc = csi_yuv_rgb_scale_u8; - } else if (input->dtype == CSINN_DTYPE_FLOAT32) { - params->bc = csi_yuv_rgb_scale_f32; - } else { + params->bc = csi_bc_map(params->api, CSINN_OP_YUV_RGB_SCALE, input->dtype); + if (params->bc == NULL) { return CSINN_UNSUPPORT_DTYPE; } return CSINN_TRUE; } int csi_yuv_rgb_scale(struct csi_tensor *input, - struct csi_tensor *output, - struct siso_params *params) + struct csi_tensor *output, + struct siso_params *params) { if (params->bc != NULL) { params->bc(input, output, params);