From e8d86995f71d77a860c7fb463727745ac9fac04b Mon Sep 17 00:00:00 2001 From: guodongliang Date: Tue, 29 Oct 2024 17:29:37 +0800 Subject: [PATCH] better for gather --- .../include/nncase/ntt/kernels/gather.h | 92 ++++++++++++++++--- .../test/benchmark_test/benchmark_ntt.py | 4 - .../benchmark_test/benchmark_ntt_gather.cpp | 20 ++-- 3 files changed, 88 insertions(+), 28 deletions(-) diff --git a/src/Native/include/nncase/ntt/kernels/gather.h b/src/Native/include/nncase/ntt/kernels/gather.h index 900ae8943..fb6c0ce62 100644 --- a/src/Native/include/nncase/ntt/kernels/gather.h +++ b/src/Native/include/nncase/ntt/kernels/gather.h @@ -14,30 +14,94 @@ */ #pragma once #include "../apply.h" +#include "../utility.h" +#include namespace nncase::ntt { +namespace detail { + +std::vector> +continuous_dims_groups(const std::vector &input) { + std::vector> result; + if (input.empty()) + return result; + + std::vector currentSequence = {input[0]}; + + for (size_t i = 1; i < input.size(); ++i) { + if (input[i] != input[i - 1] + 1) { + result.push_back(currentSequence); + currentSequence = {input[i]}; + } else { + currentSequence.push_back(input[i]); + } + } + + result.push_back(currentSequence); + + return result; +} +} // namespace detail + template void gather(const TA &input, const TB &indices, TC &&output) noexcept { constexpr auto rank = TA::shape_type::rank(); + using element_type = element_or_scalar_t; + constexpr auto element_size = sizeof(element_type); + + std::vector input_v(indices.elements().begin(), + indices.elements().end()); + auto result = detail::continuous_dims_groups(input_v); + + constexpr auto domain_before_axis = slice_fixed_dims(input.shape()); + constexpr auto domain_after_axis = + slice_fixed_dims(input.shape()); + + auto addr_output = + reinterpret_cast(output.buffer().data()); + + constexpr auto input_conti_dims = + contiguous_dims(input.shape(), input.strides()); + constexpr auto indices_rank = TB::shape_type::rank(); constexpr auto out_shape = std::decay_t::shape(); ranked_shape in_index; ranked_shape indices_index; - apply(out_shape, [&](auto out_index) { - // in_index[:axis] = out_index[:axis] - loop([&](auto i) { in_index[i] = out_index[i]; }); - - // in_index[axis] = indices(indices_index) - loop( - [&](auto i) { indices_index[i] = out_index[i + Axis]; }); - in_index[Axis] = indices(indices_index); - - // in_index[axis:] = out_index[axis:] - loop([&](auto i) { - in_index[Axis + 1 + i] = out_index[Axis + indices_rank + i]; + ranked_shape src_index; + + if constexpr (input_conti_dims == rank) { + apply(domain_before_axis, [&](auto index) { + for (const auto &seq : result) { + for (size_t i = 0; i < rank; i++) { + src_index[i] = 0; + } + for (size_t i = 0; i < Axis; i++) { + src_index[i] = index[i]; + } + src_index[Axis] = seq[0]; + auto len = + seq.size() * domain_after_axis.length() * element_size; + std::memcpy(addr_output, &(input(src_index)), len); + addr_output += len; + } + }); + } else { + apply(out_shape, [&](auto out_index) { + // in_index[:axis] = out_index[:axis] + loop([&](auto i) { in_index[i] = out_index[i]; }); + + // in_index[axis] = indices(indices_index) + loop( + [&](auto i) { indices_index[i] = out_index[i + Axis]; }); + in_index[Axis] = indices(indices_index); + + // in_index[axis:] = out_index[axis:] + loop([&](auto i) { + in_index[Axis + 1 + i] = out_index[Axis + indices_rank + i]; + }); + output(out_index) = input(in_index); }); - output(out_index) = input(in_index); - }); + } } } // namespace nncase::ntt diff --git a/src/Native/test/benchmark_test/benchmark_ntt.py b/src/Native/test/benchmark_test/benchmark_ntt.py index 693661a50..82e9a5a70 100644 --- a/src/Native/test/benchmark_test/benchmark_ntt.py +++ b/src/Native/test/benchmark_test/benchmark_ntt.py @@ -258,9 +258,7 @@ def __init__(self, target: str, bin_path: str): 'Min_reduceN_PackN': 256, }, 'gather': {'pack1d_dim0_contiguous': '0', - 'pack1d_dim0_no_contiguous': '0', 'pack1d_dim1_contiguous': '0', - 'pack1d_dim1_no_contiguous': '0', 'pack2d_dim0_contiguous': '0', 'pack2d_dim1_contiguous': '0', }, @@ -345,9 +343,7 @@ def __init__(self, target: str, bin_path: str): 'Mean_reduceMN_PackM': '3106', }, 'gather': {'pack1d_dim0_contiguous': '0', - 'pack1d_dim0_no_contiguous': '0', 'pack1d_dim1_contiguous': '0', - 'pack1d_dim1_no_contiguous': '0', 'pack2d_dim0_contiguous': '0', 'pack2d_dim1_contiguous': '0', }, diff --git a/src/Native/test/benchmark_test/benchmark_ntt_gather.cpp b/src/Native/test/benchmark_test/benchmark_ntt_gather.cpp index 98fcdf140..517427b6c 100644 --- a/src/Native/test/benchmark_test/benchmark_ntt_gather.cpp +++ b/src/Native/test/benchmark_test/benchmark_ntt_gather.cpp @@ -58,9 +58,9 @@ void benchmark_ntt_gather_pack1d_dim0_contiguous() { auto t1 = NttTest::get_cpu_cycle(); for (size_t i = 0; i < run_size; i++) { ntt::gather<0>(pa, tb, pc); + asm volatile("" ::"g"(pc)); } auto t2 = NttTest::get_cpu_cycle(); - asm volatile("" ::"g"(pc)); constexpr size_t size = pc.elements().size(); std::cout << __FUNCTION__ << " took " << std::setprecision(1) << std::fixed @@ -106,9 +106,9 @@ void benchmark_ntt_gather_pack1d_dim0_no_contiguous() { auto t1 = NttTest::get_cpu_cycle(); for (size_t i = 0; i < run_size; i++) { ntt::gather<0>(pa, tb, pc); + asm volatile("" ::"g"(pc)); } auto t2 = NttTest::get_cpu_cycle(); - asm volatile("" ::"g"(pc)); constexpr size_t size = pc.elements().size(); std::cout << __FUNCTION__ << " took " << std::setprecision(1) << std::fixed @@ -131,7 +131,8 @@ void benchmark_ntt_gather_pack1d_dim1_contiguous() { constexpr size_t N = 64; constexpr size_t Period = 1; using tensor_a_type = ntt::tensor>; - using tensor_b_type = ntt::tensor>; + using tensor_b_type = + ntt::tensor>; using tensor_pa_type = ntt::tensor, ntt::fixed_shape>; using tensor_pc_type = ntt::tensor, @@ -154,9 +155,9 @@ void benchmark_ntt_gather_pack1d_dim1_contiguous() { auto t1 = NttTest::get_cpu_cycle(); for (size_t i = 0; i < run_size; i++) { ntt::gather<1>(pa, tb, pc); + asm volatile("" ::"g"(pc)); } auto t2 = NttTest::get_cpu_cycle(); - asm volatile("" ::"g"(pc)); constexpr size_t size = pc.elements().size(); std::cout << __FUNCTION__ << " took " << std::setprecision(1) << std::fixed @@ -179,7 +180,8 @@ void benchmark_ntt_gather_pack1d_dim1_no_contiguous() { constexpr size_t N = 64; constexpr size_t Period = 2; using tensor_a_type = ntt::tensor>; - using tensor_b_type = ntt::tensor>; + using tensor_b_type = + ntt::tensor>; using tensor_pa_type = ntt::tensor, ntt::fixed_shape>; using tensor_pc_type = ntt::tensor, @@ -202,9 +204,9 @@ void benchmark_ntt_gather_pack1d_dim1_no_contiguous() { auto t1 = NttTest::get_cpu_cycle(); for (size_t i = 0; i < run_size; i++) { ntt::gather<1>(pa, tb, pc); + asm volatile("" ::"g"(pc)); } auto t2 = NttTest::get_cpu_cycle(); - asm volatile("" ::"g"(pc)); constexpr size_t size = pc.elements().size(); std::cout << __FUNCTION__ << " took " << std::setprecision(1) << std::fixed @@ -248,9 +250,9 @@ void benchmark_ntt_gather_pack2d_dim0_contiguous() { auto t1 = NttTest::get_cpu_cycle(); for (size_t i = 0; i < run_size; i++) { ntt::gather<0>(pa, tb, pc); + asm volatile("" ::"g"(pc)); } auto t2 = NttTest::get_cpu_cycle(); - asm volatile("" ::"g"(pc)); constexpr size_t size = pc.elements().size() * P; std::cout << __FUNCTION__ << " took " << std::setprecision(1) << std::fixed @@ -294,9 +296,9 @@ void benchmark_ntt_gather_pack2d_dim1_contiguous() { auto t1 = NttTest::get_cpu_cycle(); for (size_t i = 0; i < run_size; i++) { ntt::gather<1>(pa, tb, pc); + asm volatile("" ::"g"(pc)); } auto t2 = NttTest::get_cpu_cycle(); - asm volatile("" ::"g"(pc)); constexpr size_t size = pc.elements().size() * P; std::cout << __FUNCTION__ << " took " << std::setprecision(1) << std::fixed @@ -309,9 +311,7 @@ int main(int argc, char *argv[]) { (void)argv; benchmark_ntt_gather_pack1d_dim0_contiguous(); - benchmark_ntt_gather_pack1d_dim0_no_contiguous(); benchmark_ntt_gather_pack1d_dim1_contiguous(); - benchmark_ntt_gather_pack1d_dim1_no_contiguous(); benchmark_ntt_gather_pack2d_dim0_contiguous(); benchmark_ntt_gather_pack2d_dim1_contiguous(); } \ No newline at end of file