diff --git a/src/Native/include/nncase/ntt/arch/riscv64/ukernels.h b/src/Native/include/nncase/ntt/arch/riscv64/ukernels.h
index 61aa9a56c..a48671ce2 100644
--- a/src/Native/include/nncase/ntt/arch/riscv64/ukernels.h
+++ b/src/Native/include/nncase/ntt/arch/riscv64/ukernels.h
@@ -23,54 +23,68 @@
 namespace nncase::ntt::ukernels {
 
 // unary
-template <typename T>
-struct u_unary_policy<ntt::ops::abs<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::ceil<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::floor<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::neg<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::round<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::sign<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::square<vector<T, NTT_VLEN / sizeof(T) / 8>>,
-                      vector<T, NTT_VLEN / sizeof(T) / 8>, true> {
-    static constexpr size_t unroll = 8;
-};
-
+#define SPECIALIZE_U_UNARY(op, unroll_num)                                     \
+    template <typename T>                                                      \
+    struct u_unary_policy<ntt::ops::op<vector<T, NTT_VLEN / sizeof(T) / 8>>,   \
+                          vector<T, NTT_VLEN / sizeof(T) / 8>, true> {         \
+        static constexpr size_t unroll = unroll_num;                           \
+    };
+
+SPECIALIZE_U_UNARY(abs, 8)
+SPECIALIZE_U_UNARY(ceil, 8)
+SPECIALIZE_U_UNARY(floor, 8)
+SPECIALIZE_U_UNARY(neg, 8)
+SPECIALIZE_U_UNARY(round, 8)
+SPECIALIZE_U_UNARY(sign, 8)
+SPECIALIZE_U_UNARY(square, 8)
+
+#undef SPECIALIZE_U_UNARY
+
+// binary
+#define SPECIALIZE_U_BINARY(op, unroll_num)                                    \
+    template <typename T1, typename T2>                                        \
+    struct u_binary_policy<                                                    \
+        ntt::ops::op<vector<T1, NTT_VLEN / sizeof(T1) / 8>,                    \
+                     vector<T2, NTT_VLEN / sizeof(T2) / 8>>,                   \
+        vector<T1, NTT_VLEN / sizeof(T1) / 8>,                                 \
+        vector<T2, NTT_VLEN / sizeof(T2) / 8>, true> {                         \
+        static constexpr size_t unroll = unroll_num;                           \
+    };                                                                         \
+                                                                               \
+    template <typename T1, typename T2>                                        \
+    struct u_binary_policy<                                                    \
+        ntt::ops::op<T1, vector<T2, NTT_VLEN / sizeof(T2) / 8>>, T1,           \
+        vector<T2, NTT_VLEN / sizeof(T2) / 8>, true> {                         \
+        static constexpr size_t unroll = unroll_num;                           \
+    };                                                                         \
+                                                                               \
+    template <typename T1, typename T2>                                        \
+    struct u_binary_policy<                                                    \
+        ntt::ops::op<vector<T1, NTT_VLEN / sizeof(T1) / 8>, T2>,               \
+        vector<T1, NTT_VLEN / sizeof(T1) / 8>, T2, true> {                     \
+        static constexpr size_t unroll = unroll_num;                           \
+    };
+
+SPECIALIZE_U_BINARY(add, 8)
+SPECIALIZE_U_BINARY(sub, 8)
+SPECIALIZE_U_BINARY(mul, 8)
+SPECIALIZE_U_BINARY(div, 8)
+SPECIALIZE_U_BINARY(max, 8)
+SPECIALIZE_U_BINARY(min, 8)
+SPECIALIZE_U_BINARY(mod, 8)
+SPECIALIZE_U_BINARY(floor_mod, 8)
+
+#undef SPECIALIZE_U_BINARY
+
+// reduce
 template <reduce_op Op, class T> struct u_reduce_policy<Op, T, true> {
     static constexpr size_t unroll = 8;
 };
 
+// cast
 template <> struct u_cast_policy<true> { static constexpr size_t unroll = 8; };
 
+// matmul
 template <>
 struct u_matmul_policy<mamtul_pack_kind::no_pack, float, float, float, true> {
     static constexpr size_t m0_tile = 1;
diff --git a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
index 61431fde0..3b3a7a3b6 100644
--- a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
+++ b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
@@ -19,41 +19,54 @@
 namespace nncase::ntt::ukernels {
 
 // unary
-template <typename T>
-struct u_unary_policy<ntt::ops::abs<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::ceil<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::floor<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::neg<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::round<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::sign<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
-template <typename T>
-struct u_unary_policy<ntt::ops::square<vector<T, 8>>, vector<T, 8>, true> {
-    static constexpr size_t unroll = 2;
-};
-
+#define SPECIALIZE_U_UNARY(op, unroll_num)                                     \
+    template <typename T>                                                      \
+    struct u_unary_policy<ntt::ops::op<vector<T, 8>>, vector<T, 8>, true> {    \
+        static constexpr size_t unroll = unroll_num;                           \
+    };
+
+SPECIALIZE_U_UNARY(abs, 2)
+SPECIALIZE_U_UNARY(ceil, 2)
+SPECIALIZE_U_UNARY(floor, 2)
+SPECIALIZE_U_UNARY(neg, 2)
+SPECIALIZE_U_UNARY(round, 2)
+SPECIALIZE_U_UNARY(sign, 2)
+SPECIALIZE_U_UNARY(square, 2)
+
+#undef SPECIALIZE_U_UNARY
+
+// binary
+#define SPECIALIZE_U_BINARY(op, unroll_num)                                    \
+    template <typename T1, typename T2>                                        \
+    struct u_binary_policy<ntt::ops::op<vector<T1, 8>, vector<T2, 8>>,         \
+                           vector<T1, 8>, vector<T2, 8>, true> {               \
+        static constexpr size_t unroll = unroll_num;                           \
+    };                                                                         \
+                                                                               \
+    template <typename T1, typename T2>                                        \
+    struct u_binary_policy<ntt::ops::op<T1, vector<T2, 8>>, T1, vector<T2, 8>, \
+                           true> {                                             \
+        static constexpr size_t unroll = unroll_num;                           \
+    };                                                                         \
+                                                                               \
+    template <typename T1, typename T2>                                        \
+    struct u_binary_policy<ntt::ops::op<vector<T1, 8>, T2>, vector<T1, 8>, T2, \
+                           true> {                                             \
+        static constexpr size_t unroll = unroll_num;                           \
+    };
+
+SPECIALIZE_U_BINARY(add, 2)
+SPECIALIZE_U_BINARY(sub, 2)
+SPECIALIZE_U_BINARY(mul, 2)
+SPECIALIZE_U_BINARY(div, 2)
+SPECIALIZE_U_BINARY(max, 2)
+SPECIALIZE_U_BINARY(min, 2)
+SPECIALIZE_U_BINARY(mod, 2)
+SPECIALIZE_U_BINARY(floor_mod, 2)
+
+#undef SPECIALIZE_U_BINARY
+
+// pack
 template <size_t M, size_t N, size_t MStrides>
 class u_pack<M, N, MStrides, true, float, vector<float, 8>> {
   public:
@@ -75,10 +88,12 @@ class u_pack<M, N, MStrides, true, float, vector<float, 8>> {
     }
 };
 
+// reduce
 template <reduce_op Op, class T> struct u_reduce_policy<Op, T, true> {
     static constexpr size_t unroll = 8;
 };
 
+// matmul
 template <>
 struct u_matmul_policy<mamtul_pack_kind::no_pack, float, float, float, true> {
     static constexpr size_t m0_tile = 1;
diff --git a/src/Native/include/nncase/ntt/kernels/binary.h b/src/Native/include/nncase/ntt/kernels/binary.h
index 409002753..2614c048c 100644
--- a/src/Native/include/nncase/ntt/kernels/binary.h
+++ b/src/Native/include/nncase/ntt/kernels/binary.h
@@ -76,14 +76,14 @@ class binary_impl<TLhs, TRhs, TOut> {
 
             // 1.1 Non broadcast
             if constexpr (is_same_seq(lhs_rest_dims, rhs_rest_dims)) {
-                return binary_non_broadcast(op, lhs_p, rhs_p, out_p,
-                                            lhs_rest_dims.length());
+                return binary_non_broadcast<Op>(lhs_p, rhs_p, out_p,
+                                                lhs_rest_dims.length());
             } else if constexpr (lhs_rest_dims.length() == 1) {
-                return binary_left_broadcast(op, *lhs_p, rhs_p, out_p,
-                                             rhs_rest_dims.length());
+                return binary_left_broadcast<Op>(*lhs_p, rhs_p, out_p,
+                                                 rhs_rest_dims.length());
             } else if constexpr (rhs_rest_dims.length() == 1) {
-                return binary_right_broadcast(op, lhs_p, *rhs_p, out_p,
-                                              lhs_rest_dims.length());
+                return binary_right_broadcast<Op>(lhs_p, *rhs_p, out_p,
+                                                  lhs_rest_dims.length());
             }
         }
 
@@ -102,153 +102,26 @@ class binary_impl<TLhs, TRhs, TOut> {
     }
 
     template <class Op, class TLhsElem, class TRhsElem, class TOutElem>
-    void binary_non_broadcast(Op &op, const TLhsElem *lhs, const TRhsElem *rhs,
+    void binary_non_broadcast(const TLhsElem *lhs, const TRhsElem *rhs,
                               TOutElem *output, size_t extent) {
-        for (size_t i = 0; i < extent; i++) {
-            *output++ = op(*lhs++, *rhs++);
-        }
+        ntt::u_binary<Op, TLhsElem, TRhsElem, TOutElem>(lhs, 1, rhs, 1, output,
+                                                        1, extent);
     }
 
     template <class Op, class TLhsElem, class TRhsElem, class TOutElem>
-    void binary_left_broadcast(Op &op, const TLhsElem &lhs, const TRhsElem *rhs,
+    void binary_left_broadcast(const TLhsElem &lhs, const TRhsElem *rhs,
                                TOutElem *output, size_t extent) {
-        for (size_t i = 0; i < extent; i++) {
-            *output++ = op(lhs, *rhs++);
-        }
+        ntt::u_binary<Op, TLhsElem, TRhsElem, TOutElem>(lhs, 0, rhs, 1, output,
+                                                        1, extent);
     }
 
     template <class Op, class TLhsElem, class TRhsElem, class TOutElem>
-    void binary_right_broadcast(Op &op, const TLhsElem *lhs,
-                                const TRhsElem &rhs, TOutElem *output,
-                                size_t extent) {
-        for (size_t i = 0; i < extent; i++) {
-            *output++ = op(*lhs++, rhs);
-        }
+    void binary_right_broadcast(const TLhsElem *lhs, const TRhsElem &rhs,
+                                TOutElem *output, size_t extent) {
+        ntt::u_binary<Op, TLhsElem, TRhsElem, TOutElem>(lhs, 1, rhs, 0, output,
+                                                        1, extent);
     }
 };
-
-#define BINARY_IMPL(OP)                                                        \
-    template <class Shape, class In1Strides, class In2Strides,                 \
-              class OutStrides>                                                \
-    class OP##_impl;                                                           \
-    template <size_t... Dims, size_t... In1Strides, size_t... In2Strides,      \
-              size_t... OutStrides>                                            \
-    class OP##_impl<fixed_shape<Dims...>, fixed_strides<In1Strides...>,        \
-                    fixed_strides<In2Strides...>,                              \
-                    fixed_strides<OutStrides...>> {                            \
-      public:                                                                  \
-        template <class TIn1, class TIn2, class TOut>                          \
-        constexpr void operator()(const TIn1 &input1, const TIn2 &input2,      \
-                                  TOut &output) {                              \
-            constexpr size_t rank = sizeof...(Dims);                           \
-            ranked_shape<rank> index{};                                        \
-            constexpr auto conti_dims =                                        \
-                std::min(contiguous_dims(fixed_shape<Dims...>{},               \
-                                         fixed_strides<In1Strides...>{}),      \
-                         contiguous_dims(fixed_shape<Dims...>{},               \
-                                         fixed_strides<OutStrides...>{}));     \
-            apply<TIn1, TIn2, TOut, 0, rank, conti_dims, Dims...>(             \
-                index, input1, input2, output);                                \
-        }                                                                      \
-                                                                               \
-      private:                                                                 \
-        template <class TIn1, class TIn2, class TOut, size_t Axis,             \
-                  size_t Rank, size_t ContiguousDims, size_t... RestDims>      \
-        constexpr void apply(ranked_shape<Rank> &index, const TIn1 &input1,    \
-                             const TIn2 &input2, TOut &output) {               \
-            if constexpr (ContiguousDims == sizeof...(RestDims)) {             \
-                constexpr auto inner_size =                                    \
-                    fixed_shape<RestDims...>::length();                        \
-                auto input1_p = input1.elements().data() +                     \
-                                linear_offset(index, input1.strides());        \
-                auto input2_p = input2.elements().data() +                     \
-                                linear_offset(index, input2.strides());        \
-                auto output_p = output.elements().data() +                     \
-                                linear_offset(index, output.strides());        \
-                OP##_contiguous<inner_size>(input1_p, input2_p, output_p);     \
-            } else {                                                           \
-                apply_next<TIn1, TIn2, TOut, Axis, Rank, ContiguousDims,       \
-                           RestDims...>(index, input1, input2, output);        \
-            }                                                                  \
-        }                                                                      \
-                                                                               \
-        template <class TIn1, class TIn2, class TOut, size_t Axis,             \
-                  size_t Rank, size_t ContiguousDims, size_t Dim,              \
-                  size_t... RestDims>                                          \
-        constexpr void apply_next(ranked_shape<Rank> &index,                   \
-                                  const TIn1 &input1, const TIn2 &input2,      \
-                                  TOut &output) {                              \
-            for (index[Axis] = 0; index[Axis] < Dim; index[Axis]++) {          \
-                apply<TIn1, TIn2, TOut, Axis + 1, Rank, ContiguousDims,        \
-                      RestDims...>(index, input1, input2, output);             \
-            }                                                                  \
-        }                                                                      \
-        template <size_t Extent, class T1, class T2, class TOut>               \
-        constexpr void OP##_contiguous(const T1 *input1, const T2 *input2,     \
-                                       TOut *output) {                         \
-            ntt::u_##OP(input1, input2, 1, 1, output, 1, Extent);              \
-        }                                                                      \
-    };                                                                         \
-                                                                               \
-    template <size_t Rank, class In1Strides, class In2Strides,                 \
-              class OutStrides>                                                \
-    class OP##_impl<ranked_shape<Rank>, In1Strides, In2Strides, OutStrides> {  \
-      public:                                                                  \
-        template <class TIn1, class TIn2, class TOut>                          \
-        constexpr void operator()(const TIn1 &input1, const TIn2 &input2,      \
-                                  TOut &output) {                              \
-            ranked_shape<Rank> index{};                                        \
-            auto conti_dims =                                                  \
-                std::min(contiguous_dims(input1.shape(), input1.strides()),    \
-                         contiguous_dims(input1.shape(), output.strides()));   \
-            apply<TIn1, TIn2, TOut, 0>(index, conti_dims, input1, input2,      \
-                                       output);                                \
-        }                                                                      \
-                                                                               \
-      private:                                                                 \
-        template <class TIn1, class TIn2, class TOut, size_t Axis>             \
-        constexpr void apply(ranked_shape<Rank> &index, size_t conti_dims,     \
-                             const TIn1 &input1, const TIn2 &input2,           \
-                             TOut &output) {                                   \
-            const auto outer_dims = Rank - conti_dims;                         \
-            if (Axis >= outer_dims) {                                          \
-                size_t inner_size = 1;                                         \
-                for (size_t i = outer_dims; i < input1.shape().rank(); i++)    \
-                    inner_size *= input1.shape()[i];                           \
-                auto input1_p = input1.buffer().data() +                       \
-                                linear_offset(index, input1.strides());        \
-                auto input2_p = input2.buffer().data() +                       \
-                                linear_offset(index, input2.strides());        \
-                auto output_p = output.buffer().data() +                       \
-                                linear_offset(index, output.strides());        \
-                OP##_contiguous(input1_p, input2_p, output_p, inner_size);     \
-            } else if constexpr (Axis < Rank - 1) {                            \
-                const auto dim = input1.shape()[Axis];                         \
-                for (index[Axis] = 0; index[Axis] < dim; index[Axis]++) {      \
-                    apply<TIn1, TIn2, TOut, Axis + 1>(index, conti_dims,       \
-                                                      input1, input2, output); \
-                }                                                              \
-            }                                                                  \
-        }                                                                      \
-                                                                               \
-        template <class T1, class T2, class TOut>                              \
-        constexpr void OP##_contiguous(const T1 *input1_p, const T2 *input2_p, \
-                                       TOut *output_p, size_t extent) {        \
-            for (size_t i = 0; i < extent; i++) {                              \
-                output_p[i] =                                                  \
-                    ntt::ops::OP<T1, T2>()(input1_p[i], input2_p[i]);          \
-            }                                                                  \
-        }                                                                      \
-    };
-
-BINARY_IMPL(add)
-BINARY_IMPL(div)
-BINARY_IMPL(max)
-BINARY_IMPL(min)
-BINARY_IMPL(mod)
-BINARY_IMPL(mul)
-BINARY_IMPL(sub)
-
 } // namespace detail
 
 template <template <class T1, class T2> class Op, class TLhs, class TRhs,
@@ -258,25 +131,4 @@ void binary(const TLhs &lhs, const TRhs &rhs, TOut &&output) {
     detail::binary_impl<std::decay_t<TLhs>, std::decay_t<TRhs>,
                         std::decay_t<TOut>>()(op, lhs, rhs, output);
 }
-
-#define BINARY(OP)                                                             \
-    template <typename TIn1, typename TIn2, typename TOut>                     \
-    void OP(const TIn1 &input1, const TIn2 &input2, TOut &&output) noexcept {  \
-        detail::OP##_impl<                                                     \
-            common_shape_t<typename TIn1::shape_type,                          \
-                           typename std::decay_t<TOut>::shape_type>,           \
-            typename TIn1::strides_type, typename TIn2::strides_type,          \
-            typename std::decay_t<TOut>::strides_type>                         \
-            impl;                                                              \
-        impl(input1, input2, output);                                          \
-    }
-
-BINARY(add)
-BINARY(div)
-BINARY(max)
-BINARY(min)
-BINARY(mod)
-BINARY(mul)
-BINARY(sub)
-
 } // namespace nncase::ntt
diff --git a/src/Native/include/nncase/ntt/kernels/slice.h b/src/Native/include/nncase/ntt/kernels/slice.h
index d15f4a676..e18a2fac0 100644
--- a/src/Native/include/nncase/ntt/kernels/slice.h
+++ b/src/Native/include/nncase/ntt/kernels/slice.h
@@ -16,6 +16,7 @@
 #include "../apply.h"
 #include "../shape_infer/reduce_axis.h"
 #include "../utility.h"
+#include <iostream>
 #include <tuple>
 
 namespace nncase::ntt {
@@ -31,6 +32,14 @@ inline constexpr auto compute_inner_domain(std::index_sequence<Ints...>) {
 }
 } // namespace slice_detail
 
+template <typename Tshape>
+void dump_shape(const std::string &info, Tshape shape) {
+    std::cout << info;
+    for (size_t i = 0; i < shape.rank(); i++)
+        std::cout << shape[i] << " ";
+    std::cout << std::endl;
+}
+
 /**
  * @brief
  *
@@ -53,18 +62,30 @@ void slice(const TIn &input, TOut &&output) {
 
     auto in_index = ranked_shape<domain.rank()>{};
     auto out_index = ranked_shape<domain.rank()>{};
+    // dump_shape("domain = ", domain);
+    // dump_shape("inner_domain = ", inner_domain);
+    // dump_shape("in_index = ", in_index);
+    // dump_shape("out_index = ", out_index);
+
     apply(domain, [&](auto index) {
+        // dump_shape("index = ", index);
+
         loop<domain.rank()>([&](auto i) {
             in_index[i] = index[i];
             out_index[i] = index[i];
         });
+        // dump_shape("1: in_index = ", in_index);
+        // dump_shape("1: out_index = ", out_index);
 
         apply(inner_domain, [&](auto inner_index) {
+            // dump_shape("inner_index = ", inner_index);
             loop<inner_domain.rank()>([&](auto i) {
                 in_index[TAxes::at(i)] =
                     TStart::at(i) + inner_index[i] * TStride::at(i);
                 out_index[TAxes::at(i)] = inner_index[i];
             });
+            // dump_shape("2: in_index = ", in_index);
+            // dump_shape("2: out_index = ", out_index);
             output(out_index) = input(in_index);
         });
     });
diff --git a/src/Native/include/nncase/ntt/ukernels/u_binary.h b/src/Native/include/nncase/ntt/ukernels/u_binary.h
index 48663f829..2b5b82da9 100644
--- a/src/Native/include/nncase/ntt/ukernels/u_binary.h
+++ b/src/Native/include/nncase/ntt/ukernels/u_binary.h
@@ -18,66 +18,48 @@
 namespace nncase::ntt {
 namespace ukernels {
 
-template <bool Arch> struct u_binary_policy {
-    static constexpr size_t unroll = 2;
+template <class Op, class T1, class T2, bool Arch> struct u_binary_policy {
+    static constexpr size_t unroll = 1;
 };
 
-#define U_BINARY_IMPL(OP)                                                      \
-    template <class T1, class T2, class TOut, bool Arch> struct u_##OP {       \
-      public:                                                                  \
-        constexpr void operator()(const T1 *input1, const T2 *input2,          \
-                                  size_t input1_stride, size_t input2_stride,  \
-                                  TOut *output, size_t output_stride,          \
-                                  size_t count) noexcept {                     \
-            using policy_t = u_binary_policy<Arch>;                            \
-            constexpr auto unroll = policy_t::unroll;                          \
-                                                                               \
-            if (count / unroll) {                                              \
-                while (count / unroll) {                                       \
-                    for (size_t i = 0; i < unroll; i++) {                      \
-                        *output = ntt::ops::OP<T1, T2>()(*input1, *input2);    \
-                        input1 += input1_stride;                               \
-                        input2 += input2_stride;                               \
-                        output += output_stride;                               \
-                        count--;                                               \
-                    }                                                          \
-                }                                                              \
-            }                                                                  \
-                                                                               \
-            for (size_t i = 0; i < count; i++) {                               \
-                *output = ntt::ops::OP<T1, T2>()(*input1, *input2);            \
-                input1 += input1_stride;                                       \
-                input2 += input2_stride;                                       \
-                output += output_stride;                                       \
-            }                                                                  \
-        }                                                                      \
-    };
+template <class Op, class T1, class T2, class TOut, bool Arch> struct u_binary {
+  public:
+    constexpr void operator()(const T1 *input1, const T2 *input2,
+                              size_t input1_stride, size_t input2_stride,
+                              TOut *output, size_t output_stride,
+                              size_t count) noexcept {
+        using policy_t = u_binary_policy<Op, T1, T2, Arch>;
+        constexpr auto unroll = policy_t::unroll;
+        Op op;
 
-U_BINARY_IMPL(add)
-U_BINARY_IMPL(div)
-U_BINARY_IMPL(max)
-U_BINARY_IMPL(min)
-U_BINARY_IMPL(mod)
-U_BINARY_IMPL(mul)
-U_BINARY_IMPL(sub)
-} // namespace ukernels
+        if (count / unroll) {
+            while (count / unroll) {
+                for (size_t i = 0; i < unroll; i++) {
+                    *output = op(*input1, *input2);
+                    input1 += input1_stride;
+                    input2 += input2_stride;
+                    output += output_stride;
+                    count--;
+                }
+            }
+        }
 
-#define U_BINARY(OP)                                                           \
-    template <class T1, class T2, class TOut>                                  \
-    constexpr void u_##OP(const T1 *input1, const T2 *input2,                  \
-                          size_t input1_stride, size_t input2_stride,          \
-                          TOut *output, size_t output_stride,                  \
-                          size_t count) noexcept {                             \
-        ukernels::u_##OP<T1, T2, TOut, true> impl;                             \
-        impl(input1, input2, input1_stride, input2_stride, output,             \
-             output_stride, count);                                            \
+        for (size_t i = 0; i < count; i++) {
+            *output = op(*input1, *input2);
+            input1 += input1_stride;
+            input2 += input2_stride;
+            output += output_stride;
+        }
     }
-U_BINARY(add)
-U_BINARY(div)
-U_BINARY(max)
-U_BINARY(min)
-U_BINARY(mod)
-U_BINARY(mul)
-U_BINARY(sub)
+};
+} // namespace ukernels
 
+template <class Op, class T1, class T2, class TOut>
+constexpr void u_binary(const T1 *input1, size_t input1_stride,
+                        const T2 *input2, size_t input2_stride, TOut *output,
+                        size_t output_stride, size_t count) noexcept {
+    ukernels::u_binary<Op, T1, T2, TOut, true> impl;
+    impl(input1, input2, input1_stride, input2_stride, output, output_stride,
+         count);
+}
 } // namespace nncase::ntt
diff --git a/src/Native/test/benchmark_test/benchmark_ntt.py b/src/Native/test/benchmark_test/benchmark_ntt.py
index 98e9ab96f..0855f949b 100644
--- a/src/Native/test/benchmark_test/benchmark_ntt.py
+++ b/src/Native/test/benchmark_test/benchmark_ntt.py
@@ -281,14 +281,14 @@ def run(self):
 class BenchmarkNTT_riscv64(BenchmarkNTT, Benchmark_riscv64):
     def __init__(self, target: str, bin_path: str):
         BenchmarkNTT.__init__(self, 'riscv64', target, bin_path)
-        self.roofline_dict = {'binary': {'add': '10.3',
-                                         'sub': '10.3',
-                                         'mul': '10.3',
-                                         'div': '42.3',
-                                         'max': '10.3',
-                                         'min': '10.3',
-                                         'floor_mod': '43',
-                                         'mod': '54',
+        self.roofline_dict = {'binary': {'add': '7.3',
+                                         'sub': '7.3',
+                                         'mul': '7.3',
+                                         'div': '30.3',
+                                         'max': '7.3',
+                                         'min': '7.3',
+                                         'floor_mod': '40.3',
+                                         'mod': '35.3',
                                          'pow': '139'
                                          },
                               'cast': {'float-int32': '5',
diff --git a/src/Native/test/benchmark_test/benchmark_ntt_binary.cpp b/src/Native/test/benchmark_test/benchmark_ntt_binary.cpp
index 93e7b9322..af2b6be62 100644
--- a/src/Native/test/benchmark_test/benchmark_ntt_binary.cpp
+++ b/src/Native/test/benchmark_test/benchmark_ntt_binary.cpp
@@ -35,11 +35,10 @@ void benchmark_ntt_binary(std::string op_name, T lhs_low, T lhs_high, T rhs_low,
     tensor_type ntt_lhs, ntt_rhs, ntt_result;
     NttTest::init_tensor(ntt_lhs, lhs_low, lhs_high);
     NttTest::init_tensor(ntt_rhs, rhs_low, rhs_high);
-    Op<tensor_type, tensor_type> op;
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < size1; i++)
-        ntt_result = op(ntt_lhs, ntt_rhs);
+        ntt::binary<Op>(ntt_lhs, ntt_rhs, ntt_result);
     auto t2 = NttTest::get_cpu_cycle();
 #if __x86_64__
     asm volatile("" ::"g"(ntt_result));
@@ -50,76 +49,26 @@ void benchmark_ntt_binary(std::string op_name, T lhs_low, T lhs_high, T rhs_low,
               << std::endl;
 }
 
-#define BENCHMARK_NTT_BINARY(OP)                                               \
-    template <size_t N, size_t run_size, size_t size>                          \
-    void benchmark_ntt_binary_##OP() {                                         \
-                                                                               \
-        using tensor_type1 =                                                   \
-            ntt::tensor<ntt::vector<float, N>, ntt::fixed_shape<size>>;        \
-        using tensor_type2 =                                                   \
-            ntt::tensor<ntt::vector<float, N>, ntt::fixed_shape<size>>;        \
-        using tensor_type_out =                                                \
-            ntt::tensor<ntt::vector<float, N>, ntt::fixed_shape<size>>;        \
-        constexpr size_t warmup_size = 30;                                     \
-                                                                               \
-        tensor_type1 ntt_input1;                                               \
-        tensor_type2 ntt_input2;                                               \
-        tensor_type_out ntt_output;                                            \
-        NttTest::init_tensor(ntt_input1, -10.f, 10.f);                         \
-        NttTest::init_tensor(ntt_input2, -10.f, 10.f);                         \
-                                                                               \
-        for (size_t i = 0; i < warmup_size; i++)                               \
-            ntt::OP(ntt_input1, ntt_input2, ntt_output);                       \
-                                                                               \
-        auto t1 = NttTest::get_cpu_cycle();                                    \
-        for (size_t i = 0; i < run_size; i++) {                                \
-            ntt::OP(ntt_input1, ntt_input2, ntt_output);                       \
-            asm volatile("" ::"g"(ntt_output));                                \
-            asm volatile("" ::"g"(ntt_input1));                                \
-            asm volatile("" ::"g"(ntt_input2));                                \
-        }                                                                      \
-        auto t2 = NttTest::get_cpu_cycle();                                    \
-                                                                               \
-        std::cout << __FUNCTION__ << " took " << std::setprecision(1)          \
-                  << std::fixed                                                \
-                  << static_cast<float>(t2 - t1) / size / run_size             \
-                  << " cycles" << std::endl;                                   \
-    }
-
-BENCHMARK_NTT_BINARY(add)
-BENCHMARK_NTT_BINARY(div)
-BENCHMARK_NTT_BINARY(max)
-BENCHMARK_NTT_BINARY(min)
-BENCHMARK_NTT_BINARY(mod)
-BENCHMARK_NTT_BINARY(mul)
-BENCHMARK_NTT_BINARY(sub)
-
 int main(int argc, char *argv[]) {
     (void)argc;
     (void)argv;
 
-#if __riscv
-    constexpr size_t run_size = 300;
-    constexpr size_t size = 600;
-#elif __x86_64__
-    constexpr size_t run_size = 2000;
-    constexpr size_t size = 2000;
-#else
-    constexpr size_t run_size = 2000;
-    constexpr size_t size = 2000;
-#endif
-
     constexpr size_t N = NTT_VLEN / (sizeof(float) * 8);
-
+    benchmark_ntt_binary<ntt::ops::add, float, N>("add", -10.f, 10.f, -10.f,
+                                                  10.f);
+    benchmark_ntt_binary<ntt::ops::sub, float, N>("sub", -10.f, 10.f, -10.f,
+                                                  10.f);
+    benchmark_ntt_binary<ntt::ops::mul, float, N>("mul", -10.f, 10.f, -10.f,
+                                                  10.f);
+    benchmark_ntt_binary<ntt::ops::div, float, N>("div", -10.f, 10.f, 1.f,
+                                                  10.f);
+    benchmark_ntt_binary<ntt::ops::max, float, N>("max", -10.f, 10.f, -10.f,
+                                                  10.f);
+    benchmark_ntt_binary<ntt::ops::min, float, N>("min", -10.f, 10.f, -10.f,
+                                                  10.f);
     benchmark_ntt_binary<ntt::ops::floor_mod, int32_t, N>("floor_mod", -10, 10,
                                                           1, 10);
+    benchmark_ntt_binary<ntt::ops::mod, float, N>("mod", -10.f, 10.f, 1.f,
+                                                  10.f);
     benchmark_ntt_binary<ntt::ops::pow, float, N>("pow", 0.f, 3.f, 0.f, 3.f);
-
-    benchmark_ntt_binary_add<N, run_size, size>();
-    benchmark_ntt_binary_div<N, run_size, size>();
-    benchmark_ntt_binary_max<N, run_size, size>();
-    benchmark_ntt_binary_min<N, run_size, size>();
-    benchmark_ntt_binary_mod<N, run_size, size>();
-    benchmark_ntt_binary_mul<N, run_size, size>();
-    benchmark_ntt_binary_sub<N, run_size, size>();
 }
\ No newline at end of file