diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp index 73c55ba6cb1..b07de9c9a57 100644 --- a/src/layer/loongarch/quantize_loongarch.cpp +++ b/src/layer/loongarch/quantize_loongarch.cpp @@ -83,6 +83,68 @@ static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data } } +#if __loongarch_sx +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + __m128 _scale0 = (__m128)__lsx_vreplfr2vr_s(scale); + __m128 _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = (__m128)__lsx_vld((const float*)scale_data, 0); + _scale1 = (__m128)__lsx_vld((const float*)scale_data + 4, 0); + } + + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v0 = (__m128)__lsx_vld(ptr0, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr1, 0); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)s8ptr) = float2int8(_v0, _v1); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} + +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + if (scale_data_size > 1) + { + _scale = (__m128)__lsx_vld((const float*)scale_data); + } + + int i = 0; + for (; i < elemcount; i++) + { + __m128 _v = (__m128)__lsx_vld(ptr, 0); + _v = __lsx_vfmul_s(_v, _scale); + v16i8 v = float2int8(_v, _v); + s8ptr0[0] = v[0]; + s8ptr1[0] = v[1]; + s8ptr2[0] = v[2]; + s8ptr3[0] = v[3]; + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} +#endif // __loongarch_sx + int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -90,11 +152,20 @@ int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opt const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -119,37 +190,127 @@ int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __loongarch_sx + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __loongarch_sx + if (elempack == out_elempack) { - const float* ptr = bottom_blob.row(i); - signed char* s8ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - quantize(ptr, s8ptr, scale_data_i, w, elempack); + quantize(ptr, s8ptr, scale_data_i, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __loongarch_sx + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } +#endif // __loongarch_sx + if (elempack == out_elempack) { - const float* ptr = bottom_blob.channel(q); - signed char* s8ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - quantize(ptr, s8ptr, scale_data_q, w * h, elempack); + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); + } } } diff --git a/src/layer/mips/quantize_mips.cpp b/src/layer/mips/quantize_mips.cpp index 772a0d1a92d..76a7fa8eacb 100644 --- a/src/layer/mips/quantize_mips.cpp +++ b/src/layer/mips/quantize_mips.cpp @@ -83,6 +83,68 @@ static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data } } +#if __mips_msa +static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + v4f32 _scale0 = (v4f32)__msa_fill_w_f32(scale); + v4f32 _scale1 = _scale0; + if (scale_data_size > 1) + { + _scale0 = (v4f32)__msa_ld_w((const float*)scale_data, 0); + _scale1 = (v4f32)__msa_ld_w((const float*)scale_data + 4, 0); + } + + int i = 0; + for (; i < elemcount; i++) + { + v4f32 _v0 = (v4f32)__msa_ld_w(ptr0, 0); + v4f32 _v1 = (v4f32)__msa_ld_w(ptr1, 0); + _v0 = __msa_fmul_w(_v0, _scale0); + _v1 = __msa_fmul_w(_v1, _scale1); + *((int64_t*)s8ptr) = float2int8(_v0, _v1); + ptr0 += 4; + ptr1 += 4; + s8ptr += 8; + } +} + +static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount) +{ + const int scale_data_size = scale_data.w; + + // NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount); + + float scale = scale_data[0]; + v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); + if (scale_data_size > 1) + { + _scale = (v4f32)__msa_ld_w((const float*)scale_data); + } + + int i = 0; + for (; i < elemcount; i++) + { + v4f32 _v = (v4f32)__msa_ld_w(ptr, 0); + _v = __msa_fmul_w(_v, _scale); + v16i8 v = float2int8(_v, _v); + s8ptr0[0] = v[0]; + s8ptr1[0] = v[1]; + s8ptr2[0] = v[2]; + s8ptr3[0] = v[3]; + ptr += 4; + s8ptr0 += 1; + s8ptr1 += 1; + s8ptr2 += 1; + s8ptr3 += 1; + } +} +#endif // __mips_msa + int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -90,11 +152,20 @@ int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 1u; if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack = w * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -119,37 +190,127 @@ int Quantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack = h * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __mips_msa + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr0 = top_blob.row(i * 4); + signed char* s8ptr1 = top_blob.row(i * 4 + 1); + signed char* s8ptr2 = top_blob.row(i * 4 + 2); + signed char* s8ptr3 = top_blob.row(i * 4 + 3); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w); + } + } +#endif // __mips_msa + if (elempack == out_elempack) { - const float* ptr = bottom_blob.row(i); - signed char* s8ptr = top_blob.row(i); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - quantize(ptr, s8ptr, scale_data_i, w, elempack); + quantize(ptr, s8ptr, scale_data_i, w, elempack); + } } } if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % 8 == 0 ? 8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __mips_msa + if (elempack == 4 && out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; + + quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h); + } + } + if (elempack == 4 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr0 = top_blob.channel(q * 4); + signed char* s8ptr1 = top_blob.channel(q * 4 + 1); + signed char* s8ptr2 = top_blob.channel(q * 4 + 2); + signed char* s8ptr3 = top_blob.channel(q * 4 + 3); + + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + + quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h); + } + } +#endif // __mips_msa + if (elempack == out_elempack) { - const float* ptr = bottom_blob.channel(q); - signed char* s8ptr = top_blob.channel(q); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - quantize(ptr, s8ptr, scale_data_q, w * h, elempack); + quantize(ptr, s8ptr, scale_data_q, w * h, elempack); + } } }