diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index dfb7f6bdd44..4c9719f7a05 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -976,7 +976,15 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (use_int8_requantize) out_elempack = num_output % 8 == 0 ? 8 : 1; else + { +#if __AVX512F__ + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } } #endif // __SSE2__ size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; @@ -995,11 +1003,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con { if (use_int8_requantize) { -#if __AVX__ out_elempack_int32 = num_output % 8 == 0 ? 8 : 1; -#else - out_elempack_int32 = num_output % 8 == 0 ? 4 : 1; -#endif } else { @@ -1010,7 +1014,19 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con #else out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; #endif - // out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; + } + } +#endif // __SSE2__ + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8); + +#if __SSE2__ + if (opt.use_packing_layout) + { + if ((opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) || (!opt.use_sgemm_convolution)) + { + // TODO implement winograd and packed int8 avx pack8 output + out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; } } #endif // __SSE2__ @@ -1020,8 +1036,6 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (top_blob_int32.empty()) return -100; - bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8); - int _nT = nT ? nT : opt.num_threads; if (nT != 0 && opt.num_threads != nT) { @@ -1049,6 +1063,39 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (ret != 0) return ret; +#if __SSE2__ + if (opt.use_packing_layout) + { + // NCNN_LOGE("top_blob_int32 %d %d", top_blob_int32.c, top_blob_int32.elempack); + if (use_int8_requantize) + { + // TODO implement winograd and packed int8 pack1 output + if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 1) + { + Mat tmp; + convert_packing(top_blob_int32, tmp, 1, opt); + top_blob_int32 = tmp; + } + if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 0) + { + Mat tmp; + convert_packing(top_blob_int32, tmp, 8, opt); + top_blob_int32 = tmp; + } + } + else + { + // TODO implement winograd and packed int8 avx pack8 output + if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 0) + { + Mat tmp; + convert_packing(top_blob_int32, tmp, 8, opt); + top_blob_int32 = tmp; + } + } + } +#endif + if (use_int8_requantize) { requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); diff --git a/src/layer/x86/requantize_x86.cpp b/src/layer/x86/requantize_x86.cpp index a2e7385dac1..6b64f86967d 100644 --- a/src/layer/x86/requantize_x86.cpp +++ b/src/layer/x86/requantize_x86.cpp @@ -330,82 +330,6 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_ } } -#if __SSE2__ -#if !__AVX__ -static void requantize_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) -{ - const int scale_in_data_size = scale_in_data.w; - const int bias_data_size = bias_data.w; - const int scale_out_data_size = scale_out_data.w; - - // NCNN_LOGE("requantize_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount); - - __m128 _scale_in0 = _mm_set1_ps(scale_in_data[0]); - __m128 _scale_in1 = _scale_in0; - if (scale_in_data_size > 1) - { - _scale_in0 = _mm_loadu_ps((const float*)scale_in_data); - _scale_in1 = _mm_loadu_ps((const float*)scale_in_data + 4); - } - - __m128 _scale_out0 = _mm_set1_ps(scale_out_data[0]); - __m128 _scale_out1 = _scale_out0; - if (scale_out_data_size > 1) - { - _scale_out0 = _mm_loadu_ps((const float*)scale_out_data); - _scale_out1 = _mm_loadu_ps((const float*)scale_out_data + 4); - } - - if (bias_data_size == 0) - { - int i = 0; - for (; i < elemcount; i++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - else - { - __m128 _bias0 = _mm_set1_ps(bias_data[0]); - __m128 _bias1 = _bias0; - if (bias_data_size > 1) - { - _bias0 = _mm_loadu_ps((const float*)bias_data); - _bias1 = _mm_loadu_ps((const float*)bias_data + 4); - } - - int i = 0; - for (; i < elemcount; i++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); - _v0 = _mm_comp_fmadd_ps(_v0, _scale_in0, _bias0); - _v1 = _mm_comp_fmadd_ps(_v1, _scale_in1, _bias1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } -} -#endif // !__AVX__ -#endif // __SSE2__ - int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int dims = bottom_blob.dims; @@ -413,20 +337,11 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& const int h = bottom_blob.h; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 1u; if (dims == 1) { - int out_elempack = 1; -#if __SSE2__ - if (opt.use_packing_layout) - { - out_elempack = w * elempack % 8 == 0 ? 8 : 1; - } -#endif - const int outw = w * elempack / out_elempack; - const size_t out_elemsize = out_elempack * 1u; - - top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -453,107 +368,41 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (dims == 2) { - int out_elempack = 1; -#if __SSE2__ - if (opt.use_packing_layout) - { - out_elempack = h * elempack % 8 == 0 ? 8 : 1; - } -#endif - const int outh = h * elempack / out_elempack; - const size_t out_elemsize = out_elempack * 1u; - - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; -#if __SSE2__ -#if !__AVX__ - if (elempack == 4 && out_elempack == 8) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data; - const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data; - const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data; - - requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); - } - } -#endif // !__AVX__ -#endif // __SSE2__ - if (elempack == out_elempack) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; - const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); - } + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); } } if (dims == 3) { - int out_elempack = 1; -#if __SSE2__ - if (opt.use_packing_layout) - { - out_elempack = channels * elempack % 8 == 0 ? 8 : 1; - } -#endif - const int outc = channels * elempack / out_elempack; - const size_t out_elemsize = out_elempack * 1u; - - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; -#if __SSE2__ -#if !__AVX__ - if (elempack == 4 && out_elempack == 8) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data; - const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data; - const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data; - - requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h); - } - } -#endif // !__AVX__ -#endif // __SSE2__ - if (elempack == out_elempack) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; - const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); - } + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); } }