diff --git a/src/layer/x86/convolution_1x1.h b/src/layer/x86/convolution_1x1.h new file mode 100644 index 00000000000..b324740f35d --- /dev/null +++ b/src/layer/x86/convolution_1x1.h @@ -0,0 +1,215 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for + for (int p=0; p0; remain--) + { + float sum = *r0 * k0; + float sum1 = *r1 * k1; + float sum2 = *r2 * k2; + float sum3 = *r3 * k3; + + *outptr += sum + sum1 + sum2 + sum3; + + r0++; + r1++; + r2++; + r3++; + outptr++; + } + + } + + for (; q0; remain--) + { + float sum = *r0 * k0; + + *outptr += sum; + + r0++; + outptr++; + } + + } + } + +} + +static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2*outw + w; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for + for (int p=0; p0; remain--) + { + float sum = *r0 * k0; + float sum1 = *r1 * k1; + float sum2 = *r2 * k2; + float sum3 = *r3 * k3; + + *outptr += sum + sum1 + sum2 + sum3; + + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + } + + } + + for (; q0; remain--) + { + float sum = *r0 * k0; + + *outptr += sum; + + r0 += 2; + outptr++; + } + + r0 += tailstep; + } + + } + } + +} diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 8ddf9b1e675..c36e80b9547 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -16,6 +16,7 @@ namespace ncnn { +#include "convolution_1x1.h" #include "convolution_3x3.h" #include "convolution_5x5.h" @@ -45,8 +46,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const conv_func conv_func_table[5][5] = { { - 0, - 0, + conv1x1s1_sse, + conv1x1s2_sse, 0, 0, 0 diff --git a/src/layer/x86/convolutiondepthwise_3x3.h b/src/layer/x86/convolutiondepthwise_3x3.h new file mode 100644 index 00000000000..d14948de3bd --- /dev/null +++ b/src/layer/x86/convolutiondepthwise_3x3.h @@ -0,0 +1,201 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for + for (int g=0; g0; remain--) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + float sum2 = bias0; + sum2 += r1[0] * k0[0]; + sum2 += r1[1] * k0[1]; + sum2 += r1[2] * k0[2]; + sum2 += r2[0] * k1[0]; + sum2 += r2[1] * k1[1]; + sum2 += r2[2] * k1[2]; + sum2 += r3[0] * k2[0]; + sum2 += r3[1] * k2[1]; + sum2 += r3[2] * k2[2]; + + *outptr = sum; + *outptr2 = sum2; + + r0++; + r1++; + r2++; + r3++; + outptr++; + outptr2++; + } + + r0 += 2 + w; + r1 += 2 + w; + r2 += 2 + w; + r3 += 2 + w; + + outptr += outw; + outptr2 += outw; + } + + for (; i < outh; i++) + { + int remain = outw; + + for (; remain>0; remain--) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr = sum; + + r0++; + r1++; + r2++; + outptr++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } +} + +static void convdw3x3s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = w - 2*outw + w; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for + for (int g=0; g0; remain--) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr = sum; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + + } +} diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp new file mode 100644 index 00000000000..8de31d88081 --- /dev/null +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -0,0 +1,194 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolutiondepthwise_x86.h" + +#ifdef _OPENMP +#include +#endif + +namespace ncnn { + +#include "convolution_1x1.h" +#include "convolution_3x3.h" +#include "convolution_5x5.h" + +#include "convolutiondepthwise_3x3.h" + +DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86) + +int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) const +{ + // convolv with NxN kernel + // value = value + bias + + if (kernel_w != kernel_h || stride_w != stride_h) + { + return ConvolutionDepthWise::forward(bottom_blob, top_blob); + } + + const int kernel_size = kernel_w; + const int stride = stride_w; + + if (kernel_size > 5 || stride > 5 || dilation_w != 1 || dilation_h != 1) + { + return ConvolutionDepthWise::forward(bottom_blob, top_blob); + } + + typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&); + + // kernel_size x stride + conv_func conv_func_table[5][5] = + { + { + conv1x1s1_sse, + conv1x1s2_sse, + 0, + 0, + 0 + }, // kernel_size = 1 + { + 0, + 0, + 0, + 0, + 0 + }, // kernel_size = 2 + { + conv3x3s1_sse, + 0, + 0, + 0, + 0 + }, // kernel_size = 3 + { + 0, + 0, + 0, + 0, + 0 + }, // kernel_size = 4 + { + conv5x5s1_sse, + 0, + 0, + 0, + 0 + } // kernel_size = 5 + }; + + conv_func conv = conv_func_table[kernel_size-1][stride-1]; + if (!conv) + { + return ConvolutionDepthWise::forward(bottom_blob, top_blob); + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + Mat bottom_blob_bordered = bottom_blob; + if (pad_w > 0 || pad_h > 0) + { + copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + } + else if (pad_w == -233 && pad_h == -233) + { + int wpad = kernel_size + (w - 1) / stride * stride - w; + int hpad = kernel_size + (h - 1) / stride * stride - h; + if (wpad > 0 || hpad > 0) + { + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f); + if (bottom_blob_bordered.empty()) + return -100; + } + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + } + + int outw = (w - kernel_size) / stride + 1; + int outh = (h - kernel_size) / stride + 1; + + top_blob.create(outw, outh, num_output); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_size * kernel_size; + + // depth-wise + if (channels == group && group == num_output) + { + if (kernel_size == 3) + { + if (stride == 1) + { + convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data); + return 0; + } + else if (stride == 2) + { + convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data); + return 0; + } + } + +#ifdef _OPENMP + int nested_current = omp_get_nested(); + omp_set_nested(0); +#endif + + #pragma omp parallel for + for (int g=0; g