more x86 stub for convolution and convolutiondepthwise

XiaoguangHu01 · Jan 25, 2018 · 03621aa · 03621aa
1 parent 6612178
commit 03621aa
Show file tree

Hide file tree

Showing 5 changed files with 643 additions and 2 deletions.
diff --git a/src/layer/x86/convolution_1x1.h b/src/layer/x86/convolution_1x1.h
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+3<inch; q+=4)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+            const float* img2 = bottom_blob.channel(q+2);
+            const float* img3 = bottom_blob.channel(q+3);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+            const float k1 = kernel0[1];
+            const float k2 = kernel0[2];
+            const float k3 = kernel0[3];
+
+            const float* r0 = img0;
+            const float* r1 = img1;
+            const float* r2 = img2;
+            const float* r3 = img3;
+
+            int size = outw * outh;
+
+            int remain = size;
+
+            for (; remain>0; remain--)
+            {
+                float sum = *r0 * k0;
+                float sum1 = *r1 * k1;
+                float sum2 = *r2 * k2;
+                float sum3 = *r3 * k3;
+
+                *outptr += sum + sum1 + sum2 + sum3;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr++;
+            }
+
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+
+            const float* r0 = img0;
+
+            int size = outw * outh;
+
+            int remain = size;
+
+            for (; remain>0; remain--)
+            {
+                float sum = *r0 * k0;
+
+                *outptr += sum;
+
+                r0++;
+                outptr++;
+            }
+
+        }
+    }
+
+}
+
+static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+3<inch; q+=4)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+            const float* img2 = bottom_blob.channel(q+2);
+            const float* img3 = bottom_blob.channel(q+3);
+
+            const float* kernel0 = kernel + p*inch + q;
+            const float k0 = kernel0[0];
+            const float k1 = kernel0[1];
+            const float k2 = kernel0[2];
+            const float k3 = kernel0[3];
+
+            const float* r0 = img0;
+            const float* r1 = img1;
+            const float* r2 = img2;
+            const float* r3 = img3;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = *r0 * k0;
+                    float sum1 = *r1 * k1;
+                    float sum2 = *r2 * k2;
+                    float sum3 = *r3 * k3;
+
+                    *outptr += sum + sum1 + sum2 + sum3;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+            }
+
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch + q;
+            const float k0 = kernel0[0];
+
+            const float* r0 = img0;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = *r0 * k0;
+
+                    *outptr += sum;
+
+                    r0 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
@@ -16,6 +16,7 @@
 
 namespace ncnn {
 
+#include "convolution_1x1.h"
 #include "convolution_3x3.h"
 #include "convolution_5x5.h"
 
@@ -45,8 +46,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
     conv_func conv_func_table[5][5] =
     {
         {
-            0,
-            0,
+            conv1x1s1_sse,
+            conv1x1s2_sse,
             0,
             0,
             0