Skip to content

Commit

Permalink
more x86 stub for convolution and convolutiondepthwise
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Jan 25, 2018
1 parent 6612178 commit 03621aa
Show file tree
Hide file tree
Showing 5 changed files with 643 additions and 2 deletions.
215 changes: 215 additions & 0 deletions src/layer/x86/convolution_1x1.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

int q = 0;

for (; q+3<inch; q+=4)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q+1);
const float* img2 = bottom_blob.channel(q+2);
const float* img3 = bottom_blob.channel(q+3);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];
const float k1 = kernel0[1];
const float k2 = kernel0[2];
const float k3 = kernel0[3];

const float* r0 = img0;
const float* r1 = img1;
const float* r2 = img2;
const float* r3 = img3;

int size = outw * outh;

int remain = size;

for (; remain>0; remain--)
{
float sum = *r0 * k0;
float sum1 = *r1 * k1;
float sum2 = *r2 * k2;
float sum3 = *r3 * k3;

*outptr += sum + sum1 + sum2 + sum3;

r0++;
r1++;
r2++;
r3++;
outptr++;
}

}

for (; q<inch; q++)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];

const float* r0 = img0;

int size = outw * outh;

int remain = size;

for (; remain>0; remain--)
{
float sum = *r0 * k0;

*outptr += sum;

r0++;
outptr++;
}

}
}

}

static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2*outw + w;

const float* kernel = _kernel;
const float* bias = _bias;

#pragma omp parallel for
for (int p=0; p<outch; p++)
{
Mat out = top_blob.channel(p);

const float bias0 = bias ? bias[p] : 0.f;

out.fill(bias0);

int q = 0;

for (; q+3<inch; q+=4)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);
const float* img1 = bottom_blob.channel(q+1);
const float* img2 = bottom_blob.channel(q+2);
const float* img3 = bottom_blob.channel(q+3);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];
const float k1 = kernel0[1];
const float k2 = kernel0[2];
const float k3 = kernel0[3];

const float* r0 = img0;
const float* r1 = img1;
const float* r2 = img2;
const float* r3 = img3;

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain>0; remain--)
{
float sum = *r0 * k0;
float sum1 = *r1 * k1;
float sum2 = *r2 * k2;
float sum3 = *r3 * k3;

*outptr += sum + sum1 + sum2 + sum3;

r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
outptr++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
r3 += tailstep;
}

}

for (; q<inch; q++)
{
float* outptr = out;

const float* img0 = bottom_blob.channel(q);

const float* kernel0 = kernel + p*inch + q;
const float k0 = kernel0[0];

const float* r0 = img0;

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain>0; remain--)
{
float sum = *r0 * k0;

*outptr += sum;

r0 += 2;
outptr++;
}

r0 += tailstep;
}

}
}

}
5 changes: 3 additions & 2 deletions src/layer/x86/convolution_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

namespace ncnn {

#include "convolution_1x1.h"
#include "convolution_3x3.h"
#include "convolution_5x5.h"

Expand Down Expand Up @@ -45,8 +46,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
conv_func conv_func_table[5][5] =
{
{
0,
0,
conv1x1s1_sse,
conv1x1s2_sse,
0,
0,
0
Expand Down
Loading

0 comments on commit 03621aa

Please sign in to comment.