diff --git a/src/layer/x86/convolution_1x1.h b/src/layer/x86/convolution_1x1.h
new file mode 100644
index 00000000000..b324740f35d
--- /dev/null
+++ b/src/layer/x86/convolution_1x1.h
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+3<inch; q+=4)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+            const float* img2 = bottom_blob.channel(q+2);
+            const float* img3 = bottom_blob.channel(q+3);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+            const float k1 = kernel0[1];
+            const float k2 = kernel0[2];
+            const float k3 = kernel0[3];
+
+            const float* r0 = img0;
+            const float* r1 = img1;
+            const float* r2 = img2;
+            const float* r3 = img3;
+
+            int size = outw * outh;
+
+            int remain = size;
+
+            for (; remain>0; remain--)
+            {
+                float sum = *r0 * k0;
+                float sum1 = *r1 * k1;
+                float sum2 = *r2 * k2;
+                float sum3 = *r3 * k3;
+
+                *outptr += sum + sum1 + sum2 + sum3;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr++;
+            }
+
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch  + q;
+            const float k0 = kernel0[0];
+
+            const float* r0 = img0;
+
+            int size = outw * outh;
+
+            int remain = size;
+
+            for (; remain>0; remain--)
+            {
+                float sum = *r0 * k0;
+
+                *outptr += sum;
+
+                r0++;
+                outptr++;
+            }
+
+        }
+    }
+
+}
+
+static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int p=0; p<outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        int q = 0;
+
+        for (; q+3<inch; q+=4)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+            const float* img1 = bottom_blob.channel(q+1);
+            const float* img2 = bottom_blob.channel(q+2);
+            const float* img3 = bottom_blob.channel(q+3);
+
+            const float* kernel0 = kernel + p*inch + q;
+            const float k0 = kernel0[0];
+            const float k1 = kernel0[1];
+            const float k2 = kernel0[2];
+            const float k3 = kernel0[3];
+
+            const float* r0 = img0;
+            const float* r1 = img1;
+            const float* r2 = img2;
+            const float* r3 = img3;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = *r0 * k0;
+                    float sum1 = *r1 * k1;
+                    float sum2 = *r2 * k2;
+                    float sum3 = *r3 * k3;
+
+                    *outptr += sum + sum1 + sum2 + sum3;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+            }
+
+        }
+
+        for (; q<inch; q++)
+        {
+            float* outptr = out;
+
+            const float* img0 = bottom_blob.channel(q);
+
+            const float* kernel0 = kernel + p*inch + q;
+            const float k0 = kernel0[0];
+
+            const float* r0 = img0;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain>0; remain--)
+                {
+                    float sum = *r0 * k0;
+
+                    *outptr += sum;
+
+                    r0 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+            }
+
+        }
+    }
+
+}
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 8ddf9b1e675..c36e80b9547 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -16,6 +16,7 @@
 
 namespace ncnn {
 
+#include "convolution_1x1.h"
 #include "convolution_3x3.h"
 #include "convolution_5x5.h"
 
@@ -45,8 +46,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
     conv_func conv_func_table[5][5] =
     {
         {
-            0,
-            0,
+            conv1x1s1_sse,
+            conv1x1s2_sse,
             0,
             0,
             0
diff --git a/src/layer/x86/convolutiondepthwise_3x3.h b/src/layer/x86/convolutiondepthwise_3x3.h
new file mode 100644
index 00000000000..d14948de3bd
--- /dev/null
+++ b/src/layer/x86/convolutiondepthwise_3x3.h
@@ -0,0 +1,201 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int g=0; g<group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const float bias0 = bias ? bias[g] : 0.f;
+
+        const float* kernel0 = kernel + g*9;
+
+        float* outptr = out;
+        float* outptr2 = outptr + outw;
+
+        const float* img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w*2;
+        const float* r3 = img0 + w*3;
+
+        const float* k0 = kernel0;
+        const float* k1 = kernel0 + 3;
+        const float* k2 = kernel0 + 6;
+
+        int i = 0;
+
+        for (; i+1 < outh; i+=2)
+        {
+
+            int remain = outw;
+
+            for (; remain>0; remain--)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                float sum2 = bias0;
+                sum2 += r1[0] * k0[0];
+                sum2 += r1[1] * k0[1];
+                sum2 += r1[2] * k0[2];
+                sum2 += r2[0] * k1[0];
+                sum2 += r2[1] * k1[1];
+                sum2 += r2[2] * k1[2];
+                sum2 += r3[0] * k2[0];
+                sum2 += r3[1] * k2[1];
+                sum2 += r3[2] * k2[2];
+
+                *outptr = sum;
+                *outptr2 = sum2;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr++;
+                outptr2++;
+            }
+
+            r0 += 2 + w;
+            r1 += 2 + w;
+            r2 += 2 + w;
+            r3 += 2 + w;
+
+            outptr += outw;
+            outptr2 += outw;
+        }
+
+        for (; i < outh; i++)
+        {
+            int remain = outw;
+
+            for (; remain>0; remain--)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                *outptr = sum;
+
+                r0++;
+                r1++;
+                r2++;
+                outptr++;
+            }
+
+            r0 += 2;
+            r1 += 2;
+            r2 += 2;
+        }
+    }
+}
+
+static void convdw3x3s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for
+    for (int g=0; g<group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const float bias0 = bias ? bias[g] : 0.f;
+
+        const float* kernel0 = kernel + g*9;
+
+        float* outptr = out;
+
+        const float* img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w*2;
+
+        const float* k0 = kernel0;
+        const float* k1 = kernel0 + 3;
+        const float* k2 = kernel0 + 6;
+
+        int i = 0;
+
+        for (; i < outh; i++)
+        {
+            int remain = outw;
+
+            for (; remain>0; remain--)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                *outptr = sum;
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr++;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+
+    }
+}
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
new file mode 100644
index 00000000000..8de31d88081
--- /dev/null
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -0,0 +1,194 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolutiondepthwise_x86.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace ncnn {
+
+#include "convolution_1x1.h"
+#include "convolution_3x3.h"
+#include "convolution_5x5.h"
+
+#include "convolutiondepthwise_3x3.h"
+
+DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)
+
+int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob) const
+{
+    // convolv with NxN kernel
+    // value = value + bias
+
+    if (kernel_w != kernel_h || stride_w != stride_h)
+    {
+        return ConvolutionDepthWise::forward(bottom_blob, top_blob);
+    }
+
+    const int kernel_size = kernel_w;
+    const int stride = stride_w;
+
+    if (kernel_size > 5 || stride > 5 || dilation_w != 1 || dilation_h != 1)
+    {
+        return ConvolutionDepthWise::forward(bottom_blob, top_blob);
+    }
+
+    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
+
+    // kernel_size x stride
+    conv_func conv_func_table[5][5] =
+    {
+        {
+            conv1x1s1_sse,
+            conv1x1s2_sse,
+            0,
+            0,
+            0
+        }, // kernel_size = 1
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 2
+        {
+            conv3x3s1_sse,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 3
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 4
+        {
+            conv5x5s1_sse,
+            0,
+            0,
+            0,
+            0
+        }  // kernel_size = 5
+    };
+
+    conv_func conv = conv_func_table[kernel_size-1][stride-1];
+    if (!conv)
+    {
+        return ConvolutionDepthWise::forward(bottom_blob, top_blob);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    Mat bottom_blob_bordered = bottom_blob;
+    if (pad_w > 0 || pad_h > 0)
+    {
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+    else if (pad_w == -233 && pad_h == -233)
+    {
+        int wpad = kernel_size + (w - 1) / stride * stride - w;
+        int hpad = kernel_size + (h - 1) / stride * stride - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
+            if (bottom_blob_bordered.empty())
+                return -100;
+        }
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+    }
+
+    int outw = (w - kernel_size) / stride + 1;
+    int outh = (h - kernel_size) / stride + 1;
+
+    top_blob.create(outw, outh, num_output);
+    if (top_blob.empty())
+        return -100;
+
+    const int maxk = kernel_size * kernel_size;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        if (kernel_size == 3)
+        {
+            if (stride == 1)
+            {
+                convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data);
+                return 0;
+            }
+            else if (stride == 2)
+            {
+                convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data);
+                return 0;
+            }
+        }
+
+#ifdef _OPENMP
+        int nested_current = omp_get_nested();
+        omp_set_nested(0);
+#endif
+
+        #pragma omp parallel for
+        for (int g=0; g<group; g++)
+        {
+            Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
+            Mat top_blob_g = top_blob.channel(g);
+            Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
+            Mat bias_data_g;
+            if (bias_term)
+                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));
+
+            conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
+        }
+
+#ifdef _OPENMP
+        omp_set_nested(nested_current);
+#endif
+        return 0;
+    }
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    for (int g=0; g<group; g++)
+    {
+        Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
+        Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g));
+        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
+        Mat bias_data_g;
+        if (bias_term)
+            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
+
+        conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h
new file mode 100644
index 00000000000..d67283511af
--- /dev/null
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTIONDEPTHWISE_X86_H
+#define LAYER_CONVOLUTIONDEPTHWISE_X86_H
+
+#include "convolutiondepthwise.h"
+
+namespace ncnn {
+
+class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTIONDEPTHWISE_X86_H