1 files changed, 1883 insertions, 0 deletions
diff --git a/compute/ncnn/src/srcn/sgemm_test.cc b/compute/ncnn/src/srcn/sgemm_test.cc
new file mode 100644
index 000000000..1b10970bb
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_test.cc
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "ncnn/srcn/conv_type.h"
+#include "srcn/srcn_conv.h"
+//#include "srcn_sgemm.h"
+#include "conv_sgemm_singlethread.h"
+#include "conv_sgemm_multithreads.h"
+//#include "conv_sgemm_batch.h"
+#include "sgemm_singlethread.h"
+#include "conv_winograd.h"
+#include "winograd.h"
+
+//#include "conv_gpu.h"
+//#include "convolutiondepthwise_3x3.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static void direct_conv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+                                 convParams_t *params)
+{
+  const int w = input->w;
+  const int h = input->h;
+  const int inch = input->c;
+  const int outw = output->w;
+  const int outh = output->h;
+  const int outch = output->c;
+  const int kernel_w = params->kernel_w;
+  const int kernel_h = params->kernel_h;
+  const int stride_w = params->stride_w;
+  const int stride_h = params->stride_h;
+  const int pad_w = params->pad_w;
+  const int pad_h = params->pad_h;
+  const int dilation_w = params->dilation_w;
+  const int dilation_h = params->dilation_h;
+  const float *input_data = input->data;
+  const float *filter_data = filter->data;
+  float *output_data = output->data;
+
+  for (int out_c = 0; out_c < outch; out_c++)
+  {
+    for (int out_row = 0; out_row < outh; out_row++)
+    {
+      for (int out_col = 0; out_col < outw; out_col++)
+      {
+        const int in_col0 = (out_col * stride_w) - pad_w;
+        const int in_row0 = (out_row * stride_h) - pad_h;
+        float sum = 0.f;
+        for (int in_c = 0; in_c < inch; in_c++)
+        {
+          for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+          {
+            for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+            {
+              const int in_col = in_col0 + filter_x * dilation_w;
+              const int in_row = in_row0 + filter_y * dilation_h;
+
+              if (((unsigned int)in_col < (unsigned int)w) &&
+                  ((unsigned int)in_row < (unsigned int)h))
+              {
+                float input_value = input_data[(in_c * h + in_row) * w + in_col];
+                float filter_value =
+                    filter_data[((out_c * inch + in_c) * kernel_h + filter_y) * kernel_w +
+                                filter_x];
+                sum += (input_value * filter_value);
+              }
+            }
+          }
+        }
+        output_data[(out_c * outh + out_row) * outw + out_col] = sum;
+      }
+    }
+  }
+}
+
+static void direct_deconv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+                                   convParams_t *params)
+{
+  const int w = input->w;
+  const int h = input->h;
+  const int inch = input->c;
+  const int outw = output->w;
+  const int outh = output->h;
+  const int outch = output->c;
+  const int kernel_w = params->kernel_w;
+  const int kernel_h = params->kernel_h;
+  const int stride_w = params->stride_w;
+  const int stride_h = params->stride_h;
+  const int pad_w = params->pad_w;
+  const int pad_h = params->pad_h;
+  const int dilation_w = params->dilation_w;
+  const int dilation_h = params->dilation_h;
+  const float *input_data = input->data;
+  const float *filter_data = filter->data;
+  float *output_data = output->data;
+
+  for (int i = 0; i < outw * outh * outch; i++)
+  {
+    output_data[i] = 0;
+  }
+
+  for (int in_c = 0; in_c < inch; in_c++)
+  {
+    for (int in_row = 0; in_row < h; in_row++)
+    {
+      for (int in_col = 0; in_col < w; in_col++)
+      {
+        const int out_col0 = (in_col * stride_w) - pad_w;
+        const int out_row0 = (in_row * stride_h) - pad_h;
+        float in_value = input_data[(in_c * h + in_row) * w + in_col];
+        for (int out_c = 0; out_c < outch; out_c++)
+        {
+          for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+          {
+            for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+            {
+              const int out_col = out_col0 + filter_x * dilation_w;
+              const int out_row = out_row0 + filter_y * dilation_h;
+
+              if (((unsigned int)out_col < (unsigned int)outw) &&
+                  ((unsigned int)out_row < (unsigned int)outh))
+              {
+                float filter_value =
+                    filter_data[((in_c * outch + out_c) * kernel_h + filter_y) * kernel_w +
+                                filter_x];
+                output_data[(out_c * outh + out_row) * outw + out_col] += filter_value * in_value;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void direct_sgemm_rowmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B,
+                                  float *C)
+{
+  float *aa, *bb;
+
+  if (Atrans == trans)
+  {
+    aa = (float *)malloc(m * k * sizeof(float));
+    if (!aa)
+      return;
+
+    for (int i = 0; i < k; i++)
+    {
+      for (int j = 0; j < m; j++)
+      {
+        aa[j * k + i] = A[i * m + j];
+      }
+    }
+  }
+  else
+  {
+    aa = A;
+  }
+
+  if (Btrans == trans)
+  {
+    bb = (float *)malloc(n * k * sizeof(float));
+    if (!bb)
+      return;
+
+    for (int i = 0; i < n; i++)
+    {
+      for (int j = 0; j < k; j++)
+      {
+        bb[j * n + i] = B[i * k + j];
+      }
+    }
+  }
+  else
+  {
+    bb = B;
+  }
+
+  for (int i = 0; i < m; i++)
+  {
+    for (int j = 0; j < n; j++)
+    {
+      float res = 0.f;
+      for (int l = 0; l < k; l++)
+      {
+        res += aa[i * k + l] * bb[l * n + j];
+      }
+      C[i * n + j] = res;
+    }
+  }
+}
+
+/*static void direct_sgemm_kernel(const int k, const int lhs_stride, const int rhs_stride, const int
+res_stride,
+                                    const float *lhs_ptr, const float *rhs_ptr, float *res_ptr)
+{
+    int lstride = lhs_stride << 2;
+    int rstride = rhs_stride << 2;
+    int estride = res_stride << 2;
+    int rstep = rstride << 2;
+
+    int nk = (k >> 2) - 1;
+
+    __asm __volatile (
+        "movi v16.4s, #0x0\n"
+        "movi v17.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        "movi v19.4s, #0x0\n"
+        "movi v20.4s, #0x0\n"
+        "movi v21.4s, #0x0\n"
+        "movi v22.4s, #0x0\n"
+        "movi v23.4s, #0x0\n"
+        "movi v24.4s, #0x0\n"
+        "movi v25.4s, #0x0\n"
+        "movi v26.4s, #0x0\n"
+        "movi v27.4s, #0x0\n"
+        "movi v28.4s, #0x0\n"
+        "movi v29.4s, #0x0\n"
+        "movi v30.4s, #0x0\n"
+        "movi v31.4s, #0x0\n"
+
+        "mov x0, %[lhs_ptr]\n"
+        "add %[lhs_ptr], %[lhs_ptr], #16\n"
+        "ld1 {v0.4s}, [x0]\n"
+        "add x0, x0, %[lstride]\n"
+        "ld1 {v1.4s}, [x0]\n"
+        "add x0, x0, %[lstride]\n"
+        "ld1 {v2.4s}, [x0]\n"
+        "add x0, x0, %[lstride]\n"
+        "ld1 {v3.4s}, [x0]\n"
+        "add x0, x0, %[lstride]\n"
+
+        "mov x1, %[rhs_ptr]\n"
+        "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n"
+        "ld1 {v8.4s, v9.4s}, [x1]\n"
+        "add x1, x1, %[rstride]\n"
+        "ld1 {v10.4s, v11.4s}, [x1]\n"
+        "add x1, x1, %[rstride]\n"
+
+        "1:\n"
+        "fmla v16.4s,  v8.4s,  v0.s[0]\n"
+        "fmla v17.4s,  v9.4s,  v0.s[0]\n"
+        "fmla v16.4s,  v10.4s, v0.s[1]\n"
+        "fmla v17.4s,  v11.4s, v0.s[1]\n"
+        "fmla v18.4s,  v8.4s,  v1.s[0]\n"
+        "fmla v19.4s,  v9.4s,  v1.s[0]\n"
+        "fmla v18.4s,  v10.4s, v1.s[1]\n"
+        "fmla v19.4s,  v11.4s, v1.s[1]\n"
+        "ld1 {v12.4s, v13.4s}, [x1]\n"
+        "fmla v20.4s,  v8.4s,  v2.s[0]\n"
+        "add x1, x1, %[rstride]\n"
+        "fmla v21.4s,  v9.4s,  v2.s[0]\n"
+        "ld1 {v14.4s, v15.4s}, [x1]\n"
+        "fmla v20.4s,  v10.4s, v2.s[1]\n"
+        "add x1, x1, %[rstride]\n"
+        "fmla v21.4s,  v11.4s, v2.s[1]\n"
+        "fmla v22.4s,  v8.4s,  v3.s[0]\n"
+        "fmla v23.4s,  v9.4s,  v3.s[0]\n"
+        "fmla v22.4s,  v10.4s, v3.s[1]\n"
+        "fmla v23.4s,  v11.4s, v3.s[1]\n"
+
+        "ld1 {v4.4s}, [x0]\n"
+        "fmla v16.4s,  v12.4s, v0.s[2]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v17.4s,  v13.4s, v0.s[2]\n"
+        "ld1 {v5.4s}, [x0]\n"
+        "fmla v16.4s,  v14.4s, v0.s[3]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v17.4s,  v15.4s, v0.s[3]\n"
+        "ld1 {v6.4s}, [x0]\n"
+        "fmla v18.4s,  v12.4s, v1.s[2]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v19.4s,  v13.4s, v1.s[2]\n"
+        "ld1 {v7.4s}, [x0]\n"
+        "fmla v18.4s,  v14.4s, v1.s[3]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v19.4s,  v15.4s, v1.s[3]\n"
+        "fmla v20.4s,  v12.4s, v2.s[2]\n"
+        "fmla v21.4s,  v13.4s, v2.s[2]\n"
+        "fmla v20.4s,  v14.4s, v2.s[3]\n"
+        "fmla v21.4s,  v15.4s, v2.s[3]\n"
+        "fmla v22.4s,  v12.4s, v3.s[2]\n"
+        "fmla v23.4s,  v13.4s, v3.s[2]\n"
+        "fmla v22.4s,  v14.4s, v3.s[3]\n"
+        "fmla v23.4s,  v15.4s, v3.s[3]\n"
+
+        "mov x0, %[lhs_ptr]\n"
+        "add %[lhs_ptr], %[lhs_ptr], #16\n"
+
+        "fmla v24.4s,  v8.4s,  v4.s[0]\n"
+        "fmla v25.4s,  v9.4s,  v4.s[0]\n"
+        "ld1 {v0.4s}, [x0]\n"
+        "fmla v24.4s,  v10.4s, v4.s[1]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v25.4s,  v11.4s, v4.s[1]\n"
+        "ld1 {v1.4s}, [x0]\n"
+        "fmla v26.4s,  v8.4s,  v5.s[0]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v27.4s,  v9.4s,  v5.s[0]\n"
+        "ld1 {v2.4s}, [x0]\n"
+        "fmla v26.4s,  v10.4s, v5.s[1]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v27.4s,  v11.4s, v5.s[1]\n"
+        "ld1 {v3.4s}, [x0]\n"
+        "fmla v28.4s,  v8.4s,  v6.s[0]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v29.4s,  v9.4s,  v6.s[0]\n"
+        "fmla v28.4s,  v10.4s, v6.s[1]\n"
+        "fmla v29.4s,  v11.4s, v6.s[1]\n"
+        "fmla v30.4s,  v8.4s,  v7.s[0]\n"
+        "fmla v31.4s,  v9.4s,  v7.s[0]\n"
+        "fmla v30.4s,  v10.4s, v7.s[1]\n"
+        "fmla v31.4s,  v11.4s, v7.s[1]\n"
+
+        "mov x1, %[rhs_ptr]\n"
+        "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n"
+
+        "fmla v24.4s,  v12.4s, v4.s[2]\n"
+        "fmla v25.4s,  v13.4s, v4.s[2]\n"
+        "ld1 {v8.4s, v9.4s}, [x1]\n"
+        "fmla v24.4s,  v14.4s, v4.s[3]\n"
+        "add x1, x1, %[rstride]\n"
+        "fmla v25.4s,  v15.4s, v4.s[3]\n"
+        "ld1 {v10.4s, v11.4s}, [x1]\n"
+        "fmla v26.4s,  v12.4s, v5.s[2]\n"
+        "add x1, x1, %[rstride]\n"
+        "fmla v27.4s,  v13.4s, v5.s[2]\n"
+        "fmla v26.4s,  v14.4s, v5.s[3]\n"
+        "fmla v27.4s,  v15.4s, v5.s[3]\n"
+        "fmla v28.4s,  v12.4s, v6.s[2]\n"
+        "fmla v29.4s,  v13.4s, v6.s[2]\n"
+        "fmla v28.4s,  v14.4s, v6.s[3]\n"
+        "fmla v29.4s,  v15.4s, v6.s[3]\n"
+        "fmla v30.4s,  v12.4s, v7.s[2]\n"
+        "fmla v31.4s,  v13.4s, v7.s[2]\n"
+        "subs %w[nk], %w[nk], #1\n"
+        "fmla v30.4s,  v14.4s, v7.s[3]\n"
+        "fmla v31.4s,  v15.4s, v7.s[3]\n"
+        "bne 1b\n"
+
+        "fmla v16.4s,  v8.4s,  v0.s[0]\n"
+        "fmla v17.4s,  v9.4s,  v0.s[0]\n"
+        "fmla v16.4s,  v10.4s, v0.s[1]\n"
+        "fmla v17.4s,  v11.4s, v0.s[1]\n"
+        "fmla v18.4s,  v8.4s,  v1.s[0]\n"
+        "fmla v19.4s,  v9.4s,  v1.s[0]\n"
+        "fmla v18.4s,  v10.4s, v1.s[1]\n"
+        "fmla v19.4s,  v11.4s, v1.s[1]\n"
+        "ld1 {v12.4s, v13.4s}, [x1]\n"
+        "fmla v20.4s,  v8.4s,  v2.s[0]\n"
+        "add x1, x1, %[rstride]\n"
+        "fmla v21.4s,  v9.4s,  v2.s[0]\n"
+        "ld1 {v14.4s, v15.4s}, [x1]\n"
+        "fmla v20.4s,  v10.4s, v2.s[1]\n"
+        "add x1, x1, %[rstride]\n"
+        "fmla v21.4s,  v11.4s, v2.s[1]\n"
+        "fmla v22.4s,  v8.4s,  v3.s[0]\n"
+        "fmla v23.4s,  v9.4s,  v3.s[0]\n"
+        "fmla v22.4s,  v10.4s, v3.s[1]\n"
+        "fmla v23.4s,  v11.4s, v3.s[1]\n"
+
+        "ld1 {v4.4s}, [x0]\n"
+        "fmla v16.4s,  v12.4s, v0.s[2]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v17.4s,  v13.4s, v0.s[2]\n"
+        "ld1 {v5.4s}, [x0]\n"
+        "fmla v16.4s,  v14.4s, v0.s[3]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v17.4s,  v15.4s, v0.s[3]\n"
+        "ld1 {v6.4s}, [x0]\n"
+        "fmla v18.4s,  v12.4s, v1.s[2]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v19.4s,  v13.4s, v1.s[2]\n"
+        "ld1 {v7.4s}, [x0]\n"
+        "fmla v18.4s,  v14.4s, v1.s[3]\n"
+        "add x0, x0, %[lstride]\n"
+        "fmla v19.4s,  v15.4s, v1.s[3]\n"
+        "fmla v20.4s,  v12.4s, v2.s[2]\n"
+        "fmla v21.4s,  v13.4s, v2.s[2]\n"
+        "fmla v20.4s,  v14.4s, v2.s[3]\n"
+        "fmla v21.4s,  v15.4s, v2.s[3]\n"
+        "fmla v22.4s,  v12.4s, v3.s[2]\n"
+        "fmla v23.4s,  v13.4s, v3.s[2]\n"
+        "fmla v22.4s,  v14.4s, v3.s[3]\n"
+        "fmla v23.4s,  v15.4s, v3.s[3]\n"
+
+        "mov x0, %[res_ptr]\n"
+        "fmla v24.4s,  v8.4s,  v4.s[0]\n"
+        "fmla v25.4s,  v9.4s,  v4.s[0]\n"
+        "st1 {v16.4s, v17.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v24.4s,  v10.4s, v4.s[1]\n"
+        "fmla v25.4s,  v11.4s, v4.s[1]\n"
+        "st1 {v18.4s, v19.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v26.4s,  v8.4s,  v5.s[0]\n"
+        "fmla v27.4s,  v9.4s,  v5.s[0]\n"
+        "st1 {v20.4s, v21.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v26.4s,  v10.4s, v5.s[1]\n"
+        "fmla v27.4s,  v11.4s, v5.s[1]\n"
+        "st1 {v22.4s, v23.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v28.4s,  v8.4s,  v6.s[0]\n"
+        "fmla v29.4s,  v9.4s,  v6.s[0]\n"
+        "fmla v28.4s,  v10.4s, v6.s[1]\n"
+        "fmla v29.4s,  v11.4s, v6.s[1]\n"
+        "fmla v30.4s,  v8.4s,  v7.s[0]\n"
+        "fmla v31.4s,  v9.4s,  v7.s[0]\n"
+        "fmla v30.4s,  v10.4s, v7.s[1]\n"
+        "fmla v31.4s,  v11.4s, v7.s[1]\n"
+
+        "fmla v24.4s,  v12.4s, v4.s[2]\n"
+        "fmla v25.4s,  v13.4s, v4.s[2]\n"
+        "fmla v24.4s,  v14.4s, v4.s[3]\n"
+        "fmla v25.4s,  v15.4s, v4.s[3]\n"
+        "fmla v26.4s,  v12.4s, v5.s[2]\n"
+        "fmla v27.4s,  v13.4s, v5.s[2]\n"
+        "st1 {v24.4s, v25.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v26.4s,  v14.4s, v5.s[3]\n"
+        "fmla v27.4s,  v15.4s, v5.s[3]\n"
+        "fmla v28.4s,  v12.4s, v6.s[2]\n"
+        "fmla v29.4s,  v13.4s, v6.s[2]\n"
+        "st1 {v26.4s, v27.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v28.4s,  v14.4s, v6.s[3]\n"
+        "fmla v29.4s,  v15.4s, v6.s[3]\n"
+        "fmla v30.4s,  v12.4s, v7.s[2]\n"
+        "fmla v31.4s,  v13.4s, v7.s[2]\n"
+        "st1 {v28.4s, v29.4s}, [x0]\n"
+        "add x0, x0, %[estride]\n"
+        "fmla v30.4s,  v14.4s, v7.s[3]\n"
+        "fmla v31.4s,  v15.4s, v7.s[3]\n"
+        "st1 {v30.4s, v31.4s}, [x0]\n"
+        :[lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), [res_ptr] "+r" (res_ptr),
+          [nk] "+r" (nk)
+        : [lstride] "r" (lstride), [rstride] "r" (rstride), [estride] "r" (estride), [rstep] "r"
+(rstep)
+        : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+          "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+    );
+}*/
+
+static void direct_conv_colmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+                                 convParams_t *params)
+{
+  const int w = input->w;
+  const int h = input->h;
+  const int inch = input->c;
+  const int outw = output->w;
+  const int outh = output->h;
+  const int outch = output->c;
+  const int kernel_w = params->kernel_w;
+  const int kernel_h = params->kernel_h;
+  const int stride_w = params->stride_w;
+  const int stride_h = params->stride_h;
+  const int pad_w = params->pad_w;
+  const int pad_h = params->pad_h;
+  const int dilation_w = params->dilation_w;
+  const int dilation_h = params->dilation_h;
+  const float *input_data = input->data;
+  const float *filter_data = filter->data;
+  float *output_data = output->data;
+
+  for (int out_row = 0; out_row < outh; out_row++)
+  {
+    for (int out_col = 0; out_col < outw; out_col++)
+    {
+      const int in_col0 = (out_col * stride_w) - pad_w;
+      const int in_row0 = (out_row * stride_h) - pad_h;
+
+      for (int out_c = 0; out_c < outch; out_c++)
+      {
+        float sum = 0.f;
+        for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+        {
+          for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+          {
+            const int in_col = in_col0 + filter_x * dilation_w;
+            const int in_row = in_row0 + filter_y * dilation_h;
+
+            if (((unsigned int)in_col < (unsigned int)w) &&
+                ((unsigned int)in_row < (unsigned int)h))
+            {
+              for (int in_c = 0; in_c < inch; in_c++)
+              {
+                float input_value = input_data[(in_row * w + in_col) * inch + in_c];
+                float filter_value =
+                    filter_data[((filter_y * kernel_w + filter_x) * inch + in_c) * outch + out_c];
+                sum += (input_value * filter_value);
+              }
+            }
+          }
+        }
+        output_data[(out_row * outw + out_col) * outch + out_c] = sum;
+      }
+    }
+  }
+}
+
+static void direct_sgemm_colmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B,
+                                  float *C)
+{
+  float *aa, *bb;
+
+  if (Atrans)
+  {
+    aa = (float *)malloc(m * k * sizeof(float));
+    if (!aa)
+      return;
+
+    for (int i = 0; i < k; i++)
+    {
+      for (int j = 0; j < m; j++)
+      {
+        aa[i * m + j] = A[j * k + i];
+      }
+    }
+  }
+  else
+  {
+    aa = A;
+  }
+
+  if (Btrans)
+  {
+    bb = (float *)malloc(n * k * sizeof(float));
+    if (!bb)
+      return;
+
+    for (int i = 0; i < n; i++)
+    {
+      for (int j = 0; j < k; j++)
+      {
+        bb[i * k + j] = B[j * n + i];
+      }
+    }
+  }
+  else
+  {
+    bb = B;
+  }
+
+  for (int i = 0; i < m; i++)
+  {
+    for (int j = 0; j < n; j++)
+    {
+      float res = 0.f;
+      for (int l = 0; l < k; l++)
+      {
+        res += bb[j * k + l] * aa[l * m + i];
+      }
+      C[j * m + i] = res;
+    }
+  }
+}
+
+#if 0
+static int test_sgemm(int m, int n, int k, int loops)
+{
+	struct timeval start, end;
+	float total_time = 0.f;
+
+	const int mb = 180;
+	const int nb = 1440;
+	const int kb = 512;
+
+	const int mr = 4;
+	const int nr = 12;
+
+#if 0
+	const int pm = (m + mr - 1) / mr * mr;
+	const int pn = (n + nr - 1) / nr * nr;
+	const int pk = k;
+#else
+	const int pm = (mb + mr - 1) / mr * mr;
+	const int pn = (nb + nr - 1) / nr * nr;
+	const int pk = kb;
+#endif
+	const int nm = (m + mb - 1) / mb;
+	const int nn = (n + nb - 1) / nb;
+	const int nk = (k + kb - 1) / kb;
+
+	const int rm = m % mb;
+	const int rn = n % nb;
+	const int rk = k % kb;
+
+	float *A = (float *)malloc(m * k * sizeof(float));
+	if(!A) return 0;
+
+	for(int i = 0 ; i < m * k; i++)
+	{
+		A[i] = 0.001 + i * 0.000001;
+	}
+
+	float *B = (float *)malloc(k * n * sizeof(float));
+	if(!B) return 0;
+
+	for(int i = 0 ; i < n * k; i++)
+	{
+		B[i] = 0.001 - i * 0.000001;
+	}
+
+	float *C = (float *)malloc(m * n * sizeof(float));
+	if(!C) return 0;
+
+#if 0
+	float *PA = (float *)malloc(pm * pk * sizeof(float));
+	if(!PA) return 0;
+
+	float *PB = (float *)malloc(pk * pn * sizeof(float));
+	if(!PB) return 0;
+#else
+	float PA[pm * pk];
+	float PB[pk * pn];
+#endif
+
+	for(int nloop = 0; nloop < loops; nloop++)
+
+	{
+		gettimeofday(&start, NULL);
+
+		//pack_rowmajor_notrans_lhs(mr, m, k, k, A, PA);
+		//pack_rowmajor_notrans_rhs(nr, n, k, n, B, PB);
+#if 1
+		for (int j = 0; j < nn; j++)
+		{
+			const int _nb = (j != nn - 1 || rn == 0) ? nb : rn;
+			for (int l = 0; l < nk; l++)
+			{
+				const int _kb = (l != nk - 1 || rk == 0) ? kb : rk;
+				pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + j * nb], PB);
+				for(int i = 0; i < nm; i++)
+				{
+					const int _mb = (i != nm - 1 || rm == 0) ? mb : rm;
+					pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[i * mb * k + l * kb], PA);
+					sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, PA, PB, &C[i * mb * n + j * nb], l, n, _kb);
+					//sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk);
+				}
+			}
+		}
+#else
+		for (int j = 0; j < nm; j++)
+		{
+			const int _mb = (j != nm - 1 || rm == 0) ? mb : rm;
+			for (int l = 0; l < nk; l++)
+			{
+				const int _kb = (l != nk - 1 || rk == 0) ? kb : rk;
+				pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[j * mb * k + l * kb], PA);
+				for(int i = 0; i < nn; i++)
+				{
+					const int _nb = (i != nn - 1 || rn == 0) ? nb : rn;
+					pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + i * nb], PB);
+					sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, PA, PB, &C[j * mb * n + i * nb], l, n, _kb);
+					//sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk);
+				}
+			}
+		}
+#endif
+		gettimeofday(&end, NULL);
+		total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec))/1000;
+	}
+
+	int div = m * n < 16 ? m * n : 16;
+	int num = m * n > 64 ? 64 : m * n;
+
+	float *c_ptr = &C[0];
+	for(int i = 0; i < num; i++)
+	{
+		printf("%f ", c_ptr[i]);
+		if((i + 1) % div == 0) printf("\n");
+	}
+
+	printf("\n");
+
+	c_ptr = &C[m * n - num];
+	for(int i = 0; i < num; i++)
+	{
+		printf("%f ", c_ptr[i]);
+		if((i + 1) % div == 0) printf("\n");
+	}
+
+	printf("\n");
+
+	long long total_size = (long long)m *n * k * 2;
+	printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops , total_size, (double)total_size/(total_time / loops)/1000000);
+
+	free(A);
+	free(B);
+	free(C);
+
+	//free(PA);
+	//free(PB);
+
+}
+#endif
+
+static int test_sgemm(int m, int n, int k, int type, int loops)
+{
+  struct timeval start, end;
+  float total_time = 0.f;
+
+  // printf("1.\n");
+
+  float *A = (float *)malloc(m * k * sizeof(float));
+  if (!A)
+    return 0;
+
+  for (int i = 0; i < m * k; i++)
+  {
+    A[i] = 0.001 + i * 0.001; // i * 0.000001;
+  }
+
+  float *B = (float *)malloc(k * n * sizeof(float));
+  if (!B)
+    return 0;
+
+  for (int i = 0; i < n * k; i++)
+  {
+    B[i] = 0.001 - i * 0.001; // - i * 0.000001;
+  }
+
+  float *C = (float *)malloc(m * n * sizeof(float));
+  if (!C)
+    return 0;
+
+  for (int nloop = 0; nloop < loops; nloop++)
+
+  {
+    gettimeofday(&start, NULL);
+
+    if (type == 0)
+    {
+      // direct_sgemm_rowmajor(notrans, notrans, m, n, k, A, B, C);
+      direct_sgemm_colmajor(notrans, notrans, m, n, k, A, B, C);
+    }
+
+    else if (type == 1)
+    {
+      class sgemm_singlethread my_gemm(colMajor, notrans, notrans, m, n, k, A, B, C, 1);
+      my_gemm.run();
+    }
+
+    /*else if(type == 2)
+    {
+        for(int i = 0; i < m / 8; i++)
+        {
+            for(int j = 0; j < n / 8; j++)
+            {
+                direct_sgemm_kernel(k, k, n, n, A + i * 8 * k, B + j * 8, C + i * 8 * n + j * 8);
+            }
+        }
+    }*/
+
+    gettimeofday(&end, NULL);
+    total_time +=
+        ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+  }
+
+  int div = m * n < 16 ? m * n : 16;
+  int num = m * n > 64 ? 64 : m * n;
+
+  float *c_ptr = &C[0];
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  c_ptr = &C[m * n - num];
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  long long total_size = (long long)m * n * k * 2;
+  printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+         total_size, (double)total_size / (total_time / loops) / 1000000);
+
+  free(A);
+  free(B);
+  free(C);
+
+  return 0;
+}
+
+void weight_tensorflow2caffe(float *out, float *in, int H, int W, int C, int N)
+{ // HWCN ---> NCHW
+  for (int h = 0; h < H; ++h)
+  {
+    for (int w = 0; w < W; ++w)
+    {
+      for (int c = 0; c < C; ++c)
+      {
+        for (int n = 0; n < N; ++n)
+        {
+          int index_in = h * W * C * N + w * C * N + c * N + n;
+          int index_out = n * C * H * W + c * H * W + h * W + w;
+          // printf("%3d <--- %3d\n", index_out, index_in);
+          out[index_out] = in[index_in];
+        }
+      }
+    }
+  }
+}
+
+void trans_weight2winograd(const convMat_t &_kernel, float **winograd_weight)
+{
+  const double *G;
+  const int kernel_size = _kernel.h;
+  const int channels = _kernel.c;
+  const int num_output = _kernel.n;
+
+  int tile_h_in_, tile_w_in_;
+  int M, N;
+
+  /*Step 1: transfer weight to winograd domain*/
+  if (kernel_size == 3)
+  {
+    M = winograd_para_3x3s1::M;
+    N = winograd_para_3x3s1::N;
+    G = winograd_para_3x3s1::getG();
+  }
+  else
+  {
+    M = winograd_para_5x5s1::M;
+    N = winograd_para_5x5s1::N;
+    G = winograd_para_5x5s1::getG();
+  }
+
+  tile_h_in_ = tile_w_in_ = M;
+
+  float *winograd_g = new float[M * M * N * N];
+  if (NULL == winograd_g)
+    return;
+  kronecker_product(winograd_g, G, G, M, N, M, N);
+
+  *winograd_weight = new float[tile_h_in_ * tile_w_in_ * channels * num_output];
+
+  if (NULL == *winograd_weight)
+    return;
+
+  float *weight_data_tran = new float[_kernel.h * _kernel.w * _kernel.c * _kernel.n];
+  if (NULL == weight_data_tran)
+    return;
+  weight_tensorflow2caffe(weight_data_tran, _kernel.data, kernel_size, kernel_size, channels,
+                          num_output);
+
+  class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_,
+                                 channels * num_output, kernel_size * kernel_size, winograd_g,
+                                 weight_data_tran, *winograd_weight, 1);
+
+  sgemm.run();
+
+  delete[] weight_data_tran;
+
+  /*With winograd, original weight data is useless.*/
+  delete[] winograd_g;
+}
+
+static int test_conv(const int w, const int h, const int kernel_size, const int stride,
+                     const int inch, const int outch, const int padding, const int conv_type,
+                     const int thread_num, const int loops)
+{
+  struct timeval start, end;
+  float total_time = 0.f;
+
+  struct timeval start1, end1;
+  float total_time1 = 0.f;
+
+  const int dilation = 1;
+
+  const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+  convMat_t input;
+  convMat_t output;
+  convMat_t filter;
+  convParams_t params;
+
+  int pad_l, pad_r, pad_t, pad_b;
+  if (padding)
+  {
+    int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+    int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+    pad_l = pad_w / 2;
+    pad_r = pad_w - pad_l;
+    pad_t = pad_h / 2;
+    pad_b = pad_h - pad_t;
+  }
+  else
+  {
+    pad_l = pad_r = pad_t = pad_b = 0;
+  }
+
+  input.w = w;
+  input.h = h;
+  input.c = inch;
+  input.n = 1;
+#ifdef NCNN
+  input.data =
+      (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float));
+#else
+  input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+#endif
+
+  if (!input.data)
+    return 0;
+
+  output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+  output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+  output.c = outch;
+  output.n = 1;
+#ifdef NCNN
+  output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c *
+                                sizeof(float));
+#else
+  output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+#endif
+
+  if (!output.data)
+    return 0;
+
+  for (int i = 0; i < output.w * output.h * output.c; i++)
+  {
+    output.data[i] = 0;
+  }
+
+  filter.w = kernel_size;
+  filter.h = kernel_size;
+  filter.c = inch;
+  filter.n = outch;
+  filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+  if (!filter.data)
+    return 0;
+
+  for (int i = 0; i < input.w * input.h * input.c; i++)
+  {
+    input.data[i] = 0.001 + i * 0.000001;
+  }
+
+#if 1
+  for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+  {
+    filter.data[i] = 0.001 - i * 0.000001;
+  }
+#else
+  for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+  {
+    if ((i + 1) % 15 == 0)
+      filter.data[i] = 0.001 - i * 0.000001;
+    else
+      filter.data[i] = 0;
+  }
+#endif
+  params.kernel_w = kernel_size;
+  params.kernel_h = kernel_size;
+  params.stride_w = stride;
+  params.stride_h = stride;
+  params.padding = padding;
+  params.pad_w = pad_l;
+  params.pad_h = pad_t;
+  params.dilation_w = dilation;
+  params.dilation_h = dilation;
+
+  const int m = output.c;
+  const int n = output.w * output.h;
+  const int k = params.kernel_h * params.kernel_w * input.c;
+
+  // ocl_context_t context;
+  size_t local_min[2];
+  /**
+      if(conv_type == 14 || conv_type == 15 || conv_type == 6)
+      {
+          if(init_gpu(&context) < 0) return -1;
+          //if(conv_type ==14 || conv_type == 5) sgemm_ocltune(&context, m, n, (k < 1024 ? k :
+     1024), local_min);
+          //else if(conv_type == 6)
+          {
+              if(kernel_size == 3) directconv_3x3S1_tune(&context, &input, &filter, &output,
+     local_min);
+              else if(kernel_size == 1) directconv_1x1S1_tune(&context, &input, &filter, &output,
+     local_min);
+          }
+          //local_min[0] = 1; local_min[1] = 1;
+      }
+      **/
+  if (conv_type == 0)
+  {
+    for (int nloop = 0; nloop < loops; nloop++)
+    {
+      gettimeofday(&start, NULL);
+
+      direct_conv_rowmajor(&input, &output, &filter, &params);
+      // direct_conv_colmajor(&input, &output, &filter, &params);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+  else if (conv_type == 1)
+  {
+    for (int nloop = 0; nloop < loops; nloop++)
+    {
+      // printf("nloop = %d, thread_num = %d\n", nloop, thread_num);
+      // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major);
+      gettimeofday(&start, NULL);
+
+      /*if(thread_num == 1)
+      {
+          class conv_sgemm_singlethread my_gemm(input, filter, output, params, col_major);
+          my_gemm.run();
+      }
+      else
+      {
+          class conv_sgemm_multithreads my_gemm(input, filter, output, params, thread_num,
+      col_major);
+          my_gemm.run();
+      }*/
+
+      srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major);
+
+      // printf("sync\n");
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+  else if (conv_type == 2)
+  {
+    float *winograd_weight;
+
+    // trans_weight2winograd(filter, &winograd_weight);
+
+    winogradParams_t wparams = {params.kernel_w,
+                                params.kernel_h,
+                                params.stride_w,
+                                params.stride_h,
+                                params.dilation_w,
+                                params.dilation_h,
+                                1,
+                                w,
+                                h,
+                                input.c,
+                                output.c,
+                                thread_num,
+                                col_major,
+                                filter.data};
+    winograd_weight = trans_weight2winograd(wparams);
+
+    for (int nloop = 0; nloop < loops; nloop++)
+    {
+      gettimeofday(&start, NULL);
+
+      // class conv_winograd my_sgemm(input, output, params, col_major, winograd_weight, thread_num,
+      // w * h, n);
+      // my_sgemm.run();
+
+      srcn_convolution2D(input, filter, output, params, winograd_weight, thread_num, row_major);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+  else if (conv_type == 3)
+  {
+    void *sparse_weight = trans_weight2sparse(filter);
+
+    for (int nloop = 0; nloop < loops; nloop++)
+    {
+      gettimeofday(&start, NULL);
+
+      srcn_sparse_convolution2D(input, output, params, sparse_weight, thread_num, row_major);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+
+    sparse_release(outch, sparse_weight);
+  } /**
+else if(conv_type == 4)
+{
+#if 0
+    cl_int err;
+    convlib::load_opencl("./libmali.so");
+    const int mpad = (m + 4 - 1) / 4 * 4;
+    const int npad = (n + 4 - 1) / 4 * 4;
+    cl_mem lhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, mpad * k * sizeof(float), NULL, &err);
+    if(err != CL_SUCCESS)
+    {
+        printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+        return -1;
+    }
+
+    cl_image_format rhs_format = {CL_RGBA, CL_FLOAT};
+    cl_image_desc desc =
+    {
+        CL_MEM_OBJECT_IMAGE2D,
+        (size_t)npad / 4,
+        (size_t)k,
+        0, 0,
+        0,
+        0, 0, 0, 0
+    };
+    cl_mem rhs_gpu = convlib::clCreateImage(context.context, CL_MEM_READ_ONLY |
+CL_MEM_ALLOC_HOST_PTR, &rhs_format, &desc, NULL, &err);
+    if(err != CL_SUCCESS)
+    {
+        printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+        return -1;
+    }
+
+    cl_mem rhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, npad * k * sizeof(float), NULL, &err);
+    if(err != CL_SUCCESS)
+    {
+        printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+        return -1;;
+    }
+
+    cl_mem res_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, mpad * npad * sizeof(float), NULL, &err);
+    if(err != CL_SUCCESS)
+    {
+        printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+        return -1;
+    }
+#endif
+    for(int nloop = 0; nloop < loops + 1; nloop++)
+    {
+        gettimeofday(&start, NULL);
+
+        //cl_mem _res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, &params, local_min,
+lhs_gpu, rhs_gpu, res_gpu);
+
+        //get_result_gpu(&context, output.data + gpu_data_off, _res_gpu, m, n);
+        srcn_convolution2D_gpu(input, filter, output, params, row_major);
+
+        gettimeofday(&end, NULL);
+
+        if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000
++ start.tv_usec))/1000;
+    }
+}
+else if(conv_type == 5)
+{
+
+    for(int nloop = 0; nloop < loops + 1; nloop++)
+    {
+        gettimeofday(&start, NULL);
+
+        //cl_mem res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, &params, local_min);
+
+        //clFlush(context.cmdQueue);
+        gettimeofday(&start1, NULL);
+    #if 1
+        srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major
+
+    #endif
+        //usleep(80 * 1000);
+        gettimeofday(&end1, NULL);
+        total_time1 += ((end1.tv_sec * 1000000 + end1.tv_usec) - (start1.tv_sec * 1000000 +
+start1.tv_usec))/1000;
+
+        //get_result_gpu(&context, output.data + gpu_data_off, res_gpu, m, n);
+
+        srcn_convolution2D_dpu(input, filter, output, params, row_major);
+
+        gettimeofday(&end, NULL);
+        if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000
++ start.tv_usec))/1000;
+    }
+}
+else if(conv_type == 6)
+{
+    for(int nloop = 0; nloop < loops; nloop++)
+    {
+        gettimeofday(&start, NULL);
+
+        if(kernel_size == 3 && stride == 1 && padding == 0)
+        {
+            conv2D_gpu_directconv_3x3S1(&context, &input, &filter, &output, &params, local_min);
+        }
+        else if(kernel_size == 1 && stride == 1 && padding == 0)
+        {
+            conv2D_gpu_directconv_1x1S1(&context, &input, &filter, &output, &params, local_min);
+        }
+
+        gettimeofday(&end, NULL);
+        total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 +
+start.tv_usec))/1000;
+    }
+}**/
+
+  int div = m * n < 16 ? m * n : 16;
+  int num = m * n > 64 ? 64 : m * n;
+
+  if (conv_type < 4)
+    printf("[CPU RESULT]\n");
+  else if (conv_type == 4)
+    printf("[GPU RESULT]\n");
+  else if (conv_type == 5)
+    printf("[DPU RESULT]\n");
+  float *c_ptr = output.data;
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  c_ptr = &output.data[m * n - num];
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  long long total_size = (long long)m * n * k * 2;
+  printf(
+      "AVER Time consuming: %.2fms, CPU Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n",
+      total_time / loops, total_time1 / loops, total_size,
+      (double)total_size / (total_time / loops) / 1000000);
+
+  free(input.data);
+  free(output.data);
+  free(filter.data);
+
+  return 0;
+}
+
+static int test_deconv(const int w, const int h, const int kernel_size, const int stride,
+                       const int inch, const int outch, const int padding, const int conv_type,
+                       const int thread_num, const int loops)
+{
+  struct timeval start, end;
+  float total_time = 0.f;
+
+  const int dilation = 1;
+
+  const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+  convMat_t input;
+  convMat_t output;
+  convMat_t filter;
+  convParams_t params;
+
+  int pad_l, pad_r, pad_t, pad_b;
+  if (padding)
+  {
+    int pad_w = kernel_dilation - 1;
+    int pad_h = kernel_dilation - 1;
+    pad_l = pad_w / 2;
+    pad_r = pad_w - pad_l;
+    pad_t = pad_h / 2;
+    pad_b = pad_h - pad_t;
+  }
+  else
+  {
+    pad_l = pad_r = pad_t = pad_b = 0;
+  }
+
+  input.w = w;
+  input.h = h;
+  input.c = inch;
+  input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+  if (!input.data)
+    return 0;
+
+  // output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+  // output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+  output.w = stride * (w - 1) + kernel_dilation - (pad_l + pad_r);
+  output.h = stride * (h - 1) + kernel_dilation - (pad_t + pad_b);
+  output.c = outch;
+  output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+  if (!output.data)
+    return 0;
+
+  filter.w = kernel_size;
+  filter.h = kernel_size;
+  filter.c = outch;
+  filter.n = inch;
+  filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+  if (!filter.data)
+    return 0;
+
+  for (int i = 0; i < input.w * input.h * input.c; i++)
+  {
+    input.data[i] = 0.001 + i * 0.000001;
+  }
+
+  for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+  {
+    filter.data[i] = 0.001 - i * 0.000001;
+  }
+
+  params.kernel_w = kernel_size;
+  params.kernel_h = kernel_size;
+  params.stride_w = stride;
+  params.stride_h = stride;
+  params.padding = padding;
+  params.pad_w = pad_l;
+  params.pad_h = pad_t;
+  params.dilation_w = dilation;
+  params.dilation_h = dilation;
+
+  const int m = params.kernel_h * params.kernel_w * output.c;
+  const int n = input.w * input.h;
+  const int k = input.c;
+
+  if (conv_type == 0)
+  {
+    for (int nloop = 0; nloop < loops; nloop++)
+
+    {
+      gettimeofday(&start, NULL);
+
+      direct_deconv_rowmajor(&input, &output, &filter, &params);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+  else if (conv_type == 1)
+  {
+    for (int nloop = 0; nloop < loops; nloop++)
+
+    {
+      gettimeofday(&start, NULL);
+
+      for (int i = 0; i < output.w * output.h * output.c; i++)
+      {
+        output.data[i] = 0;
+      }
+
+      srcn_deconvolution2D(input, filter, output, params, thread_num, row_major);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+
+  const int output_size = output.w * output.h * output.c;
+
+  int div = output_size < 16 ? output_size : 16;
+  int num = output_size > 64 ? 64 : output_size;
+
+  float *c_ptr = output.data;
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  c_ptr = &output.data[output_size - num];
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  long long total_size = (long long)m * n * k * 2;
+  printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+         total_size, (double)total_size / (total_time / loops) / 1000000);
+
+  free(input.data);
+  free(output.data);
+  free(filter.data);
+
+  return 0;
+}
+
+static int test_batch_conv(const int batch, const int w, const int h, const int kernel_size,
+                           const int stride, const int inch, const int outch, const int padding,
+                           const int conv_type, const int thread_num, const int loops)
+{
+  struct timeval start, end;
+  float total_time = 0.f;
+
+  const int dilation = 1;
+
+  const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+  convMat_t input;
+  convMat_t output;
+  convMat_t filter;
+  convParams_t params;
+
+  int pad_l, pad_r, pad_t, pad_b;
+  if (padding)
+  {
+    int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+    int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+    pad_l = pad_w / 2;
+    pad_r = pad_w - pad_l;
+    pad_t = pad_h / 2;
+    pad_b = pad_h - pad_t;
+  }
+  else
+  {
+    pad_l = pad_r = pad_t = pad_b = 0;
+  }
+
+  input.w = w;
+  input.h = h;
+  input.c = inch;
+  input.n = batch;
+  input.data = (float *)malloc(input.n * input.w * input.h * input.c * sizeof(float));
+  if (!input.data)
+    return 0;
+
+  output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+  output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+  output.c = outch;
+  output.n = batch;
+  output.data = (float *)malloc(output.n * output.w * output.h * output.c * sizeof(float));
+  if (!output.data)
+    return 0;
+
+  filter.w = kernel_size;
+  filter.h = kernel_size;
+  filter.c = inch;
+  filter.n = outch;
+  filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+  if (!filter.data)
+    return 0;
+
+  for (int i = 0; i < input.w * input.h * input.c * input.n; i++)
+  {
+    input.data[i] = 0.001 + i * 0.000001;
+  }
+
+  for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+  {
+    filter.data[i] = 0.001 - i * 0.000001;
+  }
+
+  params.kernel_w = kernel_size;
+  params.kernel_h = kernel_size;
+  params.stride_w = stride;
+  params.stride_h = stride;
+  params.padding = padding;
+  params.pad_w = pad_l;
+  params.pad_h = pad_t;
+  params.dilation_w = dilation;
+  params.dilation_h = dilation;
+
+  const int m = output.c;
+  const int n = output.w * output.h;
+  const int k = params.kernel_h * params.kernel_w * input.c;
+
+  if (conv_type == 1)
+  {
+    for (int nloop = 0; nloop < loops; nloop++)
+
+    {
+      // printf("nloop = %d, thread_num = %d\n", nloop, thread_num);
+      // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major);
+
+      gettimeofday(&start, NULL);
+
+      srcn_batch_convolution2D(input, filter, output, params, NULL, thread_num, col_major);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+  else if (conv_type == 2)
+  {
+    float *winograd_weight;
+
+    // trans_weight2winograd(filter, &winograd_weight);
+
+    winogradParams_t wparams = {params.kernel_w,
+                                params.kernel_h,
+                                params.stride_w,
+                                params.stride_h,
+                                params.dilation_w,
+                                params.dilation_h,
+                                input.n,
+                                w,
+                                h,
+                                input.c,
+                                output.c,
+                                thread_num,
+                                col_major,
+                                filter.data};
+    winograd_weight = trans_weight2winograd(wparams);
+
+    for (int nloop = 0; nloop < loops; nloop++)
+
+    {
+      gettimeofday(&start, NULL);
+
+      srcn_batch_convolution2D(input, filter, output, params, winograd_weight, thread_num,
+                               col_major);
+
+      gettimeofday(&end, NULL);
+      total_time +=
+          ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+    }
+  }
+
+  int div = m * n < 16 ? m * n : 16;
+  int num = m * n > 64 ? 64 : m * n;
+
+  float *c_ptr = output.data;
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  c_ptr = &output.data[m * n * batch - num];
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  long long total_size = (long long)batch * m * n * k * 2;
+  printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+         total_size, (double)total_size / (total_time / loops) / 1000000);
+
+  free(input.data);
+  free(output.data);
+  free(filter.data);
+
+  return 0;
+}
+
+static int test_depthwise_conv(const int w, const int h, const int kernel_size, const int stride,
+                               const int inch, const int outch, const int padding,
+                               const int conv_type, const int thread_num, const int loops)
+{
+  if (outch != inch)
+    return -1;
+  struct timeval start, end;
+  float total_time = 0.f;
+
+  const int dilation = 1;
+
+  const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+  convMat_t input;
+  convMat_t output;
+  convMat_t filter;
+  convMat_t bias;
+  convParams_t params;
+
+  int pad_l, pad_r, pad_t, pad_b;
+  if (padding)
+  {
+    int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+    int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+    pad_l = pad_w / 2;
+    pad_r = pad_w - pad_l;
+    pad_t = pad_h / 2;
+    pad_b = pad_h - pad_t;
+  }
+  else
+  {
+    pad_l = pad_r = pad_t = pad_b = 0;
+  }
+
+  input.w = w;
+  input.h = h;
+  input.c = inch;
+  input.n = 1;
+#ifdef NCNN
+  input.data =
+      (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float));
+#else
+  input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+#endif
+  if (!input.data)
+    return 0;
+
+  output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+  output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+  output.c = outch;
+  output.n = 1;
+
+#ifdef NCNN
+  output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c *
+                                sizeof(float));
+#else
+  output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+#endif
+  const int gpu_data_off = output.w * output.h * output.c;
+  if (!output.data)
+    return 0;
+
+  for (int i = 0; i < output.w * output.h * output.c; i++)
+  {
+    output.data[i] = 1.f;
+  }
+
+  filter.w = kernel_size;
+  filter.h = kernel_size;
+  filter.c = 1;
+  filter.n = outch;
+  filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+  if (!filter.data)
+    return 0;
+
+  for (int i = 0; i < input.w * input.h * input.c; i++)
+  {
+    input.data[i] = 0.001 + i * 0.000001;
+  }
+
+  for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+  {
+    filter.data[i] = 0.001 - i * 0.000001;
+  }
+
+  bias.w = outch;
+  bias.data = (float *)malloc(bias.w * sizeof(float));
+  if (!bias.data)
+    return 0;
+  for (int i = 0; i < bias.w; i++)
+  {
+    bias.data[i] = 0.f;
+  }
+
+  params.kernel_w = kernel_size;
+  params.kernel_h = kernel_size;
+  params.stride_w = stride;
+  params.stride_h = stride;
+  params.padding = padding;
+  params.pad_w = pad_l;
+  params.pad_h = pad_t;
+  params.dilation_w = dilation;
+  params.dilation_h = dilation;
+
+  const int m = output.c;
+  const int n = output.w * output.h;
+  const int k = params.kernel_h * params.kernel_w * input.c;
+
+  // ocl_context_t context;
+  size_t local_min[2] = {4, 4};
+  /**
+  if(conv_type == 1)
+  {
+      if(init_gpu(&context) < 0) return -1;
+      depthwise_conv_3x3S1_tune(&context, &input, &filter, &output, local_min);
+  }**/
+
+  gettimeofday(&start, NULL);
+  if (conv_type == 0)
+    srcn_depthwise_conv(input, filter, output, bias, params, 4,
+                        row_major); // convdw3x3s1_neon(input, output, filter, filter);
+  // else if(conv_type == 1) depthwise_conv_gpu3x3S1(&context, &input, &filter, &output, &params,
+  // local_min);
+  else if (conv_type == 2)
+  {
+    for (int i = 0; i < input.c; i++)
+    {
+      convMat_t _input;
+      convMat_t _output;
+      convMat_t _filter;
+      convParams_t _params = params;
+
+      _input.w = input.w;
+      _input.h = input.h;
+      _input.c = 1;
+      _input.n = 1;
+#ifdef NCNN
+      _input.data = input.data + i * alignSize(input.w * input.h, 16 / sizeof(float));
+#else
+      _input.data = input.data + i * input.w * input.h;
+#endif
+
+      _output.w = output.w;
+      _output.h = output.h;
+      _output.c = 1;
+      _output.n = 1;
+#ifdef NCNN
+      _output.data = output.data + i * alignSize(output.w * output.h, 16 / sizeof(float));
+#else
+      _output.data = output.data + i * output.w * output.h;
+#endif
+      _filter.w = filter.w;
+      _filter.h = filter.h;
+      _filter.c = 1; // filter.c;
+      _filter.n = 1; // filter.n;
+      _filter.data = filter.data + i * 9;
+
+      srcn_convolution2D(_input, _filter, _output, _params, NULL, 1, row_major);
+      // direct_conv_rowmajor(&_input, &_output, &_filter, &_params);
+    }
+  }
+
+  gettimeofday(&end, NULL);
+  total_time +=
+      ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+
+  int div = m * n < 16 ? m * n : 16;
+  int num = m * n > 64 ? 64 : m * n;
+
+  if (conv_type == 0)
+    printf("[CPU RESULT]\n");
+  else if (conv_type == 1)
+    printf("[GPU RESULT]\n");
+  float *c_ptr = output.data;
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  c_ptr = &output.data[m * n - num];
+  for (int i = 0; i < num; i++)
+  {
+    printf("%f ", c_ptr[i]);
+    if ((i + 1) % div == 0)
+      printf("\n");
+  }
+
+  printf("\n");
+
+  long long total_size = (long long)m * n * k * 2;
+  printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+         total_size, (double)total_size / (total_time / loops) / 1000000);
+
+  free(input.data);
+  free(output.data);
+  free(filter.data);
+  free(bias.data);
+
+  return 0;
+}
+
+//#define TEST_SGEMM
+#define TEST_CONV
+//#define TEST_DECONV
+//#define TEST_BATCH_CONV
+//#define TEST_DEPTHWISE_CONV
+
+int main(int argc, char **argv)
+{
+#ifdef TEST_SGEMM
+  if (argc < 6)
+    return 0;
+
+  const int m = atoi(argv[1]);
+  const int n = atoi(argv[2]);
+  const int k = atoi(argv[3]);
+  const int type = atoi(argv[4]);
+  const int loops = atoi(argv[5]);
+
+  test_sgemm(m, n, k, type, loops);
+#elif (defined TEST_CONV)
+  if (argc < 10)
+    return 0;
+  const int w = atoi(argv[1]);
+  const int h = atoi(argv[2]);
+  const int kernel_size = atoi(argv[3]);
+  const int stride = atoi(argv[4]);
+  const int outch = atoi(argv[5]);
+  const int inch = atoi(argv[6]);
+  const int padding = atoi(argv[7]);
+  const int conv_type = atoi(argv[8]);
+  const int thread_num = atoi(argv[9]);
+  int loops = 1;
+  if (argc > 10)
+    loops = atoi(argv[10]);
+  test_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops);
+#elif (defined TEST_DECONV)
+  if (argc < 10)
+    return 0;
+  const int w = atoi(argv[1]);
+  const int h = atoi(argv[2]);
+  const int kernel_size = atoi(argv[3]);
+  const int stride = atoi(argv[4]);
+  const int outch = atoi(argv[5]);
+  const int inch = atoi(argv[6]);
+  const int padding = atoi(argv[7]);
+  const int conv_type = atoi(argv[8]);
+  const int thread_num = atoi(argv[9]);
+  int loops = 1;
+  if (argc > 10)
+    loops = atoi(argv[10]);
+  test_deconv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops);
+#elif (defined TEST_BATCH_CONV)
+  if (argc < 11)
+    return 0;
+  const int batch = atoi(argv[1]);
+  const int w = atoi(argv[2]);
+  const int h = atoi(argv[3]);
+  const int kernel_size = atoi(argv[4]);
+  const int stride = atoi(argv[5]);
+  const int outch = atoi(argv[6]);
+  const int inch = atoi(argv[7]);
+  const int padding = atoi(argv[8]);
+  const int conv_type = atoi(argv[9]);
+  const int thread_num = atoi(argv[10]);
+  int loops = 1;
+  if (argc > 11)
+    loops = atoi(argv[11]);
+  test_batch_conv(batch, w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num,
+                  loops);
+#elif (defined TEST_DEPTHWISE_CONV)
+  if (argc < 10)
+    return 0;
+  const int w = atoi(argv[1]);
+  const int h = atoi(argv[2]);
+  const int kernel_size = atoi(argv[3]);
+  const int stride = atoi(argv[4]);
+  const int outch = atoi(argv[5]);
+  const int inch = atoi(argv[6]);
+  const int padding = atoi(argv[7]);
+  const int conv_type = atoi(argv[8]);
+  const int thread_num = atoi(argv[9]);
+  int loops = 1;
+  if (argc > 10)
+    loops = atoi(argv[10]);
+  test_depthwise_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num,
+                      loops);
+#endif
+
+  return 0;
+}
+
+} // namespace srcn
+} // namespace nnfw