summaryrefslogtreecommitdiff
path: root/compute/ncnn/src/srcn/sgemm_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ncnn/src/srcn/sgemm_test.cc')
-rw-r--r--compute/ncnn/src/srcn/sgemm_test.cc1883
1 files changed, 1883 insertions, 0 deletions
diff --git a/compute/ncnn/src/srcn/sgemm_test.cc b/compute/ncnn/src/srcn/sgemm_test.cc
new file mode 100644
index 000000000..1b10970bb
--- /dev/null
+++ b/compute/ncnn/src/srcn/sgemm_test.cc
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "ncnn/srcn/conv_type.h"
+#include "srcn/srcn_conv.h"
+//#include "srcn_sgemm.h"
+#include "conv_sgemm_singlethread.h"
+#include "conv_sgemm_multithreads.h"
+//#include "conv_sgemm_batch.h"
+#include "sgemm_singlethread.h"
+#include "conv_winograd.h"
+#include "winograd.h"
+
+//#include "conv_gpu.h"
+//#include "convolutiondepthwise_3x3.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static void direct_conv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+ convParams_t *params)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int inch = input->c;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outch = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const int dilation_w = params->dilation_w;
+ const int dilation_h = params->dilation_h;
+ const float *input_data = input->data;
+ const float *filter_data = filter->data;
+ float *output_data = output->data;
+
+ for (int out_c = 0; out_c < outch; out_c++)
+ {
+ for (int out_row = 0; out_row < outh; out_row++)
+ {
+ for (int out_col = 0; out_col < outw; out_col++)
+ {
+ const int in_col0 = (out_col * stride_w) - pad_w;
+ const int in_row0 = (out_row * stride_h) - pad_h;
+ float sum = 0.f;
+ for (int in_c = 0; in_c < inch; in_c++)
+ {
+ for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+ {
+ for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+ {
+ const int in_col = in_col0 + filter_x * dilation_w;
+ const int in_row = in_row0 + filter_y * dilation_h;
+
+ if (((unsigned int)in_col < (unsigned int)w) &&
+ ((unsigned int)in_row < (unsigned int)h))
+ {
+ float input_value = input_data[(in_c * h + in_row) * w + in_col];
+ float filter_value =
+ filter_data[((out_c * inch + in_c) * kernel_h + filter_y) * kernel_w +
+ filter_x];
+ sum += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ output_data[(out_c * outh + out_row) * outw + out_col] = sum;
+ }
+ }
+ }
+}
+
+static void direct_deconv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+ convParams_t *params)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int inch = input->c;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outch = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const int dilation_w = params->dilation_w;
+ const int dilation_h = params->dilation_h;
+ const float *input_data = input->data;
+ const float *filter_data = filter->data;
+ float *output_data = output->data;
+
+ for (int i = 0; i < outw * outh * outch; i++)
+ {
+ output_data[i] = 0;
+ }
+
+ for (int in_c = 0; in_c < inch; in_c++)
+ {
+ for (int in_row = 0; in_row < h; in_row++)
+ {
+ for (int in_col = 0; in_col < w; in_col++)
+ {
+ const int out_col0 = (in_col * stride_w) - pad_w;
+ const int out_row0 = (in_row * stride_h) - pad_h;
+ float in_value = input_data[(in_c * h + in_row) * w + in_col];
+ for (int out_c = 0; out_c < outch; out_c++)
+ {
+ for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+ {
+ for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+ {
+ const int out_col = out_col0 + filter_x * dilation_w;
+ const int out_row = out_row0 + filter_y * dilation_h;
+
+ if (((unsigned int)out_col < (unsigned int)outw) &&
+ ((unsigned int)out_row < (unsigned int)outh))
+ {
+ float filter_value =
+ filter_data[((in_c * outch + out_c) * kernel_h + filter_y) * kernel_w +
+ filter_x];
+ output_data[(out_c * outh + out_row) * outw + out_col] += filter_value * in_value;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void direct_sgemm_rowmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B,
+ float *C)
+{
+ float *aa, *bb;
+
+ if (Atrans == trans)
+ {
+ aa = (float *)malloc(m * k * sizeof(float));
+ if (!aa)
+ return;
+
+ for (int i = 0; i < k; i++)
+ {
+ for (int j = 0; j < m; j++)
+ {
+ aa[j * k + i] = A[i * m + j];
+ }
+ }
+ }
+ else
+ {
+ aa = A;
+ }
+
+ if (Btrans == trans)
+ {
+ bb = (float *)malloc(n * k * sizeof(float));
+ if (!bb)
+ return;
+
+ for (int i = 0; i < n; i++)
+ {
+ for (int j = 0; j < k; j++)
+ {
+ bb[j * n + i] = B[i * k + j];
+ }
+ }
+ }
+ else
+ {
+ bb = B;
+ }
+
+ for (int i = 0; i < m; i++)
+ {
+ for (int j = 0; j < n; j++)
+ {
+ float res = 0.f;
+ for (int l = 0; l < k; l++)
+ {
+ res += aa[i * k + l] * bb[l * n + j];
+ }
+ C[i * n + j] = res;
+ }
+ }
+}
+
+/*static void direct_sgemm_kernel(const int k, const int lhs_stride, const int rhs_stride, const int
+res_stride,
+ const float *lhs_ptr, const float *rhs_ptr, float *res_ptr)
+{
+ int lstride = lhs_stride << 2;
+ int rstride = rhs_stride << 2;
+ int estride = res_stride << 2;
+ int rstep = rstride << 2;
+
+ int nk = (k >> 2) - 1;
+
+ __asm __volatile (
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ "mov x0, %[lhs_ptr]\n"
+ "add %[lhs_ptr], %[lhs_ptr], #16\n"
+ "ld1 {v0.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+ "ld1 {v1.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+ "ld1 {v2.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+ "ld1 {v3.4s}, [x0]\n"
+ "add x0, x0, %[lstride]\n"
+
+ "mov x1, %[rhs_ptr]\n"
+ "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n"
+ "ld1 {v8.4s, v9.4s}, [x1]\n"
+ "add x1, x1, %[rstride]\n"
+ "ld1 {v10.4s, v11.4s}, [x1]\n"
+ "add x1, x1, %[rstride]\n"
+
+ "1:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v16.4s, v10.4s, v0.s[1]\n"
+ "fmla v17.4s, v11.4s, v0.s[1]\n"
+ "fmla v18.4s, v8.4s, v1.s[0]\n"
+ "fmla v19.4s, v9.4s, v1.s[0]\n"
+ "fmla v18.4s, v10.4s, v1.s[1]\n"
+ "fmla v19.4s, v11.4s, v1.s[1]\n"
+ "ld1 {v12.4s, v13.4s}, [x1]\n"
+ "fmla v20.4s, v8.4s, v2.s[0]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v9.4s, v2.s[0]\n"
+ "ld1 {v14.4s, v15.4s}, [x1]\n"
+ "fmla v20.4s, v10.4s, v2.s[1]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v11.4s, v2.s[1]\n"
+ "fmla v22.4s, v8.4s, v3.s[0]\n"
+ "fmla v23.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v3.s[1]\n"
+ "fmla v23.4s, v11.4s, v3.s[1]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "fmla v16.4s, v12.4s, v0.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v13.4s, v0.s[2]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "fmla v16.4s, v14.4s, v0.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v15.4s, v0.s[3]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "fmla v18.4s, v12.4s, v1.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v13.4s, v1.s[2]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "fmla v18.4s, v14.4s, v1.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v15.4s, v1.s[3]\n"
+ "fmla v20.4s, v12.4s, v2.s[2]\n"
+ "fmla v21.4s, v13.4s, v2.s[2]\n"
+ "fmla v20.4s, v14.4s, v2.s[3]\n"
+ "fmla v21.4s, v15.4s, v2.s[3]\n"
+ "fmla v22.4s, v12.4s, v3.s[2]\n"
+ "fmla v23.4s, v13.4s, v3.s[2]\n"
+ "fmla v22.4s, v14.4s, v3.s[3]\n"
+ "fmla v23.4s, v15.4s, v3.s[3]\n"
+
+ "mov x0, %[lhs_ptr]\n"
+ "add %[lhs_ptr], %[lhs_ptr], #16\n"
+
+ "fmla v24.4s, v8.4s, v4.s[0]\n"
+ "fmla v25.4s, v9.4s, v4.s[0]\n"
+ "ld1 {v0.4s}, [x0]\n"
+ "fmla v24.4s, v10.4s, v4.s[1]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v25.4s, v11.4s, v4.s[1]\n"
+ "ld1 {v1.4s}, [x0]\n"
+ "fmla v26.4s, v8.4s, v5.s[0]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v27.4s, v9.4s, v5.s[0]\n"
+ "ld1 {v2.4s}, [x0]\n"
+ "fmla v26.4s, v10.4s, v5.s[1]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v27.4s, v11.4s, v5.s[1]\n"
+ "ld1 {v3.4s}, [x0]\n"
+ "fmla v28.4s, v8.4s, v6.s[0]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v29.4s, v9.4s, v6.s[0]\n"
+ "fmla v28.4s, v10.4s, v6.s[1]\n"
+ "fmla v29.4s, v11.4s, v6.s[1]\n"
+ "fmla v30.4s, v8.4s, v7.s[0]\n"
+ "fmla v31.4s, v9.4s, v7.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[1]\n"
+ "fmla v31.4s, v11.4s, v7.s[1]\n"
+
+ "mov x1, %[rhs_ptr]\n"
+ "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n"
+
+ "fmla v24.4s, v12.4s, v4.s[2]\n"
+ "fmla v25.4s, v13.4s, v4.s[2]\n"
+ "ld1 {v8.4s, v9.4s}, [x1]\n"
+ "fmla v24.4s, v14.4s, v4.s[3]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v25.4s, v15.4s, v4.s[3]\n"
+ "ld1 {v10.4s, v11.4s}, [x1]\n"
+ "fmla v26.4s, v12.4s, v5.s[2]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v27.4s, v13.4s, v5.s[2]\n"
+ "fmla v26.4s, v14.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v5.s[3]\n"
+ "fmla v28.4s, v12.4s, v6.s[2]\n"
+ "fmla v29.4s, v13.4s, v6.s[2]\n"
+ "fmla v28.4s, v14.4s, v6.s[3]\n"
+ "fmla v29.4s, v15.4s, v6.s[3]\n"
+ "fmla v30.4s, v12.4s, v7.s[2]\n"
+ "fmla v31.4s, v13.4s, v7.s[2]\n"
+ "subs %w[nk], %w[nk], #1\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "bne 1b\n"
+
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v16.4s, v10.4s, v0.s[1]\n"
+ "fmla v17.4s, v11.4s, v0.s[1]\n"
+ "fmla v18.4s, v8.4s, v1.s[0]\n"
+ "fmla v19.4s, v9.4s, v1.s[0]\n"
+ "fmla v18.4s, v10.4s, v1.s[1]\n"
+ "fmla v19.4s, v11.4s, v1.s[1]\n"
+ "ld1 {v12.4s, v13.4s}, [x1]\n"
+ "fmla v20.4s, v8.4s, v2.s[0]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v9.4s, v2.s[0]\n"
+ "ld1 {v14.4s, v15.4s}, [x1]\n"
+ "fmla v20.4s, v10.4s, v2.s[1]\n"
+ "add x1, x1, %[rstride]\n"
+ "fmla v21.4s, v11.4s, v2.s[1]\n"
+ "fmla v22.4s, v8.4s, v3.s[0]\n"
+ "fmla v23.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v3.s[1]\n"
+ "fmla v23.4s, v11.4s, v3.s[1]\n"
+
+ "ld1 {v4.4s}, [x0]\n"
+ "fmla v16.4s, v12.4s, v0.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v13.4s, v0.s[2]\n"
+ "ld1 {v5.4s}, [x0]\n"
+ "fmla v16.4s, v14.4s, v0.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v17.4s, v15.4s, v0.s[3]\n"
+ "ld1 {v6.4s}, [x0]\n"
+ "fmla v18.4s, v12.4s, v1.s[2]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v13.4s, v1.s[2]\n"
+ "ld1 {v7.4s}, [x0]\n"
+ "fmla v18.4s, v14.4s, v1.s[3]\n"
+ "add x0, x0, %[lstride]\n"
+ "fmla v19.4s, v15.4s, v1.s[3]\n"
+ "fmla v20.4s, v12.4s, v2.s[2]\n"
+ "fmla v21.4s, v13.4s, v2.s[2]\n"
+ "fmla v20.4s, v14.4s, v2.s[3]\n"
+ "fmla v21.4s, v15.4s, v2.s[3]\n"
+ "fmla v22.4s, v12.4s, v3.s[2]\n"
+ "fmla v23.4s, v13.4s, v3.s[2]\n"
+ "fmla v22.4s, v14.4s, v3.s[3]\n"
+ "fmla v23.4s, v15.4s, v3.s[3]\n"
+
+ "mov x0, %[res_ptr]\n"
+ "fmla v24.4s, v8.4s, v4.s[0]\n"
+ "fmla v25.4s, v9.4s, v4.s[0]\n"
+ "st1 {v16.4s, v17.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v24.4s, v10.4s, v4.s[1]\n"
+ "fmla v25.4s, v11.4s, v4.s[1]\n"
+ "st1 {v18.4s, v19.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v26.4s, v8.4s, v5.s[0]\n"
+ "fmla v27.4s, v9.4s, v5.s[0]\n"
+ "st1 {v20.4s, v21.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v26.4s, v10.4s, v5.s[1]\n"
+ "fmla v27.4s, v11.4s, v5.s[1]\n"
+ "st1 {v22.4s, v23.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v28.4s, v8.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v6.s[0]\n"
+ "fmla v28.4s, v10.4s, v6.s[1]\n"
+ "fmla v29.4s, v11.4s, v6.s[1]\n"
+ "fmla v30.4s, v8.4s, v7.s[0]\n"
+ "fmla v31.4s, v9.4s, v7.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[1]\n"
+ "fmla v31.4s, v11.4s, v7.s[1]\n"
+
+ "fmla v24.4s, v12.4s, v4.s[2]\n"
+ "fmla v25.4s, v13.4s, v4.s[2]\n"
+ "fmla v24.4s, v14.4s, v4.s[3]\n"
+ "fmla v25.4s, v15.4s, v4.s[3]\n"
+ "fmla v26.4s, v12.4s, v5.s[2]\n"
+ "fmla v27.4s, v13.4s, v5.s[2]\n"
+ "st1 {v24.4s, v25.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v26.4s, v14.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v5.s[3]\n"
+ "fmla v28.4s, v12.4s, v6.s[2]\n"
+ "fmla v29.4s, v13.4s, v6.s[2]\n"
+ "st1 {v26.4s, v27.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v28.4s, v14.4s, v6.s[3]\n"
+ "fmla v29.4s, v15.4s, v6.s[3]\n"
+ "fmla v30.4s, v12.4s, v7.s[2]\n"
+ "fmla v31.4s, v13.4s, v7.s[2]\n"
+ "st1 {v28.4s, v29.4s}, [x0]\n"
+ "add x0, x0, %[estride]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "st1 {v30.4s, v31.4s}, [x0]\n"
+ :[lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), [res_ptr] "+r" (res_ptr),
+ [nk] "+r" (nk)
+ : [lstride] "r" (lstride), [rstride] "r" (rstride), [estride] "r" (estride), [rstep] "r"
+(rstep)
+ : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+ );
+}*/
+
+static void direct_conv_colmajor(convMat_t *input, convMat_t *output, convMat_t *filter,
+ convParams_t *params)
+{
+ const int w = input->w;
+ const int h = input->h;
+ const int inch = input->c;
+ const int outw = output->w;
+ const int outh = output->h;
+ const int outch = output->c;
+ const int kernel_w = params->kernel_w;
+ const int kernel_h = params->kernel_h;
+ const int stride_w = params->stride_w;
+ const int stride_h = params->stride_h;
+ const int pad_w = params->pad_w;
+ const int pad_h = params->pad_h;
+ const int dilation_w = params->dilation_w;
+ const int dilation_h = params->dilation_h;
+ const float *input_data = input->data;
+ const float *filter_data = filter->data;
+ float *output_data = output->data;
+
+ for (int out_row = 0; out_row < outh; out_row++)
+ {
+ for (int out_col = 0; out_col < outw; out_col++)
+ {
+ const int in_col0 = (out_col * stride_w) - pad_w;
+ const int in_row0 = (out_row * stride_h) - pad_h;
+
+ for (int out_c = 0; out_c < outch; out_c++)
+ {
+ float sum = 0.f;
+ for (int filter_y = 0; filter_y < kernel_h; filter_y++)
+ {
+ for (int filter_x = 0; filter_x < kernel_w; filter_x++)
+ {
+ const int in_col = in_col0 + filter_x * dilation_w;
+ const int in_row = in_row0 + filter_y * dilation_h;
+
+ if (((unsigned int)in_col < (unsigned int)w) &&
+ ((unsigned int)in_row < (unsigned int)h))
+ {
+ for (int in_c = 0; in_c < inch; in_c++)
+ {
+ float input_value = input_data[(in_row * w + in_col) * inch + in_c];
+ float filter_value =
+ filter_data[((filter_y * kernel_w + filter_x) * inch + in_c) * outch + out_c];
+ sum += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ output_data[(out_row * outw + out_col) * outch + out_c] = sum;
+ }
+ }
+ }
+}
+
+static void direct_sgemm_colmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B,
+ float *C)
+{
+ float *aa, *bb;
+
+ if (Atrans)
+ {
+ aa = (float *)malloc(m * k * sizeof(float));
+ if (!aa)
+ return;
+
+ for (int i = 0; i < k; i++)
+ {
+ for (int j = 0; j < m; j++)
+ {
+ aa[i * m + j] = A[j * k + i];
+ }
+ }
+ }
+ else
+ {
+ aa = A;
+ }
+
+ if (Btrans)
+ {
+ bb = (float *)malloc(n * k * sizeof(float));
+ if (!bb)
+ return;
+
+ for (int i = 0; i < n; i++)
+ {
+ for (int j = 0; j < k; j++)
+ {
+ bb[i * k + j] = B[j * n + i];
+ }
+ }
+ }
+ else
+ {
+ bb = B;
+ }
+
+ for (int i = 0; i < m; i++)
+ {
+ for (int j = 0; j < n; j++)
+ {
+ float res = 0.f;
+ for (int l = 0; l < k; l++)
+ {
+ res += bb[j * k + l] * aa[l * m + i];
+ }
+ C[j * m + i] = res;
+ }
+ }
+}
+
+#if 0
+static int test_sgemm(int m, int n, int k, int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int mb = 180;
+ const int nb = 1440;
+ const int kb = 512;
+
+ const int mr = 4;
+ const int nr = 12;
+
+#if 0
+ const int pm = (m + mr - 1) / mr * mr;
+ const int pn = (n + nr - 1) / nr * nr;
+ const int pk = k;
+#else
+ const int pm = (mb + mr - 1) / mr * mr;
+ const int pn = (nb + nr - 1) / nr * nr;
+ const int pk = kb;
+#endif
+ const int nm = (m + mb - 1) / mb;
+ const int nn = (n + nb - 1) / nb;
+ const int nk = (k + kb - 1) / kb;
+
+ const int rm = m % mb;
+ const int rn = n % nb;
+ const int rk = k % kb;
+
+ float *A = (float *)malloc(m * k * sizeof(float));
+ if(!A) return 0;
+
+ for(int i = 0 ; i < m * k; i++)
+ {
+ A[i] = 0.001 + i * 0.000001;
+ }
+
+ float *B = (float *)malloc(k * n * sizeof(float));
+ if(!B) return 0;
+
+ for(int i = 0 ; i < n * k; i++)
+ {
+ B[i] = 0.001 - i * 0.000001;
+ }
+
+ float *C = (float *)malloc(m * n * sizeof(float));
+ if(!C) return 0;
+
+#if 0
+ float *PA = (float *)malloc(pm * pk * sizeof(float));
+ if(!PA) return 0;
+
+ float *PB = (float *)malloc(pk * pn * sizeof(float));
+ if(!PB) return 0;
+#else
+ float PA[pm * pk];
+ float PB[pk * pn];
+#endif
+
+ for(int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ //pack_rowmajor_notrans_lhs(mr, m, k, k, A, PA);
+ //pack_rowmajor_notrans_rhs(nr, n, k, n, B, PB);
+#if 1
+ for (int j = 0; j < nn; j++)
+ {
+ const int _nb = (j != nn - 1 || rn == 0) ? nb : rn;
+ for (int l = 0; l < nk; l++)
+ {
+ const int _kb = (l != nk - 1 || rk == 0) ? kb : rk;
+ pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + j * nb], PB);
+ for(int i = 0; i < nm; i++)
+ {
+ const int _mb = (i != nm - 1 || rm == 0) ? mb : rm;
+ pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[i * mb * k + l * kb], PA);
+ sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, PA, PB, &C[i * mb * n + j * nb], l, n, _kb);
+ //sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk);
+ }
+ }
+ }
+#else
+ for (int j = 0; j < nm; j++)
+ {
+ const int _mb = (j != nm - 1 || rm == 0) ? mb : rm;
+ for (int l = 0; l < nk; l++)
+ {
+ const int _kb = (l != nk - 1 || rk == 0) ? kb : rk;
+ pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[j * mb * k + l * kb], PA);
+ for(int i = 0; i < nn; i++)
+ {
+ const int _nb = (i != nn - 1 || rn == 0) ? nb : rn;
+ pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + i * nb], PB);
+ sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, PA, PB, &C[j * mb * n + i * nb], l, n, _kb);
+ //sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk);
+ }
+ }
+ }
+#endif
+ gettimeofday(&end, NULL);
+ total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec))/1000;
+ }
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ float *c_ptr = &C[0];
+ for(int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if((i + 1) % div == 0) printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &C[m * n - num];
+ for(int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if((i + 1) % div == 0) printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m *n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops , total_size, (double)total_size/(total_time / loops)/1000000);
+
+ free(A);
+ free(B);
+ free(C);
+
+ //free(PA);
+ //free(PB);
+
+}
+#endif
+
+static int test_sgemm(int m, int n, int k, int type, int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ // printf("1.\n");
+
+ float *A = (float *)malloc(m * k * sizeof(float));
+ if (!A)
+ return 0;
+
+ for (int i = 0; i < m * k; i++)
+ {
+ A[i] = 0.001 + i * 0.001; // i * 0.000001;
+ }
+
+ float *B = (float *)malloc(k * n * sizeof(float));
+ if (!B)
+ return 0;
+
+ for (int i = 0; i < n * k; i++)
+ {
+ B[i] = 0.001 - i * 0.001; // - i * 0.000001;
+ }
+
+ float *C = (float *)malloc(m * n * sizeof(float));
+ if (!C)
+ return 0;
+
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ if (type == 0)
+ {
+ // direct_sgemm_rowmajor(notrans, notrans, m, n, k, A, B, C);
+ direct_sgemm_colmajor(notrans, notrans, m, n, k, A, B, C);
+ }
+
+ else if (type == 1)
+ {
+ class sgemm_singlethread my_gemm(colMajor, notrans, notrans, m, n, k, A, B, C, 1);
+ my_gemm.run();
+ }
+
+ /*else if(type == 2)
+ {
+ for(int i = 0; i < m / 8; i++)
+ {
+ for(int j = 0; j < n / 8; j++)
+ {
+ direct_sgemm_kernel(k, k, n, n, A + i * 8 * k, B + j * 8, C + i * 8 * n + j * 8);
+ }
+ }
+ }*/
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ float *c_ptr = &C[0];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &C[m * n - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(A);
+ free(B);
+ free(C);
+
+ return 0;
+}
+
+void weight_tensorflow2caffe(float *out, float *in, int H, int W, int C, int N)
+{ // HWCN ---> NCHW
+ for (int h = 0; h < H; ++h)
+ {
+ for (int w = 0; w < W; ++w)
+ {
+ for (int c = 0; c < C; ++c)
+ {
+ for (int n = 0; n < N; ++n)
+ {
+ int index_in = h * W * C * N + w * C * N + c * N + n;
+ int index_out = n * C * H * W + c * H * W + h * W + w;
+ // printf("%3d <--- %3d\n", index_out, index_in);
+ out[index_out] = in[index_in];
+ }
+ }
+ }
+ }
+}
+
+void trans_weight2winograd(const convMat_t &_kernel, float **winograd_weight)
+{
+ const double *G;
+ const int kernel_size = _kernel.h;
+ const int channels = _kernel.c;
+ const int num_output = _kernel.n;
+
+ int tile_h_in_, tile_w_in_;
+ int M, N;
+
+ /*Step 1: transfer weight to winograd domain*/
+ if (kernel_size == 3)
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ G = winograd_para_3x3s1::getG();
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ G = winograd_para_5x5s1::getG();
+ }
+
+ tile_h_in_ = tile_w_in_ = M;
+
+ float *winograd_g = new float[M * M * N * N];
+ if (NULL == winograd_g)
+ return;
+ kronecker_product(winograd_g, G, G, M, N, M, N);
+
+ *winograd_weight = new float[tile_h_in_ * tile_w_in_ * channels * num_output];
+
+ if (NULL == *winograd_weight)
+ return;
+
+ float *weight_data_tran = new float[_kernel.h * _kernel.w * _kernel.c * _kernel.n];
+ if (NULL == weight_data_tran)
+ return;
+ weight_tensorflow2caffe(weight_data_tran, _kernel.data, kernel_size, kernel_size, channels,
+ num_output);
+
+ class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_,
+ channels * num_output, kernel_size * kernel_size, winograd_g,
+ weight_data_tran, *winograd_weight, 1);
+
+ sgemm.run();
+
+ delete[] weight_data_tran;
+
+ /*With winograd, original weight data is useless.*/
+ delete[] winograd_g;
+}
+
+static int test_conv(const int w, const int h, const int kernel_size, const int stride,
+ const int inch, const int outch, const int padding, const int conv_type,
+ const int thread_num, const int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ struct timeval start1, end1;
+ float total_time1 = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+ int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.n = 1;
+#ifdef NCNN
+ input.data =
+ (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float));
+#else
+ input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+#endif
+
+ if (!input.data)
+ return 0;
+
+ output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.c = outch;
+ output.n = 1;
+#ifdef NCNN
+ output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c *
+ sizeof(float));
+#else
+ output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+#endif
+
+ if (!output.data)
+ return 0;
+
+ for (int i = 0; i < output.w * output.h * output.c; i++)
+ {
+ output.data[i] = 0;
+ }
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = inch;
+ filter.n = outch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+#if 1
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+#else
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ if ((i + 1) % 15 == 0)
+ filter.data[i] = 0.001 - i * 0.000001;
+ else
+ filter.data[i] = 0;
+ }
+#endif
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = output.c;
+ const int n = output.w * output.h;
+ const int k = params.kernel_h * params.kernel_w * input.c;
+
+ // ocl_context_t context;
+ size_t local_min[2];
+ /**
+ if(conv_type == 14 || conv_type == 15 || conv_type == 6)
+ {
+ if(init_gpu(&context) < 0) return -1;
+ //if(conv_type ==14 || conv_type == 5) sgemm_ocltune(&context, m, n, (k < 1024 ? k :
+ 1024), local_min);
+ //else if(conv_type == 6)
+ {
+ if(kernel_size == 3) directconv_3x3S1_tune(&context, &input, &filter, &output,
+ local_min);
+ else if(kernel_size == 1) directconv_1x1S1_tune(&context, &input, &filter, &output,
+ local_min);
+ }
+ //local_min[0] = 1; local_min[1] = 1;
+ }
+ **/
+ if (conv_type == 0)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ direct_conv_rowmajor(&input, &output, &filter, &params);
+ // direct_conv_colmajor(&input, &output, &filter, &params);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 1)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ // printf("nloop = %d, thread_num = %d\n", nloop, thread_num);
+ // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major);
+ gettimeofday(&start, NULL);
+
+ /*if(thread_num == 1)
+ {
+ class conv_sgemm_singlethread my_gemm(input, filter, output, params, col_major);
+ my_gemm.run();
+ }
+ else
+ {
+ class conv_sgemm_multithreads my_gemm(input, filter, output, params, thread_num,
+ col_major);
+ my_gemm.run();
+ }*/
+
+ srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major);
+
+ // printf("sync\n");
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 2)
+ {
+ float *winograd_weight;
+
+ // trans_weight2winograd(filter, &winograd_weight);
+
+ winogradParams_t wparams = {params.kernel_w,
+ params.kernel_h,
+ params.stride_w,
+ params.stride_h,
+ params.dilation_w,
+ params.dilation_h,
+ 1,
+ w,
+ h,
+ input.c,
+ output.c,
+ thread_num,
+ col_major,
+ filter.data};
+ winograd_weight = trans_weight2winograd(wparams);
+
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ // class conv_winograd my_sgemm(input, output, params, col_major, winograd_weight, thread_num,
+ // w * h, n);
+ // my_sgemm.run();
+
+ srcn_convolution2D(input, filter, output, params, winograd_weight, thread_num, row_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 3)
+ {
+ void *sparse_weight = trans_weight2sparse(filter);
+
+ for (int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ srcn_sparse_convolution2D(input, output, params, sparse_weight, thread_num, row_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+
+ sparse_release(outch, sparse_weight);
+ } /**
+else if(conv_type == 4)
+{
+#if 0
+ cl_int err;
+ convlib::load_opencl("./libmali.so");
+ const int mpad = (m + 4 - 1) / 4 * 4;
+ const int npad = (n + 4 - 1) / 4 * 4;
+ cl_mem lhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, mpad * k * sizeof(float), NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;
+ }
+
+ cl_image_format rhs_format = {CL_RGBA, CL_FLOAT};
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE2D,
+ (size_t)npad / 4,
+ (size_t)k,
+ 0, 0,
+ 0,
+ 0, 0, 0, 0
+ };
+ cl_mem rhs_gpu = convlib::clCreateImage(context.context, CL_MEM_READ_ONLY |
+CL_MEM_ALLOC_HOST_PTR, &rhs_format, &desc, NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;
+ }
+
+ cl_mem rhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, npad * k * sizeof(float), NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;;
+ }
+
+ cl_mem res_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE |
+CL_MEM_ALLOC_HOST_PTR, mpad * npad * sizeof(float), NULL, &err);
+ if(err != CL_SUCCESS)
+ {
+ printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__);
+ return -1;
+ }
+#endif
+ for(int nloop = 0; nloop < loops + 1; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ //cl_mem _res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, &params, local_min,
+lhs_gpu, rhs_gpu, res_gpu);
+
+ //get_result_gpu(&context, output.data + gpu_data_off, _res_gpu, m, n);
+ srcn_convolution2D_gpu(input, filter, output, params, row_major);
+
+ gettimeofday(&end, NULL);
+
+ if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000
++ start.tv_usec))/1000;
+ }
+}
+else if(conv_type == 5)
+{
+
+ for(int nloop = 0; nloop < loops + 1; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ //cl_mem res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, &params, local_min);
+
+ //clFlush(context.cmdQueue);
+ gettimeofday(&start1, NULL);
+ #if 1
+ srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major
+
+ #endif
+ //usleep(80 * 1000);
+ gettimeofday(&end1, NULL);
+ total_time1 += ((end1.tv_sec * 1000000 + end1.tv_usec) - (start1.tv_sec * 1000000 +
+start1.tv_usec))/1000;
+
+ //get_result_gpu(&context, output.data + gpu_data_off, res_gpu, m, n);
+
+ srcn_convolution2D_dpu(input, filter, output, params, row_major);
+
+ gettimeofday(&end, NULL);
+ if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000
++ start.tv_usec))/1000;
+ }
+}
+else if(conv_type == 6)
+{
+ for(int nloop = 0; nloop < loops; nloop++)
+ {
+ gettimeofday(&start, NULL);
+
+ if(kernel_size == 3 && stride == 1 && padding == 0)
+ {
+ conv2D_gpu_directconv_3x3S1(&context, &input, &filter, &output, &params, local_min);
+ }
+ else if(kernel_size == 1 && stride == 1 && padding == 0)
+ {
+ conv2D_gpu_directconv_1x1S1(&context, &input, &filter, &output, &params, local_min);
+ }
+
+ gettimeofday(&end, NULL);
+ total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 +
+start.tv_usec))/1000;
+ }
+}**/
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ if (conv_type < 4)
+ printf("[CPU RESULT]\n");
+ else if (conv_type == 4)
+ printf("[GPU RESULT]\n");
+ else if (conv_type == 5)
+ printf("[DPU RESULT]\n");
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[m * n - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf(
+ "AVER Time consuming: %.2fms, CPU Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n",
+ total_time / loops, total_time1 / loops, total_size,
+ (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+
+ return 0;
+}
+
+static int test_deconv(const int w, const int h, const int kernel_size, const int stride,
+ const int inch, const int outch, const int padding, const int conv_type,
+ const int thread_num, const int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation - 1;
+ int pad_h = kernel_dilation - 1;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+ if (!input.data)
+ return 0;
+
+ // output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ // output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.w = stride * (w - 1) + kernel_dilation - (pad_l + pad_r);
+ output.h = stride * (h - 1) + kernel_dilation - (pad_t + pad_b);
+ output.c = outch;
+ output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+ if (!output.data)
+ return 0;
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = outch;
+ filter.n = inch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = params.kernel_h * params.kernel_w * output.c;
+ const int n = input.w * input.h;
+ const int k = input.c;
+
+ if (conv_type == 0)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ direct_deconv_rowmajor(&input, &output, &filter, &params);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 1)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ for (int i = 0; i < output.w * output.h * output.c; i++)
+ {
+ output.data[i] = 0;
+ }
+
+ srcn_deconvolution2D(input, filter, output, params, thread_num, row_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+
+ const int output_size = output.w * output.h * output.c;
+
+ int div = output_size < 16 ? output_size : 16;
+ int num = output_size > 64 ? 64 : output_size;
+
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[output_size - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+
+ return 0;
+}
+
+static int test_batch_conv(const int batch, const int w, const int h, const int kernel_size,
+ const int stride, const int inch, const int outch, const int padding,
+ const int conv_type, const int thread_num, const int loops)
+{
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+ int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.n = batch;
+ input.data = (float *)malloc(input.n * input.w * input.h * input.c * sizeof(float));
+ if (!input.data)
+ return 0;
+
+ output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.c = outch;
+ output.n = batch;
+ output.data = (float *)malloc(output.n * output.w * output.h * output.c * sizeof(float));
+ if (!output.data)
+ return 0;
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = inch;
+ filter.n = outch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c * input.n; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = output.c;
+ const int n = output.w * output.h;
+ const int k = params.kernel_h * params.kernel_w * input.c;
+
+ if (conv_type == 1)
+ {
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ // printf("nloop = %d, thread_num = %d\n", nloop, thread_num);
+ // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major);
+
+ gettimeofday(&start, NULL);
+
+ srcn_batch_convolution2D(input, filter, output, params, NULL, thread_num, col_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+ else if (conv_type == 2)
+ {
+ float *winograd_weight;
+
+ // trans_weight2winograd(filter, &winograd_weight);
+
+ winogradParams_t wparams = {params.kernel_w,
+ params.kernel_h,
+ params.stride_w,
+ params.stride_h,
+ params.dilation_w,
+ params.dilation_h,
+ input.n,
+ w,
+ h,
+ input.c,
+ output.c,
+ thread_num,
+ col_major,
+ filter.data};
+ winograd_weight = trans_weight2winograd(wparams);
+
+ for (int nloop = 0; nloop < loops; nloop++)
+
+ {
+ gettimeofday(&start, NULL);
+
+ srcn_batch_convolution2D(input, filter, output, params, winograd_weight, thread_num,
+ col_major);
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+ }
+ }
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[m * n * batch - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)batch * m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+
+ return 0;
+}
+
+static int test_depthwise_conv(const int w, const int h, const int kernel_size, const int stride,
+ const int inch, const int outch, const int padding,
+ const int conv_type, const int thread_num, const int loops)
+{
+ if (outch != inch)
+ return -1;
+ struct timeval start, end;
+ float total_time = 0.f;
+
+ const int dilation = 1;
+
+ const int kernel_dilation = dilation * (kernel_size - 1) + 1;
+
+ convMat_t input;
+ convMat_t output;
+ convMat_t filter;
+ convMat_t bias;
+ convParams_t params;
+
+ int pad_l, pad_r, pad_t, pad_b;
+ if (padding)
+ {
+ int pad_w = kernel_dilation + (w - 1) / stride * stride - w;
+ int pad_h = kernel_dilation + (h - 1) / stride * stride - h;
+ pad_l = pad_w / 2;
+ pad_r = pad_w - pad_l;
+ pad_t = pad_h / 2;
+ pad_b = pad_h - pad_t;
+ }
+ else
+ {
+ pad_l = pad_r = pad_t = pad_b = 0;
+ }
+
+ input.w = w;
+ input.h = h;
+ input.c = inch;
+ input.n = 1;
+#ifdef NCNN
+ input.data =
+ (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float));
+#else
+ input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float));
+#endif
+ if (!input.data)
+ return 0;
+
+ output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1;
+ output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1;
+ output.c = outch;
+ output.n = 1;
+
+#ifdef NCNN
+ output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c *
+ sizeof(float));
+#else
+ output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float));
+#endif
+ const int gpu_data_off = output.w * output.h * output.c;
+ if (!output.data)
+ return 0;
+
+ for (int i = 0; i < output.w * output.h * output.c; i++)
+ {
+ output.data[i] = 1.f;
+ }
+
+ filter.w = kernel_size;
+ filter.h = kernel_size;
+ filter.c = 1;
+ filter.n = outch;
+ filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float));
+ if (!filter.data)
+ return 0;
+
+ for (int i = 0; i < input.w * input.h * input.c; i++)
+ {
+ input.data[i] = 0.001 + i * 0.000001;
+ }
+
+ for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++)
+ {
+ filter.data[i] = 0.001 - i * 0.000001;
+ }
+
+ bias.w = outch;
+ bias.data = (float *)malloc(bias.w * sizeof(float));
+ if (!bias.data)
+ return 0;
+ for (int i = 0; i < bias.w; i++)
+ {
+ bias.data[i] = 0.f;
+ }
+
+ params.kernel_w = kernel_size;
+ params.kernel_h = kernel_size;
+ params.stride_w = stride;
+ params.stride_h = stride;
+ params.padding = padding;
+ params.pad_w = pad_l;
+ params.pad_h = pad_t;
+ params.dilation_w = dilation;
+ params.dilation_h = dilation;
+
+ const int m = output.c;
+ const int n = output.w * output.h;
+ const int k = params.kernel_h * params.kernel_w * input.c;
+
+ // ocl_context_t context;
+ size_t local_min[2] = {4, 4};
+ /**
+ if(conv_type == 1)
+ {
+ if(init_gpu(&context) < 0) return -1;
+ depthwise_conv_3x3S1_tune(&context, &input, &filter, &output, local_min);
+ }**/
+
+ gettimeofday(&start, NULL);
+ if (conv_type == 0)
+ srcn_depthwise_conv(input, filter, output, bias, params, 4,
+ row_major); // convdw3x3s1_neon(input, output, filter, filter);
+ // else if(conv_type == 1) depthwise_conv_gpu3x3S1(&context, &input, &filter, &output, &params,
+ // local_min);
+ else if (conv_type == 2)
+ {
+ for (int i = 0; i < input.c; i++)
+ {
+ convMat_t _input;
+ convMat_t _output;
+ convMat_t _filter;
+ convParams_t _params = params;
+
+ _input.w = input.w;
+ _input.h = input.h;
+ _input.c = 1;
+ _input.n = 1;
+#ifdef NCNN
+ _input.data = input.data + i * alignSize(input.w * input.h, 16 / sizeof(float));
+#else
+ _input.data = input.data + i * input.w * input.h;
+#endif
+
+ _output.w = output.w;
+ _output.h = output.h;
+ _output.c = 1;
+ _output.n = 1;
+#ifdef NCNN
+ _output.data = output.data + i * alignSize(output.w * output.h, 16 / sizeof(float));
+#else
+ _output.data = output.data + i * output.w * output.h;
+#endif
+ _filter.w = filter.w;
+ _filter.h = filter.h;
+ _filter.c = 1; // filter.c;
+ _filter.n = 1; // filter.n;
+ _filter.data = filter.data + i * 9;
+
+ srcn_convolution2D(_input, _filter, _output, _params, NULL, 1, row_major);
+ // direct_conv_rowmajor(&_input, &_output, &_filter, &_params);
+ }
+ }
+
+ gettimeofday(&end, NULL);
+ total_time +=
+ ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000;
+
+ int div = m * n < 16 ? m * n : 16;
+ int num = m * n > 64 ? 64 : m * n;
+
+ if (conv_type == 0)
+ printf("[CPU RESULT]\n");
+ else if (conv_type == 1)
+ printf("[GPU RESULT]\n");
+ float *c_ptr = output.data;
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ c_ptr = &output.data[m * n - num];
+ for (int i = 0; i < num; i++)
+ {
+ printf("%f ", c_ptr[i]);
+ if ((i + 1) % div == 0)
+ printf("\n");
+ }
+
+ printf("\n");
+
+ long long total_size = (long long)m * n * k * 2;
+ printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops,
+ total_size, (double)total_size / (total_time / loops) / 1000000);
+
+ free(input.data);
+ free(output.data);
+ free(filter.data);
+ free(bias.data);
+
+ return 0;
+}
+
+//#define TEST_SGEMM
+#define TEST_CONV
+//#define TEST_DECONV
+//#define TEST_BATCH_CONV
+//#define TEST_DEPTHWISE_CONV
+
+int main(int argc, char **argv)
+{
+#ifdef TEST_SGEMM
+ if (argc < 6)
+ return 0;
+
+ const int m = atoi(argv[1]);
+ const int n = atoi(argv[2]);
+ const int k = atoi(argv[3]);
+ const int type = atoi(argv[4]);
+ const int loops = atoi(argv[5]);
+
+ test_sgemm(m, n, k, type, loops);
+#elif (defined TEST_CONV)
+ if (argc < 10)
+ return 0;
+ const int w = atoi(argv[1]);
+ const int h = atoi(argv[2]);
+ const int kernel_size = atoi(argv[3]);
+ const int stride = atoi(argv[4]);
+ const int outch = atoi(argv[5]);
+ const int inch = atoi(argv[6]);
+ const int padding = atoi(argv[7]);
+ const int conv_type = atoi(argv[8]);
+ const int thread_num = atoi(argv[9]);
+ int loops = 1;
+ if (argc > 10)
+ loops = atoi(argv[10]);
+ test_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops);
+#elif (defined TEST_DECONV)
+ if (argc < 10)
+ return 0;
+ const int w = atoi(argv[1]);
+ const int h = atoi(argv[2]);
+ const int kernel_size = atoi(argv[3]);
+ const int stride = atoi(argv[4]);
+ const int outch = atoi(argv[5]);
+ const int inch = atoi(argv[6]);
+ const int padding = atoi(argv[7]);
+ const int conv_type = atoi(argv[8]);
+ const int thread_num = atoi(argv[9]);
+ int loops = 1;
+ if (argc > 10)
+ loops = atoi(argv[10]);
+ test_deconv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops);
+#elif (defined TEST_BATCH_CONV)
+ if (argc < 11)
+ return 0;
+ const int batch = atoi(argv[1]);
+ const int w = atoi(argv[2]);
+ const int h = atoi(argv[3]);
+ const int kernel_size = atoi(argv[4]);
+ const int stride = atoi(argv[5]);
+ const int outch = atoi(argv[6]);
+ const int inch = atoi(argv[7]);
+ const int padding = atoi(argv[8]);
+ const int conv_type = atoi(argv[9]);
+ const int thread_num = atoi(argv[10]);
+ int loops = 1;
+ if (argc > 11)
+ loops = atoi(argv[11]);
+ test_batch_conv(batch, w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num,
+ loops);
+#elif (defined TEST_DEPTHWISE_CONV)
+ if (argc < 10)
+ return 0;
+ const int w = atoi(argv[1]);
+ const int h = atoi(argv[2]);
+ const int kernel_size = atoi(argv[3]);
+ const int stride = atoi(argv[4]);
+ const int outch = atoi(argv[5]);
+ const int inch = atoi(argv[6]);
+ const int padding = atoi(argv[7]);
+ const int conv_type = atoi(argv[8]);
+ const int thread_num = atoi(argv[9]);
+ int loops = 1;
+ if (argc > 10)
+ loops = atoi(argv[10]);
+ test_depthwise_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num,
+ loops);
+#endif
+
+ return 0;
+}
+
+} // namespace srcn
+} // namespace nnfw