diff options
Diffstat (limited to 'compute/ncnn/src/srcn/sgemm_test.cc')
-rw-r--r-- | compute/ncnn/src/srcn/sgemm_test.cc | 1883 |
1 files changed, 1883 insertions, 0 deletions
diff --git a/compute/ncnn/src/srcn/sgemm_test.cc b/compute/ncnn/src/srcn/sgemm_test.cc new file mode 100644 index 000000000..1b10970bb --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_test.cc @@ -0,0 +1,1883 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <unistd.h> + +#include "ncnn/srcn/conv_type.h" +#include "srcn/srcn_conv.h" +//#include "srcn_sgemm.h" +#include "conv_sgemm_singlethread.h" +#include "conv_sgemm_multithreads.h" +//#include "conv_sgemm_batch.h" +#include "sgemm_singlethread.h" +#include "conv_winograd.h" +#include "winograd.h" + +//#include "conv_gpu.h" +//#include "convolutiondepthwise_3x3.h" + +namespace nnfw +{ +namespace srcn +{ + +static void direct_conv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter, + convParams_t *params) +{ + const int w = input->w; + const int h = input->h; + const int inch = input->c; + const int outw = output->w; + const int outh = output->h; + const int outch = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const float *input_data = input->data; + const float *filter_data = filter->data; + float *output_data = output->data; + + for (int out_c = 0; out_c < outch; out_c++) + { + for (int out_row = 0; out_row < outh; out_row++) + { + for (int out_col = 0; out_col < outw; out_col++) + { + const int in_col0 = (out_col * stride_w) - pad_w; + const int in_row0 = (out_row * stride_h) - pad_h; + float sum = 0.f; + for (int in_c = 0; in_c < inch; in_c++) + { + for (int filter_y = 0; filter_y < kernel_h; filter_y++) + { + for (int filter_x = 0; filter_x < kernel_w; filter_x++) + { + const int in_col = in_col0 + filter_x * dilation_w; + const int in_row = in_row0 + filter_y * dilation_h; + + if (((unsigned int)in_col < (unsigned int)w) && + ((unsigned int)in_row < (unsigned int)h)) + { + float input_value = input_data[(in_c * h + in_row) * w + in_col]; + float filter_value = + filter_data[((out_c * inch + in_c) * kernel_h + filter_y) * kernel_w + + filter_x]; + sum += (input_value * filter_value); + } + } + } + } + output_data[(out_c * outh + out_row) * outw + out_col] = sum; + } + } + } +} + +static void direct_deconv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter, + convParams_t *params) +{ + const int w = input->w; + const int h = input->h; + const int inch = input->c; + const int outw = output->w; + const int outh = output->h; + const int outch = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const float *input_data = input->data; + const float *filter_data = filter->data; + float *output_data = output->data; + + for (int i = 0; i < outw * outh * outch; i++) + { + output_data[i] = 0; + } + + for (int in_c = 0; in_c < inch; in_c++) + { + for (int in_row = 0; in_row < h; in_row++) + { + for (int in_col = 0; in_col < w; in_col++) + { + const int out_col0 = (in_col * stride_w) - pad_w; + const int out_row0 = (in_row * stride_h) - pad_h; + float in_value = input_data[(in_c * h + in_row) * w + in_col]; + for (int out_c = 0; out_c < outch; out_c++) + { + for (int filter_y = 0; filter_y < kernel_h; filter_y++) + { + for (int filter_x = 0; filter_x < kernel_w; filter_x++) + { + const int out_col = out_col0 + filter_x * dilation_w; + const int out_row = out_row0 + filter_y * dilation_h; + + if (((unsigned int)out_col < (unsigned int)outw) && + ((unsigned int)out_row < (unsigned int)outh)) + { + float filter_value = + filter_data[((in_c * outch + out_c) * kernel_h + filter_y) * kernel_w + + filter_x]; + output_data[(out_c * outh + out_row) * outw + out_col] += filter_value * in_value; + } + } + } + } + } + } + } +} + +static void direct_sgemm_rowmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B, + float *C) +{ + float *aa, *bb; + + if (Atrans == trans) + { + aa = (float *)malloc(m * k * sizeof(float)); + if (!aa) + return; + + for (int i = 0; i < k; i++) + { + for (int j = 0; j < m; j++) + { + aa[j * k + i] = A[i * m + j]; + } + } + } + else + { + aa = A; + } + + if (Btrans == trans) + { + bb = (float *)malloc(n * k * sizeof(float)); + if (!bb) + return; + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < k; j++) + { + bb[j * n + i] = B[i * k + j]; + } + } + } + else + { + bb = B; + } + + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + float res = 0.f; + for (int l = 0; l < k; l++) + { + res += aa[i * k + l] * bb[l * n + j]; + } + C[i * n + j] = res; + } + } +} + +/*static void direct_sgemm_kernel(const int k, const int lhs_stride, const int rhs_stride, const int +res_stride, + const float *lhs_ptr, const float *rhs_ptr, float *res_ptr) +{ + int lstride = lhs_stride << 2; + int rstride = rhs_stride << 2; + int estride = res_stride << 2; + int rstep = rstride << 2; + + int nk = (k >> 2) - 1; + + __asm __volatile ( + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + "mov x0, %[lhs_ptr]\n" + "add %[lhs_ptr], %[lhs_ptr], #16\n" + "ld1 {v0.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + "ld1 {v1.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + "ld1 {v2.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + "ld1 {v3.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + + "mov x1, %[rhs_ptr]\n" + "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n" + "ld1 {v8.4s, v9.4s}, [x1]\n" + "add x1, x1, %[rstride]\n" + "ld1 {v10.4s, v11.4s}, [x1]\n" + "add x1, x1, %[rstride]\n" + + "1:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v16.4s, v10.4s, v0.s[1]\n" + "fmla v17.4s, v11.4s, v0.s[1]\n" + "fmla v18.4s, v8.4s, v1.s[0]\n" + "fmla v19.4s, v9.4s, v1.s[0]\n" + "fmla v18.4s, v10.4s, v1.s[1]\n" + "fmla v19.4s, v11.4s, v1.s[1]\n" + "ld1 {v12.4s, v13.4s}, [x1]\n" + "fmla v20.4s, v8.4s, v2.s[0]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v9.4s, v2.s[0]\n" + "ld1 {v14.4s, v15.4s}, [x1]\n" + "fmla v20.4s, v10.4s, v2.s[1]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v11.4s, v2.s[1]\n" + "fmla v22.4s, v8.4s, v3.s[0]\n" + "fmla v23.4s, v9.4s, v3.s[0]\n" + "fmla v22.4s, v10.4s, v3.s[1]\n" + "fmla v23.4s, v11.4s, v3.s[1]\n" + + "ld1 {v4.4s}, [x0]\n" + "fmla v16.4s, v12.4s, v0.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v13.4s, v0.s[2]\n" + "ld1 {v5.4s}, [x0]\n" + "fmla v16.4s, v14.4s, v0.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v15.4s, v0.s[3]\n" + "ld1 {v6.4s}, [x0]\n" + "fmla v18.4s, v12.4s, v1.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v13.4s, v1.s[2]\n" + "ld1 {v7.4s}, [x0]\n" + "fmla v18.4s, v14.4s, v1.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v15.4s, v1.s[3]\n" + "fmla v20.4s, v12.4s, v2.s[2]\n" + "fmla v21.4s, v13.4s, v2.s[2]\n" + "fmla v20.4s, v14.4s, v2.s[3]\n" + "fmla v21.4s, v15.4s, v2.s[3]\n" + "fmla v22.4s, v12.4s, v3.s[2]\n" + "fmla v23.4s, v13.4s, v3.s[2]\n" + "fmla v22.4s, v14.4s, v3.s[3]\n" + "fmla v23.4s, v15.4s, v3.s[3]\n" + + "mov x0, %[lhs_ptr]\n" + "add %[lhs_ptr], %[lhs_ptr], #16\n" + + "fmla v24.4s, v8.4s, v4.s[0]\n" + "fmla v25.4s, v9.4s, v4.s[0]\n" + "ld1 {v0.4s}, [x0]\n" + "fmla v24.4s, v10.4s, v4.s[1]\n" + "add x0, x0, %[lstride]\n" + "fmla v25.4s, v11.4s, v4.s[1]\n" + "ld1 {v1.4s}, [x0]\n" + "fmla v26.4s, v8.4s, v5.s[0]\n" + "add x0, x0, %[lstride]\n" + "fmla v27.4s, v9.4s, v5.s[0]\n" + "ld1 {v2.4s}, [x0]\n" + "fmla v26.4s, v10.4s, v5.s[1]\n" + "add x0, x0, %[lstride]\n" + "fmla v27.4s, v11.4s, v5.s[1]\n" + "ld1 {v3.4s}, [x0]\n" + "fmla v28.4s, v8.4s, v6.s[0]\n" + "add x0, x0, %[lstride]\n" + "fmla v29.4s, v9.4s, v6.s[0]\n" + "fmla v28.4s, v10.4s, v6.s[1]\n" + "fmla v29.4s, v11.4s, v6.s[1]\n" + "fmla v30.4s, v8.4s, v7.s[0]\n" + "fmla v31.4s, v9.4s, v7.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + + "mov x1, %[rhs_ptr]\n" + "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n" + + "fmla v24.4s, v12.4s, v4.s[2]\n" + "fmla v25.4s, v13.4s, v4.s[2]\n" + "ld1 {v8.4s, v9.4s}, [x1]\n" + "fmla v24.4s, v14.4s, v4.s[3]\n" + "add x1, x1, %[rstride]\n" + "fmla v25.4s, v15.4s, v4.s[3]\n" + "ld1 {v10.4s, v11.4s}, [x1]\n" + "fmla v26.4s, v12.4s, v5.s[2]\n" + "add x1, x1, %[rstride]\n" + "fmla v27.4s, v13.4s, v5.s[2]\n" + "fmla v26.4s, v14.4s, v5.s[3]\n" + "fmla v27.4s, v15.4s, v5.s[3]\n" + "fmla v28.4s, v12.4s, v6.s[2]\n" + "fmla v29.4s, v13.4s, v6.s[2]\n" + "fmla v28.4s, v14.4s, v6.s[3]\n" + "fmla v29.4s, v15.4s, v6.s[3]\n" + "fmla v30.4s, v12.4s, v7.s[2]\n" + "fmla v31.4s, v13.4s, v7.s[2]\n" + "subs %w[nk], %w[nk], #1\n" + "fmla v30.4s, v14.4s, v7.s[3]\n" + "fmla v31.4s, v15.4s, v7.s[3]\n" + "bne 1b\n" + + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v16.4s, v10.4s, v0.s[1]\n" + "fmla v17.4s, v11.4s, v0.s[1]\n" + "fmla v18.4s, v8.4s, v1.s[0]\n" + "fmla v19.4s, v9.4s, v1.s[0]\n" + "fmla v18.4s, v10.4s, v1.s[1]\n" + "fmla v19.4s, v11.4s, v1.s[1]\n" + "ld1 {v12.4s, v13.4s}, [x1]\n" + "fmla v20.4s, v8.4s, v2.s[0]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v9.4s, v2.s[0]\n" + "ld1 {v14.4s, v15.4s}, [x1]\n" + "fmla v20.4s, v10.4s, v2.s[1]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v11.4s, v2.s[1]\n" + "fmla v22.4s, v8.4s, v3.s[0]\n" + "fmla v23.4s, v9.4s, v3.s[0]\n" + "fmla v22.4s, v10.4s, v3.s[1]\n" + "fmla v23.4s, v11.4s, v3.s[1]\n" + + "ld1 {v4.4s}, [x0]\n" + "fmla v16.4s, v12.4s, v0.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v13.4s, v0.s[2]\n" + "ld1 {v5.4s}, [x0]\n" + "fmla v16.4s, v14.4s, v0.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v15.4s, v0.s[3]\n" + "ld1 {v6.4s}, [x0]\n" + "fmla v18.4s, v12.4s, v1.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v13.4s, v1.s[2]\n" + "ld1 {v7.4s}, [x0]\n" + "fmla v18.4s, v14.4s, v1.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v15.4s, v1.s[3]\n" + "fmla v20.4s, v12.4s, v2.s[2]\n" + "fmla v21.4s, v13.4s, v2.s[2]\n" + "fmla v20.4s, v14.4s, v2.s[3]\n" + "fmla v21.4s, v15.4s, v2.s[3]\n" + "fmla v22.4s, v12.4s, v3.s[2]\n" + "fmla v23.4s, v13.4s, v3.s[2]\n" + "fmla v22.4s, v14.4s, v3.s[3]\n" + "fmla v23.4s, v15.4s, v3.s[3]\n" + + "mov x0, %[res_ptr]\n" + "fmla v24.4s, v8.4s, v4.s[0]\n" + "fmla v25.4s, v9.4s, v4.s[0]\n" + "st1 {v16.4s, v17.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v24.4s, v10.4s, v4.s[1]\n" + "fmla v25.4s, v11.4s, v4.s[1]\n" + "st1 {v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v26.4s, v8.4s, v5.s[0]\n" + "fmla v27.4s, v9.4s, v5.s[0]\n" + "st1 {v20.4s, v21.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v26.4s, v10.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v5.s[1]\n" + "st1 {v22.4s, v23.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v28.4s, v8.4s, v6.s[0]\n" + "fmla v29.4s, v9.4s, v6.s[0]\n" + "fmla v28.4s, v10.4s, v6.s[1]\n" + "fmla v29.4s, v11.4s, v6.s[1]\n" + "fmla v30.4s, v8.4s, v7.s[0]\n" + "fmla v31.4s, v9.4s, v7.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + + "fmla v24.4s, v12.4s, v4.s[2]\n" + "fmla v25.4s, v13.4s, v4.s[2]\n" + "fmla v24.4s, v14.4s, v4.s[3]\n" + "fmla v25.4s, v15.4s, v4.s[3]\n" + "fmla v26.4s, v12.4s, v5.s[2]\n" + "fmla v27.4s, v13.4s, v5.s[2]\n" + "st1 {v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v26.4s, v14.4s, v5.s[3]\n" + "fmla v27.4s, v15.4s, v5.s[3]\n" + "fmla v28.4s, v12.4s, v6.s[2]\n" + "fmla v29.4s, v13.4s, v6.s[2]\n" + "st1 {v26.4s, v27.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v28.4s, v14.4s, v6.s[3]\n" + "fmla v29.4s, v15.4s, v6.s[3]\n" + "fmla v30.4s, v12.4s, v7.s[2]\n" + "fmla v31.4s, v13.4s, v7.s[2]\n" + "st1 {v28.4s, v29.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v30.4s, v14.4s, v7.s[3]\n" + "fmla v31.4s, v15.4s, v7.s[3]\n" + "st1 {v30.4s, v31.4s}, [x0]\n" + :[lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), [res_ptr] "+r" (res_ptr), + [nk] "+r" (nk) + : [lstride] "r" (lstride), [rstride] "r" (rstride), [estride] "r" (estride), [rstep] "r" +(rstep) + : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +}*/ + +static void direct_conv_colmajor(convMat_t *input, convMat_t *output, convMat_t *filter, + convParams_t *params) +{ + const int w = input->w; + const int h = input->h; + const int inch = input->c; + const int outw = output->w; + const int outh = output->h; + const int outch = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const float *input_data = input->data; + const float *filter_data = filter->data; + float *output_data = output->data; + + for (int out_row = 0; out_row < outh; out_row++) + { + for (int out_col = 0; out_col < outw; out_col++) + { + const int in_col0 = (out_col * stride_w) - pad_w; + const int in_row0 = (out_row * stride_h) - pad_h; + + for (int out_c = 0; out_c < outch; out_c++) + { + float sum = 0.f; + for (int filter_y = 0; filter_y < kernel_h; filter_y++) + { + for (int filter_x = 0; filter_x < kernel_w; filter_x++) + { + const int in_col = in_col0 + filter_x * dilation_w; + const int in_row = in_row0 + filter_y * dilation_h; + + if (((unsigned int)in_col < (unsigned int)w) && + ((unsigned int)in_row < (unsigned int)h)) + { + for (int in_c = 0; in_c < inch; in_c++) + { + float input_value = input_data[(in_row * w + in_col) * inch + in_c]; + float filter_value = + filter_data[((filter_y * kernel_w + filter_x) * inch + in_c) * outch + out_c]; + sum += (input_value * filter_value); + } + } + } + } + output_data[(out_row * outw + out_col) * outch + out_c] = sum; + } + } + } +} + +static void direct_sgemm_colmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B, + float *C) +{ + float *aa, *bb; + + if (Atrans) + { + aa = (float *)malloc(m * k * sizeof(float)); + if (!aa) + return; + + for (int i = 0; i < k; i++) + { + for (int j = 0; j < m; j++) + { + aa[i * m + j] = A[j * k + i]; + } + } + } + else + { + aa = A; + } + + if (Btrans) + { + bb = (float *)malloc(n * k * sizeof(float)); + if (!bb) + return; + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < k; j++) + { + bb[i * k + j] = B[j * n + i]; + } + } + } + else + { + bb = B; + } + + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + float res = 0.f; + for (int l = 0; l < k; l++) + { + res += bb[j * k + l] * aa[l * m + i]; + } + C[j * m + i] = res; + } + } +} + +#if 0 +static int test_sgemm(int m, int n, int k, int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + const int mb = 180; + const int nb = 1440; + const int kb = 512; + + const int mr = 4; + const int nr = 12; + +#if 0 + const int pm = (m + mr - 1) / mr * mr; + const int pn = (n + nr - 1) / nr * nr; + const int pk = k; +#else + const int pm = (mb + mr - 1) / mr * mr; + const int pn = (nb + nr - 1) / nr * nr; + const int pk = kb; +#endif + const int nm = (m + mb - 1) / mb; + const int nn = (n + nb - 1) / nb; + const int nk = (k + kb - 1) / kb; + + const int rm = m % mb; + const int rn = n % nb; + const int rk = k % kb; + + float *A = (float *)malloc(m * k * sizeof(float)); + if(!A) return 0; + + for(int i = 0 ; i < m * k; i++) + { + A[i] = 0.001 + i * 0.000001; + } + + float *B = (float *)malloc(k * n * sizeof(float)); + if(!B) return 0; + + for(int i = 0 ; i < n * k; i++) + { + B[i] = 0.001 - i * 0.000001; + } + + float *C = (float *)malloc(m * n * sizeof(float)); + if(!C) return 0; + +#if 0 + float *PA = (float *)malloc(pm * pk * sizeof(float)); + if(!PA) return 0; + + float *PB = (float *)malloc(pk * pn * sizeof(float)); + if(!PB) return 0; +#else + float PA[pm * pk]; + float PB[pk * pn]; +#endif + + for(int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + //pack_rowmajor_notrans_lhs(mr, m, k, k, A, PA); + //pack_rowmajor_notrans_rhs(nr, n, k, n, B, PB); +#if 1 + for (int j = 0; j < nn; j++) + { + const int _nb = (j != nn - 1 || rn == 0) ? nb : rn; + for (int l = 0; l < nk; l++) + { + const int _kb = (l != nk - 1 || rk == 0) ? kb : rk; + pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + j * nb], PB); + for(int i = 0; i < nm; i++) + { + const int _mb = (i != nm - 1 || rm == 0) ? mb : rm; + pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[i * mb * k + l * kb], PA); + sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, PA, PB, &C[i * mb * n + j * nb], l, n, _kb); + //sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk); + } + } + } +#else + for (int j = 0; j < nm; j++) + { + const int _mb = (j != nm - 1 || rm == 0) ? mb : rm; + for (int l = 0; l < nk; l++) + { + const int _kb = (l != nk - 1 || rk == 0) ? kb : rk; + pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[j * mb * k + l * kb], PA); + for(int i = 0; i < nn; i++) + { + const int _nb = (i != nn - 1 || rn == 0) ? nb : rn; + pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + i * nb], PB); + sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, PA, PB, &C[j * mb * n + i * nb], l, n, _kb); + //sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk); + } + } + } +#endif + gettimeofday(&end, NULL); + total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec))/1000; + } + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + float *c_ptr = &C[0]; + for(int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if((i + 1) % div == 0) printf("\n"); + } + + printf("\n"); + + c_ptr = &C[m * n - num]; + for(int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if((i + 1) % div == 0) printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m *n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops , total_size, (double)total_size/(total_time / loops)/1000000); + + free(A); + free(B); + free(C); + + //free(PA); + //free(PB); + +} +#endif + +static int test_sgemm(int m, int n, int k, int type, int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + // printf("1.\n"); + + float *A = (float *)malloc(m * k * sizeof(float)); + if (!A) + return 0; + + for (int i = 0; i < m * k; i++) + { + A[i] = 0.001 + i * 0.001; // i * 0.000001; + } + + float *B = (float *)malloc(k * n * sizeof(float)); + if (!B) + return 0; + + for (int i = 0; i < n * k; i++) + { + B[i] = 0.001 - i * 0.001; // - i * 0.000001; + } + + float *C = (float *)malloc(m * n * sizeof(float)); + if (!C) + return 0; + + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + if (type == 0) + { + // direct_sgemm_rowmajor(notrans, notrans, m, n, k, A, B, C); + direct_sgemm_colmajor(notrans, notrans, m, n, k, A, B, C); + } + + else if (type == 1) + { + class sgemm_singlethread my_gemm(colMajor, notrans, notrans, m, n, k, A, B, C, 1); + my_gemm.run(); + } + + /*else if(type == 2) + { + for(int i = 0; i < m / 8; i++) + { + for(int j = 0; j < n / 8; j++) + { + direct_sgemm_kernel(k, k, n, n, A + i * 8 * k, B + j * 8, C + i * 8 * n + j * 8); + } + } + }*/ + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + float *c_ptr = &C[0]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &C[m * n - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(A); + free(B); + free(C); + + return 0; +} + +void weight_tensorflow2caffe(float *out, float *in, int H, int W, int C, int N) +{ // HWCN ---> NCHW + for (int h = 0; h < H; ++h) + { + for (int w = 0; w < W; ++w) + { + for (int c = 0; c < C; ++c) + { + for (int n = 0; n < N; ++n) + { + int index_in = h * W * C * N + w * C * N + c * N + n; + int index_out = n * C * H * W + c * H * W + h * W + w; + // printf("%3d <--- %3d\n", index_out, index_in); + out[index_out] = in[index_in]; + } + } + } + } +} + +void trans_weight2winograd(const convMat_t &_kernel, float **winograd_weight) +{ + const double *G; + const int kernel_size = _kernel.h; + const int channels = _kernel.c; + const int num_output = _kernel.n; + + int tile_h_in_, tile_w_in_; + int M, N; + + /*Step 1: transfer weight to winograd domain*/ + if (kernel_size == 3) + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + G = winograd_para_3x3s1::getG(); + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + G = winograd_para_5x5s1::getG(); + } + + tile_h_in_ = tile_w_in_ = M; + + float *winograd_g = new float[M * M * N * N]; + if (NULL == winograd_g) + return; + kronecker_product(winograd_g, G, G, M, N, M, N); + + *winograd_weight = new float[tile_h_in_ * tile_w_in_ * channels * num_output]; + + if (NULL == *winograd_weight) + return; + + float *weight_data_tran = new float[_kernel.h * _kernel.w * _kernel.c * _kernel.n]; + if (NULL == weight_data_tran) + return; + weight_tensorflow2caffe(weight_data_tran, _kernel.data, kernel_size, kernel_size, channels, + num_output); + + class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, + channels * num_output, kernel_size * kernel_size, winograd_g, + weight_data_tran, *winograd_weight, 1); + + sgemm.run(); + + delete[] weight_data_tran; + + /*With winograd, original weight data is useless.*/ + delete[] winograd_g; +} + +static int test_conv(const int w, const int h, const int kernel_size, const int stride, + const int inch, const int outch, const int padding, const int conv_type, + const int thread_num, const int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + struct timeval start1, end1; + float total_time1 = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation + (w - 1) / stride * stride - w; + int pad_h = kernel_dilation + (h - 1) / stride * stride - h; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.n = 1; +#ifdef NCNN + input.data = + (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float)); +#else + input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float)); +#endif + + if (!input.data) + return 0; + + output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.c = outch; + output.n = 1; +#ifdef NCNN + output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c * + sizeof(float)); +#else + output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float)); +#endif + + if (!output.data) + return 0; + + for (int i = 0; i < output.w * output.h * output.c; i++) + { + output.data[i] = 0; + } + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = inch; + filter.n = outch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + +#if 1 + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } +#else + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + if ((i + 1) % 15 == 0) + filter.data[i] = 0.001 - i * 0.000001; + else + filter.data[i] = 0; + } +#endif + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = output.c; + const int n = output.w * output.h; + const int k = params.kernel_h * params.kernel_w * input.c; + + // ocl_context_t context; + size_t local_min[2]; + /** + if(conv_type == 14 || conv_type == 15 || conv_type == 6) + { + if(init_gpu(&context) < 0) return -1; + //if(conv_type ==14 || conv_type == 5) sgemm_ocltune(&context, m, n, (k < 1024 ? k : + 1024), local_min); + //else if(conv_type == 6) + { + if(kernel_size == 3) directconv_3x3S1_tune(&context, &input, &filter, &output, + local_min); + else if(kernel_size == 1) directconv_1x1S1_tune(&context, &input, &filter, &output, + local_min); + } + //local_min[0] = 1; local_min[1] = 1; + } + **/ + if (conv_type == 0) + { + for (int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + direct_conv_rowmajor(&input, &output, &filter, ¶ms); + // direct_conv_colmajor(&input, &output, &filter, ¶ms); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 1) + { + for (int nloop = 0; nloop < loops; nloop++) + { + // printf("nloop = %d, thread_num = %d\n", nloop, thread_num); + // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major); + gettimeofday(&start, NULL); + + /*if(thread_num == 1) + { + class conv_sgemm_singlethread my_gemm(input, filter, output, params, col_major); + my_gemm.run(); + } + else + { + class conv_sgemm_multithreads my_gemm(input, filter, output, params, thread_num, + col_major); + my_gemm.run(); + }*/ + + srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major); + + // printf("sync\n"); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 2) + { + float *winograd_weight; + + // trans_weight2winograd(filter, &winograd_weight); + + winogradParams_t wparams = {params.kernel_w, + params.kernel_h, + params.stride_w, + params.stride_h, + params.dilation_w, + params.dilation_h, + 1, + w, + h, + input.c, + output.c, + thread_num, + col_major, + filter.data}; + winograd_weight = trans_weight2winograd(wparams); + + for (int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + // class conv_winograd my_sgemm(input, output, params, col_major, winograd_weight, thread_num, + // w * h, n); + // my_sgemm.run(); + + srcn_convolution2D(input, filter, output, params, winograd_weight, thread_num, row_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 3) + { + void *sparse_weight = trans_weight2sparse(filter); + + for (int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + srcn_sparse_convolution2D(input, output, params, sparse_weight, thread_num, row_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + + sparse_release(outch, sparse_weight); + } /** +else if(conv_type == 4) +{ +#if 0 + cl_int err; + convlib::load_opencl("./libmali.so"); + const int mpad = (m + 4 - 1) / 4 * 4; + const int npad = (n + 4 - 1) / 4 * 4; + cl_mem lhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE | +CL_MEM_ALLOC_HOST_PTR, mpad * k * sizeof(float), NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1; + } + + cl_image_format rhs_format = {CL_RGBA, CL_FLOAT}; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + (size_t)npad / 4, + (size_t)k, + 0, 0, + 0, + 0, 0, 0, 0 + }; + cl_mem rhs_gpu = convlib::clCreateImage(context.context, CL_MEM_READ_ONLY | +CL_MEM_ALLOC_HOST_PTR, &rhs_format, &desc, NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1; + } + + cl_mem rhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE | +CL_MEM_ALLOC_HOST_PTR, npad * k * sizeof(float), NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1;; + } + + cl_mem res_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE | +CL_MEM_ALLOC_HOST_PTR, mpad * npad * sizeof(float), NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1; + } +#endif + for(int nloop = 0; nloop < loops + 1; nloop++) + { + gettimeofday(&start, NULL); + + //cl_mem _res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, ¶ms, local_min, +lhs_gpu, rhs_gpu, res_gpu); + + //get_result_gpu(&context, output.data + gpu_data_off, _res_gpu, m, n); + srcn_convolution2D_gpu(input, filter, output, params, row_major); + + gettimeofday(&end, NULL); + + if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 ++ start.tv_usec))/1000; + } +} +else if(conv_type == 5) +{ + + for(int nloop = 0; nloop < loops + 1; nloop++) + { + gettimeofday(&start, NULL); + + //cl_mem res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, ¶ms, local_min); + + //clFlush(context.cmdQueue); + gettimeofday(&start1, NULL); + #if 1 + srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major + + #endif + //usleep(80 * 1000); + gettimeofday(&end1, NULL); + total_time1 += ((end1.tv_sec * 1000000 + end1.tv_usec) - (start1.tv_sec * 1000000 + +start1.tv_usec))/1000; + + //get_result_gpu(&context, output.data + gpu_data_off, res_gpu, m, n); + + srcn_convolution2D_dpu(input, filter, output, params, row_major); + + gettimeofday(&end, NULL); + if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 ++ start.tv_usec))/1000; + } +} +else if(conv_type == 6) +{ + for(int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + if(kernel_size == 3 && stride == 1 && padding == 0) + { + conv2D_gpu_directconv_3x3S1(&context, &input, &filter, &output, ¶ms, local_min); + } + else if(kernel_size == 1 && stride == 1 && padding == 0) + { + conv2D_gpu_directconv_1x1S1(&context, &input, &filter, &output, ¶ms, local_min); + } + + gettimeofday(&end, NULL); + total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + +start.tv_usec))/1000; + } +}**/ + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + if (conv_type < 4) + printf("[CPU RESULT]\n"); + else if (conv_type == 4) + printf("[GPU RESULT]\n"); + else if (conv_type == 5) + printf("[DPU RESULT]\n"); + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[m * n - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf( + "AVER Time consuming: %.2fms, CPU Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", + total_time / loops, total_time1 / loops, total_size, + (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + + return 0; +} + +static int test_deconv(const int w, const int h, const int kernel_size, const int stride, + const int inch, const int outch, const int padding, const int conv_type, + const int thread_num, const int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation - 1; + int pad_h = kernel_dilation - 1; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float)); + if (!input.data) + return 0; + + // output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + // output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.w = stride * (w - 1) + kernel_dilation - (pad_l + pad_r); + output.h = stride * (h - 1) + kernel_dilation - (pad_t + pad_b); + output.c = outch; + output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float)); + if (!output.data) + return 0; + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = outch; + filter.n = inch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } + + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = params.kernel_h * params.kernel_w * output.c; + const int n = input.w * input.h; + const int k = input.c; + + if (conv_type == 0) + { + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + direct_deconv_rowmajor(&input, &output, &filter, ¶ms); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 1) + { + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + for (int i = 0; i < output.w * output.h * output.c; i++) + { + output.data[i] = 0; + } + + srcn_deconvolution2D(input, filter, output, params, thread_num, row_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + + const int output_size = output.w * output.h * output.c; + + int div = output_size < 16 ? output_size : 16; + int num = output_size > 64 ? 64 : output_size; + + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[output_size - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + + return 0; +} + +static int test_batch_conv(const int batch, const int w, const int h, const int kernel_size, + const int stride, const int inch, const int outch, const int padding, + const int conv_type, const int thread_num, const int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation + (w - 1) / stride * stride - w; + int pad_h = kernel_dilation + (h - 1) / stride * stride - h; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.n = batch; + input.data = (float *)malloc(input.n * input.w * input.h * input.c * sizeof(float)); + if (!input.data) + return 0; + + output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.c = outch; + output.n = batch; + output.data = (float *)malloc(output.n * output.w * output.h * output.c * sizeof(float)); + if (!output.data) + return 0; + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = inch; + filter.n = outch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c * input.n; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } + + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = output.c; + const int n = output.w * output.h; + const int k = params.kernel_h * params.kernel_w * input.c; + + if (conv_type == 1) + { + for (int nloop = 0; nloop < loops; nloop++) + + { + // printf("nloop = %d, thread_num = %d\n", nloop, thread_num); + // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major); + + gettimeofday(&start, NULL); + + srcn_batch_convolution2D(input, filter, output, params, NULL, thread_num, col_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 2) + { + float *winograd_weight; + + // trans_weight2winograd(filter, &winograd_weight); + + winogradParams_t wparams = {params.kernel_w, + params.kernel_h, + params.stride_w, + params.stride_h, + params.dilation_w, + params.dilation_h, + input.n, + w, + h, + input.c, + output.c, + thread_num, + col_major, + filter.data}; + winograd_weight = trans_weight2winograd(wparams); + + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + srcn_batch_convolution2D(input, filter, output, params, winograd_weight, thread_num, + col_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[m * n * batch - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)batch * m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + + return 0; +} + +static int test_depthwise_conv(const int w, const int h, const int kernel_size, const int stride, + const int inch, const int outch, const int padding, + const int conv_type, const int thread_num, const int loops) +{ + if (outch != inch) + return -1; + struct timeval start, end; + float total_time = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convMat_t bias; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation + (w - 1) / stride * stride - w; + int pad_h = kernel_dilation + (h - 1) / stride * stride - h; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.n = 1; +#ifdef NCNN + input.data = + (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float)); +#else + input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float)); +#endif + if (!input.data) + return 0; + + output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.c = outch; + output.n = 1; + +#ifdef NCNN + output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c * + sizeof(float)); +#else + output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float)); +#endif + const int gpu_data_off = output.w * output.h * output.c; + if (!output.data) + return 0; + + for (int i = 0; i < output.w * output.h * output.c; i++) + { + output.data[i] = 1.f; + } + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = 1; + filter.n = outch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } + + bias.w = outch; + bias.data = (float *)malloc(bias.w * sizeof(float)); + if (!bias.data) + return 0; + for (int i = 0; i < bias.w; i++) + { + bias.data[i] = 0.f; + } + + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = output.c; + const int n = output.w * output.h; + const int k = params.kernel_h * params.kernel_w * input.c; + + // ocl_context_t context; + size_t local_min[2] = {4, 4}; + /** + if(conv_type == 1) + { + if(init_gpu(&context) < 0) return -1; + depthwise_conv_3x3S1_tune(&context, &input, &filter, &output, local_min); + }**/ + + gettimeofday(&start, NULL); + if (conv_type == 0) + srcn_depthwise_conv(input, filter, output, bias, params, 4, + row_major); // convdw3x3s1_neon(input, output, filter, filter); + // else if(conv_type == 1) depthwise_conv_gpu3x3S1(&context, &input, &filter, &output, ¶ms, + // local_min); + else if (conv_type == 2) + { + for (int i = 0; i < input.c; i++) + { + convMat_t _input; + convMat_t _output; + convMat_t _filter; + convParams_t _params = params; + + _input.w = input.w; + _input.h = input.h; + _input.c = 1; + _input.n = 1; +#ifdef NCNN + _input.data = input.data + i * alignSize(input.w * input.h, 16 / sizeof(float)); +#else + _input.data = input.data + i * input.w * input.h; +#endif + + _output.w = output.w; + _output.h = output.h; + _output.c = 1; + _output.n = 1; +#ifdef NCNN + _output.data = output.data + i * alignSize(output.w * output.h, 16 / sizeof(float)); +#else + _output.data = output.data + i * output.w * output.h; +#endif + _filter.w = filter.w; + _filter.h = filter.h; + _filter.c = 1; // filter.c; + _filter.n = 1; // filter.n; + _filter.data = filter.data + i * 9; + + srcn_convolution2D(_input, _filter, _output, _params, NULL, 1, row_major); + // direct_conv_rowmajor(&_input, &_output, &_filter, &_params); + } + } + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + if (conv_type == 0) + printf("[CPU RESULT]\n"); + else if (conv_type == 1) + printf("[GPU RESULT]\n"); + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[m * n - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + free(bias.data); + + return 0; +} + +//#define TEST_SGEMM +#define TEST_CONV +//#define TEST_DECONV +//#define TEST_BATCH_CONV +//#define TEST_DEPTHWISE_CONV + +int main(int argc, char **argv) +{ +#ifdef TEST_SGEMM + if (argc < 6) + return 0; + + const int m = atoi(argv[1]); + const int n = atoi(argv[2]); + const int k = atoi(argv[3]); + const int type = atoi(argv[4]); + const int loops = atoi(argv[5]); + + test_sgemm(m, n, k, type, loops); +#elif (defined TEST_CONV) + if (argc < 10) + return 0; + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + const int kernel_size = atoi(argv[3]); + const int stride = atoi(argv[4]); + const int outch = atoi(argv[5]); + const int inch = atoi(argv[6]); + const int padding = atoi(argv[7]); + const int conv_type = atoi(argv[8]); + const int thread_num = atoi(argv[9]); + int loops = 1; + if (argc > 10) + loops = atoi(argv[10]); + test_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops); +#elif (defined TEST_DECONV) + if (argc < 10) + return 0; + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + const int kernel_size = atoi(argv[3]); + const int stride = atoi(argv[4]); + const int outch = atoi(argv[5]); + const int inch = atoi(argv[6]); + const int padding = atoi(argv[7]); + const int conv_type = atoi(argv[8]); + const int thread_num = atoi(argv[9]); + int loops = 1; + if (argc > 10) + loops = atoi(argv[10]); + test_deconv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops); +#elif (defined TEST_BATCH_CONV) + if (argc < 11) + return 0; + const int batch = atoi(argv[1]); + const int w = atoi(argv[2]); + const int h = atoi(argv[3]); + const int kernel_size = atoi(argv[4]); + const int stride = atoi(argv[5]); + const int outch = atoi(argv[6]); + const int inch = atoi(argv[7]); + const int padding = atoi(argv[8]); + const int conv_type = atoi(argv[9]); + const int thread_num = atoi(argv[10]); + int loops = 1; + if (argc > 11) + loops = atoi(argv[11]); + test_batch_conv(batch, w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, + loops); +#elif (defined TEST_DEPTHWISE_CONV) + if (argc < 10) + return 0; + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + const int kernel_size = atoi(argv[3]); + const int stride = atoi(argv[4]); + const int outch = atoi(argv[5]); + const int inch = atoi(argv[6]); + const int padding = atoi(argv[7]); + const int conv_type = atoi(argv[8]); + const int thread_num = atoi(argv[9]); + int loops = 1; + if (argc > 10) + loops = atoi(argv[10]); + test_depthwise_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, + loops); +#endif + + return 0; +} + +} // namespace srcn +} // namespace nnfw |