summaryrefslogtreecommitdiff
path: root/compute/ncnn/src/srcn/srcn_conv.cc
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ncnn/src/srcn/srcn_conv.cc')
-rw-r--r--compute/ncnn/src/srcn/srcn_conv.cc614
1 files changed, 614 insertions, 0 deletions
diff --git a/compute/ncnn/src/srcn/srcn_conv.cc b/compute/ncnn/src/srcn/srcn_conv.cc
new file mode 100644
index 000000000..bb8e4f13e
--- /dev/null
+++ b/compute/ncnn/src/srcn/srcn_conv.cc
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+#include "sgemm_singlethread.h"
+#include "conv_sgemm_singlethread.h"
+#include "conv_sgemm_multithreads.h"
+#include "conv_winograd.h"
+#include "direct_conv_colmajor.h"
+#include "winograd.h"
+
+#include "deconv_sgemm_multithreads.h"
+#include "conv_sparse.h"
+#include "conv_winograd_batch.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static inline void weight_transfer(float *out, float *in, int H, int W, int C, int N)
+{
+ // HWCN ---> NCHW
+ for (int h = 0; h < H; ++h)
+ {
+ for (int w = 0; w < W; ++w)
+ {
+ for (int c = 0; c < C; ++c)
+ {
+ for (int n = 0; n < N; ++n)
+ {
+ int index_in = h * W * C * N + w * C * N + c * N + n;
+ int index_out = n * C * H * W + c * H * W + h * W + w;
+ out[index_out] = in[index_in];
+ }
+ }
+ }
+ }
+}
+
+int check_winograd(winogradParams_t &params)
+{
+ int winograd_flag =
+ ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) &&
+ (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) &&
+ (params.dilation_w == 1) && (params.dilation_h == 1));
+
+ int winograd_channel_cond = 64 * 64;
+ int winograd_image_cond = 10 * 10;
+
+#ifdef TIZEN
+ if (params.num_threads > 1)
+ {
+ winograd_channel_cond = 128 * 128;
+ winograd_image_cond = 20 * 20;
+ }
+#endif // TIZEN
+
+ winograd_flag &= (params.inch * params.outch >= winograd_channel_cond);
+
+ if (params.w > 0 && params.h > 0 && params.batch == 1)
+ {
+ winograd_flag &= (params.w * params.h >= winograd_image_cond);
+ }
+
+ return winograd_flag;
+}
+
+float *trans_weight2winograd(winogradParams_t &params, unsigned int *size = NULL)
+{
+ int M, N;
+ const double *G;
+
+ float *winograd_weight;
+
+ int winograd_channel_cond = 64 * 64;
+ int winograd_image_cond = 10 * 10;
+
+#ifdef TIZEN
+ if (params.num_threads > 1)
+ {
+ winograd_channel_cond = 128 * 128;
+ // int winograd_image_cond = 20 * 20;
+ }
+#endif // TIZEN
+
+ int winograd_flag =
+ ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) &&
+ (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) &&
+ (params.dilation_w == 1) && (params.dilation_h == 1));
+ if (!winograd_flag)
+ return NULL;
+
+ winograd_flag = (params.inch * params.outch >= winograd_channel_cond);
+
+ if (!winograd_flag)
+ return NULL;
+
+ if (params.w > 0 && params.h > 0 && params.batch == 1)
+ {
+ winograd_flag &= (params.w * params.h >= winograd_image_cond);
+ if (!winograd_flag)
+ return NULL;
+ }
+
+ const int kernel_size = params.kernel_w;
+ const int inch = params.inch;
+ const int outch = params.outch;
+ float *weight_data = params.weight_data;
+
+ /*Step 1: transfer weight to winograd domain*/
+ if (kernel_size == 3)
+ {
+ if (params.w == 4 && params.batch > 1)
+ {
+ M = winograd_para_3x3s1_2::M;
+ N = winograd_para_3x3s1_2::N;
+ G = winograd_para_3x3s1_2::getG();
+ }
+ else
+ {
+ M = winograd_para_3x3s1::M;
+ N = winograd_para_3x3s1::N;
+ G = winograd_para_3x3s1::getG();
+ }
+ }
+ else
+ {
+ M = winograd_para_5x5s1::M;
+ N = winograd_para_5x5s1::N;
+ G = winograd_para_5x5s1::getG();
+ }
+
+ int tile_h_in_, tile_w_in_;
+ tile_h_in_ = tile_w_in_ = M;
+
+ if (size)
+ *size = tile_h_in_ * tile_w_in_ * inch * outch;
+
+ winograd_weight = new float[tile_h_in_ * tile_w_in_ * inch * outch];
+ if (!winograd_weight)
+ return NULL;
+
+ float *winograd_g = new float[M * M * N * N];
+ if (!winograd_g)
+ {
+ delete[] winograd_weight;
+ return NULL;
+ }
+
+ kronecker_product(winograd_g, G, G, M, N, M, N);
+
+ if (params.conv_type == col_major)
+ {
+ weight_data = new float[kernel_size * kernel_size * inch * outch];
+ if (!weight_data)
+ {
+ delete[] winograd_weight;
+ delete[] winograd_g;
+ return NULL;
+ }
+ weight_transfer(weight_data, params.weight_data, kernel_size, kernel_size, inch, outch);
+ }
+
+ class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, inch * outch,
+ kernel_size * kernel_size, winograd_g, weight_data,
+ winograd_weight, 1);
+
+ sgemm.run();
+
+ if (params.conv_type == col_major)
+ delete[] weight_data;
+
+ delete[] winograd_g;
+
+ return winograd_weight;
+}
+
+void winograd_release(float *winograd_weight)
+{
+ if (winograd_weight)
+ delete[] winograd_weight;
+}
+
+void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, const float *winograd_weight, int num_threads,
+ convType_t conv_type)
+{
+ const int outw = out_mat.w;
+ const int outh = out_mat.h;
+ const int inch = in_mat.c;
+ const int outch = out_mat.c;
+
+ int winograd_flag =
+ ((in_param.kernel_w == in_param.kernel_h) && (in_param.stride_w == in_param.stride_h) &&
+ (in_param.kernel_w == 3 || in_param.kernel_w == 5) && (in_param.stride_w == 1) &&
+ (winograd_weight) && (in_param.dilation_w == 1) && (in_param.dilation_h == 1));
+
+ int direct_flag = ((conv_type == col_major) && (in_param.stride_w == in_param.stride_h) &&
+ (in_param.dilation_w == 1) && (in_param.dilation_h == 1));
+
+ int winograd_image_cond = 10 * 10;
+ int winograd_channel_cond = 64 * 64;
+ int direct_image_cond = 4 * 4;
+ int direct_channel_cond = 16 * 16;
+
+#ifdef TIZEN
+ if (num_threads > 1)
+ {
+ winograd_image_cond = 20 * 20;
+ winograd_channel_cond = 128 * 128;
+ }
+#endif
+
+ winograd_flag &=
+ ((outw * outh >= winograd_image_cond) && (inch * outch >= winograd_channel_cond));
+ direct_flag &= ((outw * outh <= direct_image_cond) || (inch * outch <= direct_channel_cond));
+
+ if (num_threads == 1)
+ {
+ if (winograd_flag)
+ {
+ class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, num_threads,
+ in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ else if (direct_flag)
+ {
+ direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads);
+ }
+ else
+ {
+ class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type);
+ conv.run();
+ }
+ }
+ else if (num_threads > 1)
+ {
+ if (winograd_flag)
+ {
+ const int npart = num_threads > 4 ? 4 : num_threads;
+
+ omp_set_num_threads(npart);
+
+ if (conv_type == col_major)
+ {
+ if (outch < 512)
+ {
+ const int _H = (outh + npart - 1) / npart;
+
+ if (_H < in_param.pad_h)
+ {
+ class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1,
+ in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ return;
+ }
+
+ // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w;
+ // const int oh = _H;
+ const int nh = (outh + _H - 1) / _H;
+ int rh = outh % _H;
+ if (rh == 0)
+ rh = _H;
+
+#pragma omp parallel for
+ for (int i = 0; i < nh; i++)
+ {
+ int pad_h_part = 0;
+ convMat_t in_part;
+ convMat_t out_part;
+ const int oh = (i != nh - 1 || rh == 0) ? _H : rh;
+ const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w;
+
+ in_part.w = in_mat.w;
+ in_part.c = inch;
+ out_part.w = outw;
+ out_part.c = outch;
+ in_part.h = ih;
+ out_part.h = oh;
+
+ int bottom_offset = i * _H - in_param.pad_h;
+ if (bottom_offset < 0)
+ {
+ bottom_offset = 0;
+ pad_h_part = in_param.pad_h;
+ }
+ in_part.data = in_mat.data + bottom_offset * in_mat.w * inch * in_param.stride_w;
+ if (ih + bottom_offset > in_mat.h)
+ {
+ in_part.h = in_mat.h - bottom_offset;
+ }
+
+ out_part.data = out_mat.data + i * _H * outw * outch;
+
+ convParams_t params = {
+ in_param.kernel_w, in_param.kernel_h, in_param.stride_w, in_param.stride_h, 1, 1,
+ in_param.padding, in_param.pad_w, pad_h_part};
+
+ class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ else
+ {
+ const int _OUTC = (outch + npart - 1) / npart;
+
+ const int nc = (outch + _OUTC - 1) / _OUTC;
+ int rc = out_mat.c % _OUTC;
+ if (rc == 0)
+ rc = _OUTC;
+
+#pragma omp parallel for
+ for (int i = 0; i < nc; i++)
+ {
+ const float *weight_part;
+ convMat_t out_part;
+
+ const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc;
+
+ out_part.w = outw;
+ out_part.h = outh;
+ out_part.c = oc;
+ out_part.data = out_mat.data + i * _OUTC;
+ weight_part = winograd_weight + i * _OUTC * inch;
+ class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ }
+ else if (conv_type == row_major)
+ {
+#ifdef TIZEN
+ if (outch < 512)
+#else // TIZEN
+ if (outh >= 20)
+#endif // TIZEN
+ {
+ const int _H = (outh + npart - 1) / npart;
+
+ if (_H < in_param.pad_h)
+ {
+ class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1,
+ in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ return;
+ }
+
+ // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w;
+ // const int oh = _H;
+ const int nh = (outh + _H - 1) / _H;
+ int rh = outh % _H;
+ if (rh == 0)
+ rh = _H;
+
+#pragma omp parallel for
+ for (int i = 0; i < nh; i++)
+ {
+ int pad_h_part = 0;
+ convMat_t in_part;
+ convMat_t out_part;
+ const int oh = (i != nh - 1 || rh == 0) ? _H : rh;
+ const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w;
+
+ in_part.w = in_mat.w;
+ in_part.c = inch;
+ out_part.w = outw;
+ out_part.c = outch;
+ in_part.h = ih;
+ out_part.h = oh;
+
+ int bottom_offset = i * _H - in_param.pad_h;
+ if (bottom_offset < 0)
+ {
+ bottom_offset = 0;
+ pad_h_part = in_param.pad_h;
+ }
+ in_part.data = in_mat.data + bottom_offset * in_mat.w * in_param.stride_w;
+ if (ih + bottom_offset > in_mat.h)
+ {
+ in_part.h = in_mat.h - bottom_offset;
+ }
+
+ out_part.data = out_mat.data + i * _H * outw;
+
+ convParams_t params = {
+ in_param.kernel_w, in_param.kernel_h, in_param.stride_w, 1, 1,
+ in_param.stride_h, in_param.padding, in_param.pad_w, pad_h_part};
+
+ class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ else
+ {
+ const int _OUTC = (outch + npart - 1) / npart;
+
+ const int nc = (outch + _OUTC - 1) / _OUTC;
+ int rc = out_mat.c % _OUTC;
+ if (rc == 0)
+ rc = _OUTC;
+
+#pragma omp parallel for
+ for (int i = 0; i < nc; i++)
+ {
+ const float *weight_part;
+ convMat_t out_part;
+
+ const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc;
+
+ out_part.w = outw;
+ out_part.h = outh;
+ out_part.c = oc;
+ out_part.data = out_mat.data + i * _OUTC * outw * outh;
+ weight_part = winograd_weight + i * _OUTC * inch;
+ class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part,
+ num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+ conv.run();
+ }
+ }
+ }
+ }
+ else if (direct_flag)
+ {
+ direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads);
+ }
+ else
+ {
+ class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads,
+ conv_type);
+ conv.run();
+ }
+ }
+}
+
+void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+ const convParams_t &in_param, int num_threads, convType_t conv_type)
+{
+ class deconv_sgemm_multithreads deconv(in_mat, weights_mat, out_mat, in_param, num_threads,
+ conv_type);
+ deconv.run();
+}
+
+void *trans_weight2sparse(const convMat_t &weights_mat)
+{
+ const int kernel_w = weights_mat.w;
+ const int kernel_h = weights_mat.h;
+ const int inch = weights_mat.c;
+ const int outch = weights_mat.n;
+
+ const int nch = (outch + BCH - 1) / BCH;
+ const int rch = outch % BCH;
+
+ const float *data = weights_mat.data;
+ const int klength = inch * kernel_h * kernel_w;
+
+ sparse_weight_t *sparse_weight = new sparse_weight_t[nch];
+ if (!sparse_weight)
+ return NULL;
+
+ for (int i = 0; i < nch; i++)
+ {
+ int _bch = (i != nch - 1 || rch == 0) ? BCH : rch;
+ sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+ sparse_weight_n->mxk = 0;
+
+ for (int j = 0; j < _bch; j++)
+ {
+ for (int l = 0; l < klength; l++)
+ {
+ float val = *(data + (i * BCH + j) * klength + l);
+ if (val != 0)
+ {
+ sparse_weight_n->mxk++;
+ }
+ }
+ }
+ }
+
+ for (int i = 0; i < nch; i++)
+ {
+ int _bch = (i != nch - 1 || rch == 0) ? BCH : rch;
+ sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+ sparse_weight_n->wdata = new weight_data_t[sparse_weight_n->mxk];
+ int index = 0;
+
+ for (int l = 0; l < klength; l++)
+ {
+ for (int j = 0; j < _bch; j++)
+ {
+ float val = *(data + (i * BCH + j) * klength + l);
+ if (val != 0)
+ {
+ sparse_weight_n->wdata[index].m = i * BCH + j;
+ sparse_weight_n->wdata[index].k = l;
+ sparse_weight_n->wdata[index++].data = val;
+ }
+ }
+ }
+ }
+
+ return (void *)sparse_weight;
+}
+
+void sparse_release(const int outch, void *ptr)
+{
+ sparse_weight_t *sparse_weight = (sparse_weight_t *)ptr;
+ const int nch = (outch + BCH - 1) / BCH;
+
+ if (!sparse_weight)
+ return;
+
+ for (int i = 0; i < nch; i++)
+ {
+ sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+ if (sparse_weight_n->wdata)
+ delete[] sparse_weight_n->wdata;
+ }
+
+ if (sparse_weight)
+ delete[] sparse_weight;
+}
+
+void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat,
+ const convParams_t &in_param, const void *sparse_weight,
+ int number_threas, convType_t conv_type)
+{
+ class conv_sparse conv(in_mat, out_mat, in_param, (const sparse_weight_t *)sparse_weight,
+ number_threas, conv_type);
+
+ for (int i = 0; i < out_mat.c * out_mat.h * out_mat.w; i++)
+ {
+ *(out_mat.data + i) = 0;
+ }
+
+ conv.run();
+}
+
+void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat,
+ convMat_t &out_mat, const convParams_t &in_param,
+ const float *winograd_weight, int num_threads, convType_t conv_type)
+{
+ int winograd_flag = (winograd_weight != NULL);
+
+ if (winograd_flag)
+ {
+ if (num_threads > 1)
+ {
+ omp_set_num_threads(num_threads);
+ const int batch = in_mat.n;
+ const int npart = (batch + num_threads - 1) / num_threads;
+ const int nn = (batch + npart - 1) / npart;
+ const int rn = batch % npart;
+
+#pragma omp parallel for
+ for (int i = 0; i < nn; i++)
+ {
+ const int pn = (i != nn - 1 || rn == 0) ? npart : rn;
+ convMat_t in_mat_part = {in_mat.w, in_mat.h, in_mat.c, pn,
+ in_mat.data + i * npart * in_mat.w * in_mat.h * in_mat.c};
+ convMat_t out_mat_part = {out_mat.w, out_mat.h, out_mat.c, pn,
+ out_mat.data + i * npart * out_mat.w * out_mat.h * out_mat.c};
+
+ class conv_winograd_batch conv(in_mat_part, out_mat_part, in_param, conv_type,
+ winograd_weight, num_threads);
+ conv.run();
+ }
+ }
+ else
+ {
+ class conv_winograd_batch conv(in_mat, out_mat, in_param, conv_type, winograd_weight,
+ num_threads);
+ conv.run();
+ }
+ }
+ else
+ {
+ if (num_threads == 1)
+ {
+ class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type);
+ conv.run();
+ }
+ else
+ {
+ class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads,
+ conv_type);
+ conv.run();
+ }
+ }
+}
+
+} // namespace srcn
+} // namespace nnfw