diff options
Diffstat (limited to 'compute/ncnn/src/srcn/srcn_conv.cc')
-rw-r--r-- | compute/ncnn/src/srcn/srcn_conv.cc | 614 |
1 files changed, 614 insertions, 0 deletions
diff --git a/compute/ncnn/src/srcn/srcn_conv.cc b/compute/ncnn/src/srcn/srcn_conv.cc new file mode 100644 index 000000000..bb8e4f13e --- /dev/null +++ b/compute/ncnn/src/srcn/srcn_conv.cc @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "ncnn/srcn/conv_type.h" +#include "common.h" +#include "sgemm_singlethread.h" +#include "conv_sgemm_singlethread.h" +#include "conv_sgemm_multithreads.h" +#include "conv_winograd.h" +#include "direct_conv_colmajor.h" +#include "winograd.h" + +#include "deconv_sgemm_multithreads.h" +#include "conv_sparse.h" +#include "conv_winograd_batch.h" + +namespace nnfw +{ +namespace srcn +{ + +static inline void weight_transfer(float *out, float *in, int H, int W, int C, int N) +{ + // HWCN ---> NCHW + for (int h = 0; h < H; ++h) + { + for (int w = 0; w < W; ++w) + { + for (int c = 0; c < C; ++c) + { + for (int n = 0; n < N; ++n) + { + int index_in = h * W * C * N + w * C * N + c * N + n; + int index_out = n * C * H * W + c * H * W + h * W + w; + out[index_out] = in[index_in]; + } + } + } + } +} + +int check_winograd(winogradParams_t ¶ms) +{ + int winograd_flag = + ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) && + (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) && + (params.dilation_w == 1) && (params.dilation_h == 1)); + + int winograd_channel_cond = 64 * 64; + int winograd_image_cond = 10 * 10; + +#ifdef TIZEN + if (params.num_threads > 1) + { + winograd_channel_cond = 128 * 128; + winograd_image_cond = 20 * 20; + } +#endif // TIZEN + + winograd_flag &= (params.inch * params.outch >= winograd_channel_cond); + + if (params.w > 0 && params.h > 0 && params.batch == 1) + { + winograd_flag &= (params.w * params.h >= winograd_image_cond); + } + + return winograd_flag; +} + +float *trans_weight2winograd(winogradParams_t ¶ms, unsigned int *size = NULL) +{ + int M, N; + const double *G; + + float *winograd_weight; + + int winograd_channel_cond = 64 * 64; + int winograd_image_cond = 10 * 10; + +#ifdef TIZEN + if (params.num_threads > 1) + { + winograd_channel_cond = 128 * 128; + // int winograd_image_cond = 20 * 20; + } +#endif // TIZEN + + int winograd_flag = + ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) && + (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) && + (params.dilation_w == 1) && (params.dilation_h == 1)); + if (!winograd_flag) + return NULL; + + winograd_flag = (params.inch * params.outch >= winograd_channel_cond); + + if (!winograd_flag) + return NULL; + + if (params.w > 0 && params.h > 0 && params.batch == 1) + { + winograd_flag &= (params.w * params.h >= winograd_image_cond); + if (!winograd_flag) + return NULL; + } + + const int kernel_size = params.kernel_w; + const int inch = params.inch; + const int outch = params.outch; + float *weight_data = params.weight_data; + + /*Step 1: transfer weight to winograd domain*/ + if (kernel_size == 3) + { + if (params.w == 4 && params.batch > 1) + { + M = winograd_para_3x3s1_2::M; + N = winograd_para_3x3s1_2::N; + G = winograd_para_3x3s1_2::getG(); + } + else + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + G = winograd_para_3x3s1::getG(); + } + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + G = winograd_para_5x5s1::getG(); + } + + int tile_h_in_, tile_w_in_; + tile_h_in_ = tile_w_in_ = M; + + if (size) + *size = tile_h_in_ * tile_w_in_ * inch * outch; + + winograd_weight = new float[tile_h_in_ * tile_w_in_ * inch * outch]; + if (!winograd_weight) + return NULL; + + float *winograd_g = new float[M * M * N * N]; + if (!winograd_g) + { + delete[] winograd_weight; + return NULL; + } + + kronecker_product(winograd_g, G, G, M, N, M, N); + + if (params.conv_type == col_major) + { + weight_data = new float[kernel_size * kernel_size * inch * outch]; + if (!weight_data) + { + delete[] winograd_weight; + delete[] winograd_g; + return NULL; + } + weight_transfer(weight_data, params.weight_data, kernel_size, kernel_size, inch, outch); + } + + class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, inch * outch, + kernel_size * kernel_size, winograd_g, weight_data, + winograd_weight, 1); + + sgemm.run(); + + if (params.conv_type == col_major) + delete[] weight_data; + + delete[] winograd_g; + + return winograd_weight; +} + +void winograd_release(float *winograd_weight) +{ + if (winograd_weight) + delete[] winograd_weight; +} + +void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, const float *winograd_weight, int num_threads, + convType_t conv_type) +{ + const int outw = out_mat.w; + const int outh = out_mat.h; + const int inch = in_mat.c; + const int outch = out_mat.c; + + int winograd_flag = + ((in_param.kernel_w == in_param.kernel_h) && (in_param.stride_w == in_param.stride_h) && + (in_param.kernel_w == 3 || in_param.kernel_w == 5) && (in_param.stride_w == 1) && + (winograd_weight) && (in_param.dilation_w == 1) && (in_param.dilation_h == 1)); + + int direct_flag = ((conv_type == col_major) && (in_param.stride_w == in_param.stride_h) && + (in_param.dilation_w == 1) && (in_param.dilation_h == 1)); + + int winograd_image_cond = 10 * 10; + int winograd_channel_cond = 64 * 64; + int direct_image_cond = 4 * 4; + int direct_channel_cond = 16 * 16; + +#ifdef TIZEN + if (num_threads > 1) + { + winograd_image_cond = 20 * 20; + winograd_channel_cond = 128 * 128; + } +#endif + + winograd_flag &= + ((outw * outh >= winograd_image_cond) && (inch * outch >= winograd_channel_cond)); + direct_flag &= ((outw * outh <= direct_image_cond) || (inch * outch <= direct_channel_cond)); + + if (num_threads == 1) + { + if (winograd_flag) + { + class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, num_threads, + in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + else if (direct_flag) + { + direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads); + } + else + { + class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type); + conv.run(); + } + } + else if (num_threads > 1) + { + if (winograd_flag) + { + const int npart = num_threads > 4 ? 4 : num_threads; + + omp_set_num_threads(npart); + + if (conv_type == col_major) + { + if (outch < 512) + { + const int _H = (outh + npart - 1) / npart; + + if (_H < in_param.pad_h) + { + class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1, + in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + return; + } + + // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w; + // const int oh = _H; + const int nh = (outh + _H - 1) / _H; + int rh = outh % _H; + if (rh == 0) + rh = _H; + +#pragma omp parallel for + for (int i = 0; i < nh; i++) + { + int pad_h_part = 0; + convMat_t in_part; + convMat_t out_part; + const int oh = (i != nh - 1 || rh == 0) ? _H : rh; + const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w; + + in_part.w = in_mat.w; + in_part.c = inch; + out_part.w = outw; + out_part.c = outch; + in_part.h = ih; + out_part.h = oh; + + int bottom_offset = i * _H - in_param.pad_h; + if (bottom_offset < 0) + { + bottom_offset = 0; + pad_h_part = in_param.pad_h; + } + in_part.data = in_mat.data + bottom_offset * in_mat.w * inch * in_param.stride_w; + if (ih + bottom_offset > in_mat.h) + { + in_part.h = in_mat.h - bottom_offset; + } + + out_part.data = out_mat.data + i * _H * outw * outch; + + convParams_t params = { + in_param.kernel_w, in_param.kernel_h, in_param.stride_w, in_param.stride_h, 1, 1, + in_param.padding, in_param.pad_w, pad_h_part}; + + class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + else + { + const int _OUTC = (outch + npart - 1) / npart; + + const int nc = (outch + _OUTC - 1) / _OUTC; + int rc = out_mat.c % _OUTC; + if (rc == 0) + rc = _OUTC; + +#pragma omp parallel for + for (int i = 0; i < nc; i++) + { + const float *weight_part; + convMat_t out_part; + + const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc; + + out_part.w = outw; + out_part.h = outh; + out_part.c = oc; + out_part.data = out_mat.data + i * _OUTC; + weight_part = winograd_weight + i * _OUTC * inch; + class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + } + else if (conv_type == row_major) + { +#ifdef TIZEN + if (outch < 512) +#else // TIZEN + if (outh >= 20) +#endif // TIZEN + { + const int _H = (outh + npart - 1) / npart; + + if (_H < in_param.pad_h) + { + class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1, + in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + return; + } + + // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w; + // const int oh = _H; + const int nh = (outh + _H - 1) / _H; + int rh = outh % _H; + if (rh == 0) + rh = _H; + +#pragma omp parallel for + for (int i = 0; i < nh; i++) + { + int pad_h_part = 0; + convMat_t in_part; + convMat_t out_part; + const int oh = (i != nh - 1 || rh == 0) ? _H : rh; + const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w; + + in_part.w = in_mat.w; + in_part.c = inch; + out_part.w = outw; + out_part.c = outch; + in_part.h = ih; + out_part.h = oh; + + int bottom_offset = i * _H - in_param.pad_h; + if (bottom_offset < 0) + { + bottom_offset = 0; + pad_h_part = in_param.pad_h; + } + in_part.data = in_mat.data + bottom_offset * in_mat.w * in_param.stride_w; + if (ih + bottom_offset > in_mat.h) + { + in_part.h = in_mat.h - bottom_offset; + } + + out_part.data = out_mat.data + i * _H * outw; + + convParams_t params = { + in_param.kernel_w, in_param.kernel_h, in_param.stride_w, 1, 1, + in_param.stride_h, in_param.padding, in_param.pad_w, pad_h_part}; + + class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + else + { + const int _OUTC = (outch + npart - 1) / npart; + + const int nc = (outch + _OUTC - 1) / _OUTC; + int rc = out_mat.c % _OUTC; + if (rc == 0) + rc = _OUTC; + +#pragma omp parallel for + for (int i = 0; i < nc; i++) + { + const float *weight_part; + convMat_t out_part; + + const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc; + + out_part.w = outw; + out_part.h = outh; + out_part.c = oc; + out_part.data = out_mat.data + i * _OUTC * outw * outh; + weight_part = winograd_weight + i * _OUTC * inch; + class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + } + } + else if (direct_flag) + { + direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads); + } + else + { + class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads, + conv_type); + conv.run(); + } + } +} + +void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, int num_threads, convType_t conv_type) +{ + class deconv_sgemm_multithreads deconv(in_mat, weights_mat, out_mat, in_param, num_threads, + conv_type); + deconv.run(); +} + +void *trans_weight2sparse(const convMat_t &weights_mat) +{ + const int kernel_w = weights_mat.w; + const int kernel_h = weights_mat.h; + const int inch = weights_mat.c; + const int outch = weights_mat.n; + + const int nch = (outch + BCH - 1) / BCH; + const int rch = outch % BCH; + + const float *data = weights_mat.data; + const int klength = inch * kernel_h * kernel_w; + + sparse_weight_t *sparse_weight = new sparse_weight_t[nch]; + if (!sparse_weight) + return NULL; + + for (int i = 0; i < nch; i++) + { + int _bch = (i != nch - 1 || rch == 0) ? BCH : rch; + sparse_weight_t *sparse_weight_n = &sparse_weight[i]; + sparse_weight_n->mxk = 0; + + for (int j = 0; j < _bch; j++) + { + for (int l = 0; l < klength; l++) + { + float val = *(data + (i * BCH + j) * klength + l); + if (val != 0) + { + sparse_weight_n->mxk++; + } + } + } + } + + for (int i = 0; i < nch; i++) + { + int _bch = (i != nch - 1 || rch == 0) ? BCH : rch; + sparse_weight_t *sparse_weight_n = &sparse_weight[i]; + sparse_weight_n->wdata = new weight_data_t[sparse_weight_n->mxk]; + int index = 0; + + for (int l = 0; l < klength; l++) + { + for (int j = 0; j < _bch; j++) + { + float val = *(data + (i * BCH + j) * klength + l); + if (val != 0) + { + sparse_weight_n->wdata[index].m = i * BCH + j; + sparse_weight_n->wdata[index].k = l; + sparse_weight_n->wdata[index++].data = val; + } + } + } + } + + return (void *)sparse_weight; +} + +void sparse_release(const int outch, void *ptr) +{ + sparse_weight_t *sparse_weight = (sparse_weight_t *)ptr; + const int nch = (outch + BCH - 1) / BCH; + + if (!sparse_weight) + return; + + for (int i = 0; i < nch; i++) + { + sparse_weight_t *sparse_weight_n = &sparse_weight[i]; + if (sparse_weight_n->wdata) + delete[] sparse_weight_n->wdata; + } + + if (sparse_weight) + delete[] sparse_weight; +} + +void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat, + const convParams_t &in_param, const void *sparse_weight, + int number_threas, convType_t conv_type) +{ + class conv_sparse conv(in_mat, out_mat, in_param, (const sparse_weight_t *)sparse_weight, + number_threas, conv_type); + + for (int i = 0; i < out_mat.c * out_mat.h * out_mat.w; i++) + { + *(out_mat.data + i) = 0; + } + + conv.run(); +} + +void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, + convMat_t &out_mat, const convParams_t &in_param, + const float *winograd_weight, int num_threads, convType_t conv_type) +{ + int winograd_flag = (winograd_weight != NULL); + + if (winograd_flag) + { + if (num_threads > 1) + { + omp_set_num_threads(num_threads); + const int batch = in_mat.n; + const int npart = (batch + num_threads - 1) / num_threads; + const int nn = (batch + npart - 1) / npart; + const int rn = batch % npart; + +#pragma omp parallel for + for (int i = 0; i < nn; i++) + { + const int pn = (i != nn - 1 || rn == 0) ? npart : rn; + convMat_t in_mat_part = {in_mat.w, in_mat.h, in_mat.c, pn, + in_mat.data + i * npart * in_mat.w * in_mat.h * in_mat.c}; + convMat_t out_mat_part = {out_mat.w, out_mat.h, out_mat.c, pn, + out_mat.data + i * npart * out_mat.w * out_mat.h * out_mat.c}; + + class conv_winograd_batch conv(in_mat_part, out_mat_part, in_param, conv_type, + winograd_weight, num_threads); + conv.run(); + } + } + else + { + class conv_winograd_batch conv(in_mat, out_mat, in_param, conv_type, winograd_weight, + num_threads); + conv.run(); + } + } + else + { + if (num_threads == 1) + { + class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type); + conv.run(); + } + else + { + class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads, + conv_type); + conv.run(); + } + } +} + +} // namespace srcn +} // namespace nnfw |