1 files changed, 614 insertions, 0 deletions
diff --git a/compute/ncnn/src/srcn/srcn_conv.cc b/compute/ncnn/src/srcn/srcn_conv.cc
new file mode 100644
index 000000000..bb8e4f13e
--- /dev/null
+++ b/compute/ncnn/src/srcn/srcn_conv.cc
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "ncnn/srcn/conv_type.h"
+#include "common.h"
+#include "sgemm_singlethread.h"
+#include "conv_sgemm_singlethread.h"
+#include "conv_sgemm_multithreads.h"
+#include "conv_winograd.h"
+#include "direct_conv_colmajor.h"
+#include "winograd.h"
+
+#include "deconv_sgemm_multithreads.h"
+#include "conv_sparse.h"
+#include "conv_winograd_batch.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+static inline void weight_transfer(float *out, float *in, int H, int W, int C, int N)
+{
+  // HWCN ---> NCHW
+  for (int h = 0; h < H; ++h)
+  {
+    for (int w = 0; w < W; ++w)
+    {
+      for (int c = 0; c < C; ++c)
+      {
+        for (int n = 0; n < N; ++n)
+        {
+          int index_in = h * W * C * N + w * C * N + c * N + n;
+          int index_out = n * C * H * W + c * H * W + h * W + w;
+          out[index_out] = in[index_in];
+        }
+      }
+    }
+  }
+}
+
+int check_winograd(winogradParams_t &params)
+{
+  int winograd_flag =
+      ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) &&
+       (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) &&
+       (params.dilation_w == 1) && (params.dilation_h == 1));
+
+  int winograd_channel_cond = 64 * 64;
+  int winograd_image_cond = 10 * 10;
+
+#ifdef TIZEN
+  if (params.num_threads > 1)
+  {
+    winograd_channel_cond = 128 * 128;
+    winograd_image_cond = 20 * 20;
+  }
+#endif // TIZEN
+
+  winograd_flag &= (params.inch * params.outch >= winograd_channel_cond);
+
+  if (params.w > 0 && params.h > 0 && params.batch == 1)
+  {
+    winograd_flag &= (params.w * params.h >= winograd_image_cond);
+  }
+
+  return winograd_flag;
+}
+
+float *trans_weight2winograd(winogradParams_t &params, unsigned int *size = NULL)
+{
+  int M, N;
+  const double *G;
+
+  float *winograd_weight;
+
+  int winograd_channel_cond = 64 * 64;
+  int winograd_image_cond = 10 * 10;
+
+#ifdef TIZEN
+  if (params.num_threads > 1)
+  {
+    winograd_channel_cond = 128 * 128;
+    // int winograd_image_cond = 20 * 20;
+  }
+#endif // TIZEN
+
+  int winograd_flag =
+      ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) &&
+       (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) &&
+       (params.dilation_w == 1) && (params.dilation_h == 1));
+  if (!winograd_flag)
+    return NULL;
+
+  winograd_flag = (params.inch * params.outch >= winograd_channel_cond);
+
+  if (!winograd_flag)
+    return NULL;
+
+  if (params.w > 0 && params.h > 0 && params.batch == 1)
+  {
+    winograd_flag &= (params.w * params.h >= winograd_image_cond);
+    if (!winograd_flag)
+      return NULL;
+  }
+
+  const int kernel_size = params.kernel_w;
+  const int inch = params.inch;
+  const int outch = params.outch;
+  float *weight_data = params.weight_data;
+
+  /*Step 1: transfer weight to winograd domain*/
+  if (kernel_size == 3)
+  {
+    if (params.w == 4 && params.batch > 1)
+    {
+      M = winograd_para_3x3s1_2::M;
+      N = winograd_para_3x3s1_2::N;
+      G = winograd_para_3x3s1_2::getG();
+    }
+    else
+    {
+      M = winograd_para_3x3s1::M;
+      N = winograd_para_3x3s1::N;
+      G = winograd_para_3x3s1::getG();
+    }
+  }
+  else
+  {
+    M = winograd_para_5x5s1::M;
+    N = winograd_para_5x5s1::N;
+    G = winograd_para_5x5s1::getG();
+  }
+
+  int tile_h_in_, tile_w_in_;
+  tile_h_in_ = tile_w_in_ = M;
+
+  if (size)
+    *size = tile_h_in_ * tile_w_in_ * inch * outch;
+
+  winograd_weight = new float[tile_h_in_ * tile_w_in_ * inch * outch];
+  if (!winograd_weight)
+    return NULL;
+
+  float *winograd_g = new float[M * M * N * N];
+  if (!winograd_g)
+  {
+    delete[] winograd_weight;
+    return NULL;
+  }
+
+  kronecker_product(winograd_g, G, G, M, N, M, N);
+
+  if (params.conv_type == col_major)
+  {
+    weight_data = new float[kernel_size * kernel_size * inch * outch];
+    if (!weight_data)
+    {
+      delete[] winograd_weight;
+      delete[] winograd_g;
+      return NULL;
+    }
+    weight_transfer(weight_data, params.weight_data, kernel_size, kernel_size, inch, outch);
+  }
+
+  class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, inch * outch,
+                                 kernel_size * kernel_size, winograd_g, weight_data,
+                                 winograd_weight, 1);
+
+  sgemm.run();
+
+  if (params.conv_type == col_major)
+    delete[] weight_data;
+
+  delete[] winograd_g;
+
+  return winograd_weight;
+}
+
+void winograd_release(float *winograd_weight)
+{
+  if (winograd_weight)
+    delete[] winograd_weight;
+}
+
+void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+                        const convParams_t &in_param, const float *winograd_weight, int num_threads,
+                        convType_t conv_type)
+{
+  const int outw = out_mat.w;
+  const int outh = out_mat.h;
+  const int inch = in_mat.c;
+  const int outch = out_mat.c;
+
+  int winograd_flag =
+      ((in_param.kernel_w == in_param.kernel_h) && (in_param.stride_w == in_param.stride_h) &&
+       (in_param.kernel_w == 3 || in_param.kernel_w == 5) && (in_param.stride_w == 1) &&
+       (winograd_weight) && (in_param.dilation_w == 1) && (in_param.dilation_h == 1));
+
+  int direct_flag = ((conv_type == col_major) && (in_param.stride_w == in_param.stride_h) &&
+                     (in_param.dilation_w == 1) && (in_param.dilation_h == 1));
+
+  int winograd_image_cond = 10 * 10;
+  int winograd_channel_cond = 64 * 64;
+  int direct_image_cond = 4 * 4;
+  int direct_channel_cond = 16 * 16;
+
+#ifdef TIZEN
+  if (num_threads > 1)
+  {
+    winograd_image_cond = 20 * 20;
+    winograd_channel_cond = 128 * 128;
+  }
+#endif
+
+  winograd_flag &=
+      ((outw * outh >= winograd_image_cond) && (inch * outch >= winograd_channel_cond));
+  direct_flag &= ((outw * outh <= direct_image_cond) || (inch * outch <= direct_channel_cond));
+
+  if (num_threads == 1)
+  {
+    if (winograd_flag)
+    {
+      class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, num_threads,
+                               in_mat.w * in_mat.h, outw * outh, outch);
+      conv.run();
+    }
+    else if (direct_flag)
+    {
+      direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads);
+    }
+    else
+    {
+      class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type);
+      conv.run();
+    }
+  }
+  else if (num_threads > 1)
+  {
+    if (winograd_flag)
+    {
+      const int npart = num_threads > 4 ? 4 : num_threads;
+
+      omp_set_num_threads(npart);
+
+      if (conv_type == col_major)
+      {
+        if (outch < 512)
+        {
+          const int _H = (outh + npart - 1) / npart;
+
+          if (_H < in_param.pad_h)
+          {
+            class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1,
+                                     in_mat.w * in_mat.h, outw * outh, outch);
+            conv.run();
+            return;
+          }
+
+          // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w;
+          // const int oh = _H;
+          const int nh = (outh + _H - 1) / _H;
+          int rh = outh % _H;
+          if (rh == 0)
+            rh = _H;
+
+#pragma omp parallel for
+          for (int i = 0; i < nh; i++)
+          {
+            int pad_h_part = 0;
+            convMat_t in_part;
+            convMat_t out_part;
+            const int oh = (i != nh - 1 || rh == 0) ? _H : rh;
+            const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w;
+
+            in_part.w = in_mat.w;
+            in_part.c = inch;
+            out_part.w = outw;
+            out_part.c = outch;
+            in_part.h = ih;
+            out_part.h = oh;
+
+            int bottom_offset = i * _H - in_param.pad_h;
+            if (bottom_offset < 0)
+            {
+              bottom_offset = 0;
+              pad_h_part = in_param.pad_h;
+            }
+            in_part.data = in_mat.data + bottom_offset * in_mat.w * inch * in_param.stride_w;
+            if (ih + bottom_offset > in_mat.h)
+            {
+              in_part.h = in_mat.h - bottom_offset;
+            }
+
+            out_part.data = out_mat.data + i * _H * outw * outch;
+
+            convParams_t params = {
+                in_param.kernel_w, in_param.kernel_h, in_param.stride_w, in_param.stride_h, 1, 1,
+                in_param.padding,  in_param.pad_w,    pad_h_part};
+
+            class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight,
+                                     num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+            conv.run();
+          }
+        }
+        else
+        {
+          const int _OUTC = (outch + npart - 1) / npart;
+
+          const int nc = (outch + _OUTC - 1) / _OUTC;
+          int rc = out_mat.c % _OUTC;
+          if (rc == 0)
+            rc = _OUTC;
+
+#pragma omp parallel for
+          for (int i = 0; i < nc; i++)
+          {
+            const float *weight_part;
+            convMat_t out_part;
+
+            const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc;
+
+            out_part.w = outw;
+            out_part.h = outh;
+            out_part.c = oc;
+            out_part.data = out_mat.data + i * _OUTC;
+            weight_part = winograd_weight + i * _OUTC * inch;
+            class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part,
+                                     num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+            conv.run();
+          }
+        }
+      }
+      else if (conv_type == row_major)
+      {
+#ifdef TIZEN
+        if (outch < 512)
+#else  // TIZEN
+        if (outh >= 20)
+#endif // TIZEN
+        {
+          const int _H = (outh + npart - 1) / npart;
+
+          if (_H < in_param.pad_h)
+          {
+            class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1,
+                                     in_mat.w * in_mat.h, outw * outh, outch);
+            conv.run();
+            return;
+          }
+
+          // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w;
+          // const int oh = _H;
+          const int nh = (outh + _H - 1) / _H;
+          int rh = outh % _H;
+          if (rh == 0)
+            rh = _H;
+
+#pragma omp parallel for
+          for (int i = 0; i < nh; i++)
+          {
+            int pad_h_part = 0;
+            convMat_t in_part;
+            convMat_t out_part;
+            const int oh = (i != nh - 1 || rh == 0) ? _H : rh;
+            const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w;
+
+            in_part.w = in_mat.w;
+            in_part.c = inch;
+            out_part.w = outw;
+            out_part.c = outch;
+            in_part.h = ih;
+            out_part.h = oh;
+
+            int bottom_offset = i * _H - in_param.pad_h;
+            if (bottom_offset < 0)
+            {
+              bottom_offset = 0;
+              pad_h_part = in_param.pad_h;
+            }
+            in_part.data = in_mat.data + bottom_offset * in_mat.w * in_param.stride_w;
+            if (ih + bottom_offset > in_mat.h)
+            {
+              in_part.h = in_mat.h - bottom_offset;
+            }
+
+            out_part.data = out_mat.data + i * _H * outw;
+
+            convParams_t params = {
+                in_param.kernel_w, in_param.kernel_h, in_param.stride_w, 1,         1,
+                in_param.stride_h, in_param.padding,  in_param.pad_w,    pad_h_part};
+
+            class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight,
+                                     num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+            conv.run();
+          }
+        }
+        else
+        {
+          const int _OUTC = (outch + npart - 1) / npart;
+
+          const int nc = (outch + _OUTC - 1) / _OUTC;
+          int rc = out_mat.c % _OUTC;
+          if (rc == 0)
+            rc = _OUTC;
+
+#pragma omp parallel for
+          for (int i = 0; i < nc; i++)
+          {
+            const float *weight_part;
+            convMat_t out_part;
+
+            const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc;
+
+            out_part.w = outw;
+            out_part.h = outh;
+            out_part.c = oc;
+            out_part.data = out_mat.data + i * _OUTC * outw * outh;
+            weight_part = winograd_weight + i * _OUTC * inch;
+            class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part,
+                                     num_threads, in_mat.w * in_mat.h, outw * outh, outch);
+            conv.run();
+          }
+        }
+      }
+    }
+    else if (direct_flag)
+    {
+      direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads);
+    }
+    else
+    {
+      class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads,
+                                         conv_type);
+      conv.run();
+    }
+  }
+}
+
+void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat,
+                          const convParams_t &in_param, int num_threads, convType_t conv_type)
+{
+  class deconv_sgemm_multithreads deconv(in_mat, weights_mat, out_mat, in_param, num_threads,
+                                         conv_type);
+  deconv.run();
+}
+
+void *trans_weight2sparse(const convMat_t &weights_mat)
+{
+  const int kernel_w = weights_mat.w;
+  const int kernel_h = weights_mat.h;
+  const int inch = weights_mat.c;
+  const int outch = weights_mat.n;
+
+  const int nch = (outch + BCH - 1) / BCH;
+  const int rch = outch % BCH;
+
+  const float *data = weights_mat.data;
+  const int klength = inch * kernel_h * kernel_w;
+
+  sparse_weight_t *sparse_weight = new sparse_weight_t[nch];
+  if (!sparse_weight)
+    return NULL;
+
+  for (int i = 0; i < nch; i++)
+  {
+    int _bch = (i != nch - 1 || rch == 0) ? BCH : rch;
+    sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+    sparse_weight_n->mxk = 0;
+
+    for (int j = 0; j < _bch; j++)
+    {
+      for (int l = 0; l < klength; l++)
+      {
+        float val = *(data + (i * BCH + j) * klength + l);
+        if (val != 0)
+        {
+          sparse_weight_n->mxk++;
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < nch; i++)
+  {
+    int _bch = (i != nch - 1 || rch == 0) ? BCH : rch;
+    sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+    sparse_weight_n->wdata = new weight_data_t[sparse_weight_n->mxk];
+    int index = 0;
+
+    for (int l = 0; l < klength; l++)
+    {
+      for (int j = 0; j < _bch; j++)
+      {
+        float val = *(data + (i * BCH + j) * klength + l);
+        if (val != 0)
+        {
+          sparse_weight_n->wdata[index].m = i * BCH + j;
+          sparse_weight_n->wdata[index].k = l;
+          sparse_weight_n->wdata[index++].data = val;
+        }
+      }
+    }
+  }
+
+  return (void *)sparse_weight;
+}
+
+void sparse_release(const int outch, void *ptr)
+{
+  sparse_weight_t *sparse_weight = (sparse_weight_t *)ptr;
+  const int nch = (outch + BCH - 1) / BCH;
+
+  if (!sparse_weight)
+    return;
+
+  for (int i = 0; i < nch; i++)
+  {
+    sparse_weight_t *sparse_weight_n = &sparse_weight[i];
+    if (sparse_weight_n->wdata)
+      delete[] sparse_weight_n->wdata;
+  }
+
+  if (sparse_weight)
+    delete[] sparse_weight;
+}
+
+void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat,
+                               const convParams_t &in_param, const void *sparse_weight,
+                               int number_threas, convType_t conv_type)
+{
+  class conv_sparse conv(in_mat, out_mat, in_param, (const sparse_weight_t *)sparse_weight,
+                         number_threas, conv_type);
+
+  for (int i = 0; i < out_mat.c * out_mat.h * out_mat.w; i++)
+  {
+    *(out_mat.data + i) = 0;
+  }
+
+  conv.run();
+}
+
+void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat,
+                              convMat_t &out_mat, const convParams_t &in_param,
+                              const float *winograd_weight, int num_threads, convType_t conv_type)
+{
+  int winograd_flag = (winograd_weight != NULL);
+
+  if (winograd_flag)
+  {
+    if (num_threads > 1)
+    {
+      omp_set_num_threads(num_threads);
+      const int batch = in_mat.n;
+      const int npart = (batch + num_threads - 1) / num_threads;
+      const int nn = (batch + npart - 1) / npart;
+      const int rn = batch % npart;
+
+#pragma omp parallel for
+      for (int i = 0; i < nn; i++)
+      {
+        const int pn = (i != nn - 1 || rn == 0) ? npart : rn;
+        convMat_t in_mat_part = {in_mat.w, in_mat.h, in_mat.c, pn,
+                                 in_mat.data + i * npart * in_mat.w * in_mat.h * in_mat.c};
+        convMat_t out_mat_part = {out_mat.w, out_mat.h, out_mat.c, pn,
+                                  out_mat.data + i * npart * out_mat.w * out_mat.h * out_mat.c};
+
+        class conv_winograd_batch conv(in_mat_part, out_mat_part, in_param, conv_type,
+                                       winograd_weight, num_threads);
+        conv.run();
+      }
+    }
+    else
+    {
+      class conv_winograd_batch conv(in_mat, out_mat, in_param, conv_type, winograd_weight,
+                                     num_threads);
+      conv.run();
+    }
+  }
+  else
+  {
+    if (num_threads == 1)
+    {
+      class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type);
+      conv.run();
+    }
+    else
+    {
+      class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads,
+                                         conv_type);
+      conv.run();
+    }
+  }
+}
+
+} // namespace srcn
+} // namespace nnfw