1 files changed, 271 insertions, 0 deletions
diff --git a/runtimes/libs/srcn/src/conv_sparse.cc b/runtimes/libs/srcn/src/conv_sparse.cc
new file mode 100644
index 000000000..10e2a2b93
--- /dev/null
+++ b/runtimes/libs/srcn/src/conv_sparse.cc
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <stdexcept>
+
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "conv_sparse.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_sparse::param_init()
+{
+#ifdef NCNN
+  n_ = alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float));
+#else
+  n_ = out_mat_.w * out_mat_.h;
+#endif
+
+  bch_ = BCH;
+  nch_ = (out_mat_.c + bch_ - 1) / bch_;
+
+  rch_ = out_mat_.c % bch_;
+
+  bn_ = MIN(n_, L1_CACHE_SIZE / (sizeof(float) * 2));
+  bn_ = MIN(bn_, (L2_CACHE_SIZE / 2 - bch_ * sizeof(weight_data_t)) / ((bch_ + 1) * sizeof(float)) /
+                     num_threads_);
+  nn_ = (n_ + bn_ - 1) / bn_;
+  rn_ = n_ % bn_;
+
+  if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
+      in_param_.stride_h != 1 || in_param_.padding != 0)
+  {
+    need_im2col_ = 1;
+  }
+  else
+  {
+    need_im2col_ = 0;
+  }
+}
+
+conv_sparse::conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param,
+                         const sparse_weight_t *weights, int num_threads, convType_t conv_type)
+    : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), weights_(weights),
+      num_threads_(num_threads), conv_type_(conv_type)
+{
+  param_init();
+}
+
+conv_sparse::~conv_sparse() {}
+
+void conv_sparse::compute_singlethread()
+{
+  if (need_im2col_)
+  {
+    for (int i = 0; i < nch_; i++)
+    {
+      const sparse_weight_t *weight_ptr = weights_ + i;
+      const int mxk = weight_ptr->mxk;
+      float prhs_ptr[bn_];
+
+      for (int j = 0; j < nn_; j++)
+      {
+        int k = -1;
+        const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+        weight_data_t *lhs_ptr = weight_ptr->wdata;
+
+        for (int l = 0; l < mxk; l++)
+        {
+          if (k != lhs_ptr->k)
+          {
+            k = lhs_ptr->k;
+            _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+                                        &out_mat_, const_cast<convParams_t *>(&in_param_),
+                                        prhs_ptr);
+          }
+
+          // Why n_ = 64 x 64 is too much slower on Tizen???
+          _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+                               &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+          lhs_ptr++;
+        }
+      }
+    }
+  }
+  else
+  {
+    for (int i = 0; i < nch_; i++)
+    {
+      const sparse_weight_t *weight_ptr = weights_ + i;
+      const int mxk = weight_ptr->mxk;
+
+      for (int j = 0; j < nn_; j++)
+      {
+        const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+        weight_data_t *lhs_ptr = weight_ptr->wdata;
+        float *rhs_ptr = in_mat_.data + j * bn_;
+
+        for (int l = 0; l < mxk; l++)
+        {
+          // Why n_ = 64 x 64 is too much slower on Tizen???
+          _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+                               &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+          lhs_ptr++;
+        }
+      }
+    }
+  }
+}
+
+void conv_sparse::compute_multithreads()
+{
+  omp_set_num_threads(num_threads_);
+
+  if (nch_ >= num_threads_ || nch_ >= nn_)
+  {
+    if (need_im2col_)
+    {
+#pragma omp parallel for
+      for (int i = 0; i < nch_; i++)
+      {
+        const sparse_weight_t *weight_ptr = weights_ + i;
+        const int mxk = weight_ptr->mxk;
+        float prhs_ptr[bn_];
+
+        for (int j = 0; j < nn_; j++)
+        {
+          int k = -1;
+          const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+          weight_data_t *lhs_ptr = weight_ptr->wdata;
+
+          for (int l = 0; l < mxk; l++)
+          {
+            if (k != lhs_ptr->k)
+            {
+              k = lhs_ptr->k;
+              _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+                                          &out_mat_, const_cast<convParams_t *>(&in_param_),
+                                          prhs_ptr);
+            }
+
+            _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+                                 &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+            lhs_ptr++;
+          }
+        }
+      }
+    }
+    else
+    {
+#pragma omp parallel for
+      for (int i = 0; i < nch_; i++)
+      {
+        const sparse_weight_t *weight_ptr = weights_ + i;
+        const int mxk = weight_ptr->mxk;
+
+        for (int j = 0; j < nn_; j++)
+        {
+          const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+          weight_data_t *lhs_ptr = weight_ptr->wdata;
+          float *rhs_ptr = in_mat_.data + j * bn_;
+
+          for (int l = 0; l < mxk; l++)
+          {
+            _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+                                 &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+            lhs_ptr++;
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    if (need_im2col_)
+    {
+      for (int i = 0; i < nch_; i++)
+      {
+        const sparse_weight_t *weight_ptr = weights_ + i;
+        const int mxk = weight_ptr->mxk;
+
+#pragma omp parallel for
+        for (int j = 0; j < nn_; j++)
+        {
+          int k = -1;
+          const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+          weight_data_t *lhs_ptr = weight_ptr->wdata;
+          float prhs_ptr[bn];
+
+          for (int l = 0; l < mxk; l++)
+          {
+            if (k != lhs_ptr->k)
+            {
+              k = lhs_ptr->k;
+              _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+                                          &out_mat_, const_cast<convParams_t *>(&in_param_),
+                                          prhs_ptr);
+            }
+
+            _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+                                 &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+            lhs_ptr++;
+          }
+        }
+      }
+    }
+    else
+    {
+      for (int i = 0; i < nch_; i++)
+      {
+        const sparse_weight_t *weight_ptr = weights_ + i;
+        const int mxk = weight_ptr->mxk;
+
+#pragma omp parallel for
+        for (int j = 0; j < nn_; j++)
+        {
+          const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+          weight_data_t *lhs_ptr = weight_ptr->wdata;
+          float *rhs_ptr = in_mat_.data + j * bn_;
+
+          for (int l = 0; l < mxk; l++)
+          {
+            _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+                                 &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+            lhs_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+void conv_sparse::run()
+{
+  if (num_threads_ == 1)
+    compute_singlethread();
+  else if (num_threads_ > 1)
+    compute_multithreads();
+  else
+    throw std::runtime_error{"Invalid thread number."};
+}
+
+} // namespace srcn
+} // namespace nnfw