1 files changed, 0 insertions, 387 deletions
diff --git a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc
deleted file mode 100644
index f3ccf13e5..000000000
--- a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "common.h"
-#include "sgemm_kernel.h"
-#include "sgemm_pack.h"
-#include "deconv_sgemm_multithreads.h"
-
-namespace nnfw
-{
-namespace srcn
-{
-
-void deconv_sgemm_multithreads::param_init()
-{
-#if __aarch64__
-  if (conv_type_ == row_major)
-  {
-    mr_ = 8;
-    nr_ = 12;
-  }
-  else if (conv_type_ == col_major)
-  {
-
-    mr_ = 12;
-    nr_ = 8;
-  }
-#else  // __aarch64__
-  if (conv_type_ == row_major)
-  {
-    mr_ = 6;
-    nr_ = 8;
-  }
-  else if (conv_type_ == col_major)
-  {
-    mr_ = 8;
-    nr_ = 6;
-  }
-#endif // __aarch64__
-
-  int col = n_;
-
-  if (m_ > n_)
-  {
-    shard_type_ = shardByRow;
-    col = m_;
-  }
-  else
-  {
-    shard_type_ = shardByCol;
-  }
-
-  int th_base = divup(col, num_threads_);
-
-  th_base = MIN(MAX(th_base, MIN_COL), MAX_COL);
-
-  int k_div = (nr_ * sizeof_RhsScalar);
-  int k_sub = (mr_ * nr_ * sizeof_ResScalar);
-
-  const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div * 2), MAX_K);
-  bk_ = MIN(k_cache, k_);
-
-  if (shard_type_ == shardByCol)
-  {
-    int m_sub = (bk_ * nr_ * sizeof_RhsScalar);
-    int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
-    if (L3_CACHE_SIZE)
-      m_div = (sizeof_LhsScalar * bk_ * 2);
-    int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div);
-    bm_ = MIN(m_cache, m_);
-
-    bn_ = MIN(th_base, n_);
-    if (L3_CACHE_SIZE)
-    {
-      int n_sub = (bk_ * bm_ * sizeof_RhsScalar);
-      int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
-      int n_cache = divup((L3_CACHE_SIZE - n_sub), n_div);
-      bn_ = MIN(n_cache, bn_);
-    }
-  }
-  else
-  {
-    int n_sub = (bk_ * mr_ * sizeof_LhsScalar);
-    int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
-    if (L3_CACHE_SIZE)
-      n_div = (sizeof_LhsScalar * bk_ * 2);
-    int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div);
-    bn_ = MIN(n_cache, n_);
-
-    bm_ = MIN(th_base, m_);
-    if (L3_CACHE_SIZE)
-    {
-      int m_sub = (bk_ * bn_ * sizeof_RhsScalar);
-      int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_);
-      int m_cache = divup((L3_CACHE_SIZE - m_sub), m_div);
-      bm_ = MIN(m_cache, bm_);
-    }
-  }
-
-  nm_ = divup(m_, bm_);
-  nn_ = divup(n_, bn_);
-  nk_ = divup(k_, bk_);
-
-  rm_ = m_ % bm_;
-  rn_ = n_ % bn_;
-  rk_ = k_ % bk_;
-}
-
-deconv_sgemm_multithreads::deconv_sgemm_multithreads(const convMat_t &in_mat,
-                                                     const convMat_t &weights_mat,
-                                                     convMat_t &out_mat,
-                                                     const convParams_t &in_param, int num_threads,
-                                                     convType_t conv_type)
-
-    : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param),
-      conv_type_(conv_type), num_threads_(num_threads)
-{
-  m_ = in_param_.kernel_h * in_param_.kernel_w * out_mat_.c;
-#ifdef NCNN
-  n_ = alignSize(in_mat_.h * in_mat_.w, 16 / sizeof(float));
-#else  // NCNN
-  n_ = in_mat_.w * in_mat_.h;
-#endif // NCNN
-  k_ = in_mat.c;
-
-  param_init();
-
-  int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
-  int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
-
-  if (shard_type_ == shardByCol)
-  {
-    plhs_buffer_ = new float[lhs_stride * 1 * nm_];
-    prhs_buffer_ = new float[rhs_stride * num_threads_];
-  }
-  else
-  {
-    plhs_buffer_ = new float[lhs_stride * num_threads_];
-    prhs_buffer_ = new float[rhs_stride * 1 * nn_];
-  }
-
-  pres_buffer_ = new float[bm_ * bn_ * num_threads_];
-
-  if (plhs_buffer_ == NULL || prhs_buffer_ == NULL || pres_buffer_ == NULL)
-  {
-    error_ = 1;
-  }
-
-  if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
-      in_param_.stride_h != 1 || in_param_.padding != 0)
-  {
-    need_col2im_ = 1;
-  }
-  else
-  {
-    need_col2im_ = 0;
-  }
-
-  omp_set_num_threads(num_threads_);
-
-  error_ = 0;
-}
-
-deconv_sgemm_multithreads::~deconv_sgemm_multithreads()
-{
-  if (plhs_buffer_)
-    delete[] plhs_buffer_;
-  if (prhs_buffer_)
-    delete[] prhs_buffer_;
-  if (pres_buffer_)
-    delete[] pres_buffer_;
-}
-
-void deconv_sgemm_multithreads::run()
-{
-  if (error_)
-    return;
-
-  if (shard_type_ == shardByCol && conv_type_ == col_major)
-  {
-    compute_colmajor_colshard();
-  }
-  else if (shard_type_ == shardByRow && conv_type_ == col_major)
-  {
-    compute_colmajor_rowshard();
-  }
-  else if (shard_type_ == shardByCol && conv_type_ == row_major)
-  {
-    compute_rowmajor_colshard();
-  }
-  else if (shard_type_ == shardByRow && conv_type_ == row_major)
-  {
-    compute_rowmajor_rowshard();
-  }
-}
-
-void deconv_sgemm_multithreads::compute_rowmajor_colshard()
-{
-  int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
-  int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
-
-  for (int l = 0; l < nk_; l++)
-  {
-    const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
-
-#pragma omp parallel for
-    for (int i = 0; i < nm_; i++)
-    {
-      const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
-
-      _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
-                               &plhs_buffer_[i * lhs_stride]);
-    }
-
-#pragma omp parallel for
-    for (int j = 0; j < nn_; j++)
-    {
-      int thread_num = omp_get_thread_num();
-      float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num];
-      float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
-
-      const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
-      _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], prhs_ptr);
-
-      for (int i = 0; i < nm_; i++)
-      {
-        const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
-
-        _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
-                                           prhs_ptr, pres_ptr, 0, bn, bk);
-
-        if (need_col2im_)
-          _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
-                                     &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
-      }
-    }
-  }
-}
-
-void deconv_sgemm_multithreads::compute_rowmajor_rowshard()
-{
-  int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
-  int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
-
-  for (int l = 0; l < nk_; l++)
-  {
-    const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
-
-#pragma omp parallel for
-    for (int j = 0; j < nn_; j++)
-    {
-      const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
-      _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_],
-                                 &prhs_buffer_[j * rhs_stride]);
-    }
-
-#pragma omp parallel for
-    for (int i = 0; i < nm_; i++)
-    {
-      int thread_num = omp_get_thread_num();
-      float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
-      float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
-
-      const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
-
-      _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_],
-                               plhs_ptr);
-
-      for (int j = 0; j < nn_; j++)
-      {
-        const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
-        _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr,
-                                           &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bn, bk);
-        if (need_col2im_)
-          _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
-                                     &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
-      }
-    }
-  }
-}
-
-void deconv_sgemm_multithreads::compute_colmajor_colshard()
-{
-  int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
-  int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
-
-  for (int l = 0; l < nk_; l++)
-  {
-    const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
-
-#pragma omp parallel for
-    for (int i = 0; i < nm_; i++)
-    {
-      const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
-
-      _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
-                               &plhs_buffer_[i * lhs_stride]);
-    }
-
-#pragma omp parallel for
-    for (int j = 0; j < nn_; j++)
-    {
-      int thread_num = omp_get_thread_num();
-      float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num];
-      float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
-
-      const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
-      _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], prhs_ptr);
-
-      for (int i = 0; i < nm_; i++)
-      {
-        const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
-
-        _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride],
-                                           prhs_ptr, pres_ptr, 0, bm, bk);
-
-        // Need to add lock?
-        if (need_col2im_)
-          _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
-                                     &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
-      }
-    }
-  }
-}
-
-void deconv_sgemm_multithreads::compute_colmajor_rowshard()
-{
-  int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_;
-  int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_;
-
-  for (int l = 0; l < nk_; l++)
-  {
-    const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_;
-
-#pragma omp parallel for
-    for (int j = 0; j < nn_; j++)
-    {
-      const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
-      _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_],
-                                 &prhs_buffer_[j * rhs_stride]);
-    }
-
-#pragma omp parallel for
-    for (int i = 0; i < nm_; i++)
-    {
-      int thread_num = omp_get_thread_num();
-      float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num];
-      float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num];
-
-      const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_;
-
-      _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_],
-                               plhs_ptr);
-
-      for (int j = 0; j < nn_; j++)
-      {
-        const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
-        _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr,
-                                           &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bm, bk);
-
-        if (need_col2im_)
-          _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_),
-                                     &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr);
-      }
-    }
-  }
-}
-
-} // namespace srcn
-} // namespace nnfw