diff options
Diffstat (limited to 'compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc')
-rw-r--r-- | compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc | 387 |
1 files changed, 0 insertions, 387 deletions
diff --git a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc deleted file mode 100644 index f3ccf13e5..000000000 --- a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef _OPENMP -#include <omp.h> -#endif - -#include "common.h" -#include "sgemm_kernel.h" -#include "sgemm_pack.h" -#include "deconv_sgemm_multithreads.h" - -namespace nnfw -{ -namespace srcn -{ - -void deconv_sgemm_multithreads::param_init() -{ -#if __aarch64__ - if (conv_type_ == row_major) - { - mr_ = 8; - nr_ = 12; - } - else if (conv_type_ == col_major) - { - - mr_ = 12; - nr_ = 8; - } -#else // __aarch64__ - if (conv_type_ == row_major) - { - mr_ = 6; - nr_ = 8; - } - else if (conv_type_ == col_major) - { - mr_ = 8; - nr_ = 6; - } -#endif // __aarch64__ - - int col = n_; - - if (m_ > n_) - { - shard_type_ = shardByRow; - col = m_; - } - else - { - shard_type_ = shardByCol; - } - - int th_base = divup(col, num_threads_); - - th_base = MIN(MAX(th_base, MIN_COL), MAX_COL); - - int k_div = (nr_ * sizeof_RhsScalar); - int k_sub = (mr_ * nr_ * sizeof_ResScalar); - - const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div * 2), MAX_K); - bk_ = MIN(k_cache, k_); - - if (shard_type_ == shardByCol) - { - int m_sub = (bk_ * nr_ * sizeof_RhsScalar); - int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); - if (L3_CACHE_SIZE) - m_div = (sizeof_LhsScalar * bk_ * 2); - int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div); - bm_ = MIN(m_cache, m_); - - bn_ = MIN(th_base, n_); - if (L3_CACHE_SIZE) - { - int n_sub = (bk_ * bm_ * sizeof_RhsScalar); - int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); - int n_cache = divup((L3_CACHE_SIZE - n_sub), n_div); - bn_ = MIN(n_cache, bn_); - } - } - else - { - int n_sub = (bk_ * mr_ * sizeof_LhsScalar); - int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); - if (L3_CACHE_SIZE) - n_div = (sizeof_LhsScalar * bk_ * 2); - int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div); - bn_ = MIN(n_cache, n_); - - bm_ = MIN(th_base, m_); - if (L3_CACHE_SIZE) - { - int m_sub = (bk_ * bn_ * sizeof_RhsScalar); - int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); - int m_cache = divup((L3_CACHE_SIZE - m_sub), m_div); - bm_ = MIN(m_cache, bm_); - } - } - - nm_ = divup(m_, bm_); - nn_ = divup(n_, bn_); - nk_ = divup(k_, bk_); - - rm_ = m_ % bm_; - rn_ = n_ % bn_; - rk_ = k_ % bk_; -} - -deconv_sgemm_multithreads::deconv_sgemm_multithreads(const convMat_t &in_mat, - const convMat_t &weights_mat, - convMat_t &out_mat, - const convParams_t &in_param, int num_threads, - convType_t conv_type) - - : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param), - conv_type_(conv_type), num_threads_(num_threads) -{ - m_ = in_param_.kernel_h * in_param_.kernel_w * out_mat_.c; -#ifdef NCNN - n_ = alignSize(in_mat_.h * in_mat_.w, 16 / sizeof(float)); -#else // NCNN - n_ = in_mat_.w * in_mat_.h; -#endif // NCNN - k_ = in_mat.c; - - param_init(); - - int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; - int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; - - if (shard_type_ == shardByCol) - { - plhs_buffer_ = new float[lhs_stride * 1 * nm_]; - prhs_buffer_ = new float[rhs_stride * num_threads_]; - } - else - { - plhs_buffer_ = new float[lhs_stride * num_threads_]; - prhs_buffer_ = new float[rhs_stride * 1 * nn_]; - } - - pres_buffer_ = new float[bm_ * bn_ * num_threads_]; - - if (plhs_buffer_ == NULL || prhs_buffer_ == NULL || pres_buffer_ == NULL) - { - error_ = 1; - } - - if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 || - in_param_.stride_h != 1 || in_param_.padding != 0) - { - need_col2im_ = 1; - } - else - { - need_col2im_ = 0; - } - - omp_set_num_threads(num_threads_); - - error_ = 0; -} - -deconv_sgemm_multithreads::~deconv_sgemm_multithreads() -{ - if (plhs_buffer_) - delete[] plhs_buffer_; - if (prhs_buffer_) - delete[] prhs_buffer_; - if (pres_buffer_) - delete[] pres_buffer_; -} - -void deconv_sgemm_multithreads::run() -{ - if (error_) - return; - - if (shard_type_ == shardByCol && conv_type_ == col_major) - { - compute_colmajor_colshard(); - } - else if (shard_type_ == shardByRow && conv_type_ == col_major) - { - compute_colmajor_rowshard(); - } - else if (shard_type_ == shardByCol && conv_type_ == row_major) - { - compute_rowmajor_colshard(); - } - else if (shard_type_ == shardByRow && conv_type_ == row_major) - { - compute_rowmajor_rowshard(); - } -} - -void deconv_sgemm_multithreads::compute_rowmajor_colshard() -{ - int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; - int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; - - for (int l = 0; l < nk_; l++) - { - const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; - -#pragma omp parallel for - for (int i = 0; i < nm_; i++) - { - const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; - - _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], - &plhs_buffer_[i * lhs_stride]); - } - -#pragma omp parallel for - for (int j = 0; j < nn_; j++) - { - int thread_num = omp_get_thread_num(); - float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num]; - float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; - - const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; - _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], prhs_ptr); - - for (int i = 0; i < nm_; i++) - { - const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; - - _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], - prhs_ptr, pres_ptr, 0, bn, bk); - - if (need_col2im_) - _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), - &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); - } - } - } -} - -void deconv_sgemm_multithreads::compute_rowmajor_rowshard() -{ - int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; - int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; - - for (int l = 0; l < nk_; l++) - { - const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; - -#pragma omp parallel for - for (int j = 0; j < nn_; j++) - { - const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; - _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], - &prhs_buffer_[j * rhs_stride]); - } - -#pragma omp parallel for - for (int i = 0; i < nm_; i++) - { - int thread_num = omp_get_thread_num(); - float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; - float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; - - const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; - - _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], - plhs_ptr); - - for (int j = 0; j < nn_; j++) - { - const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; - _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, - &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bn, bk); - if (need_col2im_) - _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), - &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); - } - } - } -} - -void deconv_sgemm_multithreads::compute_colmajor_colshard() -{ - int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; - int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; - - for (int l = 0; l < nk_; l++) - { - const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; - -#pragma omp parallel for - for (int i = 0; i < nm_; i++) - { - const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; - - _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], - &plhs_buffer_[i * lhs_stride]); - } - -#pragma omp parallel for - for (int j = 0; j < nn_; j++) - { - int thread_num = omp_get_thread_num(); - float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num]; - float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; - - const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; - _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], prhs_ptr); - - for (int i = 0; i < nm_; i++) - { - const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; - - _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], - prhs_ptr, pres_ptr, 0, bm, bk); - - // Need to add lock? - if (need_col2im_) - _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), - &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); - } - } - } -} - -void deconv_sgemm_multithreads::compute_colmajor_rowshard() -{ - int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; - int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; - - for (int l = 0; l < nk_; l++) - { - const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; - -#pragma omp parallel for - for (int j = 0; j < nn_; j++) - { - const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; - _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], - &prhs_buffer_[j * rhs_stride]); - } - -#pragma omp parallel for - for (int i = 0; i < nm_; i++) - { - int thread_num = omp_get_thread_num(); - float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; - float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; - - const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; - - _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], - plhs_ptr); - - for (int j = 0; j < nn_; j++) - { - const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; - _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, - &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bm, bk); - - if (need_col2im_) - _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), - &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); - } - } - } -} - -} // namespace srcn -} // namespace nnfw |