diff options
Diffstat (limited to 'runtimes/libs/srcn/src/conv_sparse.cc')
-rw-r--r-- | runtimes/libs/srcn/src/conv_sparse.cc | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/runtimes/libs/srcn/src/conv_sparse.cc b/runtimes/libs/srcn/src/conv_sparse.cc new file mode 100644 index 000000000..10e2a2b93 --- /dev/null +++ b/runtimes/libs/srcn/src/conv_sparse.cc @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include <stdexcept> + +#include "common.h" +#include "sgemm_kernel.h" +#include "sgemm_pack.h" +#include "conv_sparse.h" + +namespace nnfw +{ +namespace srcn +{ + +void conv_sparse::param_init() +{ +#ifdef NCNN + n_ = alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float)); +#else + n_ = out_mat_.w * out_mat_.h; +#endif + + bch_ = BCH; + nch_ = (out_mat_.c + bch_ - 1) / bch_; + + rch_ = out_mat_.c % bch_; + + bn_ = MIN(n_, L1_CACHE_SIZE / (sizeof(float) * 2)); + bn_ = MIN(bn_, (L2_CACHE_SIZE / 2 - bch_ * sizeof(weight_data_t)) / ((bch_ + 1) * sizeof(float)) / + num_threads_); + nn_ = (n_ + bn_ - 1) / bn_; + rn_ = n_ % bn_; + + if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 || + in_param_.stride_h != 1 || in_param_.padding != 0) + { + need_im2col_ = 1; + } + else + { + need_im2col_ = 0; + } +} + +conv_sparse::conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param, + const sparse_weight_t *weights, int num_threads, convType_t conv_type) + : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), weights_(weights), + num_threads_(num_threads), conv_type_(conv_type) +{ + param_init(); +} + +conv_sparse::~conv_sparse() {} + +void conv_sparse::compute_singlethread() +{ + if (need_im2col_) + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + float prhs_ptr[bn_]; + + for (int j = 0; j < nn_; j++) + { + int k = -1; + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + + for (int l = 0; l < mxk; l++) + { + if (k != lhs_ptr->k) + { + k = lhs_ptr->k; + _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + prhs_ptr); + } + + // Why n_ = 64 x 64 is too much slower on Tizen??? + _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + else + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float *rhs_ptr = in_mat_.data + j * bn_; + + for (int l = 0; l < mxk; l++) + { + // Why n_ = 64 x 64 is too much slower on Tizen??? + _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } +} + +void conv_sparse::compute_multithreads() +{ + omp_set_num_threads(num_threads_); + + if (nch_ >= num_threads_ || nch_ >= nn_) + { + if (need_im2col_) + { +#pragma omp parallel for + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + float prhs_ptr[bn_]; + + for (int j = 0; j < nn_; j++) + { + int k = -1; + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + + for (int l = 0; l < mxk; l++) + { + if (k != lhs_ptr->k) + { + k = lhs_ptr->k; + _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + prhs_ptr); + } + + _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + else + { +#pragma omp parallel for + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float *rhs_ptr = in_mat_.data + j * bn_; + + for (int l = 0; l < mxk; l++) + { + _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + } + else + { + if (need_im2col_) + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + int k = -1; + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float prhs_ptr[bn]; + + for (int l = 0; l < mxk; l++) + { + if (k != lhs_ptr->k) + { + k = lhs_ptr->k; + _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + prhs_ptr); + } + + _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + else + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float *rhs_ptr = in_mat_.data + j * bn_; + + for (int l = 0; l < mxk; l++) + { + _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + } +} + +void conv_sparse::run() +{ + if (num_threads_ == 1) + compute_singlethread(); + else if (num_threads_ > 1) + compute_multithreads(); + else + throw std::runtime_error{"Invalid thread number."}; +} + +} // namespace srcn +} // namespace nnfw |