summaryrefslogtreecommitdiff
path: root/runtimes/libs/srcn/src/conv_sparse.cc
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/libs/srcn/src/conv_sparse.cc')
-rw-r--r--runtimes/libs/srcn/src/conv_sparse.cc271
1 files changed, 271 insertions, 0 deletions
diff --git a/runtimes/libs/srcn/src/conv_sparse.cc b/runtimes/libs/srcn/src/conv_sparse.cc
new file mode 100644
index 000000000..10e2a2b93
--- /dev/null
+++ b/runtimes/libs/srcn/src/conv_sparse.cc
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <stdexcept>
+
+#include "common.h"
+#include "sgemm_kernel.h"
+#include "sgemm_pack.h"
+#include "conv_sparse.h"
+
+namespace nnfw
+{
+namespace srcn
+{
+
+void conv_sparse::param_init()
+{
+#ifdef NCNN
+ n_ = alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float));
+#else
+ n_ = out_mat_.w * out_mat_.h;
+#endif
+
+ bch_ = BCH;
+ nch_ = (out_mat_.c + bch_ - 1) / bch_;
+
+ rch_ = out_mat_.c % bch_;
+
+ bn_ = MIN(n_, L1_CACHE_SIZE / (sizeof(float) * 2));
+ bn_ = MIN(bn_, (L2_CACHE_SIZE / 2 - bch_ * sizeof(weight_data_t)) / ((bch_ + 1) * sizeof(float)) /
+ num_threads_);
+ nn_ = (n_ + bn_ - 1) / bn_;
+ rn_ = n_ % bn_;
+
+ if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 ||
+ in_param_.stride_h != 1 || in_param_.padding != 0)
+ {
+ need_im2col_ = 1;
+ }
+ else
+ {
+ need_im2col_ = 0;
+ }
+}
+
+conv_sparse::conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param,
+ const sparse_weight_t *weights, int num_threads, convType_t conv_type)
+ : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), weights_(weights),
+ num_threads_(num_threads), conv_type_(conv_type)
+{
+ param_init();
+}
+
+conv_sparse::~conv_sparse() {}
+
+void conv_sparse::compute_singlethread()
+{
+ if (need_im2col_)
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+ float prhs_ptr[bn_];
+
+ for (int j = 0; j < nn_; j++)
+ {
+ int k = -1;
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ if (k != lhs_ptr->k)
+ {
+ k = lhs_ptr->k;
+ _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ prhs_ptr);
+ }
+
+ // Why n_ = 64 x 64 is too much slower on Tizen???
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float *rhs_ptr = in_mat_.data + j * bn_;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ // Why n_ = 64 x 64 is too much slower on Tizen???
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+}
+
+void conv_sparse::compute_multithreads()
+{
+ omp_set_num_threads(num_threads_);
+
+ if (nch_ >= num_threads_ || nch_ >= nn_)
+ {
+ if (need_im2col_)
+ {
+#pragma omp parallel for
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+ float prhs_ptr[bn_];
+
+ for (int j = 0; j < nn_; j++)
+ {
+ int k = -1;
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ if (k != lhs_ptr->k)
+ {
+ k = lhs_ptr->k;
+ _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ prhs_ptr);
+ }
+
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+#pragma omp parallel for
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float *rhs_ptr = in_mat_.data + j * bn_;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (need_im2col_)
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ int k = -1;
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float prhs_ptr[bn];
+
+ for (int l = 0; l < mxk; l++)
+ {
+ if (k != lhs_ptr->k)
+ {
+ k = lhs_ptr->k;
+ _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_),
+ &out_mat_, const_cast<convParams_t *>(&in_param_),
+ prhs_ptr);
+ }
+
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (int i = 0; i < nch_; i++)
+ {
+ const sparse_weight_t *weight_ptr = weights_ + i;
+ const int mxk = weight_ptr->mxk;
+
+#pragma omp parallel for
+ for (int j = 0; j < nn_; j++)
+ {
+ const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_;
+ weight_data_t *lhs_ptr = weight_ptr->wdata;
+ float *rhs_ptr = in_mat_.data + j * bn_;
+
+ for (int l = 0; l < mxk; l++)
+ {
+ _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_,
+ &out_mat_.data[lhs_ptr->m * n_ + j * bn_]);
+
+ lhs_ptr++;
+ }
+ }
+ }
+ }
+ }
+}
+
+void conv_sparse::run()
+{
+ if (num_threads_ == 1)
+ compute_singlethread();
+ else if (num_threads_ > 1)
+ compute_multithreads();
+ else
+ throw std::runtime_error{"Invalid thread number."};
+}
+
+} // namespace srcn
+} // namespace nnfw