summaryrefslogtreecommitdiff
path: root/runtimes/nn/depend/external/gemmlowp/internal/unpack.h
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/unpack.h')
-rw-r--r--runtimes/nn/depend/external/gemmlowp/internal/unpack.h278
1 files changed, 278 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/unpack.h b/runtimes/nn/depend/external/gemmlowp/internal/unpack.h
new file mode 100644
index 000000000..33aee13b8
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/internal/unpack.h
@@ -0,0 +1,278 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// unpack.h: unpacking the result blocks computed by compute.h,
+// storing them into the destination matrix.
+
+#ifndef GEMMLOWP_INTERNAL_UNPACK_H_
+#define GEMMLOWP_INTERNAL_UNPACK_H_
+
+#include "allocator.h"
+#include "block_params.h"
+#include "output.h"
+#include "pack.h"
+
+#include <cmath>
+
+namespace gemmlowp {
+
+class PackedResult {
+ public:
+ PackedResult(Allocator* _allocator, const BlockParams& _block_params)
+ : allocator_(_allocator), block_params_(_block_params) {
+ matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
+ block_params_.l2_cols);
+ }
+
+ ~PackedResult() {}
+
+ MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
+ return MatrixMap<std::int32_t, MapOrder::ColMajor>(
+ allocator_->GetPointer<std::int32_t>(matrix_handle_),
+ block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+ }
+
+ MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
+ return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
+ allocator_->GetPointer<const std::int32_t>(matrix_handle_),
+ block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+ }
+
+ private:
+ Allocator* allocator_;
+ Allocator::Handle matrix_handle_;
+ const BlockParams& block_params_;
+};
+
+struct MatrixBlockBounds {
+ int start_row;
+ int start_col;
+ int rows;
+ int cols;
+
+ MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_)
+ : start_row(start_row_),
+ start_col(start_col_),
+ rows(rows_),
+ cols(cols_) {}
+};
+
+template <int Rows, int Cols, typename SrcMapType>
+void PrefetchResultBlock(const SrcMapType& src,
+ const VectorMap<const std::int32_t, VectorShape::Col>&
+ lhs_sums_of_each_slice,
+ int src_row, int src_col) {
+ const std::int32_t* src_data = src.data(src_row, src_col);
+ const int src_stride = src.stride();
+ const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row);
+ for (int r = 0; r < Rows; r += 4) {
+ Prefetch(lhs_sums_data + r);
+ }
+ for (int c = 0; c < Cols; c++) {
+ for (int r = 0; r < Rows; r += 4) {
+ Prefetch(src_data + r + c * src_stride);
+ }
+ }
+}
+
+template <typename KernelFormat, typename RegisterBlockType,
+ typename SrcMapType, typename LhsOffset, typename RhsOffset,
+ typename OutputPipelineExecutorType, typename DstType>
+void UnpackResultBlock(const SrcMapType& src,
+ const OutputPipelineExecutorType& executor, DstType* dst,
+ const VectorMap<const std::int32_t, VectorShape::Col>&
+ lhs_sums_of_each_slice,
+ const VectorMap<const std::int32_t, VectorShape::Row>&
+ rhs_sums_of_each_slice,
+ const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+ int depth, int src_row, int src_col, int src_global_row,
+ int src_global_col, int dst_row, int dst_col) {
+ using KernelLhsScalar = typename KernelFormat::Lhs::Scalar;
+ using KernelRhsScalar = typename KernelFormat::Rhs::Scalar;
+ static constexpr int KernelLhsZeroPointInput =
+ ZeroPointInputValue<KernelLhsScalar>::kValue;
+ static constexpr int KernelRhsZeroPointInput =
+ ZeroPointInputValue<KernelRhsScalar>::kValue;
+ auto acc = Load<RegisterBlockType>(src, src_row, src_col);
+ const auto& lhs_sums_of_each_slice_block =
+ LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row);
+ const auto& rhs_sums_of_each_slice_block =
+ LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col);
+ auto lhs_offset_block =
+ LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row);
+ auto rhs_offset_block =
+ LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col);
+ AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block);
+ AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block);
+ BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc);
+ for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) {
+ rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth);
+ }
+ BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block),
+ lhs_offset_block, &acc);
+ executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col);
+}
+
+template <typename KernelFormat, typename ResultBlockType,
+ typename PackedResultType, typename LhsOffset, typename RhsOffset,
+ typename OutputPipelineType>
+void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block,
+ const PackedResultType& src, int depth,
+ const std::int32_t* lhs_sums_of_each_slice_ptr,
+ const std::int32_t* rhs_sums_of_each_slice_ptr,
+ const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+ const OutputPipelineType& output_pipeline) {
+ ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor
+ ? "unpack to column-major"
+ : "unpack to row-major");
+ assert(dst_block.start_row >= 0);
+ assert(dst_block.start_row + dst_block.rows <= dst->rows());
+ assert(dst_block.start_col >= 0);
+ assert(dst_block.start_col + dst_block.cols <= dst->cols());
+ const auto src_map = src.Map();
+ const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice(
+ lhs_sums_of_each_slice_ptr, dst_block.rows);
+ const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice(
+ rhs_sums_of_each_slice_ptr, dst_block.cols);
+ using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>;
+ using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>;
+ using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>;
+ using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>;
+ using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>;
+ using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>;
+
+ using DstScalarType = typename ResultBlockType::Scalar;
+ using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>;
+
+ OutputPipelineExecutor<OutputPipelineType, Int32x1x1>
+ output_pipeline_executor_1x1(output_pipeline);
+ OutputPipelineExecutor<OutputPipelineType, Int32x4x1>
+ output_pipeline_executor_4x1(output_pipeline);
+ OutputPipelineExecutor<OutputPipelineType, Int32x8x1>
+ output_pipeline_executor_8x1(output_pipeline);
+ OutputPipelineExecutor<OutputPipelineType, Int32x1x4>
+ output_pipeline_executor_1x4(output_pipeline);
+ OutputPipelineExecutor<OutputPipelineType, Int32x4x4>
+ output_pipeline_executor_4x4(output_pipeline);
+ OutputPipelineExecutor<OutputPipelineType, Int32x8x4>
+ output_pipeline_executor_8x4(output_pipeline);
+
+ int c8 = 0;
+ if (ResultBlockType::kOrder == MapOrder::RowMajor) {
+ for (; c8 <= dst_block.cols - 8; c8 += 8) {
+ PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8);
+ int r = 0;
+ for (; r <= dst_block.rows - 8; r += 8) {
+ const int global_row = r + dst_block.start_row;
+ PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8);
+ DstScalarType dst_colmajor_buf[64];
+ MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map(
+ dst_colmajor_buf, 8, 8);
+ for (int cx = 0; cx < 8; cx += 4) {
+ const int c = c8 + cx;
+ const int global_col = c + dst_block.start_col;
+ UnpackResultBlock<KernelFormat, Int32x8x4>(
+ src_map, output_pipeline_executor_8x4, &dst_colmajor_map,
+ lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
+ rhs_offset, depth, r, c, global_row, global_col, 0, cx);
+ }
+ StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst,
+ r + dst_block.start_row, c8 + dst_block.start_col);
+ }
+ for (; r <= dst_block.rows - 4; r += 4) {
+ const int global_row = r + dst_block.start_row;
+ for (int cx = 0; cx < 8; cx += 4) {
+ const int c = c8 + cx;
+ const int global_col = c + dst_block.start_col;
+ UnpackResultBlock<KernelFormat, Int32x4x4>(
+ src_map, output_pipeline_executor_4x4, dst,
+ lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
+ rhs_offset, depth, r, c, global_row, global_col, global_row,
+ global_col);
+ }
+ }
+ for (; r < dst_block.rows; r++) {
+ const int global_row = r + dst_block.start_row;
+ for (int cx = 0; cx < 8; cx += 4) {
+ const int c = c8 + cx;
+ const int global_col = c + dst_block.start_col;
+ UnpackResultBlock<KernelFormat, Int32x1x4>(
+ src_map, output_pipeline_executor_1x4, dst,
+ lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
+ rhs_offset, depth, r, c, global_row, global_col, global_row,
+ global_col);
+ }
+ }
+ }
+ }
+ int c = c8;
+ for (; c <= dst_block.cols - 4; c += 4) {
+ const int global_col = c + dst_block.start_col;
+ PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c);
+ int r = 0;
+ for (; r <= dst_block.rows - 8; r += 8) {
+ const int global_row = r + dst_block.start_row;
+ PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c);
+ UnpackResultBlock<KernelFormat, Int32x8x4>(
+ src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice,
+ rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+ global_row, global_col, global_row, global_col);
+ }
+ for (; r <= dst_block.rows - 4; r += 4) {
+ const int global_row = r + dst_block.start_row;
+ UnpackResultBlock<KernelFormat, Int32x4x4>(
+ src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice,
+ rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+ global_row, global_col, global_row, global_col);
+ }
+ for (; r < dst_block.rows; r++) {
+ const int global_row = r + dst_block.start_row;
+ UnpackResultBlock<KernelFormat, Int32x1x4>(
+ src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice,
+ rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+ global_row, global_col, global_row, global_col);
+ }
+ }
+ for (; c < dst_block.cols; c++) {
+ const int global_col = c + dst_block.start_col;
+ PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c);
+ int r = 0;
+ for (; r <= dst_block.rows - 8; r += 8) {
+ const int global_row = r + dst_block.start_row;
+ PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c);
+ UnpackResultBlock<KernelFormat, Int32x8x1>(
+ src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice,
+ rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+ global_row, global_col, global_row, global_col);
+ }
+ for (; r <= dst_block.rows - 4; r += 4) {
+ const int global_row = r + dst_block.start_row;
+ UnpackResultBlock<KernelFormat, Int32x4x1>(
+ src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice,
+ rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+ global_row, global_col, global_row, global_col);
+ }
+ for (; r < dst_block.rows; r++) {
+ const int global_row = r + dst_block.start_row;
+ UnpackResultBlock<KernelFormat, Int32x1x1>(
+ src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice,
+ rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+ global_row, global_col, global_row, global_col);
+ }
+ }
+}
+
+} // end namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_UNPACK_H_