diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/unpack.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/unpack.h | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/unpack.h b/runtimes/nn/depend/external/gemmlowp/internal/unpack.h new file mode 100644 index 000000000..33aee13b8 --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/internal/unpack.h @@ -0,0 +1,278 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// unpack.h: unpacking the result blocks computed by compute.h, +// storing them into the destination matrix. + +#ifndef GEMMLOWP_INTERNAL_UNPACK_H_ +#define GEMMLOWP_INTERNAL_UNPACK_H_ + +#include "allocator.h" +#include "block_params.h" +#include "output.h" +#include "pack.h" + +#include <cmath> + +namespace gemmlowp { + +class PackedResult { + public: + PackedResult(Allocator* _allocator, const BlockParams& _block_params) + : allocator_(_allocator), block_params_(_block_params) { + matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows * + block_params_.l2_cols); + } + + ~PackedResult() {} + + MatrixMap<std::int32_t, MapOrder::ColMajor> Map() { + return MatrixMap<std::int32_t, MapOrder::ColMajor>( + allocator_->GetPointer<std::int32_t>(matrix_handle_), + block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); + } + + MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const { + return MatrixMap<const std::int32_t, MapOrder::ColMajor>( + allocator_->GetPointer<const std::int32_t>(matrix_handle_), + block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); + } + + private: + Allocator* allocator_; + Allocator::Handle matrix_handle_; + const BlockParams& block_params_; +}; + +struct MatrixBlockBounds { + int start_row; + int start_col; + int rows; + int cols; + + MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_) + : start_row(start_row_), + start_col(start_col_), + rows(rows_), + cols(cols_) {} +}; + +template <int Rows, int Cols, typename SrcMapType> +void PrefetchResultBlock(const SrcMapType& src, + const VectorMap<const std::int32_t, VectorShape::Col>& + lhs_sums_of_each_slice, + int src_row, int src_col) { + const std::int32_t* src_data = src.data(src_row, src_col); + const int src_stride = src.stride(); + const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row); + for (int r = 0; r < Rows; r += 4) { + Prefetch(lhs_sums_data + r); + } + for (int c = 0; c < Cols; c++) { + for (int r = 0; r < Rows; r += 4) { + Prefetch(src_data + r + c * src_stride); + } + } +} + +template <typename KernelFormat, typename RegisterBlockType, + typename SrcMapType, typename LhsOffset, typename RhsOffset, + typename OutputPipelineExecutorType, typename DstType> +void UnpackResultBlock(const SrcMapType& src, + const OutputPipelineExecutorType& executor, DstType* dst, + const VectorMap<const std::int32_t, VectorShape::Col>& + lhs_sums_of_each_slice, + const VectorMap<const std::int32_t, VectorShape::Row>& + rhs_sums_of_each_slice, + const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, + int depth, int src_row, int src_col, int src_global_row, + int src_global_col, int dst_row, int dst_col) { + using KernelLhsScalar = typename KernelFormat::Lhs::Scalar; + using KernelRhsScalar = typename KernelFormat::Rhs::Scalar; + static constexpr int KernelLhsZeroPointInput = + ZeroPointInputValue<KernelLhsScalar>::kValue; + static constexpr int KernelRhsZeroPointInput = + ZeroPointInputValue<KernelRhsScalar>::kValue; + auto acc = Load<RegisterBlockType>(src, src_row, src_col); + const auto& lhs_sums_of_each_slice_block = + LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row); + const auto& rhs_sums_of_each_slice_block = + LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col); + auto lhs_offset_block = + LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row); + auto rhs_offset_block = + LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col); + AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block); + AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block); + BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc); + for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) { + rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth); + } + BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block), + lhs_offset_block, &acc); + executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col); +} + +template <typename KernelFormat, typename ResultBlockType, + typename PackedResultType, typename LhsOffset, typename RhsOffset, + typename OutputPipelineType> +void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block, + const PackedResultType& src, int depth, + const std::int32_t* lhs_sums_of_each_slice_ptr, + const std::int32_t* rhs_sums_of_each_slice_ptr, + const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, + const OutputPipelineType& output_pipeline) { + ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor + ? "unpack to column-major" + : "unpack to row-major"); + assert(dst_block.start_row >= 0); + assert(dst_block.start_row + dst_block.rows <= dst->rows()); + assert(dst_block.start_col >= 0); + assert(dst_block.start_col + dst_block.cols <= dst->cols()); + const auto src_map = src.Map(); + const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice( + lhs_sums_of_each_slice_ptr, dst_block.rows); + const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice( + rhs_sums_of_each_slice_ptr, dst_block.cols); + using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>; + using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>; + using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>; + using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>; + using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>; + using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>; + + using DstScalarType = typename ResultBlockType::Scalar; + using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>; + + OutputPipelineExecutor<OutputPipelineType, Int32x1x1> + output_pipeline_executor_1x1(output_pipeline); + OutputPipelineExecutor<OutputPipelineType, Int32x4x1> + output_pipeline_executor_4x1(output_pipeline); + OutputPipelineExecutor<OutputPipelineType, Int32x8x1> + output_pipeline_executor_8x1(output_pipeline); + OutputPipelineExecutor<OutputPipelineType, Int32x1x4> + output_pipeline_executor_1x4(output_pipeline); + OutputPipelineExecutor<OutputPipelineType, Int32x4x4> + output_pipeline_executor_4x4(output_pipeline); + OutputPipelineExecutor<OutputPipelineType, Int32x8x4> + output_pipeline_executor_8x4(output_pipeline); + + int c8 = 0; + if (ResultBlockType::kOrder == MapOrder::RowMajor) { + for (; c8 <= dst_block.cols - 8; c8 += 8) { + PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8); + int r = 0; + for (; r <= dst_block.rows - 8; r += 8) { + const int global_row = r + dst_block.start_row; + PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8); + DstScalarType dst_colmajor_buf[64]; + MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map( + dst_colmajor_buf, 8, 8); + for (int cx = 0; cx < 8; cx += 4) { + const int c = c8 + cx; + const int global_col = c + dst_block.start_col; + UnpackResultBlock<KernelFormat, Int32x8x4>( + src_map, output_pipeline_executor_8x4, &dst_colmajor_map, + lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, + rhs_offset, depth, r, c, global_row, global_col, 0, cx); + } + StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst, + r + dst_block.start_row, c8 + dst_block.start_col); + } + for (; r <= dst_block.rows - 4; r += 4) { + const int global_row = r + dst_block.start_row; + for (int cx = 0; cx < 8; cx += 4) { + const int c = c8 + cx; + const int global_col = c + dst_block.start_col; + UnpackResultBlock<KernelFormat, Int32x4x4>( + src_map, output_pipeline_executor_4x4, dst, + lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, + rhs_offset, depth, r, c, global_row, global_col, global_row, + global_col); + } + } + for (; r < dst_block.rows; r++) { + const int global_row = r + dst_block.start_row; + for (int cx = 0; cx < 8; cx += 4) { + const int c = c8 + cx; + const int global_col = c + dst_block.start_col; + UnpackResultBlock<KernelFormat, Int32x1x4>( + src_map, output_pipeline_executor_1x4, dst, + lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, + rhs_offset, depth, r, c, global_row, global_col, global_row, + global_col); + } + } + } + } + int c = c8; + for (; c <= dst_block.cols - 4; c += 4) { + const int global_col = c + dst_block.start_col; + PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c); + int r = 0; + for (; r <= dst_block.rows - 8; r += 8) { + const int global_row = r + dst_block.start_row; + PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c); + UnpackResultBlock<KernelFormat, Int32x8x4>( + src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice, + rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, + global_row, global_col, global_row, global_col); + } + for (; r <= dst_block.rows - 4; r += 4) { + const int global_row = r + dst_block.start_row; + UnpackResultBlock<KernelFormat, Int32x4x4>( + src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice, + rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, + global_row, global_col, global_row, global_col); + } + for (; r < dst_block.rows; r++) { + const int global_row = r + dst_block.start_row; + UnpackResultBlock<KernelFormat, Int32x1x4>( + src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice, + rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, + global_row, global_col, global_row, global_col); + } + } + for (; c < dst_block.cols; c++) { + const int global_col = c + dst_block.start_col; + PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c); + int r = 0; + for (; r <= dst_block.rows - 8; r += 8) { + const int global_row = r + dst_block.start_row; + PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c); + UnpackResultBlock<KernelFormat, Int32x8x1>( + src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice, + rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, + global_row, global_col, global_row, global_col); + } + for (; r <= dst_block.rows - 4; r += 4) { + const int global_row = r + dst_block.start_row; + UnpackResultBlock<KernelFormat, Int32x4x1>( + src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice, + rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, + global_row, global_col, global_row, global_col); + } + for (; r < dst_block.rows; r++) { + const int global_row = r + dst_block.start_row; + UnpackResultBlock<KernelFormat, Int32x1x1>( + src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice, + rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, + global_row, global_col, global_row, global_col); + } + } +} + +} // end namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_UNPACK_H_ |