summaryrefslogtreecommitdiff
path: root/runtimes/nn/depend/external/gemmlowp/internal/pack.h
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/pack.h')
-rw-r--r--runtimes/nn/depend/external/gemmlowp/internal/pack.h435
1 files changed, 435 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/pack.h b/runtimes/nn/depend/external/gemmlowp/internal/pack.h
new file mode 100644
index 000000000..339539602
--- /dev/null
+++ b/runtimes/nn/depend/external/gemmlowp/internal/pack.h
@@ -0,0 +1,435 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack.h: packing blocks of the LHS and RHS into the data layout
+// that is expected by compute.h and eventually by kernels.
+// Because this data layout depends on the kernel format, code here
+// is templated in KernelLhsFormat/KernelRhsFormat.
+//
+// Readers note: an important theme around here is that we try hard
+// to handle both Lhs and Rhs with a single piece of code. We indifferently
+// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
+// by (row, column) indices, we address them by (width, depth), as explained
+// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
+// at once.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_H_
+#define GEMMLOWP_INTERNAL_PACK_H_
+
+#include <cstring>
+
+#include "allocator.h"
+#include "block_params.h"
+#include "common.h"
+#include "kernel.h"
+
+namespace gemmlowp {
+
+// A PackedSideBlock instance is a packed block of either the LHS or RHS
+// (whence the generic 'Side' name).
+//
+// 'Packed' means that it is laid out in the storage order that
+// is expected by the specified kernel format. From a block of the input
+// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
+// or PackRhs().
+template <typename tKernelSideFormat>
+class PackedSideBlock {
+ public:
+ typedef tKernelSideFormat KernelSideFormat;
+
+ PackedSideBlock(Side side, Allocator* allocator,
+ const BlockParams& block_params)
+ : allocator_(allocator), pos_(0) {
+ GetSideBlockParams(side, &params_, block_params);
+ data_handle_ =
+ allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
+ sums_of_each_slice_handle_ =
+ allocator_->Reserve<std::int32_t>(params_.l2_width);
+ }
+
+ ~PackedSideBlock() {}
+
+ void seek_run(int start_width, int start_depth) const {
+ int kernel_run_depth =
+ std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
+ pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
+ }
+
+ void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
+
+ void seek_forward_n_cells(int n) const {
+ pos_ += n * KernelSideFormat::Cell::kSize;
+ }
+
+ const std::uint8_t* current_data() const {
+ return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+ }
+
+ std::uint8_t* current_data() {
+ return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+ }
+
+ std::int32_t* sums_of_each_slice() {
+ return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
+ }
+
+ const std::int32_t* sums_of_each_slice() const {
+ return allocator_->GetPointer<const std::int32_t>(
+ sums_of_each_slice_handle_);
+ }
+
+ const SideBlockParams& params() const { return params_; }
+
+ private:
+ // The block size parameters that this PackedSizeBlock follows.
+ // The L2 parameters determine its overall size, while the L1 parameters,
+ // together with the kernel format template parameter, determine
+ // the fine details of the storage/traversal order.
+ SideBlockParams params_;
+
+ // Pointer to the allocator provided by the caller. Not owned.
+ // The Allocator is assumed to outlive the PackedSideBlock.
+ Allocator* const allocator_;
+
+ // Handle on the buffer backing this packed block. Owned.
+ Allocator::Handle data_handle_;
+
+ // Handle on the additional buffer backing the vector of sums of slices
+ // associated with this block. Owned.
+ Allocator::Handle sums_of_each_slice_handle_;
+
+ // pos_ is the current position in the buffer, which we access
+ // sequentially, like a file.
+ // The idea is that we pack data in the same order as it is
+ // going to be traversed during the computation, which for
+ // cache-friendliness reasons is complicated to random-access,
+ // as the offsets calculations would be intricate. So we
+ // give up random-access addressing, and instead content ourselves
+ // with sequential access.
+ //
+ // pos_ is mutable because during the computation we will want to
+ // be able to iterate on the data in a const PackedSideBlock.
+ mutable int pos_;
+};
+
+// WidthMajor and DepthMajor are custom phrases modelled after the
+// standard terminology 'row-major' and 'column-major'. Their meaning
+// should be transparent once one has read the explanation in kernel.h:
+// for example, in the Lhs, the 'width' dimension is the rows dimension,
+// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
+// Another way to put it: WidthMajor means that contiguous storage is used
+// for entries having the same 'width' index.
+enum class SideMapOrder { WidthMajor, DepthMajor };
+
+// Similar to MatrixMap from map.h, but in terms of width/depth instead of
+// rows/columns. Used to address blocks of the input LHS/RHS matrices when
+// packing them.
+template <typename tScalar, SideMapOrder tOrder>
+class SideMap {
+ public:
+ typedef tScalar Scalar;
+ static const SideMapOrder kOrder = tOrder;
+
+ SideMap(Scalar* data, int width, int depth, int stride)
+ : data_(data), width_(width), depth_(depth), stride_(stride) {}
+
+ SideMap(Scalar* data, int width, int depth)
+ : data_(data), width_(width), depth_(depth) {
+ stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
+ }
+
+ SideMap(const SideMap& other)
+ : data_(other.data_),
+ width_(other.width_),
+ depth_(other.depth_),
+ stride_(other.stride_) {}
+
+ int width() const { return width_; }
+ int depth() const { return depth_; }
+ int stride() const { return stride_; }
+ int width_stride() const {
+ return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
+ }
+ int depth_stride() const {
+ return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
+ }
+ Scalar* data() const { return data_; }
+ Scalar* data(int w, int d) const {
+ return data_ + w * width_stride() + d * depth_stride();
+ }
+ Scalar operator()(int w, int d) const { return *data(w, d); }
+ Scalar& operator()(int w, int d) { return *data(w, d); }
+
+ SideMap block(int start_width, int start_depth, int block_width,
+ int block_depth) const {
+ assert(start_width >= 0);
+ assert(start_width + block_width <= width_);
+ assert(start_depth >= 0);
+ assert(start_depth + block_depth <= depth_);
+
+ return SideMap(data(start_width, start_depth), block_width, block_depth,
+ stride_);
+ }
+
+ private:
+ Scalar* data_; // not owned.
+ int width_, depth_, stride_;
+};
+
+// A PackingRegisterBlock is a small fixed-size block of a matrix being
+// packed. This class is the generic non-optimized implementation,
+// it is inherited by the generic implementation of PackingRegisterBlock,
+// which may be overriden by template specialization. Overriding it is how
+// one may provide optimized packing code paths.
+//
+// The packing of a block proceeds in two steps:
+// 1. Ensuring that we have a complete block of source data, i.e. a block of
+// the compile-time prescribed size. This is where we handle unaligned
+// boundaries: if we don't have a complete block of source data, then
+// we copy and zero-extend it into a local temporary (complete_src_),
+// see MakeCompleteSrc. In the generic case, we do have a complete block,
+// so we just use it in-place, see UseCompleteSrcInPlace.
+// 2. Packing a complete block into the destination, see Pack. This is the
+// most critical part, so it's convenient that unaligned boundaries have
+// already been handled in step 1.
+template <typename SrcMapType, typename PackedSideBlock>
+class PackingRegisterBlockBase {
+ public:
+ typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
+ typedef typename KernelSideFormat::Cell CellFormat;
+ typedef typename KernelSideFormat::Scalar KernelScalar;
+ static const int kCells = KernelSideFormat::kCells;
+ static const int kCellWidth = CellFormat::kWidth;
+ static const int kKernelWidth = CellFormat::kWidth * kCells;
+ static const int kCellDepth = CellFormat::kDepth;
+ static const int kCellSize = CellFormat::kSize;
+ static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
+ static const int kZeroPointInputValue =
+ ZeroPointInputValue<KernelScalar>::kValue;
+
+ PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
+
+ protected:
+ // The source data that's ready for packing. May point to
+ // in-place actual source data if it's already a complete block,
+ // (see UseCompleteSrcInPlace)
+ // or to the local buf_ below into which we copy incomplete blocks
+ // (see MakeCompleteSrc)
+ SrcMapType complete_src_;
+
+ // Temporary buffer for loading incomplete blocks to,
+ // in the source storage order
+ std::uint8_t buf_[kKernelWidth * kRegisterSize];
+
+ public:
+ // Selects a block if in-place source data that's already a complete block
+ void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
+ // Copies an incomplete block of source data into a local temporary
+ // complete block by zero-extending it.
+ void MakeCompleteSrc(const SrcMapType& src) {
+ memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
+ if (kSrcOrder == SideMapOrder::WidthMajor) {
+ for (int w = 0; w < src.width(); w++) {
+ memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
+ }
+ } else {
+ assert(kSrcOrder == SideMapOrder::DepthMajor);
+ for (int d = 0; d < src.depth(); d++) {
+ memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
+ }
+ }
+ complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
+ }
+ // Packs a complete block into the destination. This is the most
+ // critical part and the part that we most typically want to
+ // override in architecture-specific optimized specializations.
+ void Pack(PackedSideBlock* dst, int start_width) {
+ std::uint8_t* dst_ptr = dst->current_data();
+ for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
+ cell_start_depth += kCellDepth) {
+ for (int cell_start_width = 0; cell_start_width < kKernelWidth;
+ cell_start_width += kCellWidth) {
+ std::int32_t* cell_sums_of_each_slice_ptr =
+ dst->sums_of_each_slice() + start_width + cell_start_width;
+ const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
+ complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
+ kCellDepth));
+ for (int w = 0; w < kCellWidth; w++) {
+ std::int32_t sum = 0;
+ for (int d = 0; d < kCellDepth; d++) {
+ const std::uint8_t src_val = src_cell_map(w, d);
+ const std::int16_t kernel_val_unwrapped =
+ src_val - kZeroPointInputValue;
+ const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
+ dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
+ sum += kernel_val_unwrapped;
+ }
+ cell_sums_of_each_slice_ptr[w] += sum;
+ }
+ dst_ptr += kCellSize;
+ }
+ }
+ dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+ }
+};
+
+template <typename SrcMapType, typename PackedSideBlock>
+class PackingRegisterBlock
+ : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
+
+// Large-scale implementation of packing.
+template <typename SrcMapType, typename PackedSideBlock>
+class PackSideBlockImpl {
+ public:
+ typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
+ typedef typename KernelSideFormat::Cell CellFormat;
+ static const int kCells = KernelSideFormat::kCells;
+ static const int kCellWidth = CellFormat::kWidth;
+ static const int kKernelWidth = CellFormat::kWidth * kCells;
+ static const int kCellDepth = CellFormat::kDepth;
+
+ typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
+ PackingRegisterBlockType;
+
+ PackSideBlockImpl(PackedSideBlock* packed_side_block,
+ const SrcMapType& src_map)
+ : packed_side_block_(packed_side_block), src_map_(src_map) {}
+
+ PackedSideBlock* packed_side_block() const { return packed_side_block_; }
+
+ const SrcMapType& src_map() const { return src_map_; }
+
+ // The public entry point to pack a block.
+ void PackL2() {
+ memset(packed_side_block_->sums_of_each_slice(), 0,
+ sizeof(std::int32_t) * packed_side_block_->params().l2_width);
+ for (int d = 0; d < src_map_.depth();
+ d += packed_side_block_->params().l1_depth) {
+ int ds = std::min<int>(packed_side_block_->params().l1_depth,
+ src_map_.depth() - d);
+
+ for (int w = 0; w < src_map_.width();
+ w += packed_side_block_->params().l1_width) {
+ int ws = std::min<int>(packed_side_block_->params().l1_width,
+ src_map_.width() - w);
+
+ PrefetchL1(w, ws, d, ds);
+ PackL1(w, ws, d, ds);
+ }
+ }
+ }
+
+ protected:
+ // The intermediate-level loops, between PackL2 and PackRun.
+ void PackL1(int start_width, int width, int start_depth, int depth) {
+ for (int w = 0; w < width; w += kKernelWidth) {
+ int ws = std::min(+kKernelWidth, width - w);
+ packed_side_block_->seek_run(start_width + w, start_depth);
+ PackRun(start_width + w, ws, start_depth, depth);
+ }
+ }
+
+ // Prefetches the data that will be read by PackL1
+ void PrefetchL1(int start_width, int width, int start_depth, int depth) {
+ if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
+ for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+ for (int w = 0; w < width; w += 1) {
+ Prefetch(src_map_.data(start_width + w, start_depth + d));
+ }
+ }
+ } else {
+ for (int d = 0; d < depth; d++) {
+ for (int w = 0; w < width; w += kDefaultCacheLineSize) {
+ Prefetch(src_map_.data(start_width + w, start_depth + d));
+ }
+ }
+ }
+ }
+
+ // PackRun packs only a run i.e. is the inner loop in the depth dimension.
+ void PackRun(int start_width, int width, int start_depth, int depth) {
+ PackingRegisterBlockType b;
+ if (width == kKernelWidth) {
+ const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
+ if (register_aligned_depth) {
+ for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
+ b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
+ width, kRegisterSize));
+ b.Pack(packed_side_block_, start_width);
+ }
+ }
+ if (register_aligned_depth < depth) {
+ b.MakeCompleteSrc(
+ src_map_.block(start_width, start_depth + register_aligned_depth,
+ width, depth - register_aligned_depth));
+ b.Pack(packed_side_block_, start_width);
+ }
+ } else {
+ assert(width < kKernelWidth);
+ for (int d = 0; d < depth; d += kRegisterSize) {
+ const int ds = std::min(+kRegisterSize, depth - d);
+ b.MakeCompleteSrc(
+ src_map_.block(start_width, start_depth + d, width, ds));
+ b.Pack(packed_side_block_, start_width);
+ }
+ }
+ }
+
+ // The PackedSideBlock being packed, i.e. the 'destination'.
+ PackedSideBlock* const packed_side_block_;
+
+ // A map on the block of the original matrix block being packed,
+ // i.e. the 'source'.
+ const SrcMapType& src_map_;
+};
+
+// Packs a block of the input LHS matrix, into a PackedSideBlock
+template <typename PackedSideBlock, typename MatrixMapType>
+void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
+ ScopedProfilingLabel label("pack LHS");
+ static const SideMapOrder kSideMapOrder =
+ MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
+ : SideMapOrder::DepthMajor;
+ typedef typename MatrixMapType::Scalar Scalar;
+ typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+ SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
+ typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
+ ImplType impl(dst, src_side_map);
+ impl.PackL2();
+}
+
+// Packs a block of the input RHS matrix, into a PackedSideBlock
+template <typename PackedSideBlock, typename MatrixMapType>
+void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
+ ScopedProfilingLabel label("pack RHS");
+ static const SideMapOrder kSideMapOrder =
+ MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
+ : SideMapOrder::DepthMajor;
+ typedef typename MatrixMapType::Scalar Scalar;
+ typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+ SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
+ typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
+ ImplType impl(dst, src_side_map);
+ impl.PackL2();
+}
+
+} // namespace gemmlowp
+
+#ifdef GEMMLOWP_NEON
+#include "pack_neon.h"
+#elif defined(GEMMLOWP_SSE4)
+#include "pack_sse.h"
+#endif
+
+#endif // GEMMLOWP_INTERNAL_PACK_H_