diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/pack.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/pack.h | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/pack.h b/runtimes/nn/depend/external/gemmlowp/internal/pack.h new file mode 100644 index 000000000..339539602 --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/internal/pack.h @@ -0,0 +1,435 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// pack.h: packing blocks of the LHS and RHS into the data layout +// that is expected by compute.h and eventually by kernels. +// Because this data layout depends on the kernel format, code here +// is templated in KernelLhsFormat/KernelRhsFormat. +// +// Readers note: an important theme around here is that we try hard +// to handle both Lhs and Rhs with a single piece of code. We indifferently +// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices +// by (row, column) indices, we address them by (width, depth), as explained +// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, +// at once. + +#ifndef GEMMLOWP_INTERNAL_PACK_H_ +#define GEMMLOWP_INTERNAL_PACK_H_ + +#include <cstring> + +#include "allocator.h" +#include "block_params.h" +#include "common.h" +#include "kernel.h" + +namespace gemmlowp { + +// A PackedSideBlock instance is a packed block of either the LHS or RHS +// (whence the generic 'Side' name). +// +// 'Packed' means that it is laid out in the storage order that +// is expected by the specified kernel format. From a block of the input +// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() +// or PackRhs(). +template <typename tKernelSideFormat> +class PackedSideBlock { + public: + typedef tKernelSideFormat KernelSideFormat; + + PackedSideBlock(Side side, Allocator* allocator, + const BlockParams& block_params) + : allocator_(allocator), pos_(0) { + GetSideBlockParams(side, ¶ms_, block_params); + data_handle_ = + allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); + sums_of_each_slice_handle_ = + allocator_->Reserve<std::int32_t>(params_.l2_width); + } + + ~PackedSideBlock() {} + + void seek_run(int start_width, int start_depth) const { + int kernel_run_depth = + std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); + pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; + } + + void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } + + void seek_forward_n_cells(int n) const { + pos_ += n * KernelSideFormat::Cell::kSize; + } + + const std::uint8_t* current_data() const { + return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; + } + + std::uint8_t* current_data() { + return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; + } + + std::int32_t* sums_of_each_slice() { + return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); + } + + const std::int32_t* sums_of_each_slice() const { + return allocator_->GetPointer<const std::int32_t>( + sums_of_each_slice_handle_); + } + + const SideBlockParams& params() const { return params_; } + + private: + // The block size parameters that this PackedSizeBlock follows. + // The L2 parameters determine its overall size, while the L1 parameters, + // together with the kernel format template parameter, determine + // the fine details of the storage/traversal order. + SideBlockParams params_; + + // Pointer to the allocator provided by the caller. Not owned. + // The Allocator is assumed to outlive the PackedSideBlock. + Allocator* const allocator_; + + // Handle on the buffer backing this packed block. Owned. + Allocator::Handle data_handle_; + + // Handle on the additional buffer backing the vector of sums of slices + // associated with this block. Owned. + Allocator::Handle sums_of_each_slice_handle_; + + // pos_ is the current position in the buffer, which we access + // sequentially, like a file. + // The idea is that we pack data in the same order as it is + // going to be traversed during the computation, which for + // cache-friendliness reasons is complicated to random-access, + // as the offsets calculations would be intricate. So we + // give up random-access addressing, and instead content ourselves + // with sequential access. + // + // pos_ is mutable because during the computation we will want to + // be able to iterate on the data in a const PackedSideBlock. + mutable int pos_; +}; + +// WidthMajor and DepthMajor are custom phrases modelled after the +// standard terminology 'row-major' and 'column-major'. Their meaning +// should be transparent once one has read the explanation in kernel.h: +// for example, in the Lhs, the 'width' dimension is the rows dimension, +// so there WidthMajor means RowMajor, while in the Rhs it is the opposite. +// Another way to put it: WidthMajor means that contiguous storage is used +// for entries having the same 'width' index. +enum class SideMapOrder { WidthMajor, DepthMajor }; + +// Similar to MatrixMap from map.h, but in terms of width/depth instead of +// rows/columns. Used to address blocks of the input LHS/RHS matrices when +// packing them. +template <typename tScalar, SideMapOrder tOrder> +class SideMap { + public: + typedef tScalar Scalar; + static const SideMapOrder kOrder = tOrder; + + SideMap(Scalar* data, int width, int depth, int stride) + : data_(data), width_(width), depth_(depth), stride_(stride) {} + + SideMap(Scalar* data, int width, int depth) + : data_(data), width_(width), depth_(depth) { + stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; + } + + SideMap(const SideMap& other) + : data_(other.data_), + width_(other.width_), + depth_(other.depth_), + stride_(other.stride_) {} + + int width() const { return width_; } + int depth() const { return depth_; } + int stride() const { return stride_; } + int width_stride() const { + return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; + } + int depth_stride() const { + return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; + } + Scalar* data() const { return data_; } + Scalar* data(int w, int d) const { + return data_ + w * width_stride() + d * depth_stride(); + } + Scalar operator()(int w, int d) const { return *data(w, d); } + Scalar& operator()(int w, int d) { return *data(w, d); } + + SideMap block(int start_width, int start_depth, int block_width, + int block_depth) const { + assert(start_width >= 0); + assert(start_width + block_width <= width_); + assert(start_depth >= 0); + assert(start_depth + block_depth <= depth_); + + return SideMap(data(start_width, start_depth), block_width, block_depth, + stride_); + } + + private: + Scalar* data_; // not owned. + int width_, depth_, stride_; +}; + +// A PackingRegisterBlock is a small fixed-size block of a matrix being +// packed. This class is the generic non-optimized implementation, +// it is inherited by the generic implementation of PackingRegisterBlock, +// which may be overriden by template specialization. Overriding it is how +// one may provide optimized packing code paths. +// +// The packing of a block proceeds in two steps: +// 1. Ensuring that we have a complete block of source data, i.e. a block of +// the compile-time prescribed size. This is where we handle unaligned +// boundaries: if we don't have a complete block of source data, then +// we copy and zero-extend it into a local temporary (complete_src_), +// see MakeCompleteSrc. In the generic case, we do have a complete block, +// so we just use it in-place, see UseCompleteSrcInPlace. +// 2. Packing a complete block into the destination, see Pack. This is the +// most critical part, so it's convenient that unaligned boundaries have +// already been handled in step 1. +template <typename SrcMapType, typename PackedSideBlock> +class PackingRegisterBlockBase { + public: + typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; + typedef typename KernelSideFormat::Cell CellFormat; + typedef typename KernelSideFormat::Scalar KernelScalar; + static const int kCells = KernelSideFormat::kCells; + static const int kCellWidth = CellFormat::kWidth; + static const int kKernelWidth = CellFormat::kWidth * kCells; + static const int kCellDepth = CellFormat::kDepth; + static const int kCellSize = CellFormat::kSize; + static const SideMapOrder kSrcOrder = SrcMapType::kOrder; + static const int kZeroPointInputValue = + ZeroPointInputValue<KernelScalar>::kValue; + + PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} + + protected: + // The source data that's ready for packing. May point to + // in-place actual source data if it's already a complete block, + // (see UseCompleteSrcInPlace) + // or to the local buf_ below into which we copy incomplete blocks + // (see MakeCompleteSrc) + SrcMapType complete_src_; + + // Temporary buffer for loading incomplete blocks to, + // in the source storage order + std::uint8_t buf_[kKernelWidth * kRegisterSize]; + + public: + // Selects a block if in-place source data that's already a complete block + void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } + // Copies an incomplete block of source data into a local temporary + // complete block by zero-extending it. + void MakeCompleteSrc(const SrcMapType& src) { + memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize); + if (kSrcOrder == SideMapOrder::WidthMajor) { + for (int w = 0; w < src.width(); w++) { + memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); + } + } else { + assert(kSrcOrder == SideMapOrder::DepthMajor); + for (int d = 0; d < src.depth(); d++) { + memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); + } + } + complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize); + } + // Packs a complete block into the destination. This is the most + // critical part and the part that we most typically want to + // override in architecture-specific optimized specializations. + void Pack(PackedSideBlock* dst, int start_width) { + std::uint8_t* dst_ptr = dst->current_data(); + for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; + cell_start_depth += kCellDepth) { + for (int cell_start_width = 0; cell_start_width < kKernelWidth; + cell_start_width += kCellWidth) { + std::int32_t* cell_sums_of_each_slice_ptr = + dst->sums_of_each_slice() + start_width + cell_start_width; + const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( + complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, + kCellDepth)); + for (int w = 0; w < kCellWidth; w++) { + std::int32_t sum = 0; + for (int d = 0; d < kCellDepth; d++) { + const std::uint8_t src_val = src_cell_map(w, d); + const std::int16_t kernel_val_unwrapped = + src_val - kZeroPointInputValue; + const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped; + dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8; + sum += kernel_val_unwrapped; + } + cell_sums_of_each_slice_ptr[w] += sum; + } + dst_ptr += kCellSize; + } + } + dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); + } +}; + +template <typename SrcMapType, typename PackedSideBlock> +class PackingRegisterBlock + : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {}; + +// Large-scale implementation of packing. +template <typename SrcMapType, typename PackedSideBlock> +class PackSideBlockImpl { + public: + typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; + typedef typename KernelSideFormat::Cell CellFormat; + static const int kCells = KernelSideFormat::kCells; + static const int kCellWidth = CellFormat::kWidth; + static const int kKernelWidth = CellFormat::kWidth * kCells; + static const int kCellDepth = CellFormat::kDepth; + + typedef PackingRegisterBlock<SrcMapType, PackedSideBlock> + PackingRegisterBlockType; + + PackSideBlockImpl(PackedSideBlock* packed_side_block, + const SrcMapType& src_map) + : packed_side_block_(packed_side_block), src_map_(src_map) {} + + PackedSideBlock* packed_side_block() const { return packed_side_block_; } + + const SrcMapType& src_map() const { return src_map_; } + + // The public entry point to pack a block. + void PackL2() { + memset(packed_side_block_->sums_of_each_slice(), 0, + sizeof(std::int32_t) * packed_side_block_->params().l2_width); + for (int d = 0; d < src_map_.depth(); + d += packed_side_block_->params().l1_depth) { + int ds = std::min<int>(packed_side_block_->params().l1_depth, + src_map_.depth() - d); + + for (int w = 0; w < src_map_.width(); + w += packed_side_block_->params().l1_width) { + int ws = std::min<int>(packed_side_block_->params().l1_width, + src_map_.width() - w); + + PrefetchL1(w, ws, d, ds); + PackL1(w, ws, d, ds); + } + } + } + + protected: + // The intermediate-level loops, between PackL2 and PackRun. + void PackL1(int start_width, int width, int start_depth, int depth) { + for (int w = 0; w < width; w += kKernelWidth) { + int ws = std::min(+kKernelWidth, width - w); + packed_side_block_->seek_run(start_width + w, start_depth); + PackRun(start_width + w, ws, start_depth, depth); + } + } + + // Prefetches the data that will be read by PackL1 + void PrefetchL1(int start_width, int width, int start_depth, int depth) { + if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { + for (int d = 0; d < depth; d += kDefaultCacheLineSize) { + for (int w = 0; w < width; w += 1) { + Prefetch(src_map_.data(start_width + w, start_depth + d)); + } + } + } else { + for (int d = 0; d < depth; d++) { + for (int w = 0; w < width; w += kDefaultCacheLineSize) { + Prefetch(src_map_.data(start_width + w, start_depth + d)); + } + } + } + } + + // PackRun packs only a run i.e. is the inner loop in the depth dimension. + void PackRun(int start_width, int width, int start_depth, int depth) { + PackingRegisterBlockType b; + if (width == kKernelWidth) { + const int register_aligned_depth = RoundDown<kRegisterSize>(depth); + if (register_aligned_depth) { + for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { + b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, + width, kRegisterSize)); + b.Pack(packed_side_block_, start_width); + } + } + if (register_aligned_depth < depth) { + b.MakeCompleteSrc( + src_map_.block(start_width, start_depth + register_aligned_depth, + width, depth - register_aligned_depth)); + b.Pack(packed_side_block_, start_width); + } + } else { + assert(width < kKernelWidth); + for (int d = 0; d < depth; d += kRegisterSize) { + const int ds = std::min(+kRegisterSize, depth - d); + b.MakeCompleteSrc( + src_map_.block(start_width, start_depth + d, width, ds)); + b.Pack(packed_side_block_, start_width); + } + } + } + + // The PackedSideBlock being packed, i.e. the 'destination'. + PackedSideBlock* const packed_side_block_; + + // A map on the block of the original matrix block being packed, + // i.e. the 'source'. + const SrcMapType& src_map_; +}; + +// Packs a block of the input LHS matrix, into a PackedSideBlock +template <typename PackedSideBlock, typename MatrixMapType> +void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { + ScopedProfilingLabel label("pack LHS"); + static const SideMapOrder kSideMapOrder = + MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor + : SideMapOrder::DepthMajor; + typedef typename MatrixMapType::Scalar Scalar; + typedef SideMap<Scalar, kSideMapOrder> SideMapType; + SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); + typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; + ImplType impl(dst, src_side_map); + impl.PackL2(); +} + +// Packs a block of the input RHS matrix, into a PackedSideBlock +template <typename PackedSideBlock, typename MatrixMapType> +void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { + ScopedProfilingLabel label("pack RHS"); + static const SideMapOrder kSideMapOrder = + MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor + : SideMapOrder::DepthMajor; + typedef typename MatrixMapType::Scalar Scalar; + typedef SideMap<Scalar, kSideMapOrder> SideMapType; + SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); + typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; + ImplType impl(dst, src_side_map); + impl.PackL2(); +} + +} // namespace gemmlowp + +#ifdef GEMMLOWP_NEON +#include "pack_neon.h" +#elif defined(GEMMLOWP_SSE4) +#include "pack_sse.h" +#endif + +#endif // GEMMLOWP_INTERNAL_PACK_H_ |