diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/pack.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/pack.h | 435 |
1 files changed, 0 insertions, 435 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/pack.h b/runtimes/nn/depend/external/gemmlowp/internal/pack.h deleted file mode 100644 index 339539602..000000000 --- a/runtimes/nn/depend/external/gemmlowp/internal/pack.h +++ /dev/null @@ -1,435 +0,0 @@ -// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// pack.h: packing blocks of the LHS and RHS into the data layout -// that is expected by compute.h and eventually by kernels. -// Because this data layout depends on the kernel format, code here -// is templated in KernelLhsFormat/KernelRhsFormat. -// -// Readers note: an important theme around here is that we try hard -// to handle both Lhs and Rhs with a single piece of code. We indifferently -// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices -// by (row, column) indices, we address them by (width, depth), as explained -// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, -// at once. - -#ifndef GEMMLOWP_INTERNAL_PACK_H_ -#define GEMMLOWP_INTERNAL_PACK_H_ - -#include <cstring> - -#include "allocator.h" -#include "block_params.h" -#include "common.h" -#include "kernel.h" - -namespace gemmlowp { - -// A PackedSideBlock instance is a packed block of either the LHS or RHS -// (whence the generic 'Side' name). -// -// 'Packed' means that it is laid out in the storage order that -// is expected by the specified kernel format. From a block of the input -// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() -// or PackRhs(). -template <typename tKernelSideFormat> -class PackedSideBlock { - public: - typedef tKernelSideFormat KernelSideFormat; - - PackedSideBlock(Side side, Allocator* allocator, - const BlockParams& block_params) - : allocator_(allocator), pos_(0) { - GetSideBlockParams(side, ¶ms_, block_params); - data_handle_ = - allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); - sums_of_each_slice_handle_ = - allocator_->Reserve<std::int32_t>(params_.l2_width); - } - - ~PackedSideBlock() {} - - void seek_run(int start_width, int start_depth) const { - int kernel_run_depth = - std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); - pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; - } - - void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } - - void seek_forward_n_cells(int n) const { - pos_ += n * KernelSideFormat::Cell::kSize; - } - - const std::uint8_t* current_data() const { - return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; - } - - std::uint8_t* current_data() { - return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; - } - - std::int32_t* sums_of_each_slice() { - return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); - } - - const std::int32_t* sums_of_each_slice() const { - return allocator_->GetPointer<const std::int32_t>( - sums_of_each_slice_handle_); - } - - const SideBlockParams& params() const { return params_; } - - private: - // The block size parameters that this PackedSizeBlock follows. - // The L2 parameters determine its overall size, while the L1 parameters, - // together with the kernel format template parameter, determine - // the fine details of the storage/traversal order. - SideBlockParams params_; - - // Pointer to the allocator provided by the caller. Not owned. - // The Allocator is assumed to outlive the PackedSideBlock. - Allocator* const allocator_; - - // Handle on the buffer backing this packed block. Owned. - Allocator::Handle data_handle_; - - // Handle on the additional buffer backing the vector of sums of slices - // associated with this block. Owned. - Allocator::Handle sums_of_each_slice_handle_; - - // pos_ is the current position in the buffer, which we access - // sequentially, like a file. - // The idea is that we pack data in the same order as it is - // going to be traversed during the computation, which for - // cache-friendliness reasons is complicated to random-access, - // as the offsets calculations would be intricate. So we - // give up random-access addressing, and instead content ourselves - // with sequential access. - // - // pos_ is mutable because during the computation we will want to - // be able to iterate on the data in a const PackedSideBlock. - mutable int pos_; -}; - -// WidthMajor and DepthMajor are custom phrases modelled after the -// standard terminology 'row-major' and 'column-major'. Their meaning -// should be transparent once one has read the explanation in kernel.h: -// for example, in the Lhs, the 'width' dimension is the rows dimension, -// so there WidthMajor means RowMajor, while in the Rhs it is the opposite. -// Another way to put it: WidthMajor means that contiguous storage is used -// for entries having the same 'width' index. -enum class SideMapOrder { WidthMajor, DepthMajor }; - -// Similar to MatrixMap from map.h, but in terms of width/depth instead of -// rows/columns. Used to address blocks of the input LHS/RHS matrices when -// packing them. -template <typename tScalar, SideMapOrder tOrder> -class SideMap { - public: - typedef tScalar Scalar; - static const SideMapOrder kOrder = tOrder; - - SideMap(Scalar* data, int width, int depth, int stride) - : data_(data), width_(width), depth_(depth), stride_(stride) {} - - SideMap(Scalar* data, int width, int depth) - : data_(data), width_(width), depth_(depth) { - stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; - } - - SideMap(const SideMap& other) - : data_(other.data_), - width_(other.width_), - depth_(other.depth_), - stride_(other.stride_) {} - - int width() const { return width_; } - int depth() const { return depth_; } - int stride() const { return stride_; } - int width_stride() const { - return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; - } - int depth_stride() const { - return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; - } - Scalar* data() const { return data_; } - Scalar* data(int w, int d) const { - return data_ + w * width_stride() + d * depth_stride(); - } - Scalar operator()(int w, int d) const { return *data(w, d); } - Scalar& operator()(int w, int d) { return *data(w, d); } - - SideMap block(int start_width, int start_depth, int block_width, - int block_depth) const { - assert(start_width >= 0); - assert(start_width + block_width <= width_); - assert(start_depth >= 0); - assert(start_depth + block_depth <= depth_); - - return SideMap(data(start_width, start_depth), block_width, block_depth, - stride_); - } - - private: - Scalar* data_; // not owned. - int width_, depth_, stride_; -}; - -// A PackingRegisterBlock is a small fixed-size block of a matrix being -// packed. This class is the generic non-optimized implementation, -// it is inherited by the generic implementation of PackingRegisterBlock, -// which may be overriden by template specialization. Overriding it is how -// one may provide optimized packing code paths. -// -// The packing of a block proceeds in two steps: -// 1. Ensuring that we have a complete block of source data, i.e. a block of -// the compile-time prescribed size. This is where we handle unaligned -// boundaries: if we don't have a complete block of source data, then -// we copy and zero-extend it into a local temporary (complete_src_), -// see MakeCompleteSrc. In the generic case, we do have a complete block, -// so we just use it in-place, see UseCompleteSrcInPlace. -// 2. Packing a complete block into the destination, see Pack. This is the -// most critical part, so it's convenient that unaligned boundaries have -// already been handled in step 1. -template <typename SrcMapType, typename PackedSideBlock> -class PackingRegisterBlockBase { - public: - typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; - typedef typename KernelSideFormat::Cell CellFormat; - typedef typename KernelSideFormat::Scalar KernelScalar; - static const int kCells = KernelSideFormat::kCells; - static const int kCellWidth = CellFormat::kWidth; - static const int kKernelWidth = CellFormat::kWidth * kCells; - static const int kCellDepth = CellFormat::kDepth; - static const int kCellSize = CellFormat::kSize; - static const SideMapOrder kSrcOrder = SrcMapType::kOrder; - static const int kZeroPointInputValue = - ZeroPointInputValue<KernelScalar>::kValue; - - PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} - - protected: - // The source data that's ready for packing. May point to - // in-place actual source data if it's already a complete block, - // (see UseCompleteSrcInPlace) - // or to the local buf_ below into which we copy incomplete blocks - // (see MakeCompleteSrc) - SrcMapType complete_src_; - - // Temporary buffer for loading incomplete blocks to, - // in the source storage order - std::uint8_t buf_[kKernelWidth * kRegisterSize]; - - public: - // Selects a block if in-place source data that's already a complete block - void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } - // Copies an incomplete block of source data into a local temporary - // complete block by zero-extending it. - void MakeCompleteSrc(const SrcMapType& src) { - memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize); - if (kSrcOrder == SideMapOrder::WidthMajor) { - for (int w = 0; w < src.width(); w++) { - memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); - } - } else { - assert(kSrcOrder == SideMapOrder::DepthMajor); - for (int d = 0; d < src.depth(); d++) { - memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); - } - } - complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize); - } - // Packs a complete block into the destination. This is the most - // critical part and the part that we most typically want to - // override in architecture-specific optimized specializations. - void Pack(PackedSideBlock* dst, int start_width) { - std::uint8_t* dst_ptr = dst->current_data(); - for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; - cell_start_depth += kCellDepth) { - for (int cell_start_width = 0; cell_start_width < kKernelWidth; - cell_start_width += kCellWidth) { - std::int32_t* cell_sums_of_each_slice_ptr = - dst->sums_of_each_slice() + start_width + cell_start_width; - const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( - complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, - kCellDepth)); - for (int w = 0; w < kCellWidth; w++) { - std::int32_t sum = 0; - for (int d = 0; d < kCellDepth; d++) { - const std::uint8_t src_val = src_cell_map(w, d); - const std::int16_t kernel_val_unwrapped = - src_val - kZeroPointInputValue; - const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped; - dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8; - sum += kernel_val_unwrapped; - } - cell_sums_of_each_slice_ptr[w] += sum; - } - dst_ptr += kCellSize; - } - } - dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); - } -}; - -template <typename SrcMapType, typename PackedSideBlock> -class PackingRegisterBlock - : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {}; - -// Large-scale implementation of packing. -template <typename SrcMapType, typename PackedSideBlock> -class PackSideBlockImpl { - public: - typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; - typedef typename KernelSideFormat::Cell CellFormat; - static const int kCells = KernelSideFormat::kCells; - static const int kCellWidth = CellFormat::kWidth; - static const int kKernelWidth = CellFormat::kWidth * kCells; - static const int kCellDepth = CellFormat::kDepth; - - typedef PackingRegisterBlock<SrcMapType, PackedSideBlock> - PackingRegisterBlockType; - - PackSideBlockImpl(PackedSideBlock* packed_side_block, - const SrcMapType& src_map) - : packed_side_block_(packed_side_block), src_map_(src_map) {} - - PackedSideBlock* packed_side_block() const { return packed_side_block_; } - - const SrcMapType& src_map() const { return src_map_; } - - // The public entry point to pack a block. - void PackL2() { - memset(packed_side_block_->sums_of_each_slice(), 0, - sizeof(std::int32_t) * packed_side_block_->params().l2_width); - for (int d = 0; d < src_map_.depth(); - d += packed_side_block_->params().l1_depth) { - int ds = std::min<int>(packed_side_block_->params().l1_depth, - src_map_.depth() - d); - - for (int w = 0; w < src_map_.width(); - w += packed_side_block_->params().l1_width) { - int ws = std::min<int>(packed_side_block_->params().l1_width, - src_map_.width() - w); - - PrefetchL1(w, ws, d, ds); - PackL1(w, ws, d, ds); - } - } - } - - protected: - // The intermediate-level loops, between PackL2 and PackRun. - void PackL1(int start_width, int width, int start_depth, int depth) { - for (int w = 0; w < width; w += kKernelWidth) { - int ws = std::min(+kKernelWidth, width - w); - packed_side_block_->seek_run(start_width + w, start_depth); - PackRun(start_width + w, ws, start_depth, depth); - } - } - - // Prefetches the data that will be read by PackL1 - void PrefetchL1(int start_width, int width, int start_depth, int depth) { - if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { - for (int d = 0; d < depth; d += kDefaultCacheLineSize) { - for (int w = 0; w < width; w += 1) { - Prefetch(src_map_.data(start_width + w, start_depth + d)); - } - } - } else { - for (int d = 0; d < depth; d++) { - for (int w = 0; w < width; w += kDefaultCacheLineSize) { - Prefetch(src_map_.data(start_width + w, start_depth + d)); - } - } - } - } - - // PackRun packs only a run i.e. is the inner loop in the depth dimension. - void PackRun(int start_width, int width, int start_depth, int depth) { - PackingRegisterBlockType b; - if (width == kKernelWidth) { - const int register_aligned_depth = RoundDown<kRegisterSize>(depth); - if (register_aligned_depth) { - for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { - b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, - width, kRegisterSize)); - b.Pack(packed_side_block_, start_width); - } - } - if (register_aligned_depth < depth) { - b.MakeCompleteSrc( - src_map_.block(start_width, start_depth + register_aligned_depth, - width, depth - register_aligned_depth)); - b.Pack(packed_side_block_, start_width); - } - } else { - assert(width < kKernelWidth); - for (int d = 0; d < depth; d += kRegisterSize) { - const int ds = std::min(+kRegisterSize, depth - d); - b.MakeCompleteSrc( - src_map_.block(start_width, start_depth + d, width, ds)); - b.Pack(packed_side_block_, start_width); - } - } - } - - // The PackedSideBlock being packed, i.e. the 'destination'. - PackedSideBlock* const packed_side_block_; - - // A map on the block of the original matrix block being packed, - // i.e. the 'source'. - const SrcMapType& src_map_; -}; - -// Packs a block of the input LHS matrix, into a PackedSideBlock -template <typename PackedSideBlock, typename MatrixMapType> -void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { - ScopedProfilingLabel label("pack LHS"); - static const SideMapOrder kSideMapOrder = - MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor - : SideMapOrder::DepthMajor; - typedef typename MatrixMapType::Scalar Scalar; - typedef SideMap<Scalar, kSideMapOrder> SideMapType; - SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); - typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; - ImplType impl(dst, src_side_map); - impl.PackL2(); -} - -// Packs a block of the input RHS matrix, into a PackedSideBlock -template <typename PackedSideBlock, typename MatrixMapType> -void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { - ScopedProfilingLabel label("pack RHS"); - static const SideMapOrder kSideMapOrder = - MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor - : SideMapOrder::DepthMajor; - typedef typename MatrixMapType::Scalar Scalar; - typedef SideMap<Scalar, kSideMapOrder> SideMapType; - SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); - typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; - ImplType impl(dst, src_side_map); - impl.PackL2(); -} - -} // namespace gemmlowp - -#ifdef GEMMLOWP_NEON -#include "pack_neon.h" -#elif defined(GEMMLOWP_SSE4) -#include "pack_sse.h" -#endif - -#endif // GEMMLOWP_INTERNAL_PACK_H_ |