diff options
Diffstat (limited to 'runtimes/nn/depend/external/gemmlowp/internal/block_params.h')
-rw-r--r-- | runtimes/nn/depend/external/gemmlowp/internal/block_params.h | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/runtimes/nn/depend/external/gemmlowp/internal/block_params.h b/runtimes/nn/depend/external/gemmlowp/internal/block_params.h new file mode 100644 index 000000000..b2fc3ff78 --- /dev/null +++ b/runtimes/nn/depend/external/gemmlowp/internal/block_params.h @@ -0,0 +1,174 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// block_params.h: Logic to choose L1 and L2 block sizes +// to optimize cache-friendliness. + +#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ +#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ + +#include "common.h" + +namespace gemmlowp { + +// A BlockParams instance contains a full description of all the block size +// parameters to be used by a Gemm. +// There are two nested levels of block subdivisions: first a subdivision +// into large blocks that should fit in last-level cache (what we call L2 here) +// and then another subdivision into smaller blocks that should fit in +// L1 cache. There is then actually a third level of subdivision to fit +// in registers, but we are not concerned with that here. +struct BlockParams { + // L1 block parameters determine the size of small blocks that should + // fit in L1 cache. + int l1_rows; + int l1_cols; + int l1_depth; + + // L2 block parameters determine the size of larger blocks that should + // fit in L2 cache. + int l2_rows; + int l2_cols; + int l2_depth; + + template <typename KernelFormat> + void Init(int rows, int cols, int depth, int num_threads, + int l1_bytes_to_use, int l2_bytes_to_use, float l2_rhs_factor) { + FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, + l2_bytes_to_use, l2_rhs_factor, + &l2_rows, &l2_cols, &l2_depth); + FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, + l1_bytes_to_use, + &l1_rows, &l1_cols, &l1_depth); + } + + template <typename KernelFormat> + static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads, + int l2_bytes_to_use, float l2_rhs_factor, + int* out_l2_rows, int* out_l2_cols, + int* out_l2_depth) { + int l2_rows = 0; + int l2_cols = 0; + int l2_depth = 0; + // No L2 blocking in the depth dimension at the moment. + // Too much loss of accuracy due to storing intermediate results in + // low precision. + // However, we still want to round l2_depth up to the next multiple + // of register size, so as to avoid having to special-case unaligned depths. + l2_depth = RoundUp<kRegisterSize>(depth); + + { + int max_cache_friendly_l2_cols = std::max( + 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth))); + int min_l2_cols_blocks = + std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols)); + l2_cols = + RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks)); + } + + // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row + // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances + // the performance on x86. + if (l2_rhs_factor == 1.0f) { + l2_rows = RoundUp<KernelFormat::kRows>(rows); + } else { + int max_cache_friendly_l2_rows = + std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) / + (num_threads * (l2_depth + 4 * l2_cols))); + int min_l2_rows_blocks = + std::max(1, CeilQuotient(rows, max_cache_friendly_l2_rows)); + l2_rows = + RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks)); + } + + *out_l2_rows = l2_rows; + *out_l2_cols = l2_cols; + *out_l2_depth = l2_depth; + } + + template <typename KernelFormat> + static void FindL1BlockSizes(int rows, int cols, int depth, + int l1_bytes_to_use, int* out_l1_rows, + int* out_l1_cols, int* out_l1_depth) { + int l1_rows = 0; + int l1_cols = 0; + int l1_depth = 0; + + // L2 block sizes should already be multiples of kernel block sizes. + assert(rows % KernelFormat::kRows == 0); + assert(cols % KernelFormat::kCols == 0); + assert(depth % KernelFormat::kDepth == 0); + + // No L1 blocking in the columns dimension at the moment. + // Thought not to be needed. Similar to Eigen. + l1_cols = cols; + + { + int max_cache_friendly_l1_depth = std::max( + 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) / + (KernelFormat::kRows + KernelFormat::kCols)); + int min_l1_depth_blocks = + std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth)); + l1_depth = + RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks)); + } + + { + int max_cache_friendly_l1_rows = + std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols)); + int min_l1_rows_blocks = + std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows)); + l1_rows = + RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks)); + } + + *out_l1_rows = l1_rows; + *out_l1_cols = l1_cols; + *out_l1_depth = l1_depth; + } +}; + +// A SideBlockParams instance contains only the block params relevant to +// one side (LHS or RHS), expressed in terms of 'width' instead of +// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means +// the number of rows, while in the RHS, 'width' means the number of columns. +// That allows us to write generic code that applies to either LHS or RHS. +struct SideBlockParams { + // L1 block parameters determine the size of small blocks that should + // fit in L1 cache. + int l1_width; + int l1_depth; + + // L2 block parameters determine the size of larger blocks that should + // fit in L2 cache. + int l2_width; + int l2_depth; +}; + +enum class Side { Lhs, Rhs }; + +inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params, + const BlockParams& block_params) { + side_block_params->l1_width = + side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols; + side_block_params->l2_width = + side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols; + + side_block_params->l1_depth = block_params.l1_depth; + side_block_params->l2_depth = block_params.l2_depth; +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ |