runtimes/nn/depend/external/gemmlowp/internal/block_params.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// block_params.h: Logic to choose L1 and L2 block sizes
// to optimize cache-friendliness.

#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_

#include "common.h"

namespace gemmlowp {

// A BlockParams instance contains a full description of all the block size
// parameters to be used by a Gemm.
// There are two nested levels of block subdivisions: first a subdivision
// into large blocks that should fit in last-level cache (what we call L2 here)
// and then another subdivision into smaller blocks that should fit in
// L1 cache. There is then actually a third level of subdivision to fit
// in registers, but we are not concerned with that here.
struct BlockParams {
  // L1 block parameters determine the size of small blocks that should
  // fit in L1 cache.
  int l1_rows;
  int l1_cols;
  int l1_depth;

  // L2 block parameters determine the size of larger blocks that should
  // fit in L2 cache.
  int l2_rows;
  int l2_cols;
  int l2_depth;

  template <typename KernelFormat>
  void Init(int rows, int cols, int depth, int num_threads,
            int l1_bytes_to_use, int l2_bytes_to_use, float l2_rhs_factor) {
    FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
                                   l2_bytes_to_use, l2_rhs_factor,
                                   &l2_rows, &l2_cols, &l2_depth);
    FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth,
                                   l1_bytes_to_use,
                                   &l1_rows, &l1_cols, &l1_depth);
  }

  template <typename KernelFormat>
  static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
                               int l2_bytes_to_use, float l2_rhs_factor,
                               int* out_l2_rows, int* out_l2_cols,
                               int* out_l2_depth) {
    int l2_rows = 0;
    int l2_cols = 0;
    int l2_depth = 0;
    // No L2 blocking in the depth dimension at the moment.
    // Too much loss of accuracy due to storing intermediate results in
    // low precision.
    // However, we still want to round l2_depth up to the next multiple
    // of register size, so as to avoid having to special-case unaligned depths.
    l2_depth = RoundUp<kRegisterSize>(depth);

    {
      int max_cache_friendly_l2_cols = std::max(
          1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
      int min_l2_cols_blocks =
          std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols));
      l2_cols =
          RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
    }

    // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row
    // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
    // the performance on x86.
    if (l2_rhs_factor == 1.0f) {
      l2_rows = RoundUp<KernelFormat::kRows>(rows);
    } else {
      int max_cache_friendly_l2_rows =
          std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
                          (num_threads * (l2_depth + 4 * l2_cols)));
      int min_l2_rows_blocks =
          std::max(1, CeilQuotient(rows, max_cache_friendly_l2_rows));
      l2_rows =
          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks));
    }

    *out_l2_rows = l2_rows;
    *out_l2_cols = l2_cols;
    *out_l2_depth = l2_depth;
  }

  template <typename KernelFormat>
  static void FindL1BlockSizes(int rows, int cols, int depth,
                               int l1_bytes_to_use, int* out_l1_rows,
                               int* out_l1_cols, int* out_l1_depth) {
    int l1_rows = 0;
    int l1_cols = 0;
    int l1_depth = 0;

    // L2 block sizes should already be multiples of kernel block sizes.
    assert(rows % KernelFormat::kRows == 0);
    assert(cols % KernelFormat::kCols == 0);
    assert(depth % KernelFormat::kDepth == 0);

    // No L1 blocking in the columns dimension at the moment.
    // Thought not to be needed. Similar to Eigen.
    l1_cols = cols;

    {
      int max_cache_friendly_l1_depth = std::max(
          1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
                 (KernelFormat::kRows + KernelFormat::kCols));
      int min_l1_depth_blocks =
          std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth));
      l1_depth =
          RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks));
    }

    {
      int max_cache_friendly_l1_rows =
          std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
      int min_l1_rows_blocks =
          std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows));
      l1_rows =
          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
    }

    *out_l1_rows = l1_rows;
    *out_l1_cols = l1_cols;
    *out_l1_depth = l1_depth;
  }
};

// A SideBlockParams instance contains only the block params relevant to
// one side (LHS or RHS), expressed in terms of 'width' instead of
// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
// the number of rows, while in the RHS, 'width' means the number of columns.
// That allows us to write generic code that applies to either LHS or RHS.
struct SideBlockParams {
  // L1 block parameters determine the size of small blocks that should
  // fit in L1 cache.
  int l1_width;
  int l1_depth;

  // L2 block parameters determine the size of larger blocks that should
  // fit in L2 cache.
  int l2_width;
  int l2_depth;
};

enum class Side { Lhs, Rhs };

inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
                               const BlockParams& block_params) {
  side_block_params->l1_width =
      side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
  side_block_params->l2_width =
      side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;

  side_block_params->l1_depth = block_params.l1_depth;
  side_block_params->l2_depth = block_params.l2_depth;
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_