runtimes/nn/depend/external/gemmlowp/fixedpoint/fixedpoint_neon.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// fixedpoint_neon.h: optimized NEON specializations of the templates
// in fixedpoint.h.

#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
#define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_

#include <arm_neon.h>

namespace gemmlowp {

template <>
struct FixedPointRawTypeTraits<int32x4_t> {
  typedef std::int32_t ScalarRawType;
  static const int kLanes = 4;
};

template <>
inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
  return vandq_s32(a, b);
}

template <>
inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
  return vorrq_s32(a, b);
}

template <>
inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
  return veorq_s32(a, b);
}

template <>
inline int32x4_t BitNot(int32x4_t a) {
  return veorq_s32(a, vdupq_n_s32(-1));
}

template <>
inline int32x4_t Add(int32x4_t a, int32x4_t b) {
  return vaddq_s32(a, b);
}

template <>
inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
  return vsubq_s32(a, b);
}

template <>
inline int32x4_t Neg(int32x4_t a) {
  return vnegq_s32(a);
}

template <>
inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
  return vshlq_s32(a, vdupq_n_s32(offset));
}

template <>
inline int32x4_t ShiftRight(int32x4_t a, int offset) {
  return vshlq_s32(a, vdupq_n_s32(-offset));
}

template <>
inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
                                 int32x4_t else_val) {
  return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
}

template <>
inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vceqq_s32(a, b));
}

template <>
inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
  return BitNot(MaskIfEqual(a, b));
}

template <>
inline int32x4_t MaskIfZero(int32x4_t a) {
  return MaskIfEqual(a, vdupq_n_s32(0));
}

template <>
inline int32x4_t MaskIfNonZero(int32x4_t a) {
  return vreinterpretq_s32_u32(vtstq_s32(a, a));
}

template <>
inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcgtq_s32(a, b));
}

template <>
inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcgeq_s32(a, b));
}

template <>
inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcltq_s32(a, b));
}

template <>
inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
  return vreinterpretq_s32_u32(vcleq_s32(a, b));
}

template <>
inline bool All(int32x4_t a) {
  a = vandq_s32(a, vextq_s32(a, a, 1));
  a = vandq_s32(a, vextq_s32(a, a, 2));
  return vgetq_lane_s32(a, 0);
}

template <>
inline bool Any(int32x4_t a) {
  a = vorrq_s32(a, vextq_s32(a, a, 1));
  a = vorrq_s32(a, vextq_s32(a, a, 2));
  return vgetq_lane_s32(a, 0);
}

template <>
inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
  return vrhaddq_s32(a, b);
}

template <>
inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
  return vqrdmulhq_s32(a, b);
}

template <>
inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
  const int32x4_t shift_vec = vdupq_n_s32(-exponent);
  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
  return vrshlq_s32(fixed_up_x, shift_vec);
}

template <int Exponent>
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
  static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
};

template <int Exponent>
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
  static int32x4_t eval(int32x4_t x) {
    const int32x4_t fixup = vshrq_n_s32(x, 31);
    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
    return vrshrq_n_s32(fixed_up_x, -Exponent);
  }
};

template <>
inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
  return vdupq_n_s32(x);
}

}  // end namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_