diff options
Diffstat (limited to 'compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def')
-rw-r--r-- | compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def | 273 |
1 files changed, 273 insertions, 0 deletions
diff --git a/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def b/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def new file mode 100644 index 000000000..422b34242 --- /dev/null +++ b/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def @@ -0,0 +1,273 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more +// generalized and efficient BroadcastBinaryFunction. +// +// Also appears to duplicte MinimumMaximum. +// +// R: Result type. T1: Input 1 type. T2: Input 2 type. +template <typename R, typename T1, typename T2> +inline void BroadcastBinaryFunction4DSlow( + const RuntimeShape& unextended_input1_shape, const T1* input1_data, + const RuntimeShape& unextended_input2_shape, const T2* input2_data, + const RuntimeShape& unextended_output_shape, R* output_data, + R (* func)(T1, T2)) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = func(in1_val, in2_val); + } + } + } + } +} + +// R: Result type. T1: Input 1 type. T2: Input 2 type. +// TODO(renjieliu): Refactor other binary functions to use this one. +template <typename R, typename T1, typename T2> +inline void BinaryFunction(const RuntimeShape& input1_shape, + const T1* input1_data, + const RuntimeShape& input2_shape, + const T2* input2_data, + const RuntimeShape& output_shape, R* output_data, + R (* func)(T1, T2)) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = func(input1_data[i], input2_data[i]); + } +} + +struct Sub { + static inline void Sub_(const float* input1_data, const float* input2_data, + float* output_data, const int size) { + int i = 0; +#ifdef USE_NEON + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vsubq_f32(a10, a20); + auto x1 = vsubq_f32(a11, a21); + auto x2 = vsubq_f32(a12, a22); + auto x3 = vsubq_f32(a13, a23); + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vsubq_f32(a1, a2); + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + output_data[i] = input1_data[i] - input2_data[i]; + } + } + + static inline void Call( + const float* input1_data, const RuntimeShape& in1_shape, + const float* input2_data, const RuntimeShape& in2_shape, + float* output_data, const RuntimeShape& out_shape) { + if (in1_shape != in2_shape) { + BroadcastBinaryFunction4DSlow<float, float, float>( + in1_shape, input1_data, + in2_shape, input2_data, + out_shape, output_data, + [](float a, float b) { return a - b; } + ); + } else { + Sub_(input1_data, input2_data, output_data, out_shape.FlatSize()); + } + } +}; + +struct Add { + static inline void Add_(const float* input1_data, const float* input2_data, + float* output_data, const int size) { + int i = 0; +#ifdef USE_NEON + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vaddq_f32(a10, a20); + auto x1 = vaddq_f32(a11, a21); + auto x2 = vaddq_f32(a12, a22); + auto x3 = vaddq_f32(a13, a23); + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vaddq_f32(a1, a2); + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + output_data[i] = input1_data[i] + input2_data[i]; + } + } + + static inline void Call( + const float* input1_data, const RuntimeShape& in1_shape, + const float* input2_data, const RuntimeShape& in2_shape, + float* output_data, const RuntimeShape& out_shape) { + if (in1_shape != in2_shape) { + BroadcastBinaryFunction4DSlow<float, float, float>( + in1_shape, input1_data, + in2_shape, input2_data, + out_shape, output_data, + [](float a, float b) { return a + b; } + ); + } else { + Add_(input1_data, input2_data, output_data, out_shape.FlatSize()); + } + } +}; + +struct Max { + static inline void Call( + const float* input1_data, const RuntimeShape& in1_shape, + const float* input2_data, const RuntimeShape& in2_shape, + float* output_data, const RuntimeShape& out_shape) { + if (in1_shape != in2_shape) { + BroadcastBinaryFunction4DSlow<float, float, float>( + in1_shape, input1_data, + in2_shape, input2_data, + out_shape, output_data, + [](float a, float b) { return std::max(a, b); } + ); + } else { + auto input1 = MapAsVector(input1_data, in1_shape.FlatSize()); + auto input2 = MapAsVector(input2_data, in2_shape.FlatSize()); + auto output = MapAsVector(output_data, out_shape.FlatSize()); + output = input1.cwiseMax(input2); + } + } +}; + +struct Mul { + static inline void Call(const float* input1_data, const RuntimeShape& in1_shape, + const float* input2_data, const RuntimeShape& in2_shape, + float* output_data, const RuntimeShape& out_shape) { + if (in1_shape != in2_shape) { + BroadcastBinaryFunction4DSlow<float, float, float>( + in1_shape, input1_data, + in2_shape, input2_data, + out_shape, output_data, + [](float a, float b) { return a * b; }); + } else { + Mul_(input1_data, input2_data, output_data, out_shape.FlatSize()); + } + } + + static inline void Mul_(const float* input1_data, const float* input2_data, + float* output_data, const int size) { + + int i = 0; +#ifdef USE_NEON + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vmulq_f32(a10, a20); + auto x1 = vmulq_f32(a11, a21); + auto x2 = vmulq_f32(a12, a22); + auto x3 = vmulq_f32(a13, a23); + + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vmulq_f32(a1, a2); + + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + output_data[i] = input1_data[i] * input2_data[i]; + } + } +}; + +//TODO maybe move to a separate file since everything else here is extracted from TF Lite +//23.11.2018 +struct Div { + static inline void Call( + const float* input1_data, const RuntimeShape& in1_shape, + const float* input2_data, const RuntimeShape& in2_shape, + float* output_data, const RuntimeShape& out_shape) { + if (in1_shape != in2_shape) { + BroadcastBinaryFunction4DSlow<float, float, float>( + in1_shape, input1_data, + in2_shape, input2_data, + out_shape, output_data, + [](float a, float b) { return a / b; } + ); + } else { + auto input1 = MapAsVector(input1_data, in1_shape.FlatSize()); + auto input2 = MapAsVector(input2_data, in2_shape.FlatSize()); + auto output = MapAsVector(output_data, out_shape.FlatSize()); + output = input1.cwiseQuotient(input2); + } + } +}; |