summaryrefslogtreecommitdiff
path: root/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def')
-rw-r--r--compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def273
1 files changed, 273 insertions, 0 deletions
diff --git a/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def b/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def
new file mode 100644
index 000000000..422b34242
--- /dev/null
+++ b/compiler/nnc/backends/soft_backend/code_snippets/cpp_elementwise.def
@@ -0,0 +1,273 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
+// generalized and efficient BroadcastBinaryFunction.
+//
+// Also appears to duplicte MinimumMaximum.
+//
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction4DSlow(
+ const RuntimeShape& unextended_input1_shape, const T1* input1_data,
+ const RuntimeShape& unextended_input2_shape, const T2* input2_data,
+ const RuntimeShape& unextended_output_shape, R* output_data,
+ R (* func)(T1, T2)) {
+ TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+ const RuntimeShape output_shape =
+ RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+ unextended_input2_shape, &desc1, &desc2);
+
+ for (int b = 0; b < output_shape.Dims(0); ++b) {
+ for (int y = 0; y < output_shape.Dims(1); ++y) {
+ for (int x = 0; x < output_shape.Dims(2); ++x) {
+ for (int c = 0; c < output_shape.Dims(3); ++c) {
+ auto out_idx = Offset(output_shape, b, y, x, c);
+ auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+ auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+ auto in1_val = input1_data[in1_idx];
+ auto in2_val = input2_data[in2_idx];
+ output_data[out_idx] = func(in1_val, in2_val);
+ }
+ }
+ }
+ }
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+// TODO(renjieliu): Refactor other binary functions to use this one.
+template <typename R, typename T1, typename T2>
+inline void BinaryFunction(const RuntimeShape& input1_shape,
+ const T1* input1_data,
+ const RuntimeShape& input2_shape,
+ const T2* input2_data,
+ const RuntimeShape& output_shape, R* output_data,
+ R (* func)(T1, T2)) {
+ const int flat_size =
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i) {
+ output_data[i] = func(input1_data[i], input2_data[i]);
+ }
+}
+
+struct Sub {
+ static inline void Sub_(const float* input1_data, const float* input2_data,
+ float* output_data, const int size) {
+ int i = 0;
+#ifdef USE_NEON
+ for (; i <= size - 16; i += 16) {
+ auto a10 = vld1q_f32(input1_data + i);
+ auto a11 = vld1q_f32(input1_data + i + 4);
+ auto a12 = vld1q_f32(input1_data + i + 8);
+ auto a13 = vld1q_f32(input1_data + i + 12);
+ auto a20 = vld1q_f32(input2_data + i);
+ auto a21 = vld1q_f32(input2_data + i + 4);
+ auto a22 = vld1q_f32(input2_data + i + 8);
+ auto a23 = vld1q_f32(input2_data + i + 12);
+ auto x0 = vsubq_f32(a10, a20);
+ auto x1 = vsubq_f32(a11, a21);
+ auto x2 = vsubq_f32(a12, a22);
+ auto x3 = vsubq_f32(a13, a23);
+ vst1q_f32(output_data + i, x0);
+ vst1q_f32(output_data + i + 4, x1);
+ vst1q_f32(output_data + i + 8, x2);
+ vst1q_f32(output_data + i + 12, x3);
+ }
+ for (; i <= size - 4; i += 4) {
+ auto a1 = vld1q_f32(input1_data + i);
+ auto a2 = vld1q_f32(input2_data + i);
+ auto x = vsubq_f32(a1, a2);
+ vst1q_f32(output_data + i, x);
+ }
+#endif // NEON
+
+ for (; i < size; i++) {
+ output_data[i] = input1_data[i] - input2_data[i];
+ }
+ }
+
+ static inline void Call(
+ const float* input1_data, const RuntimeShape& in1_shape,
+ const float* input2_data, const RuntimeShape& in2_shape,
+ float* output_data, const RuntimeShape& out_shape) {
+ if (in1_shape != in2_shape) {
+ BroadcastBinaryFunction4DSlow<float, float, float>(
+ in1_shape, input1_data,
+ in2_shape, input2_data,
+ out_shape, output_data,
+ [](float a, float b) { return a - b; }
+ );
+ } else {
+ Sub_(input1_data, input2_data, output_data, out_shape.FlatSize());
+ }
+ }
+};
+
+struct Add {
+ static inline void Add_(const float* input1_data, const float* input2_data,
+ float* output_data, const int size) {
+ int i = 0;
+#ifdef USE_NEON
+ for (; i <= size - 16; i += 16) {
+ auto a10 = vld1q_f32(input1_data + i);
+ auto a11 = vld1q_f32(input1_data + i + 4);
+ auto a12 = vld1q_f32(input1_data + i + 8);
+ auto a13 = vld1q_f32(input1_data + i + 12);
+ auto a20 = vld1q_f32(input2_data + i);
+ auto a21 = vld1q_f32(input2_data + i + 4);
+ auto a22 = vld1q_f32(input2_data + i + 8);
+ auto a23 = vld1q_f32(input2_data + i + 12);
+ auto x0 = vaddq_f32(a10, a20);
+ auto x1 = vaddq_f32(a11, a21);
+ auto x2 = vaddq_f32(a12, a22);
+ auto x3 = vaddq_f32(a13, a23);
+ vst1q_f32(output_data + i, x0);
+ vst1q_f32(output_data + i + 4, x1);
+ vst1q_f32(output_data + i + 8, x2);
+ vst1q_f32(output_data + i + 12, x3);
+ }
+ for (; i <= size - 4; i += 4) {
+ auto a1 = vld1q_f32(input1_data + i);
+ auto a2 = vld1q_f32(input2_data + i);
+ auto x = vaddq_f32(a1, a2);
+ vst1q_f32(output_data + i, x);
+ }
+#endif // NEON
+
+ for (; i < size; i++) {
+ output_data[i] = input1_data[i] + input2_data[i];
+ }
+ }
+
+ static inline void Call(
+ const float* input1_data, const RuntimeShape& in1_shape,
+ const float* input2_data, const RuntimeShape& in2_shape,
+ float* output_data, const RuntimeShape& out_shape) {
+ if (in1_shape != in2_shape) {
+ BroadcastBinaryFunction4DSlow<float, float, float>(
+ in1_shape, input1_data,
+ in2_shape, input2_data,
+ out_shape, output_data,
+ [](float a, float b) { return a + b; }
+ );
+ } else {
+ Add_(input1_data, input2_data, output_data, out_shape.FlatSize());
+ }
+ }
+};
+
+struct Max {
+ static inline void Call(
+ const float* input1_data, const RuntimeShape& in1_shape,
+ const float* input2_data, const RuntimeShape& in2_shape,
+ float* output_data, const RuntimeShape& out_shape) {
+ if (in1_shape != in2_shape) {
+ BroadcastBinaryFunction4DSlow<float, float, float>(
+ in1_shape, input1_data,
+ in2_shape, input2_data,
+ out_shape, output_data,
+ [](float a, float b) { return std::max(a, b); }
+ );
+ } else {
+ auto input1 = MapAsVector(input1_data, in1_shape.FlatSize());
+ auto input2 = MapAsVector(input2_data, in2_shape.FlatSize());
+ auto output = MapAsVector(output_data, out_shape.FlatSize());
+ output = input1.cwiseMax(input2);
+ }
+ }
+};
+
+struct Mul {
+ static inline void Call(const float* input1_data, const RuntimeShape& in1_shape,
+ const float* input2_data, const RuntimeShape& in2_shape,
+ float* output_data, const RuntimeShape& out_shape) {
+ if (in1_shape != in2_shape) {
+ BroadcastBinaryFunction4DSlow<float, float, float>(
+ in1_shape, input1_data,
+ in2_shape, input2_data,
+ out_shape, output_data,
+ [](float a, float b) { return a * b; });
+ } else {
+ Mul_(input1_data, input2_data, output_data, out_shape.FlatSize());
+ }
+ }
+
+ static inline void Mul_(const float* input1_data, const float* input2_data,
+ float* output_data, const int size) {
+
+ int i = 0;
+#ifdef USE_NEON
+ for (; i <= size - 16; i += 16) {
+ auto a10 = vld1q_f32(input1_data + i);
+ auto a11 = vld1q_f32(input1_data + i + 4);
+ auto a12 = vld1q_f32(input1_data + i + 8);
+ auto a13 = vld1q_f32(input1_data + i + 12);
+ auto a20 = vld1q_f32(input2_data + i);
+ auto a21 = vld1q_f32(input2_data + i + 4);
+ auto a22 = vld1q_f32(input2_data + i + 8);
+ auto a23 = vld1q_f32(input2_data + i + 12);
+ auto x0 = vmulq_f32(a10, a20);
+ auto x1 = vmulq_f32(a11, a21);
+ auto x2 = vmulq_f32(a12, a22);
+ auto x3 = vmulq_f32(a13, a23);
+
+ vst1q_f32(output_data + i, x0);
+ vst1q_f32(output_data + i + 4, x1);
+ vst1q_f32(output_data + i + 8, x2);
+ vst1q_f32(output_data + i + 12, x3);
+ }
+ for (; i <= size - 4; i += 4) {
+ auto a1 = vld1q_f32(input1_data + i);
+ auto a2 = vld1q_f32(input2_data + i);
+ auto x = vmulq_f32(a1, a2);
+
+ vst1q_f32(output_data + i, x);
+ }
+#endif // NEON
+
+ for (; i < size; i++) {
+ output_data[i] = input1_data[i] * input2_data[i];
+ }
+ }
+};
+
+//TODO maybe move to a separate file since everything else here is extracted from TF Lite
+//23.11.2018
+struct Div {
+ static inline void Call(
+ const float* input1_data, const RuntimeShape& in1_shape,
+ const float* input2_data, const RuntimeShape& in2_shape,
+ float* output_data, const RuntimeShape& out_shape) {
+ if (in1_shape != in2_shape) {
+ BroadcastBinaryFunction4DSlow<float, float, float>(
+ in1_shape, input1_data,
+ in2_shape, input2_data,
+ out_shape, output_data,
+ [](float a, float b) { return a / b; }
+ );
+ } else {
+ auto input1 = MapAsVector(input1_data, in1_shape.FlatSize());
+ auto input2 = MapAsVector(input2_data, in2_shape.FlatSize());
+ auto output = MapAsVector(output_data, out_shape.FlatSize());
+ output = input1.cwiseQuotient(input2);
+ }
+ }
+};