summaryrefslogtreecommitdiff
path: root/compiler/ann-ref/src/ops/Add.float.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/ann-ref/src/ops/Add.float.cpp')
-rw-r--r--compiler/ann-ref/src/ops/Add.float.cpp122
1 files changed, 122 insertions, 0 deletions
diff --git a/compiler/ann-ref/src/ops/Add.float.cpp b/compiler/ann-ref/src/ops/Add.float.cpp
new file mode 100644
index 000000000..ce825d43d
--- /dev/null
+++ b/compiler/ann-ref/src/ops/Add.float.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Add.float.h"
+
+#include "internal/Array.h"
+#include "internal/NDArray.h"
+#include "internal/Matrix.h"
+#include "internal/Fused.h"
+#include "internal/ActivationUtils.h"
+
+template <FusedActivationFunctionType Ac>
+void Add(const float *input1_data, const Dims<4> &input1_dims, const float *input2_data,
+ const Dims<4> &input2_dims, float *output_data, const Dims<4> &output_dims)
+{
+ MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+ MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+ MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+ MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+ DCHECK(IsPackedWithoutStrides(input1_dims));
+ DCHECK(IsPackedWithoutStrides(input2_dims));
+ DCHECK(IsPackedWithoutStrides(output_dims));
+
+ int i = 0;
+ const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+
+ for (; i < size; i++)
+ {
+ auto x = input1_data[i] + input2_data[i];
+ output_data[i] = ActivationFunction<Ac>(x);
+ }
+}
+
+// From optimized_ops.h in TensorFlow Lite
+//
+// TODO: We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO: BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac>
+void BroadcastAdd(const float *input1_data, const Dims<4> &input1_dims, const float *input2_data,
+ const Dims<4> &input2_dims, float *output_data, const Dims<4> &output_dims)
+{
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b)
+ {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y)
+ {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x)
+ {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c)
+ {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunction<Ac>(input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+ }
+ }
+ }
+ }
+}
+
+bool addFloat32(const float *in1, const Shape &shape1, const float *in2, const Shape &shape2,
+ int32_t activation, float *out, const Shape &shapeOut)
+{
+ bool needBroadcast = !SameShape(shape1, shape2);
+
+#define ANDROID_NN_NORMAL_ADD(activation) \
+ Add<FusedActivationFunctionType::activation>(in1, convertShapeToDims(shape1), \
+ in2, convertShapeToDims(shape2), \
+ out, convertShapeToDims(shapeOut))
+
+#define ANDROID_NN_BROADCAST_ADD(activation) \
+ BroadcastAdd<FusedActivationFunctionType::activation>( \
+ in1, convertShapeToDims(shape1), in2, convertShapeToDims(shape2), out, \
+ convertShapeToDims(shapeOut))
+
+ if (needBroadcast)
+ {
+ ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_ADD)
+ }
+ else
+ {
+ ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_ADD)
+ }
+
+#undef ANDROID_NN_NORMAL_ADD
+#undef ANDROID_NN_BROADCAST_ADD
+ return true;
+}