summaryrefslogtreecommitdiff
path: root/compute/cker/include/cker
diff options
context:
space:
mode:
Diffstat (limited to 'compute/cker/include/cker')
-rw-r--r--compute/cker/include/cker/Shape.h286
-rw-r--r--compute/cker/include/cker/Types.h82
-rw-r--r--compute/cker/include/cker/Utils.h159
-rw-r--r--compute/cker/include/cker/eigen/Utils.h56
-rw-r--r--compute/cker/include/cker/gemmlowp/FixedPoint.h289
-rw-r--r--compute/cker/include/cker/operation/AveragePool.h101
-rw-r--r--compute/cker/include/cker/operation/BinaryArithmeticOps.h172
-rw-r--r--compute/cker/include/cker/operation/Concatenation.h93
-rw-r--r--compute/cker/include/cker/operation/Conv.h217
-rw-r--r--compute/cker/include/cker/operation/DepthwiseConv.h217
-rw-r--r--compute/cker/include/cker/operation/FullyConnected.h144
-rw-r--r--compute/cker/include/cker/operation/Gather.h78
-rw-r--r--compute/cker/include/cker/operation/InstanceNorm.h99
-rw-r--r--compute/cker/include/cker/operation/Logistic.h44
-rw-r--r--compute/cker/include/cker/operation/MaxPool.h98
-rw-r--r--compute/cker/include/cker/operation/Pad.h224
-rw-r--r--compute/cker/include/cker/operation/SoftMax.h130
-rw-r--r--compute/cker/include/cker/operation/TransposeConv.h135
-rw-r--r--compute/cker/include/cker/operation/optimized/AveragePool.h105
-rw-r--r--compute/cker/include/cker/operation/optimized/MaxPool.h97
-rw-r--r--compute/cker/include/cker/operation/optimized/SoftMax.h59
-rw-r--r--compute/cker/include/cker/operation/reference/AveragePool.h90
-rw-r--r--compute/cker/include/cker/operation/reference/MaxPool.h84
-rw-r--r--compute/cker/include/cker/operation/reference/SoftMax.h70
24 files changed, 3129 insertions, 0 deletions
diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h
new file mode 100644
index 000000000..39449c68f
--- /dev/null
+++ b/compute/cker/include/cker/Shape.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SHAPE_H__
+#define __NNFW_CKER_SHAPE_H__
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+#include <vector>
+
+#define UNUSED_RELEASE(a) (void)(a)
+
+namespace nnfw
+{
+namespace cker
+{
+
+class Shape
+{
+public:
+ // Shapes with dimensions up to 4 are stored directly in the structure, while
+ // larger shapes are separately allocated.
+ static constexpr int kMaxSmallSize = 4;
+
+ Shape &operator=(Shape const &) = delete;
+
+ Shape() : _size(0) {}
+
+ explicit Shape(int dimensions_count) : _size(dimensions_count)
+ {
+ if (dimensions_count > kMaxSmallSize)
+ {
+ _dims_pointer = new int32_t[dimensions_count];
+ }
+ }
+
+ Shape(int shape_size, int32_t value) : _size(0)
+ {
+ Resize(shape_size);
+ for (int i = 0; i < shape_size; ++i)
+ {
+ SetDim(i, value);
+ }
+ }
+
+ Shape(int dimensions_count, const int32_t *dims_data) : _size(0)
+ {
+ ReplaceWith(dimensions_count, dims_data);
+ }
+
+ Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); }
+
+ // Avoid using this constructor. We should be able to delete it when C++17
+ // rolls out.
+ Shape(Shape const &other) : _size(other.DimensionsCount())
+ {
+ if (_size > kMaxSmallSize)
+ {
+ _dims_pointer = new int32_t[_size];
+ }
+ std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size);
+ }
+
+ bool operator==(const Shape &comp) const
+ {
+ return this->_size == comp._size &&
+ std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0;
+ }
+
+ ~Shape()
+ {
+ if (_size > kMaxSmallSize)
+ {
+ delete[] _dims_pointer;
+ }
+ }
+
+ inline int32_t DimensionsCount() const { return _size; }
+ inline int32_t Dims(int i) const
+ {
+ assert(i >= 0);
+ assert(i < _size);
+ return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i];
+ }
+ inline void SetDim(int i, int32_t val)
+ {
+ assert(i >= 0);
+ assert(i < _size);
+ if (_size > kMaxSmallSize)
+ {
+ _dims_pointer[i] = val;
+ }
+ else
+ {
+ _dims[i] = val;
+ }
+ }
+
+ inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+ inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+ // The caller must ensure that the shape is no bigger than 4-D.
+ inline const int32_t *DimsDataUpTo4D() const { return _dims; }
+
+ inline void Resize(int dimensions_count)
+ {
+ if (_size > kMaxSmallSize)
+ {
+ delete[] _dims_pointer;
+ }
+ _size = dimensions_count;
+ if (dimensions_count > kMaxSmallSize)
+ {
+ _dims_pointer = new int32_t[dimensions_count];
+ }
+ }
+
+ inline void ReplaceWith(int dimensions_count, const int32_t *dims_data)
+ {
+ Resize(dimensions_count);
+ int32_t *dst_dims = DimsData();
+ std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
+ }
+
+ template <typename T> inline void BuildFrom(const T &src_iterable)
+ {
+ const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
+ Resize(dimensions_count);
+ int32_t *data = DimsData();
+ for (auto it : src_iterable)
+ {
+ *data = it;
+ ++data;
+ }
+ }
+
+ // This will probably be factored out. Old code made substantial use of 4-D
+ // shapes, and so this function is used to extend smaller shapes. Note that
+ // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+ // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+ // inputs should already be 4-D, so this function should not be needed.
+ inline static Shape ExtendedShape(int new_shape_size, const Shape &shape)
+ {
+ return Shape(new_shape_size, shape, 1);
+ }
+
+ inline void BuildFrom(const std::initializer_list<int> init_list)
+ {
+ BuildFrom<const std::initializer_list<int>>(init_list);
+ }
+
+ // Returns the total count of elements, that is the size when flattened into a
+ // vector.
+ inline int FlatSize() const
+ {
+ int buffer_size = 1;
+ const int *dims_data = DimsData();
+ for (int i = 0; i < _size; i++)
+ {
+ const int dim = dims_data[i];
+ assert(dim >= 1);
+ buffer_size *= dim;
+ }
+ return buffer_size;
+ }
+
+ bool operator!=(const Shape &comp) const { return !((*this) == comp); }
+
+private:
+ // For use only by ExtendedShape(), written to guarantee (return-value) copy
+ // elision in C++17.
+ // This creates a shape padded to the desired size with the specified value.
+ Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0)
+ {
+ assert(new_shape_size >= shape.DimensionsCount());
+ assert(new_shape_size <= kMaxSmallSize);
+ Resize(new_shape_size);
+ const int size_increase = new_shape_size - shape.DimensionsCount();
+ for (int i = 0; i < size_increase; ++i)
+ {
+ SetDim(i, pad_value);
+ }
+ std::memcpy(DimsData() + size_increase, shape.DimsData(),
+ sizeof(int32_t) * shape.DimensionsCount());
+ }
+
+ int32_t _size;
+ union {
+ int32_t _dims[kMaxSmallSize];
+ int32_t *_dims_pointer;
+ };
+};
+
+inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
+{
+ UNUSED_RELEASE(shape2);
+ UNUSED_RELEASE(index2);
+ assert(shape1.Dims(index1) == shape2.Dims(index2));
+ return shape1.Dims(index1);
+}
+
+inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); }
+
+inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3)
+{
+ assert(shape.DimensionsCount() == 4);
+ const int *dims_data = shape.DimsDataUpTo4D();
+ assert(i0 >= 0 && i0 < dims_data[0]);
+ assert(i1 >= 0 && i1 < dims_data[1]);
+ assert(i2 >= 0 && i2 < dims_data[2]);
+ assert(i3 >= 0 && i3 < dims_data[3]);
+ return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int FlatSizeSkipDim(const Shape &shape, int skip_dim)
+{
+ const int dims_count = shape.DimensionsCount();
+ assert(skip_dim >= 0 && skip_dim < dims_count);
+ const auto *dims_data = shape.DimsData();
+ int flat_size = 1;
+ for (int i = 0; i < dims_count; ++i)
+ {
+ flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+ }
+ return flat_size;
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0)
+{
+ UNUSED_RELEASE(check_shape_0);
+ assert(shape.DimensionsCount() == check_shape_0.DimensionsCount());
+ const int dims_count = shape.DimensionsCount();
+ for (int i = 0; i < dims_count; ++i)
+ {
+ assert(shape.Dims(i) == check_shape_0.Dims(i));
+ }
+ return shape.FlatSize();
+}
+
+inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0,
+ const Shape &check_shape_1)
+{
+ UNUSED_RELEASE(check_shape_0);
+ assert(shape.DimensionsCount() == check_shape_0.DimensionsCount());
+ const int dims_count = shape.DimensionsCount();
+ for (int i = 0; i < dims_count; ++i)
+ {
+ assert(shape.Dims(i) == check_shape_0.Dims(i));
+ }
+ return MatchingFlatSize(shape, check_shape_1);
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0)
+{
+ UNUSED_RELEASE(check_shape_0);
+ const int dims_count = shape.DimensionsCount();
+ for (int i = 0; i < dims_count; ++i)
+ {
+ if (i != skip_dim)
+ {
+ assert(shape.Dims(i) == check_shape_0.Dims(i));
+ }
+ }
+ return FlatSizeSkipDim(shape, skip_dim);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SHAPE_H__
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
new file mode 100644
index 000000000..85654b040
--- /dev/null
+++ b/compute/cker/include/cker/Types.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TYPES_H__
+#define __NNFW_CKER_TYPES_H__
+
+#include <cstdint>
+
+namespace nnfw
+{
+namespace cker
+{
+
+enum class FusedActivationFunctionType
+{
+ kNone = 0,
+ kRelu6 = 1,
+ kRelu1 = 2,
+ kRelu = 3,
+};
+enum class PaddingType
+{
+ kNone = 0,
+ kSame = 1,
+ kValid = 2,
+};
+
+struct PaddingValues
+{
+ int16_t width;
+ int16_t height;
+};
+
+struct PoolParams
+{
+ FusedActivationFunctionType activation;
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ int stride_height;
+ int stride_width;
+ int filter_height;
+ int filter_width;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+struct SoftmaxParams
+{
+ // beta is not really used (not a Tensorflow parameter) and not implemented
+ // for LogSoftmax.
+ double beta;
+ // uint8 inference params. Used even when beta defaults to 1.0.
+ int32_t input_multiplier;
+ int32_t input_left_shift;
+ // Reverse scaling is only used by LogSoftmax.
+ int32_t reverse_scaling_divisor;
+ int32_t reverse_scaling_right_shift;
+ int diff_min;
+};
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TYPES_H__
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
new file mode 100644
index 000000000..d1f1723c4
--- /dev/null
+++ b/compute/cker/include/cker/Utils.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UTILS_H__
+#define __NNFW_CKER_UTILS_H__
+
+#include <algorithm>
+#include <cstdint>
+
+#include "cker/gemmlowp/FixedPoint.h"
+#include "Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
+{
+ return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
+}
+
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
+{
+ int left_shift = shift > 0 ? shift : 0;
+ int right_shift = shift > 0 ? 0 : -shift;
+ return gemmlowp::RoundingDivideByPOT(
+ gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+ right_shift);
+}
+
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
+ int left_shift)
+{
+ return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+ return (b * height + h) * width + w;
+}
+
+inline int CountLeadingZeros(uint32_t integer_input)
+{
+ const uint32_t one_in_leading_positive = 1U << 31;
+ int leading_zeros = 0;
+ while (integer_input < one_in_leading_positive)
+ {
+ integer_input <<= 1;
+ ++leading_zeros;
+ }
+ return leading_zeros;
+}
+
+// Comment from tensorflow lite:
+//
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N> struct NdArrayDesc
+{
+ // The "extent" of each dimension. Indices along dimension d must be in the
+ // half-open interval [0, extents[d]).
+ int extents[N];
+
+ // The number of *elements* (not bytes) between consecutive indices of each
+ // dimension.
+ int strides[N];
+};
+
+// Comment from tensorflow lite:
+//
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
+{
+ assert(i0 >= 0 && i0 < desc.extents[0]);
+ assert(i1 >= 0 && i1 < desc.extents[1]);
+ assert(i2 >= 0 && i2 < desc.extents[2]);
+ assert(i3 >= 0 && i3 < desc.extents[3]);
+ return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
+}
+
+template <int N>
+inline void
+NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
+ NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
+{
+ assert(desc0_out != nullptr);
+ assert(desc1_out != nullptr);
+
+ auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
+ auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
+
+ // Copy dims to desc, calculating strides.
+ int desc0_stride = 1;
+ int desc1_stride = 1;
+ for (int i = N - 1; i >= 0; --i)
+ {
+ desc0_out->extents[i] = extended_input0_shape.Dims(i);
+ desc0_out->strides[i] = desc0_stride;
+ desc0_stride *= extended_input0_shape.Dims(i);
+ desc1_out->extents[i] = extended_input1_shape.Dims(i);
+ desc1_out->strides[i] = desc1_stride;
+ desc1_stride *= extended_input1_shape.Dims(i);
+ }
+
+ // Walk over each dimension. If the extents are equal do nothing.
+ // Otherwise, set the desc with extent 1 to have extent equal to the other and
+ // stride 0.
+ for (int i = 0; i < N; ++i)
+ {
+ const int extent0 = extended_input0_shape.Dims(i);
+ const int extent1 = extended_input1_shape.Dims(i);
+ if (extent0 != extent1)
+ {
+ if (extent0 == 1)
+ {
+ desc0_out->strides[i] = 0;
+ desc0_out->extents[i] = extent1;
+ }
+ else
+ {
+ assert(extent1 == 1);
+ desc1_out->strides[i] = 0;
+ desc1_out->extents[i] = extent0;
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UTILS_H__
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
new file mode 100644
index 000000000..645a61485
--- /dev/null
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EIGEN_UTILS_H__
+#define __NNFW_CKER_EIGEN_UTILS_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include <Eigen/Core>
+#include <type_traits>
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
+ Eigen::Dynamic>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
+{
+ const int dims_count = shape.DimensionsCount();
+ const int rows = shape.Dims(dims_count - 1);
+ const int cols = FlatSizeSkipDim(shape, dims_count - 1);
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_EIGEN_UTILS_H__
diff --git a/compute/cker/include/cker/gemmlowp/FixedPoint.h b/compute/cker/include/cker/gemmlowp/FixedPoint.h
new file mode 100644
index 000000000..159e01a22
--- /dev/null
+++ b/compute/cker/include/cker/gemmlowp/FixedPoint.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__
+#define __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__
+
+#include <algorithm>
+#include <cassert>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace gemmlowp
+{
+
+inline int32_t RoundingHalfSum(int32_t a, int32_t b)
+{
+ int64_t a64 = a;
+ int64_t b64 = b;
+ int64_t sum = a64 + b64;
+ int64_t sign = sum >= 0 ? 1 : -1;
+ return static_cast<int32_t>((sum + sign) / 2);
+}
+
+inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b)
+{
+ bool overflow = a == b && a == std::numeric_limits<int32_t>::min();
+ int64_t a_64(a);
+ int64_t b_64(b);
+ int64_t ab_64 = a_64 * b_64;
+ int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+ int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+ return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
+}
+
+// Correctly-rounded-to-nearest division by a power-of-two.
+// Also known as a rounding arithmetic right shift.
+inline int32_t RoundingDivideByPOT(int32_t x, int exponent)
+{
+ assert(exponent >= 0);
+ assert(exponent <= 31);
+ const int32_t mask = ((1ll << exponent) - 1);
+ const int32_t zero = 0;
+ const int32_t one = 1;
+ const int32_t remainder = x & mask;
+ const int32_t threshold = (mask >> 1) + ((x < zero) ? one : zero);
+ return ((x >> exponent) + ((remainder > threshold) ? one : zero));
+}
+
+// Returns the product of a run-time integer value by a compile-time power
+// of two, with either a positive exponent (equivalent to an arithmetic
+// left shift, saturating) or a negative exponent (equivalent to an arithmetic
+// right shift, rounding to nearest).
+template <int Exponent, int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0)>
+struct ImplSaturatingRoundingMultiplyByPOT
+{
+};
+
+template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 0>
+{
+ static int32_t eval(int32_t x) { return x; }
+};
+
+template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 1>
+{
+ static int32_t eval(int32_t x)
+ {
+ const int32_t min = (std::numeric_limits<int32_t>::min());
+ const int32_t max = (std::numeric_limits<int32_t>::max());
+ const int32_t threshold = ((1 << (31 - Exponent)) - 1);
+ const int32_t zero = 0;
+ const int32_t one = 1;
+
+ const int32_t positive_mask = ((x > threshold) ? ~zero : zero);
+ const int32_t negative_mask = ((x < -threshold) ? ~zero : zero);
+
+ int32_t result = (x * (one << Exponent));
+ result = (positive_mask ? max : result);
+ result = (negative_mask ? min : result);
+ return result;
+ }
+};
+
+template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, -1>
+{
+ static int32_t eval(int32_t x) { return RoundingDivideByPOT(x, -Exponent); }
+};
+
+template <int Exponent> int32_t SaturatingRoundingMultiplyByPOT(int32_t x)
+{
+ return ImplSaturatingRoundingMultiplyByPOT<Exponent>::eval(x);
+}
+
+template <int tIntegerBits> class FixedPoint
+{
+public:
+ static constexpr int kTotalBits = 8 * sizeof(int32_t);
+ static constexpr int kIntegerBits = tIntegerBits;
+ static constexpr int kFractionalBits = kTotalBits - 1 - kIntegerBits;
+ static_assert(kIntegerBits >= 0 && kIntegerBits < kTotalBits, "bad IntegerBits");
+
+ static int32_t ScalarRawMax() { return std::numeric_limits<int32_t>::max(); }
+
+ static FixedPoint FromRaw(int32_t x)
+ {
+ FixedPoint retval;
+ retval.raw() = x;
+ return retval;
+ }
+
+ static FixedPoint FromScalarRaw(int32_t x) { return FromRaw(x); }
+
+ template <int Exponent> static FixedPoint ConstantPOT()
+ {
+ static constexpr int kOffset = kFractionalBits + Exponent;
+ static_assert(kOffset < 31, "Constant not exactly representable in this fixed-point format");
+ return FromScalarRaw((int32_t)1 << kOffset);
+ }
+
+ static FixedPoint Zero() { return FromScalarRaw(0); }
+
+ static FixedPoint One()
+ {
+ return FromScalarRaw(kIntegerBits == 0 ? ScalarRawMax() : ((int32_t)1 << kFractionalBits));
+ }
+
+ int32_t raw() const { return i_; }
+ int32_t &raw() { return i_; }
+
+private:
+ int32_t i_;
+};
+
+// A FixedPoint multiplication is just a
+// SaturatingRoundingDoublingHighMul operation on the underlying
+// raw integer values. The IntegerBits simply add up, as is obvious
+// from the fact that the range is [-2^IntegerBits, 2^IntegerBits).
+template <int tIntegerBits_a, int tIntegerBits_b>
+FixedPoint<tIntegerBits_a + tIntegerBits_b> operator*(FixedPoint<tIntegerBits_a> a,
+ FixedPoint<tIntegerBits_b> b)
+{
+ FixedPoint<tIntegerBits_a + tIntegerBits_b> c;
+ c.raw() = SaturatingRoundingDoublingHighMul(a.raw(), b.raw());
+ return c;
+}
+
+// Tweaking IntegerBits gives exact multiplication by a power of two.
+template <int tExponent, int tIntegerBits>
+FixedPoint<tExponent + tIntegerBits> ExactMulByPot(FixedPoint<tIntegerBits> a)
+{
+ FixedPoint<tExponent + tIntegerBits> c;
+ c.raw() = a.raw();
+ return c;
+}
+
+template <int tIntegerBits>
+FixedPoint<tIntegerBits> operator+(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b)
+{
+ return FixedPoint<tIntegerBits>::FromRaw((a.raw() + b.raw()));
+}
+template <int tIntegerBits>
+FixedPoint<tIntegerBits> operator-(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b)
+{
+ return FixedPoint<tIntegerBits>::FromRaw((a.raw() - b.raw()));
+}
+template <int tIntegerBits>
+FixedPoint<tIntegerBits> operator&(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b)
+{
+ return FixedPoint<tIntegerBits>::FromRaw((a.raw() & b.raw()));
+}
+
+// Rescale changes the number of IntegerBits and updates the underlying
+// raw integer value accordingly.
+template <int tIntegerBitsDst, int tIntegerBitsSrc>
+FixedPoint<tIntegerBitsDst> Rescale(FixedPoint<tIntegerBitsSrc> x)
+{
+ static constexpr int kExponent = tIntegerBitsSrc - tIntegerBitsDst;
+ FixedPoint<tIntegerBitsDst> result;
+ result.raw() = SaturatingRoundingMultiplyByPOT<kExponent>(x.raw());
+ return result;
+}
+
+// Implementation of exponential function.
+
+// Returns exp(x) for x in [-1/4, 0).
+inline FixedPoint<0> exp_on_interval_between_negative_one_quarter_and_0_excl(FixedPoint<0> a)
+{
+ typedef FixedPoint<0> F;
+ const F constant_term = F::FromScalarRaw(RoundingDivideByPOT(1895147668, 0));
+ const F constant_1_over_3 = F::FromScalarRaw(RoundingDivideByPOT(715827883, 0));
+ // We're evaluating a Taylor expansion around -1/8, so we do the change of
+ // variable: x = a + 1/8.
+ // In fixed-point with 0 integer bits, 1/8 is represented by 1 << 28.
+ F x = a + F::template ConstantPOT<-3>();
+ F x2 = x * x;
+ F x3 = x2 * x;
+ F x4 = x2 * x2;
+ F x4_over_4 = F::FromScalarRaw(SaturatingRoundingMultiplyByPOT<-2>(x4.raw()));
+ F x4_over_24_plus_x3_over_6_plus_x2_over_2 = F::FromScalarRaw(
+ SaturatingRoundingMultiplyByPOT<-1>((((x4_over_4 + x3) * constant_1_over_3) + x2).raw()));
+ return (constant_term + constant_term * (x + x4_over_24_plus_x3_over_6_plus_x2_over_2));
+}
+
+// Returns exp(x) for x < 0.
+template <int tIntegerBits> FixedPoint<0> exp_on_negative_values(FixedPoint<tIntegerBits> a)
+{
+ typedef FixedPoint<tIntegerBits> InputF;
+ typedef FixedPoint<0> ResultF;
+ static constexpr int kFractionalBits = InputF::kFractionalBits;
+ static constexpr int kIntegerBits = InputF::kIntegerBits;
+ const InputF kOneQuarter = InputF::template ConstantPOT<-2>();
+ InputF mask = kOneQuarter - InputF::FromScalarRaw(1);
+ InputF a_mod_quarter_minus_one_quarter = (a & mask) - kOneQuarter;
+ ResultF result = exp_on_interval_between_negative_one_quarter_and_0_excl(
+ Rescale<0>(a_mod_quarter_minus_one_quarter));
+ int32_t remainder = (a_mod_quarter_minus_one_quarter - a).raw();
+
+#define GEMMLOWP_EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier) \
+ if (kIntegerBits > Exponent) \
+ { \
+ const ResultF kMultiplier = \
+ ResultF::FromScalarRaw(RoundingDivideByPOT(FixedPointMultiplier, 0)); \
+ static constexpr int kShiftAmount = \
+ ((kIntegerBits > Exponent) ? (kFractionalBits + Exponent) : 0); \
+ result = ((remainder & (1 << kShiftAmount)) ? (result * kMultiplier) : result); \
+ }
+
+ GEMMLOWP_EXP_BARREL_SHIFTER(-2, 1672461947);
+ GEMMLOWP_EXP_BARREL_SHIFTER(-1, 1302514674);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+0, 790015084);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+1, 290630308);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+2, 39332535);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+3, 720401);
+ GEMMLOWP_EXP_BARREL_SHIFTER(+4, 242);
+
+#undef GEMMLOWP_EXP_BARREL_SHIFTER
+
+ static constexpr int clampB = ((kIntegerBits > 5) ? (36 - kIntegerBits) : 0);
+ if (kIntegerBits > 5)
+ {
+ const InputF clamp = InputF::FromScalarRaw(RoundingDivideByPOT(-(1 << clampB), 0));
+ result.raw() = ((a.raw() < clamp.raw()) ? ResultF::Zero().raw() : result.raw());
+ }
+
+ result.raw() = (a.raw() ? result.raw() : ResultF::One().raw());
+ return result;
+}
+
+// Returns 1 / (1 + x) for x in (0, 1).
+inline FixedPoint<0> one_over_one_plus_x_for_x_in_0_1(FixedPoint<0> a)
+{
+ typedef FixedPoint<0> F0;
+ typedef FixedPoint<2> F2;
+ F0 half_denominator = F0::FromScalarRaw(RoundingHalfSum(a.raw(), F0::One().raw()));
+ // Newton-Raphson division
+ // https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
+ // Refer to that page for the logic behind the 48/17 and 32/17 constants.
+ const F2 constant_48_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(1515870810, 0));
+ const F2 constant_neg_32_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(-1010580540, 0));
+ F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17;
+ for (int i = 0; i < 3; i++)
+ {
+ F2 half_denominator_times_x = half_denominator * x;
+ F2 one_minus_half_denominator_times_x = F2::One() - half_denominator_times_x;
+ x = x + Rescale<2>(x * one_minus_half_denominator_times_x);
+ }
+ return Rescale<0>(ExactMulByPot<-1>(x));
+}
+
+} // namespace gemmlowp
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
new file mode 100644
index 000000000..b20919429
--- /dev/null
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_AVERAGE_POOL_H__
+#define __NNFW_CKER_AVERAGE_POOL_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+#include "cker/operation/optimized/AveragePool.h"
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#include "cker/operation/reference/AveragePool.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void AveragePool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+#if defined(CKER_OPTIMIZED_EIGEN)
+ optimized::AveragePool(params, input_shape, input_data, output_shape, output_data);
+#else // defined(CKER_OPTIMIZED_EIGEN)
+ reference::AveragePool(params, input_shape, input_data, output_shape, output_data);
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+}
+
+inline void AveragePool(const PoolParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+ assert(params.quantized_activation_min <= params.quantized_activation_max);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start);
+ if (filter_count <= 0)
+ {
+ continue;
+ }
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ int32_t acc = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ acc += input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+ }
+ }
+ acc = (acc + filter_count / 2) / filter_count;
+ acc = std::max(acc, params.quantized_activation_min);
+ acc = std::min(acc, params.quantized_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
new file mode 100644
index 000000000..60dd02651
--- /dev/null
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
+#define __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
+
+#include <functional>
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct BinaryArithmeticOpParam
+{
+ // Shape dependent / common to data / op types.
+ // BroadcastableOpCategory broadcast_category;
+ // uint8 inference params.
+ int32_t input1_offset;
+ int32_t input2_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int32_t output_shift;
+ // Add / Sub, not Mul, uint8 inference params.
+ int32_t left_shift;
+ int32_t input1_multiplier;
+ int32_t input1_shift;
+ int32_t input2_multiplier;
+ int32_t input2_shift;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+
+ // Processed output dimensions.
+ // Let input "a" be the one that broadcasts in the faster-changing dimension.
+ // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
+ // {b0, b1, b2, b3, b4},
+ // broadcast_shape[4] = b0 = a0.
+ // broadcast_shape[3] = b1; a1 = 1.
+ // broadcast_shape[2] = b2 = a2.
+ // broadcast_shape[1] = a3; b3 = 1.
+ // broadcast_shape[0] = b4 = a4.
+ // int broadcast_shape[5];
+};
+
+template <typename T>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape,
+ const T *input2_data, const Shape &output_shape, T *output_data,
+ const std::function<T(const T &, const T &)> &fn)
+{
+ const int32_t flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
+ params.quantized_activation_min,
+ params.quantized_activation_max);
+ }
+}
+
+template <>
+inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const float *input1_data, const Shape &input2_shape,
+ const float *input2_data, const Shape &output_shape,
+ float *output_data,
+ const std::function<float(const float &, const float &)> &fn)
+{
+ const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < size; i++)
+ {
+ output_data[i] =
+ ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
+ params.float_activation_min, params.float_activation_max);
+ }
+}
+
+template <typename T>
+inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &params,
+ const Shape &input1_shape, const T *input1_data,
+ const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data,
+ const std::function<T(const T &, const T &)> &fn)
+{
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+ const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+ // Comment from tensorflow lite:
+ //
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+ {
+ output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
+ fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.quantized_activation_min, params.quantized_activation_max);
+ }
+ }
+ }
+ }
+}
+
+template <>
+inline void BroadcastBinaryArithmeticOpSlow(
+ const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+ const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+ float *output_data, const std::function<float(const float &, const float &)> &fn)
+{
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+ const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+ {
+ output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
+ fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.float_activation_min, params.float_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
new file mode 100644
index 000000000..69a179c8c
--- /dev/null
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CONCATENATION_H__
+#define __NNFW_CKER_CONCATENATION_H__
+
+#include <cstdint>
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct ConcatenationParams
+{
+ int8_t axis;
+ const int32_t *input_zeropoint;
+ const float *input_scale;
+ uint16_t inputs_count;
+ int32_t output_zeropoint;
+ float output_scale;
+};
+
+template <typename Scalar>
+inline void Concatenation(const ConcatenationParams &params, const Shape *const *input_shapes,
+ const Scalar *const *input_data, const Shape &output_shape,
+ Scalar *output_data)
+{
+ int axis = params.axis;
+ int inputs_count = params.inputs_count;
+ const int concat_dimensions = output_shape.DimensionsCount();
+ assert(axis < concat_dimensions);
+
+ int64_t concat_size = 0;
+ for (int i = 0; i < inputs_count; i++)
+ {
+ assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
+ for (int j = 0; j < concat_dimensions; j++)
+ {
+ if (j != axis)
+ {
+ auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j);
+ UNUSED_RELEASE(dim_checked);
+ }
+ }
+ concat_size += input_shapes[i]->Dims(axis);
+ }
+ assert(concat_size == output_shape.Dims(axis));
+ int64_t outer_size = 1;
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= output_shape.Dims(i);
+ }
+ // For all input arrays,
+ // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+ int64_t base_inner_size = 1;
+ for (int i = axis + 1; i < concat_dimensions; ++i)
+ {
+ base_inner_size *= output_shape.Dims(i);
+ }
+
+ Scalar *output_ptr = output_data;
+ for (int k = 0; k < outer_size; k++)
+ {
+ for (int i = 0; i < inputs_count; ++i)
+ {
+ const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
+ memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+ output_ptr += copy_size;
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CONCATENATION_H__
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
new file mode 100644
index 000000000..35b0336fa
--- /dev/null
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CONV_H__
+#define __NNFW_CKER_CONV_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct ConvParams
+{
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ // TODO(starka): This was just "stride", so check that width+height is OK.
+ int16_t stride_width;
+ int16_t stride_height;
+ int16_t dilation_width_factor;
+ int16_t dilation_height_factor;
+ // uint8_t inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8_t, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+ const float *bias_data, const Shape &output_shape, float *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ UNUSED_RELEASE(bias_shape);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data)
+ {
+ assert(bias_shape.FlatSize() == output_depth);
+ }
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ float total = 0.f;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ const int in_offset = Offset(input_shape, batch, in_y, in_x, 0);
+ const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ float input_value = input_data[in_offset + in_channel];
+ float filter_value = filter_data[filter_offset + in_channel];
+ total += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ float bias_value = 0.0f;
+ if (bias_data)
+ {
+ bias_value = bias_data[out_channel];
+ }
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+ ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+ const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t input_offset = params.input_offset;
+ const int32_t filter_offset = params.weights_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ assert(output_activation_min <= output_activation_max);
+
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ UNUSED_RELEASE(bias_shape);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data)
+ {
+ assert(bias_shape.FlatSize() == output_depth);
+ }
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ const int in_base = Offset(input_shape, batch, in_y, in_x, 0);
+ const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ for (int in_channel = 0; in_channel < input_depth; in_channel++)
+ {
+ int32_t input_val = input_data[in_channel + in_base];
+ int32_t filter_val = filter_data[in_channel + filter_base];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ }
+ }
+ }
+ if (bias_data)
+ {
+ acc += bias_data[out_channel];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+ static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CONCATENATION_H_
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
new file mode 100644
index 000000000..7d022477d
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
+#define __NNFW_CKER_DEPTHWISE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct DepthwiseConvParams
+{
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ int16_t stride_width;
+ int16_t stride_height;
+ int16_t dilation_width_factor;
+ int16_t dilation_height_factor;
+ int16_t depth_multiplier;
+ // uint8 inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &filter_shape,
+ const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int32_t input_offset = params.input_offset;
+ const int32_t filter_offset = params.weights_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ assert(output_activation_min <= output_activation_max);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(bias_shape);
+
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int ic = 0; ic < input_depth; ++ic)
+ {
+ for (int m = 0; m < depth_multiplier; m++)
+ {
+ const int oc = m + ic * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ int32_t input_val = input_data[Offset(input_shape, b, in_y, in_x, ic)];
+ int32_t filter_val = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ }
+ }
+ if (bias_data)
+ {
+ acc += bias_data[oc];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_shape, b, out_y, out_x, oc)] = static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+ }
+}
+
+inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &filter_shape,
+ const float *filter_data, const Shape &bias_shape, const float *bias_data,
+ const Shape &output_shape, float *output_data)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(bias_shape);
+
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int ic = 0; ic < input_depth; ++ic)
+ {
+ for (int m = 0; m < depth_multiplier; m++)
+ {
+ const int oc = m + ic * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ float total = 0.f;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
+ float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
+ total += (input_value * filter_value);
+ }
+ }
+ }
+ float bias_value = 0.0f;
+ if (bias_data)
+ {
+ bias_value = bias_data[oc];
+ }
+ output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
+ total + bias_value, output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_DEPTHWISE_CONV_H__
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
new file mode 100644
index 000000000..428fb1b53
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_H__
+#define __NNFW_CKER_FULLY_CONNECTED_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct FullyConnectedParams
+{
+ // uint8 inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+ // FullyConnectedWeightsFormat weights_format;
+};
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &weights_shape,
+ const float *weights_data, const Shape &bias_shape,
+ const float *bias_data, const Shape &output_shape, float *output_data)
+{
+ UNUSED_RELEASE(input_shape);
+ UNUSED_RELEASE(bias_shape);
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dims_count = output_shape.DimensionsCount();
+ const int weights_dims_count = weights_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+ const int output_depth =
+ MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+ const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_c = 0; out_c < output_depth; ++out_c)
+ {
+ float total = 0.f;
+ for (int d = 0; d < accum_depth; ++d)
+ {
+ total += input_data[b * accum_depth + d] * weights_data[out_c * accum_depth + d];
+ }
+ float bias_value = 0.0f;
+ if (bias_data)
+ {
+ bias_value = bias_data[out_c];
+ }
+ output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+ total + bias_value, output_activation_min, output_activation_max);
+ }
+ }
+}
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &filter_shape,
+ const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape,
+ uint8_t *output_data)
+{
+ UNUSED_RELEASE(input_shape);
+ UNUSED_RELEASE(bias_shape);
+ const int32_t input_offset = params.input_offset;
+ const int32_t filter_offset = params.weights_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ assert(filter_shape.DimensionsCount() >= 2);
+ assert(output_shape.DimensionsCount() >= 1);
+
+ assert(output_activation_min <= output_activation_max);
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dim_count = output_shape.DimensionsCount();
+ const int filter_dim_count = filter_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+ const int output_depth =
+ MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+ const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int out_c = 0; out_c < output_depth; ++out_c)
+ {
+ int32_t acc = 0;
+ for (int d = 0; d < accum_depth; ++d)
+ {
+ int32_t input_val = input_data[b * accum_depth + d];
+ int32_t filter_val = filter_data[out_c * accum_depth + d];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ if (bias_data)
+ {
+ acc += bias_data[out_c];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FULLY_CONNECTED_H__
diff --git a/compute/cker/include/cker/operation/Gather.h b/compute/cker/include/cker/operation/Gather.h
new file mode 100644
index 000000000..9cd96eeb7
--- /dev/null
+++ b/compute/cker/include/cker/operation/Gather.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_GATHER_H__
+#define __NNFW_CKER_GATHER_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct GatherParams
+{
+ int32_t axis;
+};
+
+template <typename T, typename CoordsT = int32_t>
+inline void Gather(const GatherParams &op_params, const Shape &input_shape, const T *input_data,
+ const Shape &coords_shape, const CoordsT *coords_data, const Shape &,
+ T *output_data)
+{
+ int axis = op_params.axis;
+ if (axis < 0)
+ {
+ axis += input_shape.DimensionsCount();
+ }
+ assert(axis >= 0);
+ assert(axis < input_shape.DimensionsCount());
+ const int axis_size = input_shape.Dims(axis);
+ const int coords_count = coords_shape.FlatSize();
+
+ int outer_size = 1;
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= input_shape.Dims(i);
+ }
+
+ int inner_size = 1;
+ for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+ {
+ inner_size *= input_shape.Dims(i);
+ }
+
+ for (int outer = 0; outer < outer_size; ++outer)
+ {
+ for (int i = 0; i < coords_count; ++i)
+ {
+ assert(coords_data[i] >= 0);
+ assert(coords_data[i] < axis_size);
+ std::memcpy(output_data + (outer * coords_count + i) * inner_size,
+ input_data + (outer * axis_size + coords_data[i]) * inner_size,
+ sizeof(T) * inner_size);
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_GATHER_H__
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
new file mode 100644
index 000000000..794dcebc8
--- /dev/null
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_INSTANCE_NORM_H__
+#define __NNFW_CKER_INSTANCE_NORM_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct InstanceNormParams
+{
+ float epsilon;
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &gamma_shape, const float *gamma_data,
+ const Shape &beta_shape, const float *beta_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
+ const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
+ const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+
+ UNUSED_RELEASE(gamma_shape);
+ UNUSED_RELEASE(beta_shape);
+ assert(output_activation_min <= output_activation_max);
+
+ for (int32_t batch = 0; batch < batches; batch++)
+ {
+ for (int32_t channel = 0; channel < channels; channel++)
+ {
+ double sum = 0.0f;
+ double square_sum = 0.0f;
+ int32_t size = heights * widths;
+
+ for (int32_t height = 0; height < heights; height++)
+ {
+ for (int32_t width = 0; width < widths; width++)
+ {
+ double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
+ sum += input_val;
+ square_sum += (input_val * input_val);
+ }
+ }
+
+ double mean = sum / size;
+ double var = square_sum / size - mean * mean;
+
+ double gamma = gamma_data[channel];
+ double beta = beta_data[channel];
+
+ double a = gamma / (std::sqrt(var + params.epsilon));
+ double b = -mean * a + beta;
+
+ for (int32_t height = 0; height < heights; height++)
+ {
+ for (int32_t width = 0; width < widths; width++)
+ {
+ double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
+ double output_value = input_value * a + b;
+ output_data[Offset(output_shape, batch, height, width, channel)] =
+ ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_INSTANCE_NORM_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
new file mode 100644
index 000000000..872095531
--- /dev/null
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGISTIC_H__
+#define __NNFW_CKER_LOGISTIC_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+{
+ // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
+ const int size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < size; i++)
+ {
+ output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGISTIC_H__
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
new file mode 100644
index 000000000..326168b99
--- /dev/null
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_MAX_POOL_H__
+#define __NNFW_CKER_MAX_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+#include "cker/operation/optimized/MaxPool.h"
+#include "cker/operation/reference/MaxPool.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+#if defined(CKER_OPTIMIZED_EIGEN)
+ optimized::MaxPool(params, input_shape, input_data, output_shape, output_data);
+#else // defined(CKER_OPTIMIZED_EIGEN)
+ reference::MaxPool(params, input_shape, input_data, output_shape, output_data);
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+}
+
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const uint8_t *input_data,
+ const Shape &output_shape, uint8_t *output_data)
+{
+ assert(params.quantized_activation_min <= params.quantized_activation_max);
+ assert(params.quantized_activation_min >= 0);
+ assert(params.quantized_activation_max <= 255);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ uint8_t max = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+ }
+ }
+ max = std::max<uint8_t>(max, params.quantized_activation_min);
+ max = std::min<uint8_t>(max, params.quantized_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ static_cast<uint8_t>(max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
new file mode 100644
index 000000000..af432f3a8
--- /dev/null
+++ b/compute/cker/include/cker/operation/Pad.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_PAD_H__
+#define __NNFW_CKER_PAD_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
+ const float *input_data, const Shape &output_shape, float *output_data,
+ const float *constant_value_data)
+{
+ // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
+ // TODO: come up with more subtle solution that uses subtensors like arm compute
+ // TODO: Check if it works for all layouts
+
+ using PaddingInfo = std::pair<int32_t, int32_t>;
+ /** List of padding information */
+ using PaddingList = std::vector<PaddingInfo>;
+
+ auto constant_value = constant_value_data ? *constant_value_data : 0;
+ assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
+
+ PaddingList padding_list(pad_rank);
+ for (int32_t n = 0; n < pad_rank; ++n)
+ {
+ const int32_t *from = padding_data + (n * 2);
+ padding_list[n] = {from[0], from[1]};
+ }
+ for (int32_t i = 0; i < pad_rank; ++i)
+ {
+ assert(output_shape.Dims(i) ==
+ input_shape.Dims(i) + padding_list[i].first + padding_list[i].second);
+ }
+ /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker
+ functions:
+ 1. to prevent access violation in padding_list;
+ 2. handling as 4d is slower than as 2d/3d.
+ */
+ switch (pad_rank)
+ {
+ case 0:
+ case 1:
+ {
+ const int32_t in_row_len = input_shape.Dims(0);
+ std::fill_n(output_data, padding_list[0].first, constant_value);
+ std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
+ std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
+ constant_value);
+ break;
+ }
+ case 2: // HW
+ {
+ const int32_t in_row_len = input_shape.Dims(1);
+ const int32_t out_row_size = output_shape.Dims(1);
+
+ // prepend padding rows
+ std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value);
+
+ const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+ for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j)
+ {
+ auto out_offset = i * out_row_size;
+ const auto in_offset = j * in_row_len;
+
+ // prepend padding values
+ std::fill_n(output_data + out_offset, padding_list[1].first, constant_value);
+
+ out_offset += padding_list[1].first;
+
+ // copy a row of input data
+ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+
+ out_offset += in_row_len;
+
+ // append padding values
+ std::fill_n(output_data + out_offset, padding_list[1].second, constant_value);
+ }
+
+ // append padding rows
+ std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size,
+ constant_value);
+ break;
+ }
+ case 3: // HWC
+ {
+ const int32_t in_row_len = input_shape.Dims(2);
+ const int32_t out_row_size = output_shape.Dims(2);
+ const auto plain_size = out_row_size * output_shape.Dims(1);
+
+ // prepend padding plains
+ std::fill_n(output_data, padding_list[0].first * plain_size, constant_value);
+
+ const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+ for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp)
+ {
+ const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2);
+
+ // prepend padding rows
+ std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size,
+ constant_value);
+
+ const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first;
+ for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp)
+ {
+ auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2);
+ const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2);
+
+ // prepend padding values
+ std::fill_n(output_data + out_offset, padding_list[2].first, constant_value);
+
+ out_offset += padding_list[2].first;
+
+ // copy a row of input data
+ memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+
+ out_offset += in_row_len;
+
+ // append padding values
+ std::fill_n(output_data + out_offset, padding_list[2].second, constant_value);
+ }
+
+ // append padding rows
+ std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
+ padding_list[1].second * out_row_size, constant_value);
+ }
+
+ // append padding plains
+ std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size,
+ constant_value);
+ break;
+ }
+ case 4:
+ {
+ auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t {
+ return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3);
+ };
+ const int32_t in_row_len = input_shape.Dims(3);
+ const int32_t out_row_size = output_shape.Dims(3);
+ const auto plain_size = out_row_size * output_shape.Dims(2);
+ const auto parallelepiped_size = plain_size * output_shape.Dims(1);
+
+ // prepend padding parallelepipeds
+ std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value);
+
+ const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first;
+ for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp)
+ {
+ const auto out_h_offset = get_offset(output_shape, i, 0, 0);
+ // prepend padding plains
+ std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value);
+
+ const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first;
+ for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp)
+ {
+ const auto out_w_offset = get_offset(output_shape, i, j, 0);
+
+ // prepend padding rows
+ std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size,
+ constant_value);
+
+ const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first;
+ for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp)
+ {
+ auto out_c_offset = get_offset(output_shape, i, j, k);
+ const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp);
+
+ // prepend padding values
+ std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value);
+
+ out_c_offset += padding_list[3].first;
+
+ // copy a row of input data
+ memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
+
+ out_c_offset += in_row_len;
+
+ // append padding values
+ std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value);
+ }
+
+ // append padding rows
+ std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
+ padding_list[2].second * out_row_size, constant_value);
+ }
+
+ // append padding plains
+ std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size,
+ padding_list[1].second * plain_size, constant_value);
+ }
+ // append padding parallelepipeds
+ std::fill_n(output_data + r_b_inp_lim * parallelepiped_size,
+ padding_list[0].second * parallelepiped_size, constant_value);
+ break;
+ }
+ default:
+ throw std::runtime_error("Padding for rank > 4 NYI");
+ break;
+ }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_PAD_H__
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
new file mode 100644
index 000000000..ea404a002
--- /dev/null
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SOFTMAX_H__
+#define __NNFW_CKER_SOFTMAX_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+#include "cker/gemmlowp/FixedPoint.h"
+#include "cker/operation/optimized/SoftMax.h"
+#include "cker/operation/reference/SoftMax.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+#if defined(CKER_OPTIMIZED_EIGEN)
+ optimized::Softmax(params, input_shape, input_data, output_shape, output_data);
+#else // defined(CKER_OPTIMIZED_EIGEN)
+ reference::Softmax(params, input_shape, input_data, output_shape, output_data);
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+}
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+ const int32_t input_beta_multiplier = params.input_multiplier;
+ const int32_t input_beta_left_shift = params.input_left_shift;
+ const int diff_min = params.diff_min;
+ // The representation chosen for the input to the exp() function is Q5.26.
+ // We need to leave extra space since values that we skip might be as large as
+ // -32 before multiplying by input_beta_multiplier, and therefore as large as
+ // -16 afterwards. Note that exp(-8) is definitely not insignificant to
+ // accumulation, but exp(-16) definitely is.
+ static const int kScaledDiffIntegerBits = 5;
+ static const int kAccumulationIntegerBits = 12;
+ using FixedPointScaledDiff = gemmlowp::FixedPoint<kScaledDiffIntegerBits>;
+ using FixedPointAccum = gemmlowp::FixedPoint<kAccumulationIntegerBits>;
+ using FixedPoint0 = gemmlowp::FixedPoint<0>;
+
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ uint8_t max_in_row = 0;
+ for (int c = 0; c < depth; ++c)
+ {
+ max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+ }
+
+ FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+ for (int c = 0; c < depth; ++c)
+ {
+ int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+ if (input_diff >= diff_min)
+ {
+ const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+ sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+ exp_on_negative_values(scaled_diff_f8));
+ }
+ }
+
+ int32_t fixed_sum_of_exps = sum_of_exps.raw();
+ int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
+ // This is the number of bits to the left of the binary point above 1.0.
+ // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
+ // no later adjustment will be needed.
+ int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+ int32_t shifted_sum_minus_one =
+ static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
+ (static_cast<uint32_t>(1) << 31));
+
+ FixedPoint0 shifted_scale =
+ one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+ for (int c = 0; c < depth; ++c)
+ {
+ int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
+ if (input_diff >= diff_min)
+ {
+ const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+ FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+ int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
+ num_bits_over_unit + 31 - 8);
+
+ output_data[i * depth + c] = static_cast<uint8_t>(
+ std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
+ }
+ else
+ {
+ output_data[i * depth + c] = 0;
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SOFTMAX_H__
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
new file mode 100644
index 000000000..535fe86cf
--- /dev/null
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRANSPOSE_CONV_H__
+#define __NNFW_CKER_TRANSPOSE_CONV_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+struct TransposeConvParams
+{
+ PaddingType padding_type;
+ PaddingValues padding_values;
+ // TODO(starka): This was just "stride", so check that width+height is OK.
+ int16_t stride_width;
+ int16_t stride_height;
+ int16_t dilation_width_factor;
+ int16_t dilation_height_factor;
+ // uint8_t inference params.
+ // TODO(b/65838351): Use smaller types if appropriate.
+ int32_t input_offset;
+ int32_t weights_offset;
+ int32_t output_offset;
+ int32_t output_multiplier;
+ int output_shift;
+ // uint8_t, etc, activation params.
+ int32_t quantized_activation_min;
+ int32_t quantized_activation_max;
+ // float activation params.
+ float float_activation_min;
+ float float_activation_max;
+};
+
+inline void TransposeConv(const TransposeConvParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &filter_shape,
+ const float *filter_data, const Shape &output_shape, float *output_data)
+{
+
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+
+ // Although transpose convolution simplifies to convolution with transposed
+ // weights for strides of 1, non-unitary striding complicates matters. To
+ // keep this reference implementation as clear as possible, we use a
+ // "scatter" access pattern, where we loop through all the input elements,
+ // computing their influence on the output, rather than looping through the
+ // output elements in the typical "gather" access pattern of a conv. We
+ // therefore must initialize the output array to zero.
+ const int num_elements = output_shape.FlatSize();
+ for (int i = 0; i < num_elements; i++)
+ {
+ output_data[i] = 0.0f;
+ }
+
+ // Loop through input elements one at a time.
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int in_y = 0; in_y < input_height; ++in_y)
+ {
+ for (int in_x = 0; in_x < input_width; ++in_x)
+ {
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ // Loop through the output elements it will influence
+ const int out_x_origin = (in_x * stride_width) - pad_width;
+ const int out_y_origin = (in_y * stride_height) - pad_height;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ // Compute output element location
+ const int out_x = out_x_origin + filter_x;
+ const int out_y = out_y_origin + filter_y;
+ // We cannot accumulate out of bounds
+ if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+ (out_y < output_height))
+ {
+ float input_value =
+ input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+ float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
+ filter_x, in_channel)];
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
+ input_value * filter_value;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRANSPOSE_CONV_H__
diff --git a/compute/cker/include/cker/operation/optimized/AveragePool.h b/compute/cker/include/cker/operation/optimized/AveragePool.h
new file mode 100644
index 000000000..d94a5811a
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/AveragePool.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__
+#define __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// TODO Change to apply neon for this function if it is faster
+inline void AveragePool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+
+ // TODO(benoitjacob) make this a proper reference impl without Eigen!
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+ auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+ // TODO(benoitjacob) get rid of the dynamic memory allocation here!
+ Eigen::VectorXf out_count(out_mat.cols());
+ out_count.setZero();
+ // Prefill the output to 0.
+ out_mat.setZero();
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int h = 0; h < input_height; ++h)
+ {
+ for (int w = 0; w < input_width; ++w)
+ {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int hpad = h + params.padding_values.height;
+ int wpad = w + params.padding_values.width;
+ int h_start =
+ (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+ int h_end = std::min(hpad / stride_height + 1, output_height);
+ int w_start =
+ (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+ int w_end = std::min(wpad / stride_width + 1, output_width);
+ // compute elementwise sum
+ for (int ph = h_start; ph < h_end; ++ph)
+ {
+ for (int pw = w_start; pw < w_end; ++pw)
+ {
+ int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+ out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+ out_count(out_offset)++;
+ }
+ }
+ }
+ }
+ }
+ // Divide the output by the actual number of elements being averaged over
+ assert(out_count.minCoeff() > 0);
+ out_mat.array().rowwise() /= out_count.transpose().array();
+
+ const int flat_size = output_shape.FlatSize();
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/optimized/MaxPool.h b/compute/cker/include/cker/operation/optimized/MaxPool.h
new file mode 100644
index 000000000..07a14aee4
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/MaxPool.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_MAX_POOL_H__
+#define __NNFW_CKER_OPTIMIZED_MAX_POOL_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// TODO Change to apply neon for this function if it is faster
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+ auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+ // Prefill the output to minimum representable float value
+ out_mat.setConstant(std::numeric_limits<float>::lowest());
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int h = 0; h < input_height; ++h)
+ {
+ for (int w = 0; w < input_width; ++w)
+ {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int hpad = h + params.padding_values.height;
+ int wpad = w + params.padding_values.width;
+ int h_start =
+ (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+ int h_end = std::min(hpad / stride_height + 1, output_height);
+ int w_start =
+ (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+ int w_end = std::min(wpad / stride_width + 1, output_width);
+ // compute elementwise sum
+ for (int ph = h_start; ph < h_end; ++ph)
+ {
+ for (int pw = w_start; pw < w_end; ++pw)
+ {
+ int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+ out_mat.col(out_offset) =
+ out_mat.col(out_offset)
+ .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+ }
+ }
+ }
+ }
+ }
+ const int flat_size = output_shape.FlatSize();
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
+ params.float_activation_max);
+ }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_OPTIMIZED_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/optimized/SoftMax.h b/compute/cker/include/cker/operation/optimized/SoftMax.h
new file mode 100644
index 000000000..e44f251d0
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/SoftMax.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_SOFTMAX_H__
+#define __NNFW_CKER_OPTIMIZED_SOFTMAX_H__
+
+#if defined(CKER_OPTIMIZED_EIGEN)
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ // Validate whether if shapes of input and output are the same
+ MatchingFlatSize(input_shape, output_shape);
+
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
+ auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+ // Compute the exponential first, removing the max coefficient for numerical
+ // stability.
+ out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
+ // We are separating out the exp function so that exp can be vectorized.
+ out_mat = out_mat.array().exp();
+ // Normalize to get the activations.
+ Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse();
+ out_mat.array().rowwise() *= scale;
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // defined(CKER_OPTIMIZED_EIGEN)
+
+#endif // __NNFW_CKER_OPTIMIZED_SOFTMAX_H__
diff --git a/compute/cker/include/cker/operation/reference/AveragePool.h b/compute/cker/include/cker/operation/reference/AveragePool.h
new file mode 100644
index 000000000..3ddab4b24
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/AveragePool.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__
+#define __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void AveragePool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start);
+ if (filter_count <= 0)
+ {
+ continue;
+ }
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ float total = 0.f;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ total += input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+ }
+ }
+ const float average = total / (float)filter_count;
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ ActivationFunctionWithMinMax(average, params.float_activation_min,
+ params.float_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__
diff --git a/compute/cker/include/cker/operation/reference/MaxPool.h b/compute/cker/include/cker/operation/reference/MaxPool.h
new file mode 100644
index 000000000..a0f0263c7
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/MaxPool.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_MAX_POOL_H__
+#define __NNFW_CKER_REFERENCE_MAX_POOL_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void MaxPool(const PoolParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int channel = 0; channel < depth; ++channel)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ float max = std::numeric_limits<float>::lowest();
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+ {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+ }
+ }
+ output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+ ActivationFunctionWithMinMax(max, params.float_activation_min,
+ params.float_activation_max);
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_MAX_POOL_H__
diff --git a/compute/cker/include/cker/operation/reference/SoftMax.h b/compute/cker/include/cker/operation/reference/SoftMax.h
new file mode 100644
index 000000000..420cb319b
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/SoftMax.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_SOFTMAX_H__
+#define __NNFW_CKER_REFERENCE_SOFTMAX_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference
+{
+
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ // Find max element value which we'll use to ensure numerical stability
+ // taking advantage of the following equality:
+ // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+ float max = std::numeric_limits<float>::lowest();
+ for (int c = 0; c < depth; ++c)
+ {
+ max = std::max(max, input_data[i * depth + c]);
+ }
+
+ // Compute sum.
+ float sum = 0.f;
+ for (int c = 0; c < depth; ++c)
+ {
+ sum += std::exp((input_data[i * depth + c] - max) * params.beta);
+ }
+
+ // Compute result.
+ for (int c = 0; c < depth; ++c)
+ {
+ output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) * params.beta) / sum;
+ }
+ }
+}
+
+} // namespace reference
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_SOFTMAX_H__