diff options
Diffstat (limited to 'compute/cker/include/cker')
24 files changed, 3129 insertions, 0 deletions
diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h new file mode 100644 index 000000000..39449c68f --- /dev/null +++ b/compute/cker/include/cker/Shape.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SHAPE_H__ +#define __NNFW_CKER_SHAPE_H__ + +#include <algorithm> +#include <cstring> +#include <cassert> +#include <vector> + +#define UNUSED_RELEASE(a) (void)(a) + +namespace nnfw +{ +namespace cker +{ + +class Shape +{ +public: + // Shapes with dimensions up to 4 are stored directly in the structure, while + // larger shapes are separately allocated. + static constexpr int kMaxSmallSize = 4; + + Shape &operator=(Shape const &) = delete; + + Shape() : _size(0) {} + + explicit Shape(int dimensions_count) : _size(dimensions_count) + { + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + Shape(int shape_size, int32_t value) : _size(0) + { + Resize(shape_size); + for (int i = 0; i < shape_size; ++i) + { + SetDim(i, value); + } + } + + Shape(int dimensions_count, const int32_t *dims_data) : _size(0) + { + ReplaceWith(dimensions_count, dims_data); + } + + Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); } + + // Avoid using this constructor. We should be able to delete it when C++17 + // rolls out. + Shape(Shape const &other) : _size(other.DimensionsCount()) + { + if (_size > kMaxSmallSize) + { + _dims_pointer = new int32_t[_size]; + } + std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size); + } + + bool operator==(const Shape &comp) const + { + return this->_size == comp._size && + std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0; + } + + ~Shape() + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + } + + inline int32_t DimensionsCount() const { return _size; } + inline int32_t Dims(int i) const + { + assert(i >= 0); + assert(i < _size); + return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i]; + } + inline void SetDim(int i, int32_t val) + { + assert(i >= 0); + assert(i < _size); + if (_size > kMaxSmallSize) + { + _dims_pointer[i] = val; + } + else + { + _dims[i] = val; + } + } + + inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + // The caller must ensure that the shape is no bigger than 4-D. + inline const int32_t *DimsDataUpTo4D() const { return _dims; } + + inline void Resize(int dimensions_count) + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + _size = dimensions_count; + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + inline void ReplaceWith(int dimensions_count, const int32_t *dims_data) + { + Resize(dimensions_count); + int32_t *dst_dims = DimsData(); + std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t)); + } + + template <typename T> inline void BuildFrom(const T &src_iterable) + { + const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end()); + Resize(dimensions_count); + int32_t *data = DimsData(); + for (auto it : src_iterable) + { + *data = it; + ++data; + } + } + + // This will probably be factored out. Old code made substantial use of 4-D + // shapes, and so this function is used to extend smaller shapes. Note that + // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be + // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their + // inputs should already be 4-D, so this function should not be needed. + inline static Shape ExtendedShape(int new_shape_size, const Shape &shape) + { + return Shape(new_shape_size, shape, 1); + } + + inline void BuildFrom(const std::initializer_list<int> init_list) + { + BuildFrom<const std::initializer_list<int>>(init_list); + } + + // Returns the total count of elements, that is the size when flattened into a + // vector. + inline int FlatSize() const + { + int buffer_size = 1; + const int *dims_data = DimsData(); + for (int i = 0; i < _size; i++) + { + const int dim = dims_data[i]; + assert(dim >= 1); + buffer_size *= dim; + } + return buffer_size; + } + + bool operator!=(const Shape &comp) const { return !((*this) == comp); } + +private: + // For use only by ExtendedShape(), written to guarantee (return-value) copy + // elision in C++17. + // This creates a shape padded to the desired size with the specified value. + Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0) + { + assert(new_shape_size >= shape.DimensionsCount()); + assert(new_shape_size <= kMaxSmallSize); + Resize(new_shape_size); + const int size_increase = new_shape_size - shape.DimensionsCount(); + for (int i = 0; i < size_increase; ++i) + { + SetDim(i, pad_value); + } + std::memcpy(DimsData() + size_increase, shape.DimsData(), + sizeof(int32_t) * shape.DimensionsCount()); + } + + int32_t _size; + union { + int32_t _dims[kMaxSmallSize]; + int32_t *_dims_pointer; + }; +}; + +inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2) +{ + UNUSED_RELEASE(shape2); + UNUSED_RELEASE(index2); + assert(shape1.Dims(index1) == shape2.Dims(index2)); + return shape1.Dims(index1); +} + +inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); } + +inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3) +{ + assert(shape.DimensionsCount() == 4); + const int *dims_data = shape.DimsDataUpTo4D(); + assert(i0 >= 0 && i0 < dims_data[0]); + assert(i1 >= 0 && i1 < dims_data[1]); + assert(i2 >= 0 && i2 < dims_data[2]); + assert(i3 >= 0 && i3 < dims_data[3]); + return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3; +} + +inline int FlatSizeSkipDim(const Shape &shape, int skip_dim) +{ + const int dims_count = shape.DimensionsCount(); + assert(skip_dim >= 0 && skip_dim < dims_count); + const auto *dims_data = shape.DimsData(); + int flat_size = 1; + for (int i = 0; i < dims_count; ++i) + { + flat_size *= (i == skip_dim) ? 1 : dims_data[i]; + } + return flat_size; +} + +// Flat size calculation, checking that dimensions match with one or more other +// arrays. +inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0) +{ + UNUSED_RELEASE(check_shape_0); + assert(shape.DimensionsCount() == check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + return shape.FlatSize(); +} + +inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0, + const Shape &check_shape_1) +{ + UNUSED_RELEASE(check_shape_0); + assert(shape.DimensionsCount() == check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + return MatchingFlatSize(shape, check_shape_1); +} + +inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0) +{ + UNUSED_RELEASE(check_shape_0); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + if (i != skip_dim) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + } + return FlatSizeSkipDim(shape, skip_dim); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SHAPE_H__ diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h new file mode 100644 index 000000000..85654b040 --- /dev/null +++ b/compute/cker/include/cker/Types.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TYPES_H__ +#define __NNFW_CKER_TYPES_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace cker +{ + +enum class FusedActivationFunctionType +{ + kNone = 0, + kRelu6 = 1, + kRelu1 = 2, + kRelu = 3, +}; +enum class PaddingType +{ + kNone = 0, + kSame = 1, + kValid = 2, +}; + +struct PaddingValues +{ + int16_t width; + int16_t height; +}; + +struct PoolParams +{ + FusedActivationFunctionType activation; + PaddingType padding_type; + PaddingValues padding_values; + int stride_height; + int stride_width; + int filter_height; + int filter_width; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct SoftmaxParams +{ + // beta is not really used (not a Tensorflow parameter) and not implemented + // for LogSoftmax. + double beta; + // uint8 inference params. Used even when beta defaults to 1.0. + int32_t input_multiplier; + int32_t input_left_shift; + // Reverse scaling is only used by LogSoftmax. + int32_t reverse_scaling_divisor; + int32_t reverse_scaling_right_shift; + int diff_min; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TYPES_H__ diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h new file mode 100644 index 000000000..d1f1723c4 --- /dev/null +++ b/compute/cker/include/cker/Utils.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_UTILS_H__ +#define __NNFW_CKER_UTILS_H__ + +#include <algorithm> +#include <cstdint> + +#include "cker/gemmlowp/FixedPoint.h" +#include "Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max) +{ + return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max); +} + +inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), + right_shift); +} + +inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, + int left_shift) +{ + return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier); +} + +inline int NodeOffset(int b, int h, int w, int height, int width) +{ + return (b * height + h) * width + w; +} + +inline int CountLeadingZeros(uint32_t integer_input) +{ + const uint32_t one_in_leading_positive = 1U << 31; + int leading_zeros = 0; + while (integer_input < one_in_leading_positive) + { + integer_input <<= 1; + ++leading_zeros; + } + return leading_zeros; +} + +// Comment from tensorflow lite: +// +// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional +// rectangular array of numbers. +// +// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. +// However, as Dims<N> is to be deprecated, this class exists as an adaptor +// to enable simple unoptimized implementations of element-wise broadcasting +// operations. +template <int N> struct NdArrayDesc +{ + // The "extent" of each dimension. Indices along dimension d must be in the + // half-open interval [0, extents[d]). + int extents[N]; + + // The number of *elements* (not bytes) between consecutive indices of each + // dimension. + int strides[N]; +}; + +// Comment from tensorflow lite: +// +// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. +inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3) +{ + assert(i0 >= 0 && i0 < desc.extents[0]); + assert(i1 >= 0 && i1 < desc.extents[1]); + assert(i2 >= 0 && i2 < desc.extents[2]); + assert(i3 >= 0 && i3 < desc.extents[3]); + return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3]; +} + +template <int N> +inline void +NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, + NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out) +{ + assert(desc0_out != nullptr); + assert(desc1_out != nullptr); + + auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape); + auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape); + + // Copy dims to desc, calculating strides. + int desc0_stride = 1; + int desc1_stride = 1; + for (int i = N - 1; i >= 0; --i) + { + desc0_out->extents[i] = extended_input0_shape.Dims(i); + desc0_out->strides[i] = desc0_stride; + desc0_stride *= extended_input0_shape.Dims(i); + desc1_out->extents[i] = extended_input1_shape.Dims(i); + desc1_out->strides[i] = desc1_stride; + desc1_stride *= extended_input1_shape.Dims(i); + } + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) + { + const int extent0 = extended_input0_shape.Dims(i); + const int extent1 = extended_input1_shape.Dims(i); + if (extent0 != extent1) + { + if (extent0 == 1) + { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent1; + } + else + { + assert(extent1 == 1); + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent0; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_UTILS_H__ diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h new file mode 100644 index 000000000..645a61485 --- /dev/null +++ b/compute/cker/include/cker/eigen/Utils.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_UTILS_H__ +#define __NNFW_CKER_EIGEN_UTILS_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) + +#include <Eigen/Core> +#include <type_traits> +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +// Make a local VectorMap typedef allowing to map a float array +// as a Eigen matrix expression. The same explanation as for VectorMap +// above also applies here. +template <typename Scalar> +using MatrixMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, + Eigen::Dynamic>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + +template <typename Scalar> +MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape) +{ + const int dims_count = shape.DimensionsCount(); + const int rows = shape.Dims(dims_count - 1); + const int cols = FlatSizeSkipDim(shape, dims_count - 1); + return MatrixMap<Scalar>(data, rows, cols); +} + +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_EIGEN_UTILS_H__ diff --git a/compute/cker/include/cker/gemmlowp/FixedPoint.h b/compute/cker/include/cker/gemmlowp/FixedPoint.h new file mode 100644 index 000000000..159e01a22 --- /dev/null +++ b/compute/cker/include/cker/gemmlowp/FixedPoint.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The Gemmlowp Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__ +#define __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__ + +#include <algorithm> +#include <cassert> + +namespace nnfw +{ +namespace cker +{ +namespace gemmlowp +{ + +inline int32_t RoundingHalfSum(int32_t a, int32_t b) +{ + int64_t a64 = a; + int64_t b64 = b; + int64_t sum = a64 + b64; + int64_t sign = sum >= 0 ? 1 : -1; + return static_cast<int32_t>((sum + sign) / 2); +} + +inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b) +{ + bool overflow = a == b && a == std::numeric_limits<int32_t>::min(); + int64_t a_64(a); + int64_t b_64(b); + int64_t ab_64 = a_64 * b_64; + int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30)); + int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31)); + return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32; +} + +// Correctly-rounded-to-nearest division by a power-of-two. +// Also known as a rounding arithmetic right shift. +inline int32_t RoundingDivideByPOT(int32_t x, int exponent) +{ + assert(exponent >= 0); + assert(exponent <= 31); + const int32_t mask = ((1ll << exponent) - 1); + const int32_t zero = 0; + const int32_t one = 1; + const int32_t remainder = x & mask; + const int32_t threshold = (mask >> 1) + ((x < zero) ? one : zero); + return ((x >> exponent) + ((remainder > threshold) ? one : zero)); +} + +// Returns the product of a run-time integer value by a compile-time power +// of two, with either a positive exponent (equivalent to an arithmetic +// left shift, saturating) or a negative exponent (equivalent to an arithmetic +// right shift, rounding to nearest). +template <int Exponent, int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0)> +struct ImplSaturatingRoundingMultiplyByPOT +{ +}; + +template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 0> +{ + static int32_t eval(int32_t x) { return x; } +}; + +template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 1> +{ + static int32_t eval(int32_t x) + { + const int32_t min = (std::numeric_limits<int32_t>::min()); + const int32_t max = (std::numeric_limits<int32_t>::max()); + const int32_t threshold = ((1 << (31 - Exponent)) - 1); + const int32_t zero = 0; + const int32_t one = 1; + + const int32_t positive_mask = ((x > threshold) ? ~zero : zero); + const int32_t negative_mask = ((x < -threshold) ? ~zero : zero); + + int32_t result = (x * (one << Exponent)); + result = (positive_mask ? max : result); + result = (negative_mask ? min : result); + return result; + } +}; + +template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, -1> +{ + static int32_t eval(int32_t x) { return RoundingDivideByPOT(x, -Exponent); } +}; + +template <int Exponent> int32_t SaturatingRoundingMultiplyByPOT(int32_t x) +{ + return ImplSaturatingRoundingMultiplyByPOT<Exponent>::eval(x); +} + +template <int tIntegerBits> class FixedPoint +{ +public: + static constexpr int kTotalBits = 8 * sizeof(int32_t); + static constexpr int kIntegerBits = tIntegerBits; + static constexpr int kFractionalBits = kTotalBits - 1 - kIntegerBits; + static_assert(kIntegerBits >= 0 && kIntegerBits < kTotalBits, "bad IntegerBits"); + + static int32_t ScalarRawMax() { return std::numeric_limits<int32_t>::max(); } + + static FixedPoint FromRaw(int32_t x) + { + FixedPoint retval; + retval.raw() = x; + return retval; + } + + static FixedPoint FromScalarRaw(int32_t x) { return FromRaw(x); } + + template <int Exponent> static FixedPoint ConstantPOT() + { + static constexpr int kOffset = kFractionalBits + Exponent; + static_assert(kOffset < 31, "Constant not exactly representable in this fixed-point format"); + return FromScalarRaw((int32_t)1 << kOffset); + } + + static FixedPoint Zero() { return FromScalarRaw(0); } + + static FixedPoint One() + { + return FromScalarRaw(kIntegerBits == 0 ? ScalarRawMax() : ((int32_t)1 << kFractionalBits)); + } + + int32_t raw() const { return i_; } + int32_t &raw() { return i_; } + +private: + int32_t i_; +}; + +// A FixedPoint multiplication is just a +// SaturatingRoundingDoublingHighMul operation on the underlying +// raw integer values. The IntegerBits simply add up, as is obvious +// from the fact that the range is [-2^IntegerBits, 2^IntegerBits). +template <int tIntegerBits_a, int tIntegerBits_b> +FixedPoint<tIntegerBits_a + tIntegerBits_b> operator*(FixedPoint<tIntegerBits_a> a, + FixedPoint<tIntegerBits_b> b) +{ + FixedPoint<tIntegerBits_a + tIntegerBits_b> c; + c.raw() = SaturatingRoundingDoublingHighMul(a.raw(), b.raw()); + return c; +} + +// Tweaking IntegerBits gives exact multiplication by a power of two. +template <int tExponent, int tIntegerBits> +FixedPoint<tExponent + tIntegerBits> ExactMulByPot(FixedPoint<tIntegerBits> a) +{ + FixedPoint<tExponent + tIntegerBits> c; + c.raw() = a.raw(); + return c; +} + +template <int tIntegerBits> +FixedPoint<tIntegerBits> operator+(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b) +{ + return FixedPoint<tIntegerBits>::FromRaw((a.raw() + b.raw())); +} +template <int tIntegerBits> +FixedPoint<tIntegerBits> operator-(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b) +{ + return FixedPoint<tIntegerBits>::FromRaw((a.raw() - b.raw())); +} +template <int tIntegerBits> +FixedPoint<tIntegerBits> operator&(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b) +{ + return FixedPoint<tIntegerBits>::FromRaw((a.raw() & b.raw())); +} + +// Rescale changes the number of IntegerBits and updates the underlying +// raw integer value accordingly. +template <int tIntegerBitsDst, int tIntegerBitsSrc> +FixedPoint<tIntegerBitsDst> Rescale(FixedPoint<tIntegerBitsSrc> x) +{ + static constexpr int kExponent = tIntegerBitsSrc - tIntegerBitsDst; + FixedPoint<tIntegerBitsDst> result; + result.raw() = SaturatingRoundingMultiplyByPOT<kExponent>(x.raw()); + return result; +} + +// Implementation of exponential function. + +// Returns exp(x) for x in [-1/4, 0). +inline FixedPoint<0> exp_on_interval_between_negative_one_quarter_and_0_excl(FixedPoint<0> a) +{ + typedef FixedPoint<0> F; + const F constant_term = F::FromScalarRaw(RoundingDivideByPOT(1895147668, 0)); + const F constant_1_over_3 = F::FromScalarRaw(RoundingDivideByPOT(715827883, 0)); + // We're evaluating a Taylor expansion around -1/8, so we do the change of + // variable: x = a + 1/8. + // In fixed-point with 0 integer bits, 1/8 is represented by 1 << 28. + F x = a + F::template ConstantPOT<-3>(); + F x2 = x * x; + F x3 = x2 * x; + F x4 = x2 * x2; + F x4_over_4 = F::FromScalarRaw(SaturatingRoundingMultiplyByPOT<-2>(x4.raw())); + F x4_over_24_plus_x3_over_6_plus_x2_over_2 = F::FromScalarRaw( + SaturatingRoundingMultiplyByPOT<-1>((((x4_over_4 + x3) * constant_1_over_3) + x2).raw())); + return (constant_term + constant_term * (x + x4_over_24_plus_x3_over_6_plus_x2_over_2)); +} + +// Returns exp(x) for x < 0. +template <int tIntegerBits> FixedPoint<0> exp_on_negative_values(FixedPoint<tIntegerBits> a) +{ + typedef FixedPoint<tIntegerBits> InputF; + typedef FixedPoint<0> ResultF; + static constexpr int kFractionalBits = InputF::kFractionalBits; + static constexpr int kIntegerBits = InputF::kIntegerBits; + const InputF kOneQuarter = InputF::template ConstantPOT<-2>(); + InputF mask = kOneQuarter - InputF::FromScalarRaw(1); + InputF a_mod_quarter_minus_one_quarter = (a & mask) - kOneQuarter; + ResultF result = exp_on_interval_between_negative_one_quarter_and_0_excl( + Rescale<0>(a_mod_quarter_minus_one_quarter)); + int32_t remainder = (a_mod_quarter_minus_one_quarter - a).raw(); + +#define GEMMLOWP_EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier) \ + if (kIntegerBits > Exponent) \ + { \ + const ResultF kMultiplier = \ + ResultF::FromScalarRaw(RoundingDivideByPOT(FixedPointMultiplier, 0)); \ + static constexpr int kShiftAmount = \ + ((kIntegerBits > Exponent) ? (kFractionalBits + Exponent) : 0); \ + result = ((remainder & (1 << kShiftAmount)) ? (result * kMultiplier) : result); \ + } + + GEMMLOWP_EXP_BARREL_SHIFTER(-2, 1672461947); + GEMMLOWP_EXP_BARREL_SHIFTER(-1, 1302514674); + GEMMLOWP_EXP_BARREL_SHIFTER(+0, 790015084); + GEMMLOWP_EXP_BARREL_SHIFTER(+1, 290630308); + GEMMLOWP_EXP_BARREL_SHIFTER(+2, 39332535); + GEMMLOWP_EXP_BARREL_SHIFTER(+3, 720401); + GEMMLOWP_EXP_BARREL_SHIFTER(+4, 242); + +#undef GEMMLOWP_EXP_BARREL_SHIFTER + + static constexpr int clampB = ((kIntegerBits > 5) ? (36 - kIntegerBits) : 0); + if (kIntegerBits > 5) + { + const InputF clamp = InputF::FromScalarRaw(RoundingDivideByPOT(-(1 << clampB), 0)); + result.raw() = ((a.raw() < clamp.raw()) ? ResultF::Zero().raw() : result.raw()); + } + + result.raw() = (a.raw() ? result.raw() : ResultF::One().raw()); + return result; +} + +// Returns 1 / (1 + x) for x in (0, 1). +inline FixedPoint<0> one_over_one_plus_x_for_x_in_0_1(FixedPoint<0> a) +{ + typedef FixedPoint<0> F0; + typedef FixedPoint<2> F2; + F0 half_denominator = F0::FromScalarRaw(RoundingHalfSum(a.raw(), F0::One().raw())); + // Newton-Raphson division + // https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division + // Refer to that page for the logic behind the 48/17 and 32/17 constants. + const F2 constant_48_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(1515870810, 0)); + const F2 constant_neg_32_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(-1010580540, 0)); + F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17; + for (int i = 0; i < 3; i++) + { + F2 half_denominator_times_x = half_denominator * x; + F2 one_minus_half_denominator_times_x = F2::One() - half_denominator_times_x; + x = x + Rescale<2>(x * one_minus_half_denominator_times_x); + } + return Rescale<0>(ExactMulByPot<-1>(x)); +} + +} // namespace gemmlowp +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__ diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h new file mode 100644 index 000000000..b20919429 --- /dev/null +++ b/compute/cker/include/cker/operation/AveragePool.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_AVERAGE_POOL_H__ +#define __NNFW_CKER_AVERAGE_POOL_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) +#include "cker/operation/optimized/AveragePool.h" +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#include "cker/operation/reference/AveragePool.h" + +namespace nnfw +{ +namespace cker +{ + +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ +#if defined(CKER_OPTIMIZED_EIGEN) + optimized::AveragePool(params, input_shape, input_data, output_shape, output_data); +#else // defined(CKER_OPTIMIZED_EIGEN) + reference::AveragePool(params, input_shape, input_data, output_shape, output_data); +#endif // defined(CKER_OPTIMIZED_EIGEN) +} + +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start); + if (filter_count <= 0) + { + continue; + } + for (int channel = 0; channel < depth; ++channel) + { + int32_t acc = 0; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + acc += input_data[Offset(input_shape, batch, in_y, in_x, channel)]; + } + } + acc = (acc + filter_count / 2) / filter_count; + acc = std::max(acc, params.quantized_activation_min); + acc = std::min(acc, params.quantized_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + static_cast<uint8_t>(acc); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h new file mode 100644 index 000000000..60dd02651 --- /dev/null +++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ +#define __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ + +#include <functional> +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct BinaryArithmeticOpParam +{ + // Shape dependent / common to data / op types. + // BroadcastableOpCategory broadcast_category; + // uint8 inference params. + int32_t input1_offset; + int32_t input2_offset; + int32_t output_offset; + int32_t output_multiplier; + int32_t output_shift; + // Add / Sub, not Mul, uint8 inference params. + int32_t left_shift; + int32_t input1_multiplier; + int32_t input1_shift; + int32_t input2_multiplier; + int32_t input2_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + + // Processed output dimensions. + // Let input "a" be the one that broadcasts in the faster-changing dimension. + // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and + // {b0, b1, b2, b3, b4}, + // broadcast_shape[4] = b0 = a0. + // broadcast_shape[3] = b1; a1 = 1. + // broadcast_shape[2] = b2 = a2. + // broadcast_shape[1] = a3; b3 = 1. + // broadcast_shape[0] = b4 = a4. + // int broadcast_shape[5]; +}; + +template <typename T> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data, + const std::function<T(const T &, const T &)> &fn) +{ + const int32_t flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), + params.quantized_activation_min, + params.quantized_activation_max); + } +} + +template <> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data, + const std::function<float(const float &, const float &)> &fn) +{ + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = + ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), + params.float_activation_min, params.float_activation_max); + } +} + +template <typename T> +inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶ms, + const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data, + const std::function<T(const T &, const T &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + // Comment from tensorflow lite: + // + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); + } + } + } + } +} + +template <> +inline void BroadcastBinaryArithmeticOpSlow( + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, + const Shape &input2_shape, const float *input2_data, const Shape &output_shape, + float *output_data, const std::function<float(const float &, const float &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.float_activation_min, params.float_activation_max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h new file mode 100644 index 000000000..69a179c8c --- /dev/null +++ b/compute/cker/include/cker/operation/Concatenation.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CONCATENATION_H__ +#define __NNFW_CKER_CONCATENATION_H__ + +#include <cstdint> + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +struct ConcatenationParams +{ + int8_t axis; + const int32_t *input_zeropoint; + const float *input_scale; + uint16_t inputs_count; + int32_t output_zeropoint; + float output_scale; +}; + +template <typename Scalar> +inline void Concatenation(const ConcatenationParams ¶ms, const Shape *const *input_shapes, + const Scalar *const *input_data, const Shape &output_shape, + Scalar *output_data) +{ + int axis = params.axis; + int inputs_count = params.inputs_count; + const int concat_dimensions = output_shape.DimensionsCount(); + assert(axis < concat_dimensions); + + int64_t concat_size = 0; + for (int i = 0; i < inputs_count; i++) + { + assert(input_shapes[i]->DimensionsCount() == concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) + { + if (j != axis) + { + auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j); + UNUSED_RELEASE(dim_checked); + } + } + concat_size += input_shapes[i]->Dims(axis); + } + assert(concat_size == output_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= output_shape.Dims(i); + } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) + { + base_inner_size *= output_shape.Dims(i); + } + + Scalar *output_ptr = output_data; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < inputs_count; ++i) + { + const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar)); + output_ptr += copy_size; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CONCATENATION_H__ diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h new file mode 100644 index 000000000..35b0336fa --- /dev/null +++ b/compute/cker/include/cker/operation/Conv.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CONV_H__ +#define __NNFW_CKER_CONV_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct ConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + UNUSED_RELEASE(bias_shape); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + const int in_offset = Offset(input_shape, batch, in_y, in_x, 0); + const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0); + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + float input_value = input_data[in_offset + in_channel]; + float filter_value = filter_data[filter_offset + in_channel]; + total += (input_value * filter_value); + } + } + } + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[out_channel]; + } + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + ActivationFunctionWithMinMax(total + bias_value, output_activation_min, + output_activation_max); + } + } + } + } +} + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(output_activation_min <= output_activation_max); + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + UNUSED_RELEASE(bias_shape); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + const int in_base = Offset(input_shape, batch, in_y, in_x, 0); + const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0); + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + for (int in_channel = 0; in_channel < input_depth; in_channel++) + { + int32_t input_val = input_data[in_channel + in_base]; + int32_t filter_val = filter_data[in_channel + filter_base]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + } + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast<uint8_t>(acc); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CONCATENATION_H_ diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h new file mode 100644 index 000000000..7d022477d --- /dev/null +++ b/compute/cker/include/cker/operation/DepthwiseConv.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__ +#define __NNFW_CKER_DEPTHWISE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct DepthwiseConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + int16_t depth_multiplier; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + assert(output_activation_min <= output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + + for (int b = 0; b < batches; ++b) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int ic = 0; ic < input_depth; ++ic) + { + for (int m = 0; m < depth_multiplier; m++) + { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + int32_t input_val = input_data[Offset(input_shape, b, in_y, in_x, ic)]; + int32_t filter_val = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + if (bias_data) + { + acc += bias_data[oc]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, b, out_y, out_x, oc)] = static_cast<uint8_t>(acc); + } + } + } + } + } +} + +inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + + for (int b = 0; b < batches; ++b) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int ic = 0; ic < input_depth; ++ic) + { + for (int m = 0; m < depth_multiplier; m++) + { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)]; + float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; + total += (input_value * filter_value); + } + } + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[oc]; + } + output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_DEPTHWISE_CONV_H__ diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h new file mode 100644 index 000000000..428fb1b53 --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_H__ +#define __NNFW_CKER_FULLY_CONNECTED_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct FullyConnectedParams +{ + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + // FullyConnectedWeightsFormat weights_format; +}; + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) +{ + UNUSED_RELEASE(input_shape); + UNUSED_RELEASE(bias_shape); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + float total = 0.f; + for (int d = 0; d < accum_depth; ++d) + { + total += input_data[b * accum_depth + d] * weights_data[out_c * accum_depth + d]; + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[out_c]; + } + output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } +} + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data) +{ + UNUSED_RELEASE(input_shape); + UNUSED_RELEASE(bias_shape); + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(filter_shape.DimensionsCount() >= 2); + assert(output_shape.DimensionsCount() >= 1); + + assert(output_activation_min <= output_activation_max); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = + MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + int32_t acc = 0; + for (int d = 0; d < accum_depth; ++d) + { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) + { + acc += bias_data[out_c]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FULLY_CONNECTED_H__ diff --git a/compute/cker/include/cker/operation/Gather.h b/compute/cker/include/cker/operation/Gather.h new file mode 100644 index 000000000..9cd96eeb7 --- /dev/null +++ b/compute/cker/include/cker/operation/Gather.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_GATHER_H__ +#define __NNFW_CKER_GATHER_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct GatherParams +{ + int32_t axis; +}; + +template <typename T, typename CoordsT = int32_t> +inline void Gather(const GatherParams &op_params, const Shape &input_shape, const T *input_data, + const Shape &coords_shape, const CoordsT *coords_data, const Shape &, + T *output_data) +{ + int axis = op_params.axis; + if (axis < 0) + { + axis += input_shape.DimensionsCount(); + } + assert(axis >= 0); + assert(axis < input_shape.DimensionsCount()); + const int axis_size = input_shape.Dims(axis); + const int coords_count = coords_shape.FlatSize(); + + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + + int inner_size = 1; + for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) + { + inner_size *= input_shape.Dims(i); + } + + for (int outer = 0; outer < outer_size; ++outer) + { + for (int i = 0; i < coords_count; ++i) + { + assert(coords_data[i] >= 0); + assert(coords_data[i] < axis_size); + std::memcpy(output_data + (outer * coords_count + i) * inner_size, + input_data + (outer * axis_size + coords_data[i]) * inner_size, + sizeof(T) * inner_size); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_GATHER_H__ diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h new file mode 100644 index 000000000..794dcebc8 --- /dev/null +++ b/compute/cker/include/cker/operation/InstanceNorm.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_INSTANCE_NORM_H__ +#define __NNFW_CKER_INSTANCE_NORM_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +struct InstanceNormParams +{ + float epsilon; + float float_activation_min; + float float_activation_max; +}; + +inline void InstanceNorm(const InstanceNormParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &gamma_shape, const float *gamma_data, + const Shape &beta_shape, const float *beta_data, const Shape &output_shape, + float *output_data) +{ + const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1); + const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2); + const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + + UNUSED_RELEASE(gamma_shape); + UNUSED_RELEASE(beta_shape); + assert(output_activation_min <= output_activation_max); + + for (int32_t batch = 0; batch < batches; batch++) + { + for (int32_t channel = 0; channel < channels; channel++) + { + double sum = 0.0f; + double square_sum = 0.0f; + int32_t size = heights * widths; + + for (int32_t height = 0; height < heights; height++) + { + for (int32_t width = 0; width < widths; width++) + { + double input_val = input_data[Offset(input_shape, batch, height, width, channel)]; + sum += input_val; + square_sum += (input_val * input_val); + } + } + + double mean = sum / size; + double var = square_sum / size - mean * mean; + + double gamma = gamma_data[channel]; + double beta = beta_data[channel]; + + double a = gamma / (std::sqrt(var + params.epsilon)); + double b = -mean * a + beta; + + for (int32_t height = 0; height < heights; height++) + { + for (int32_t width = 0; width < widths; width++) + { + double input_value = input_data[Offset(output_shape, batch, height, width, channel)]; + double output_value = input_value * a + b; + output_data[Offset(output_shape, batch, height, width, channel)] = + ActivationFunctionWithMinMax((float)output_value, output_activation_min, + output_activation_max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_INSTANCE_NORM_H__ diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h new file mode 100644 index 000000000..872095531 --- /dev/null +++ b/compute/cker/include/cker/operation/Logistic.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGISTIC_H__ +#define __NNFW_CKER_LOGISTIC_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2) + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = 1.f / (1.f + std::exp(-input_data[i])); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGISTIC_H__ diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h new file mode 100644 index 000000000..326168b99 --- /dev/null +++ b/compute/cker/include/cker/operation/MaxPool.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_MAX_POOL_H__ +#define __NNFW_CKER_MAX_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include "cker/operation/optimized/MaxPool.h" +#include "cker/operation/reference/MaxPool.h" + +namespace nnfw +{ +namespace cker +{ + +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ +#if defined(CKER_OPTIMIZED_EIGEN) + optimized::MaxPool(params, input_shape, input_data, output_shape, output_data); +#else // defined(CKER_OPTIMIZED_EIGEN) + reference::MaxPool(params, input_shape, input_data, output_shape, output_data); +#endif // defined(CKER_OPTIMIZED_EIGEN) +} + +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, uint8_t *output_data) +{ + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(params.quantized_activation_min >= 0); + assert(params.quantized_activation_max <= 255); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + uint8_t max = 0; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]); + } + } + max = std::max<uint8_t>(max, params.quantized_activation_min); + max = std::min<uint8_t>(max, params.quantized_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + static_cast<uint8_t>(max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h new file mode 100644 index 000000000..af432f3a8 --- /dev/null +++ b/compute/cker/include/cker/operation/Pad.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_PAD_H__ +#define __NNFW_CKER_PAD_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <stdexcept> +#include <iostream> +namespace nnfw +{ +namespace cker +{ +inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, + const float *input_data, const Shape &output_shape, float *output_data, + const float *constant_value_data) +{ + // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC` + // TODO: come up with more subtle solution that uses subtensors like arm compute + // TODO: Check if it works for all layouts + + using PaddingInfo = std::pair<int32_t, int32_t>; + /** List of padding information */ + using PaddingList = std::vector<PaddingInfo>; + + auto constant_value = constant_value_data ? *constant_value_data : 0; + assert(output_shape.DimensionsCount() == input_shape.DimensionsCount()); + + PaddingList padding_list(pad_rank); + for (int32_t n = 0; n < pad_rank; ++n) + { + const int32_t *from = padding_data + (n * 2); + padding_list[n] = {from[0], from[1]}; + } + for (int32_t i = 0; i < pad_rank; ++i) + { + assert(output_shape.Dims(i) == + input_shape.Dims(i) + padding_list[i].first + padding_list[i].second); + } + /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker + functions: + 1. to prevent access violation in padding_list; + 2. handling as 4d is slower than as 2d/3d. + */ + switch (pad_rank) + { + case 0: + case 1: + { + const int32_t in_row_len = input_shape.Dims(0); + std::fill_n(output_data, padding_list[0].first, constant_value); + std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float)); + std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second, + constant_value); + break; + } + case 2: // HW + { + const int32_t in_row_len = input_shape.Dims(1); + const int32_t out_row_size = output_shape.Dims(1); + + // prepend padding rows + std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j) + { + auto out_offset = i * out_row_size; + const auto in_offset = j * in_row_len; + + // prepend padding values + std::fill_n(output_data + out_offset, padding_list[1].first, constant_value); + + out_offset += padding_list[1].first; + + // copy a row of input data + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + + out_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_offset, padding_list[1].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size, + constant_value); + break; + } + case 3: // HWC + { + const int32_t in_row_len = input_shape.Dims(2); + const int32_t out_row_size = output_shape.Dims(2); + const auto plain_size = out_row_size * output_shape.Dims(1); + + // prepend padding plains + std::fill_n(output_data, padding_list[0].first * plain_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp) + { + const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2); + + // prepend padding rows + std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size, + constant_value); + + const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first; + for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp) + { + auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2); + const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2); + + // prepend padding values + std::fill_n(output_data + out_offset, padding_list[2].first, constant_value); + + out_offset += padding_list[2].first; + + // copy a row of input data + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + + out_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_offset, padding_list[2].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size, + padding_list[1].second * out_row_size, constant_value); + } + + // append padding plains + std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size, + constant_value); + break; + } + case 4: + { + auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t { + return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3); + }; + const int32_t in_row_len = input_shape.Dims(3); + const int32_t out_row_size = output_shape.Dims(3); + const auto plain_size = out_row_size * output_shape.Dims(2); + const auto parallelepiped_size = plain_size * output_shape.Dims(1); + + // prepend padding parallelepipeds + std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value); + + const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp) + { + const auto out_h_offset = get_offset(output_shape, i, 0, 0); + // prepend padding plains + std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first; + for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp) + { + const auto out_w_offset = get_offset(output_shape, i, j, 0); + + // prepend padding rows + std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size, + constant_value); + + const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first; + for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp) + { + auto out_c_offset = get_offset(output_shape, i, j, k); + const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp); + + // prepend padding values + std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value); + + out_c_offset += padding_list[3].first; + + // copy a row of input data + memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float)); + + out_c_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size, + padding_list[2].second * out_row_size, constant_value); + } + + // append padding plains + std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size, + padding_list[1].second * plain_size, constant_value); + } + // append padding parallelepipeds + std::fill_n(output_data + r_b_inp_lim * parallelepiped_size, + padding_list[0].second * parallelepiped_size, constant_value); + break; + } + default: + throw std::runtime_error("Padding for rank > 4 NYI"); + break; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_PAD_H__ diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h new file mode 100644 index 000000000..ea404a002 --- /dev/null +++ b/compute/cker/include/cker/operation/SoftMax.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SOFTMAX_H__ +#define __NNFW_CKER_SOFTMAX_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" +#include "cker/gemmlowp/FixedPoint.h" +#include "cker/operation/optimized/SoftMax.h" +#include "cker/operation/reference/SoftMax.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ +#if defined(CKER_OPTIMIZED_EIGEN) + optimized::Softmax(params, input_shape, input_data, output_shape, output_data); +#else // defined(CKER_OPTIMIZED_EIGEN) + reference::Softmax(params, input_shape, input_data, output_shape, output_data); +#endif // defined(CKER_OPTIMIZED_EIGEN) +} + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + const int32_t input_beta_multiplier = params.input_multiplier; + const int32_t input_beta_left_shift = params.input_left_shift; + const int diff_min = params.diff_min; + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = gemmlowp::FixedPoint<kScaledDiffIntegerBits>; + using FixedPointAccum = gemmlowp::FixedPoint<kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<0>; + + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + uint8_t max_in_row = 0; + for (int c = 0; c < depth; ++c) + { + max_in_row = std::max(max_in_row, input_data[i * depth + c]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < depth; ++c) + { + int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) + { + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32_t fixed_sum_of_exps = sum_of_exps.raw(); + int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32_t shifted_sum_minus_one = + static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32_t>(1) << 31)); + + FixedPoint0 shifted_scale = + one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); + + for (int c = 0; c < depth; ++c) + { + int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) + { + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(), + num_bits_over_unit + 31 - 8); + + output_data[i * depth + c] = static_cast<uint8_t>( + std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); + } + else + { + output_data[i * depth + c] = 0; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SOFTMAX_H__ diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h new file mode 100644 index 000000000..535fe86cf --- /dev/null +++ b/compute/cker/include/cker/operation/TransposeConv.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRANSPOSE_CONV_H__ +#define __NNFW_CKER_TRANSPOSE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct TransposeConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +inline void TransposeConv(const TransposeConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &output_shape, float *output_data) +{ + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + // Although transpose convolution simplifies to convolution with transposed + // weights for strides of 1, non-unitary striding complicates matters. To + // keep this reference implementation as clear as possible, we use a + // "scatter" access pattern, where we loop through all the input elements, + // computing their influence on the output, rather than looping through the + // output elements in the typical "gather" access pattern of a conv. We + // therefore must initialize the output array to zero. + const int num_elements = output_shape.FlatSize(); + for (int i = 0; i < num_elements; i++) + { + output_data[i] = 0.0f; + } + + // Loop through input elements one at a time. + for (int batch = 0; batch < batches; ++batch) + { + for (int in_y = 0; in_y < input_height; ++in_y) + { + for (int in_x = 0; in_x < input_width; ++in_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + // Loop through the output elements it will influence + const int out_x_origin = (in_x * stride_width) - pad_width; + const int out_y_origin = (in_y * stride_height) - pad_height; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + // Compute output element location + const int out_x = out_x_origin + filter_x; + const int out_y = out_y_origin + filter_y; + // We cannot accumulate out of bounds + if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && + (out_y < output_height)) + { + float input_value = + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] += + input_value * filter_value; + } + } + } + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TRANSPOSE_CONV_H__ diff --git a/compute/cker/include/cker/operation/optimized/AveragePool.h b/compute/cker/include/cker/operation/optimized/AveragePool.h new file mode 100644 index 000000000..d94a5811a --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/AveragePool.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__ +#define __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// TODO Change to apply neon for this function if it is faster +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + // TODO(benoitjacob) make this a proper reference impl without Eigen! + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // TODO(benoitjacob) get rid of the dynamic memory allocation here! + Eigen::VectorXf out_count(out_mat.cols()); + out_count.setZero(); + // Prefill the output to 0. + out_mat.setZero(); + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < input_height; ++h) + { + for (int w = 0; w < input_width; ++w) + { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) + { + for (int pw = w_start; pw < w_end; ++pw) + { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width)); + out_count(out_offset)++; + } + } + } + } + } + // Divide the output by the actual number of elements being averaged over + assert(out_count.minCoeff() > 0); + out_mat.array().rowwise() /= out_count.transpose().array(); + + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/optimized/MaxPool.h b/compute/cker/include/cker/operation/optimized/MaxPool.h new file mode 100644 index 000000000..07a14aee4 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/MaxPool.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_MAX_POOL_H__ +#define __NNFW_CKER_OPTIMIZED_MAX_POOL_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// TODO Change to apply neon for this function if it is faster +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // Prefill the output to minimum representable float value + out_mat.setConstant(std::numeric_limits<float>::lowest()); + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < input_height; ++h) + { + for (int w = 0; w < input_width; ++w) + { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) + { + for (int pw = w_start; pw < w_end; ++pw) + { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) = + out_mat.col(out_offset) + .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); + } + } + } + } + } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_OPTIMIZED_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/optimized/SoftMax.h b/compute/cker/include/cker/operation/optimized/SoftMax.h new file mode 100644 index 000000000..e44f251d0 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/SoftMax.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_SOFTMAX_H__ +#define __NNFW_CKER_OPTIMIZED_SOFTMAX_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + // Validate whether if shapes of input and output are the same + MatchingFlatSize(input_shape, output_shape); + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // Compute the exponential first, removing the max coefficient for numerical + // stability. + out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta; + // We are separating out the exp function so that exp can be vectorized. + out_mat = out_mat.array().exp(); + // Normalize to get the activations. + Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse(); + out_mat.array().rowwise() *= scale; +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_OPTIMIZED_SOFTMAX_H__ diff --git a/compute/cker/include/cker/operation/reference/AveragePool.h b/compute/cker/include/cker/operation/reference/AveragePool.h new file mode 100644 index 000000000..3ddab4b24 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/AveragePool.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__ +#define __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start); + if (filter_count <= 0) + { + continue; + } + for (int channel = 0; channel < depth; ++channel) + { + float total = 0.f; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + total += input_data[Offset(input_shape, batch, in_y, in_x, channel)]; + } + } + const float average = total / (float)filter_count; + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(average, params.float_activation_min, + params.float_activation_max); + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/reference/MaxPool.h b/compute/cker/include/cker/operation/reference/MaxPool.h new file mode 100644 index 000000000..a0f0263c7 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/MaxPool.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_MAX_POOL_H__ +#define __NNFW_CKER_REFERENCE_MAX_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + float max = std::numeric_limits<float>::lowest(); + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]); + } + } + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(max, params.float_activation_min, + params.float_activation_max); + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/reference/SoftMax.h b/compute/cker/include/cker/operation/reference/SoftMax.h new file mode 100644 index 000000000..420cb319b --- /dev/null +++ b/compute/cker/include/cker/operation/reference/SoftMax.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_SOFTMAX_H__ +#define __NNFW_CKER_REFERENCE_SOFTMAX_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) + { + max = std::max(max, input_data[i * depth + c]); + } + + // Compute sum. + float sum = 0.f; + for (int c = 0; c < depth; ++c) + { + sum += std::exp((input_data[i * depth + c] - max) * params.beta); + } + + // Compute result. + for (int c = 0; c < depth; ++c) + { + output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) * params.beta) / sum; + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_SOFTMAX_H__ |