diff options
Diffstat (limited to 'compute/ARMComputeEx/src/core/NEON/kernels')
13 files changed, 3644 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp new file mode 100644 index 000000000..d2f42de53 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace arm_compute +{ + +template <BinaryLogicalOperation op, typename ScalarType> +inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case BinaryLogicalOperation::AND: + res = a & b; + break; + case BinaryLogicalOperation::OR: + res = a | b; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op, typename VectorType> +inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + + switch (op) + { + case BinaryLogicalOperation::AND: + res = wrapper::vand(a, b); + break; + case BinaryLogicalOperation::OR: + res = wrapper::vorr(a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op> +inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) +{ + uint8x16x4_t out = {{ + elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_logic_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, + ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>, + &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_logic_op_loop<op, ScalarType, VectorType>); +} + +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input1, const ITensor *input2, ITensor *output, + const Window &window) { func(input1, input2, output, window); }; + } + return nullptr; +} + +template <BinaryLogicalOperation op> +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> +configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + + return configure_func(input1, input2, output, map_function); +} + +void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1, + const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); + configure_common(input1, input2, output); + switch (op) + { + case BinaryLogicalOperation::AND: + _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output); + break; + case BinaryLogicalOperation::OR: + _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) +{ + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, + DataType::QASYMM8); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); + return Status{}; +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp new file mode 100644 index 000000000..7e4fc129b --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECastKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::U32, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL && + input->data_type() != DataType::U8); + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::U32, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); + + // NECastKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_tuple(Status{}, win); +} + +typedef struct bool8x16 +{ + uint8x16_t val; +} bool8x16_t; + +static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; } + +template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; } +template <> inline uint8x16_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + return vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +} + +template <> inline uint32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const uint32x4x4_t ret = {{ + vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))), + vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const int32x4x4_t ret = {{ + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const float32x4x4_t ret = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), + }}; + + return ret; +} + +template <> inline uint32x4x4_t vcast(const uint8x16_t &v) +{ + const uint32x4x4_t ret = {{ + vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))), + vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const uint8x16_t &v) +{ + const int32x4x4_t ret = {{ + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const uint8x16_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const int32x4x4_t &v) +{ + // Saturate cast + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))), + vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3])))); +} + +template <> inline uint32x4x4_t vcast(const int32x4x4_t &v) +{ + // Saturate cast + const uint32x4x4_t ret = {{ + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const int32x4x4_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]), + vcvtq_f32_s32(v.val[3]), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const uint32x4x4_t &v) +{ + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))), + vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3])))); +} + +template <> inline int32x4x4_t vcast(const uint32x4x4_t &v) +{ + const int32x4x4_t ret = {{ + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const uint32x4x4_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]), + vcvtq_f32_u32(v.val[3]), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const float32x4x4_t &v) +{ + // Saturate cast + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])), + vqmovun_s32(vcvtq_s32_f32(v.val[1])))), + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])), + vqmovun_s32(vcvtq_s32_f32(v.val[3]))))); +} + +template <> inline uint32x4x4_t vcast(const float32x4x4_t &v) +{ + const uint32x4x4_t ret = {{ + vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]), + vcvtq_u32_f32(v.val[3]), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const float32x4x4_t &v) +{ + const int32x4x4_t ret = {{ + vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]), + vcvtq_s32_f32(v.val[3]), + }}; + + return ret; +} + +template <typename T> struct cast_vector; +template <> struct cast_vector<bool> +{ + using type = bool8x16_t; +}; +template <> struct cast_vector<uint8_t> +{ + using type = uint8x16_t; +}; +template <> struct cast_vector<uint32_t> +{ + using type = uint32x4x4_t; +}; +template <> struct cast_vector<int32_t> +{ + using type = int32x4x4_t; +}; +template <> struct cast_vector<float> +{ + using type = float32x4x4_t; +}; + +template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v) +{ + wrapper::vstore(ptr, v); +} + +inline bool8x16_t vloadq(const bool *ptr) +{ + bool8x16_t ret; + ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr)); + return ret; +} + +template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr) +{ + return wrapper::vloadq(ptr); +} + +template <> inline typename cast_vector<bool>::type load_input(const bool *ptr) +{ + return vloadq(ptr); +} + +template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr) +{ + return vld4q_u32(ptr); +} + +template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr) +{ + return vld4q_s32(ptr); +} + +template <> inline typename cast_vector<float>::type load_input(const float *ptr) +{ + return vld4q_f32(ptr); +} + +template <typename T> inline T get_value(const T *ptr) { return *ptr; } + +template <> inline bool get_value(const bool *ptr) +{ + bool ret = (*ptr != 0); + return ret; +} + +template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + using from_vector = typename cast_vector<FromT>::type; + const from_vector vin = load_input(in_ptr + x); + + switch (output->info()->data_type()) + { + case DataType::U8: + { + using to_vector = typename cast_vector<uint8_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::QASYMM8: + { + using to_vector = typename cast_vector<float>::type; + const QuantizationInfo &qinfo_out = output->info()->quantization_info(); + const auto vf = vcast<to_vector, from_vector>(vin); + const auto vout = vquantize(vf, qinfo_out); + store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::U32: + { + using to_vector = typename cast_vector<uint32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::S32: + { + using to_vector = typename cast_vector<int32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::F32: + { + using to_vector = typename cast_vector<float>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + FromT val = get_value(in_ptr + x); + switch (output->info()->data_type()) + { + case DataType::U8: + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val); + break; + } + case DataType::QASYMM8: + { + const QuantizationInfo &qinfo_out = output->info()->quantization_info(); + const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy); + *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval; + break; + } + case DataType::U32: + { + *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val); + break; + } + case DataType::S32: + { + *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val); + break; + } + case DataType::F32: + { + *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + }, + in, out); +} + +void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + const auto &qinfo_in = input->info()->quantization_info(); + const auto &qinfo_out = output->info()->quantization_info(); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + using from_vector = typename cast_vector<float>::type; + const auto vf = wrapper::vloadq(in_ptr + x); + const auto vin = vdequantize(vf, qinfo_in); + switch (output->info()->data_type()) + { + case DataType::U8: + { + using to_vector = typename cast_vector<uint8_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::QASYMM8: + { + using to_vector = typename cast_vector<float>::type; + const auto vf = vcast<to_vector, from_vector>(vin); + const auto vout = vquantize(vf, qinfo_out); + store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::U32: + { + using to_vector = typename cast_vector<uint32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::S32: + { + using to_vector = typename cast_vector<int32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::F32: + { + using to_vector = typename cast_vector<float>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + qasymm8_t qval_in = *(in_ptr + x); + const auto val = qinfo_in.dequantize(qval_in); + + switch (output->info()->data_type()) + { + case DataType::U8: + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val); + break; + } + case DataType::QASYMM8: + { + const auto qval_out = qinfo_out.quantize(val, rounding_policy); + *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out; + break; + } + case DataType::U32: + { + *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val); + break; + } + case DataType::S32: + { + *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val); + break; + } + case DataType::F32: + { + *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + }, + in, out); +} +} // namespace + +NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE) +{ +} + +void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype)); + + _input = input; + _output = output; + _input_subtype = input_subtype; + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + return Status{}; +} + +void NECastKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::U8: + if (_input_subtype == SubDataType::BOOL) + { + run_cast<bool>(_input, _output, window); + } + else + { + run_cast<uint8_t>(_input, _output, window); + } + break; + case DataType::QASYMM8: + run_cast_qasymm8(_input, _output, window); + break; + case DataType::U32: + run_cast<uint32_t>(_input, _output, window); + break; + case DataType::S32: + run_cast<int32_t>(_input, _output, window); + break; + case DataType::F32: + run_cast<float>(_input, _output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp new file mode 100644 index 000000000..8a2223c26 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include <arm_neon.h> +#include <cstdint> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != + 0); + // Validate output if initialized + if (output->total_size() != 0) + { + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} +} // namespace + +NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx() + : _input(nullptr), _output(nullptr), _block_shape() +{ +} + +void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output, + int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); + + _input = input; + _output = output; + _block_shape = block_shape; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + ICPPKernel::configure(win); +} + +Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); + return Status{}; +} + +void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + const int idx_channel = + get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL); + const int depth_size = _input->info()->dimension(idx_channel); + const int r = (depth_size / (_block_shape * _block_shape)); + const int element_size = _input->info()->element_size(); + + Window slice_out = window.first_slice_window_3D(); + + // The slice_out slice does not move + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Main loop for NCHW and NHWC + if (_input->info()->data_layout() == DataLayout::NCHW) + { + Window slice_in = window.first_slice_window_2D(); + do + { + Iterator in(_input, slice_in); + execute_window_loop(slice_in, + [&](const Coordinates &id) { + const int x = id.x(); + const int y = id.y(); + + const int z = id.z() % r; + const int out_x = x * _block_shape + (id.z() / r) % _block_shape; + const int out_y = y * _block_shape + (id.z() / r) / _block_shape; + Coordinates output_coords{out_x, out_y, z, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_2D(slice_in)); + } + else + { + Window slice_in = window.first_slice_window_3D(); + do + { + Iterator in(_input, slice_in); + execute_window_loop(slice_in, + [&](const Coordinates &id) { + const int x = id.y(); + const int y = id.z(); + + const int z = id.x() % r; + const int out_x = x * _block_shape + (id.x() / r) % _block_shape; + const int out_y = y * _block_shape + (id.x() / r) / _block_shape; + Coordinates output_coords{z, out_x, out_y, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_3D(slice_in)); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp new file mode 100644 index 000000000..cebd614df --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <cstdint> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; + +namespace +{ +template <ElementWiseUnaryEx op, typename ScalarType> +inline ScalarType elementwise_op_scalar(const ScalarType &a) +{ + switch (op) + { + case ElementWiseUnaryEx::NEG: + return -a; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <ElementWiseUnaryEx op, typename VectorType> +inline VectorType elementwise_op(const VectorType &a) +{ + switch (op) + { + case ElementWiseUnaryEx::NEG: + return wrapper::vneg(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <ElementWiseUnaryEx op, typename ScalarType> +void elementwise_op(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, + elementwise_op<op>(wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x)); + } + }, + input, output); +} + +template <ElementWiseUnaryEx op> +std::function<void(const ITensor *input, ITensor *output, const Window &window)> +configure_func(const ITensor *input, ITensor *output) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *> + map_function = { + {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>}, + }; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + map_function["op_F16_F16"] = &elementwise_op<op, float16_t>; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input, ITensor *output, const Window &window) { + func(input, output, window); + }; + } + return nullptr; +} +} // namespace + +NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx() + : _function(nullptr), _input(nullptr), _output(nullptr) +{ +} + +void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input, + ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Configure kernel window + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info()); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); + + Window win = calculate_max_window(valid_region); + + _input = input; + _output = output; + + INEKernel::configure(win); + + switch (op) + { + case ElementWiseUnaryEx::NEG: + _function = configure_func<ElementWiseUnaryEx::NEG>(input, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input, + const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, + DataType::S32); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); + } + + return Status{}; +} + +Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output)); + return Status{}; +} + +void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_function == nullptr); + _function(_input, _output, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..5401afea0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() + : _input(nullptr), _lookups(nullptr), _output(nullptr) +{ +} + +void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output, + const ITensor *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Auto initialize output if not initialized + auto out_shape = input->info()->tensor_shape(); + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions()); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + const arm_compute::ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + return Status{}; +} + +void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const int32_t lookup = *reinterpret_cast<int32_t *>( + _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp new file mode 100644 index 000000000..ce2413dc1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +namespace arm_compute +{ +namespace +{ +/** Validate the indices + * + * Validate that indices are not negative + * + * @param[in] indices Indices tensor info. + */ +template <typename U> void validate_indices(const ITensor *indices) +{ + for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i) + { + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0); + } +} + +} // namespace + +NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {} + +template <typename U> +inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Iterator output_it(_output, window); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices->info()->num_dimensions(), 0); + + U new_index; + switch (_indices->info()->num_dimensions()) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +template <typename U> +void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator output_it(_output, output_window); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices->info()->num_dimensions(), _axis); + + U new_index; + switch (_indices->info()->num_dimensions()) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + if (_axis < 0) + { + _axis += input->info()->num_dimensions(); + } + ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions())); + + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + + if (axis < 0) + { + axis += input->num_dimensions(); + } + + ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window, info); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp new file mode 100644 index 000000000..391337bfb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <unordered_map> + +using namespace arm_compute; + +namespace +{ +constexpr size_t NOT_HIT = 0xFFFFFFFF; +} // namespace + +NEHashtableLookupKernel::NEHashtableLookupKernel() + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} +{ +} + +void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys, + const ITensor *input, ITensor *output, ITensor *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Auto initialize output if not initialized + auto out_shape{input->info()->tensor_shape()}; + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Auto initialize hits if not initialized + auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1)); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + // Validate in case of configured hits + if (hits->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1)); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + } + + return Status{}; +} + +void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + const int const_0 = _output->info()->data_type() == DataType::QASYMM8 + ? _output->info()->quantization_info().offset + : 0; + + std::unordered_map<int32_t, size_t> key_index_map; + for (size_t n = 0; n < _keys->info()->dimension(0); ++n) + { + const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n})); + key_index_map[key] = n; + } + std::vector<size_t> lookup_indices; + for (size_t k = 0; k < _lookups->info()->dimension(0); ++k) + { + const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k})); + const auto it = key_index_map.find(key); + if (it == key_index_map.end()) + { + lookup_indices.emplace_back(NOT_HIT); + *_hits->ptr_to_element({k}) = 0; + } + else + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= _keys->info()->dimension(0)) + ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices.emplace_back(it->second); + *_hits->ptr_to_element({k}) = 1; + } + } + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..1ea77fb5c --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +template <typename T> +void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window) +{ + /** NEON vector tag type. */ + using ExactTagType = + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + const auto channel_idx = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + if (gamma != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), + "Gamma's size must be the same as size of input's channel"); + } + + if (beta != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), + "Beta's size must be the same as size of input's channel"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) +{ +} + +void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output, + ITensor *gamma, ITensor *beta, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + + if (_input->info()->data_type() == DataType::F32) + { + _func = &instance_normalization_nchw<float>; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (_input->info()->data_type() == DataType::F16) + { + _func = &instance_normalization_nchw<float16_t>; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else + { + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + (*_func)(_input, _output, _gamma, _beta, _epsilon, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..de218d489 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +inline int32x4x4_t load_value(const int32_t *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t ret = {{ + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + }}; + return ret; +} +} // namespace + +NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor, + ITensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier) +{ + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + + return Status{}; +} + +template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); +} + +void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_output->info()->data_type()) + { + case DataType::F32: + NEMultiplyScaleFactorKernel::multiply<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEMultiplyScaleFactorKernel::multiply<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp new file mode 100644 index 000000000..ad1bb9051 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +using namespace arm_compute; +namespace +{ + +/** Conditional element-wise operations */ +enum class ConditionalOperation +{ + PRELU, /**< (x * y) for x < 0, x for x >= 0 */ +}; + +template <ConditionalOperation op, typename ScalarType> +inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case ConditionalOperation::PRELU: + res = a < 0 ? a * b : a; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ConditionalOperation op> +inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b, + QuantizationInfo qinfo) +{ + return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP); +} + +template <ConditionalOperation op, typename VectorType> +inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + VectorType const_0 = {0, 0, 0, 0}; + + switch (op) + { + case ConditionalOperation::PRELU: + res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b)); + ; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ConditionalOperation op> +inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + float32x4x4_t out = {{ + elementwise_conditional_op<op>(a.val[0], b.val[0]), + elementwise_conditional_op<op>(a.val[1], b.val[1]), + elementwise_conditional_op<op>(a.val[2], b.val[2]), + elementwise_conditional_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_conditional_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_conditional_op<op>(reorder ? broadcast_vector : a, + reorder ? a : broadcast_vector); +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, + const ScalarType *input2_ptr, ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b)); + } + return x; +} + +template <ConditionalOperation op> +inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x, + int window_step_x, const uint8_t *input1_ptr, + const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, + float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <ConditionalOperation op> +inline int elementwise_conditional_op_quantized_broadcast_loop( + int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af, + reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>, + &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_conditional_op_loop<op, ScalarType, VectorType>); +} + +template <ConditionalOperation op> +void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>, + &elementwise_conditional_op_quantized_broadcast_loop<op>, + &elementwise_conditional_op_quantized_loop<op>); +} +} // namespace + +NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info())); + + // Configure kernel window + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); + + Window win = calculate_max_window(valid_region); + + _input = input; + _alpha = alpha; + _output = output; + INEKernel::configure(win); +} + +void NEPReLUKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + if (_input->info()->data_type() == DataType::F32) + { + elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha, + _output, window); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output, + window); + } + else + { + ARM_COMPUTE_ERROR("Wrong Type"); + } +} + +Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, + const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Checks performed when output is configured + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output)); + + return Status{}; +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..acf0092eb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + return Status{}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +inline float32x4_t round(const float32x4_t &fv) +{ + const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f); + const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f); + // If value < 0, mask = -1, else mask = 0 + int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4)); + return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4)); +} + +inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale) +{ + const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv); + const int32x4_t vposend = vdupq_n_s32(max_scale); + const int32x4_t vnagend = vdupq_n_s32(-max_scale); + + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#else //__aarch64__ + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#endif //__aarch64__ + }}; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} +} // namespace + +NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) +{ +} + +void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output, + ITensor *scale_factor) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), scale_factor->info())); + + _input = input; + _output = output; + _scale_factor = scale_factor; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor)); + + return Status{}; +} + +template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window; + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + const auto dim_x = _input->info()->dimension(0); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); +} + +void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::F32: + NEQuantizationSymmetricKernel::quantize<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEQuantizationSymmetricKernel::quantize<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp new file mode 100644 index 000000000..59e7d9beb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +float32x2_t calculate_min(float32x4_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +float32x2_t calculate_max(float32x4_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +int32x2_t calculate_min(int32x4_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +int32x2_t calculate_max(int32x4_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +inline uint8x8_t calculate_min(uint8x16_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +inline uint8x8_t calculate_max(uint8x16_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +inline float16x4_t calculate_min(float16x8_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +inline float16x4_t calculate_max(float16x8_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <class F> class Reducer +{ +public: + static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_1D(); + Window out_slice = out_window.first_slice_window_1D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), op); + } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); + } + static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), + output->info()->dimension(1))); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 1, op); + } while (in_window.slide_window_slice_2D(in_slice) && + out_window.slide_window_slice_2D(out_slice)); + } + static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), + output->info()->dimension(2))); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_3D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 2, op); + } while (in_window.slide_window_slice_3D(in_slice) && + out_window.slide_window_slice_3D(out_slice)); + } + static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in/out window + Window in_window(window); + Window out_window(window); + + in_window.set(3, Window::Dimension(0, 1, 1)); + out_window.set(3, Window::Dimension(0, 1, 1)); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_4D(); + Window out_slice = out_window.first_slice_window_4D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 3, op); + } while (in_window.slide_window_slice_4D(in_slice) && + out_window.slide_window_slice_4D(out_slice)); + } +}; + +template <typename T, int S> struct RedOpX +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + ARM_COMPUTE_UNUSED(in_info); + auto init_res_value = static_cast<T>(0.f); + switch (op) + { + case ReduceOperation::MIN: + case ReduceOperation::MAX: + { + init_res_value = *reinterpret_cast<T *>(input.ptr()); + break; + } + default: + break; + } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + + execute_window_loop(in_slice, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input); + + switch (op) + { + case ReduceOperation::MIN: + { + *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0); + break; + } + case ReduceOperation::MAX: + { + *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } +}; + +struct RedOpX_qasymm8 +{ + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + ARM_COMPUTE_UNUSED(in_info); + + uint8x16_t vec_res_value = {0}; + + if (op == ReduceOperation::MIN || op == ReduceOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{}); + } + + execute_window_loop(in_slice, + [&](const Coordinates &) { + const auto vec_elements = wrapper::vloadq(input.ptr()); + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input); + + switch (op) + { + case ReduceOperation::MIN: + { + *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + break; + } + case ReduceOperation::MAX: + { + *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } + } + } +}; + +template <typename T, int S> struct RedOpYZW +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, int axis, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + + execute_window_loop( + in_slice, + [&](const Coordinates &) { + neon_vector vec_res_value = {0}; + switch (op) + { + case ReduceOperation::MIN: + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr())); + break; + } + default: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + break; + } + } + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr; + switch (axis) + { + case 1: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim))); + break; + case 2: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim))); + break; + case 3: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim))); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value); + }, + input, output); + } +}; + +struct RedOpYZW_qasymm8 +{ + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, int axis, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + + execute_window_loop( + in_slice, + [&](const Coordinates &) { + auto vec_res_value = wrapper::vloadq(input.ptr()); + + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) + { + uint8_t *in_ptr; + switch (axis) + { + case 1: + in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim)); + break; + case 2: + in_ptr = + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim)); + break; + case 3: + in_ptr = + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim)); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value); + }, + input, output); + } +}; + +void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, + const ReduceOperation op) +{ + const bool is_complex = (input->info()->num_channels() == 2); + if (is_complex) + { + ARM_COMPUTE_ERROR("Not supported"); + } + + switch (axis) + { + case 0: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, + RedOpX<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); + case DataType::S32: + return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), + op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 1: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 2: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 3: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); + } +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(op); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + + if (input->num_channels() == 1) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + DataType::F16, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); + + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(op); + + // Calculate output shape and set if empty + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + + // Output auto initialization if not yet initialized + DataType output_data_type = input->data_type(); + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + + return std::make_tuple(err, win); +} +} // namespace + +NEReductionOperationKernelEx::NEReductionOperationKernelEx() + : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX), + _border_size() +{ +} + +BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; } + +void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + unsigned int num_elems_processed_per_iteration = + 16 / data_size_from_type(input->info()->data_type()); + + _input = input; + _output = output; + _border_size = + (axis == 0) + ? BorderSize(0, num_elems_processed_per_iteration - + (input->info()->dimension(0) % num_elems_processed_per_iteration), + 0, 0) + : BorderSize(); + _op = op; + _reduction_axis = axis; + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( + validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op))); + + return Status{}; +} + +void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + reduce_op(window, _input, _output, _reduction_axis, _op); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp new file mode 100644 index 000000000..36a2f55a9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include <arm_neon.h> +#include <cstdint> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + + ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); + + // Validate output if initialized + if (output->total_size() != 0) + { + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] != + output->tensor_shape()[idx_batch]); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) != + 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != + output->tensor_shape().total_size()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} +} // namespace + +NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx() + : _input(nullptr), _output(nullptr), _block_shape() +{ +} + +void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output, + int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); + + _input = input; + _block_shape = block_shape; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + INEKernel::configure(win); +} + +Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); + return Status{}; +} + +void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + const DataLayout data_layout = _input->info()->data_layout(); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int element_size = _input->info()->element_size(); + + const size_t channel_size = _input->info()->dimension(channel_idx); + + Window slice_out = window.first_slice_window_3D(); + + int batch_id = 0; + + // Main loop for NCHW and NHWC + if (_output->info()->data_layout() == DataLayout::NCHW) + { + do + { + Iterator out(_output, slice_out); + execute_window_loop(slice_out, + [&](const Coordinates &id) { + const size_t channel_id = id.z(); + const size_t in_x = + id.x() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = + id.y() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{in_x, in_y, z, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); + ++batch_id; + } while (window.slide_window_slice_3D(slice_out)); + } + else + { + do + { + Iterator out(_output, slice_out); + execute_window_loop(slice_out, + [&](const Coordinates &id) { + const size_t channel_id = id.x(); + const size_t in_x = + id.y() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = + id.z() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{z, in_x, in_y, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); + ++batch_id; + } while (window.slide_window_slice_3D(slice_out)); + } +} +} // namespace arm_compute |