diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-03-05 15:10:09 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-03-05 15:22:53 +0900 |
commit | d91a039e0eda6fd70dcd22672b8ce1817c1ca50e (patch) | |
tree | 62668ec548cf31fadbbf4e99522999ad13434a25 /runtimes/libs/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp | |
parent | bd11b24234d7d43dfe05a81c520aa01ffad06e42 (diff) | |
download | nnfw-d91a039e0eda6fd70dcd22672b8ce1817c1ca50e.tar.gz nnfw-d91a039e0eda6fd70dcd22672b8ce1817c1ca50e.tar.bz2 nnfw-d91a039e0eda6fd70dcd22672b8ce1817c1ca50e.zip |
catch up to tizen_5.5 and remove unness dir
- update to tizen_5.5
- remove dirs
Diffstat (limited to 'runtimes/libs/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp')
-rw-r--r-- | runtimes/libs/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp | 346 |
1 files changed, 346 insertions, 0 deletions
diff --git a/runtimes/libs/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/runtimes/libs/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp new file mode 100644 index 000000000..4508f5800 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" + +#include <algorithm> +#include "arm_compute/core/Types.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +namespace +{ +void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out) +{ + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +using namespace arm_compute; +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op_templ( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_value, + output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = + reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = + reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, + input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +} // namespace + +namespace arm_compute +{ + +float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, + const float32x4_t &scale) +{ + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + }}; + return out; +} + +void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, + const float32x4_t &invscale) +{ + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; + store_quantized_int32(output_ptr, out); +} + +float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale) +{ + const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t broadcast_vector = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( + vmovl_u8(vget_low_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( + vmovl_u8(vget_low_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( + vmovl_u8(vget_high_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( + vmovl_u8(vget_high_u8(broadcast_value_vec))))), + voffset)), + vscale), + }}; + return broadcast_vector; +} + +void elementwise_op_quantized( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, + float32x4_t, float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, + int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const float output_scale = out->info()->quantization_info().scale; + const int output_offset = out->info()->quantization_info().offset; + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from + // zero) + const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); + + if (is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info(); + const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) { + const auto non_broadcast_input_ptr = + reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = + dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, + invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = + scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale, + non_broadcast_qinfo.offset); + const float bfs = + scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, + out->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset); + const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset); + const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); + const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = + (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = + scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset); + const float bfs = + scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset); + *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info()); + } + }, + input1, input2, output); + } +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)) +{ + elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)) +{ + elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} +} // namespace arm_compute |