From 28b2016e8983a8fc03b4db4c631c15c3a7ac819e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9E=A5=EC=A7=80=EC=84=AD/On-Device=20Lab=28SR=29/Engine?= =?UTF-8?q?er/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Tue, 17 Sep 2019 20:47:55 +0900 Subject: Make to support ReduceSum op for acl neon (#7520) This commit makes to support ReduceSum op for acl neon Signed-off-by: jiseob.jang --- .../arm_compute/runtime/NEON/NEFunctionsEx.h | 1 + .../runtime/NEON/functions/NEReduceSum.h | 82 ++++++++++ .../src/runtime/NEON/functions/NEReduceSum.cpp | 165 +++++++++++++++++++++ .../neurun/backend/acl_neon/KernelGenerator.cc | 43 ++++++ runtimes/neurun/backend/acl_neon/KernelGenerator.h | 1 + runtimes/neurun/backend/acl_neon/ShapeFixer.cc | 2 + runtimes/neurun/backend/acl_neon/ShapeFixer.h | 1 + tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon | 1 - 8 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h create mode 100644 runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp diff --git a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h index fb5323d22..760853a1b 100644 --- a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h +++ b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h new file mode 100644 index 000000000..c028ea658 --- /dev/null +++ b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__ +#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceSum : public IFunction +{ +public: + /** Constructor */ + NEReduceSum(std::shared_ptr memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector _reduction_kernels; + std::vector _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */ diff --git a/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp new file mode 100644 index 000000000..3c18217ef --- /dev/null +++ b/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceSum.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceSum::NEReduceSum(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(input->info()->data_layout())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], + ReductionOperation::SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceSum::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/runtimes/neurun/backend/acl_neon/KernelGenerator.cc b/runtimes/neurun/backend/acl_neon/KernelGenerator.cc index 0293b8368..a05fa8b30 100644 --- a/runtimes/neurun/backend/acl_neon/KernelGenerator.cc +++ b/runtimes/neurun/backend/acl_neon/KernelGenerator.cc @@ -1030,6 +1030,49 @@ void KernelGenerator::visit(const model::operation::PReLUNode &node) _execution_builder->append(std::move(acl_fn)); } +void KernelGenerator::visit(const model::operation::ReduceSumNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ReduceSumNode::Input::INPUT)}; + const auto axis_index{node.param().axis_index}; + + const auto axis_base = _ctx.at(axis_index).data().base(); + const auto axis_size = _ctx.at(axis_index).shape().num_elements(); + const auto input_rank = _ctx.at(input_index).shape().rank(); + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = input_alloc->layout(); + // The axis's data must exist as constant values + assert(axis_base != nullptr); + std::set axes; + for (size_t n = 0; n < axis_size; ++n) + { + int32_t axis_value = *(reinterpret_cast(axis_base) + n); + if (axis_value < 0) + { + axis_value += input_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(input_rank, axis_value, + frontend_layout, backend_layout) + .value()); + } + arm_compute::Coordinates fixed_axes; + for (const auto &a : axes) + { + fixed_axes.set(fixed_axes.num_dimensions(), a); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceSum>(); + + fn->configure(input_alloc->handle(), fixed_axes, false, output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + void KernelGenerator::visit(const model::operation::ReLUNode &node) { const auto output_index{node.getOutputs().at(0)}; diff --git a/runtimes/neurun/backend/acl_neon/KernelGenerator.h b/runtimes/neurun/backend/acl_neon/KernelGenerator.h index 28ef565d4..64e8a6df1 100644 --- a/runtimes/neurun/backend/acl_neon/KernelGenerator.h +++ b/runtimes/neurun/backend/acl_neon/KernelGenerator.h @@ -57,6 +57,7 @@ public: void visit(const model::operation::MulNode &) override; void visit(const model::operation::PadNode &) override; void visit(const model::operation::PReLUNode &) override; + void visit(const model::operation::ReduceSumNode &) override; void visit(const model::operation::ReLUNode &) override; void visit(const model::operation::ReLU1Node &) override; void visit(const model::operation::ReLU6Node &) override; diff --git a/runtimes/neurun/backend/acl_neon/ShapeFixer.cc b/runtimes/neurun/backend/acl_neon/ShapeFixer.cc index f78b56681..da5287648 100644 --- a/runtimes/neurun/backend/acl_neon/ShapeFixer.cc +++ b/runtimes/neurun/backend/acl_neon/ShapeFixer.cc @@ -183,6 +183,8 @@ void ShapeFixer::visit(const model::operation::PReLUNode &node) } } +void ShapeFixer::visit(const model::operation::ReduceSumNode &) { /* DO NOTHING */} + void ShapeFixer::visit(const model::operation::ReLUNode &) { /* DO NOTHING */} void ShapeFixer::visit(const model::operation::ReLU1Node &) { /* DO NOTHING */} diff --git a/runtimes/neurun/backend/acl_neon/ShapeFixer.h b/runtimes/neurun/backend/acl_neon/ShapeFixer.h index 796ea398d..1d3466334 100644 --- a/runtimes/neurun/backend/acl_neon/ShapeFixer.h +++ b/runtimes/neurun/backend/acl_neon/ShapeFixer.h @@ -59,6 +59,7 @@ public: void visit(const model::operation::MulNode &) override; void visit(const model::operation::PadNode &) override; void visit(const model::operation::PReLUNode &) override; + void visit(const model::operation::ReduceSumNode &) override; void visit(const model::operation::ReLUNode &) override; void visit(const model::operation::ReLU1Node &) override; void visit(const model::operation::ReLU6Node &) override; diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon index 255ef0b0f..aa425a064 100644 --- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon +++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon @@ -18,7 +18,6 @@ GeneratedTests.space_to_batch* GeneratedTests.cast_ex* GeneratedTests.gather_ex* GeneratedTests.reduce_max_ex* -GeneratedTests.reduce_sum_ex* GeneratedTests.topk_v2* # Unexpected result GeneratedTests.pack* -- cgit v1.2.3