diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-03-04 18:09:24 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-03-04 18:09:24 +0900 |
commit | 302e6564a7a76109e1178207e44e45a58631c477 (patch) | |
tree | 6cc4bd95e5e438331fc2c53234af4ed0e0f3bc20 /runtime/neurun/backend/acl_neon | |
parent | bd11b24234d7d43dfe05a81c520aa01ffad06e42 (diff) | |
download | nnfw-upstream/1.1.0.tar.gz nnfw-upstream/1.1.0.tar.bz2 nnfw-upstream/1.1.0.zip |
Imported Upstream version 1.1.0upstream/1.1.0submit/tizen/20200304.094649submit/tizen/20200304.093946submit/tizen/20200304.092919accepted/tizen/unified/20200305.051107
Diffstat (limited to 'runtime/neurun/backend/acl_neon')
21 files changed, 3808 insertions, 0 deletions
diff --git a/runtime/neurun/backend/acl_neon/Backend.h b/runtime/neurun/backend/acl_neon/Backend.h new file mode 100644 index 000000000..2fcf66933 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/Backend.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_BACKEND_H__ +#define __NEURUN_BACKEND_ACL_NEON_BACKEND_H__ + +#include <memory> +#include <backend/Backend.h> +#include <ir/Operands.h> + +#include "Config.h" +#include "ConstantInitializer.h" +#include "KernelGenerator.h" +#include "ShapeFixer.h" +#include "TensorManager.h" +#include "TensorRegister.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +class Backend : public ::neurun::backend::Backend +{ +public: + Backend() : _config{std::make_shared<Config>()} {} + + std::shared_ptr<IConfig> config() const override { return _config; } + + std::unique_ptr<BackendContext> + newContext(const ir::Operands &operands, + const std::shared_ptr<custom::IKernelBuilder> &) const override + { + auto tensor_builder = std::make_shared<TensorBuilder>(createTensorManager()); + return std::unique_ptr<BackendContext>{new BackendContext{ + this, tensor_builder, std::make_shared<ConstantInitializer>(operands, tensor_builder), + std::make_shared<KernelGenerator>(operands, tensor_builder), + std::make_shared<ShapeFixer>(operands, tensor_builder), + std::make_shared<TensorRegister>(operands, tensor_builder)}}; + } + +private: + std::shared_ptr<IConfig> _config; +}; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_BACKEND_H__ diff --git a/runtime/neurun/backend/acl_neon/CMakeLists.txt b/runtime/neurun/backend/acl_neon/CMakeLists.txt new file mode 100644 index 000000000..061246d36 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/CMakeLists.txt @@ -0,0 +1,21 @@ +# Unsupported architecture +nnas_find_package(ARMCompute QUIET) +if(NOT ARMCompute_FOUND) + return() +endif(NOT ARMCompute_FOUND) + +set(LIB_NEURUN_BACKEND_ACL_NEON neurun_backend_acl_neon) + +file(GLOB_RECURSE SOURCES "*.cc") + +add_library(${LIB_NEURUN_BACKEND_ACL_NEON} SHARED ${SOURCES}) + +target_include_directories(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE neurun_core) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE ${LIB_NEURUN_BACKEND_ACL_COMMON}) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE nnfw_common) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE nnfw_coverage) + +set_target_properties(${LIB_NEURUN_BACKEND_ACL_NEON} PROPERTIES OUTPUT_NAME backend_acl_neon) + +install(TARGETS ${LIB_NEURUN_BACKEND_ACL_NEON} DESTINATION lib) diff --git a/runtime/neurun/backend/acl_neon/Config.cc b/runtime/neurun/backend/acl_neon/Config.cc new file mode 100644 index 000000000..352bc0b41 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/Config.cc @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Config.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +bool Config::initialize() { return true; } + +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/Config.h b/runtime/neurun/backend/acl_neon/Config.h new file mode 100644 index 000000000..430c194ee --- /dev/null +++ b/runtime/neurun/backend/acl_neon/Config.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_CONFIG_H__ +#define __NEURUN_BACKEND_ACL_NEON_CONFIG_H__ + +#include <backend/IConfig.h> +#include <cpp14/memory.h> +#include <util/ITimer.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +class Config : public IConfig +{ +public: + std::string id() override { return "acl_neon"; } + bool initialize() override; + bool SupportPermutation() override { return true; } + bool SupportSubTensorAlloc() override { return true; } + + std::unique_ptr<util::ITimer> timer() override + { + return nnfw::cpp14::make_unique<util::CPUTimer>(); + } +}; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_CONFIG_H__ diff --git a/runtime/neurun/backend/acl_neon/ConstantInitializer.cc b/runtime/neurun/backend/acl_neon/ConstantInitializer.cc new file mode 100644 index 000000000..9a74bda29 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/ConstantInitializer.cc @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConstantInitializer.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +ConstantInitializer::ConstantInitializer(const ir::Operands &operands, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _operands{operands}, _tensor_builder{tensor_builder} +{ + // DO NOTHING +} + +void ConstantInitializer::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto &block_size_index = node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE); + const auto &block_size_obj = _operands.at(block_size_index); + + if (block_size_obj.isConstant()) + { + _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) { + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base()); + assert(model_obj.shape().rank() == 1); + obj.access([&](::neurun::backend::operand::ITensor &tensor) { + for (size_t i = 0; i < shape.num_elements(); ++i) + { + const int32_t value = base[shape.num_elements() - i - 1]; + int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() + + tensor.calcOffset({static_cast<int32_t>(i)})); + *into = value; + } + }); + }; + } +} + +void ConstantInitializer::visit(const ir::operation::Conv2D &node) +{ + const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node) +{ + const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::FullyConnected &node) +{ + const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); + const auto &weight_obj = _operands.at(weight_index); + registerCopyInitializer(weight_index, weight_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::LSTM &node) +{ + const auto &input_to_input_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_INPUT_WEIGHTS); + const auto &input_to_input_weights_obj = _operands.at(input_to_input_weights_index); + registerCopyInitializer(input_to_input_weights_index, input_to_input_weights_obj); + + const auto &input_to_forget_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_FORGET_WEIGHTS); + const auto &input_to_forget_weights_obj = _operands.at(input_to_forget_weights_index); + registerCopyInitializer(input_to_forget_weights_index, input_to_forget_weights_obj); + + const auto &input_to_cell_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_CELL_WEIGHTS); + const auto &input_to_cell_weights_obj = _operands.at(input_to_cell_weights_index); + registerCopyInitializer(input_to_cell_weights_index, input_to_cell_weights_obj); + + const auto &input_to_output_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_OUTPUT_WEIGHTS); + const auto &input_to_output_weights_obj = _operands.at(input_to_output_weights_index); + registerCopyInitializer(input_to_output_weights_index, input_to_output_weights_obj); + + const auto &recurrent_to_input_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_INPUT_WEIGHTS); + const auto &recurrent_to_input_weights_obj = _operands.at(recurrent_to_input_weights_index); + registerCopyInitializer(recurrent_to_input_weights_index, recurrent_to_input_weights_obj); + + const auto &recurrent_to_forget_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_FORGET_WEIGHTS); + const auto &recurrent_to_forget_weights_obj = _operands.at(recurrent_to_forget_weights_index); + registerCopyInitializer(recurrent_to_forget_weights_index, recurrent_to_forget_weights_obj); + + const auto &recurrent_to_cell_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_CELL_WEIGHTS); + const auto &recurrent_to_cell_weights_obj = _operands.at(recurrent_to_cell_weights_index); + registerCopyInitializer(recurrent_to_cell_weights_index, recurrent_to_cell_weights_obj); + + const auto &recurrent_to_output_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_OUTPUT_WEIGHTS); + const auto &recurrent_to_output_weights_obj = _operands.at(recurrent_to_output_weights_index); + registerCopyInitializer(recurrent_to_output_weights_index, recurrent_to_output_weights_obj); + + const auto &cell_to_input_weights_index = + node.getInputs().at(ir::operation::LSTM::CELL_TO_INPUT_WEIGHTS); + const auto &cell_to_input_weights_obj = _operands.at(cell_to_input_weights_index); + registerCopyInitializer(cell_to_input_weights_index, cell_to_input_weights_obj); + + const auto &cell_to_forget_weights_index = + node.getInputs().at(ir::operation::LSTM::CELL_TO_FORGET_WEIGHTS); + const auto &cell_to_forget_weights_obj = _operands.at(cell_to_forget_weights_index); + registerCopyInitializer(cell_to_forget_weights_index, cell_to_forget_weights_obj); + + const auto &cell_to_output_weights_index = + node.getInputs().at(ir::operation::LSTM::CELL_TO_OUTPUT_WEIGHTS); + const auto &cell_to_output_weights_obj = _operands.at(cell_to_output_weights_index); + registerCopyInitializer(cell_to_output_weights_index, cell_to_output_weights_obj); + + const auto &input_gate_bias_index = node.getInputs().at(ir::operation::LSTM::INPUT_GATE_BIAS); + const auto &input_gate_bias_obj = _operands.at(input_gate_bias_index); + registerCopyInitializer(input_gate_bias_index, input_gate_bias_obj); + + const auto &forget_gate_bias_index = node.getInputs().at(ir::operation::LSTM::FORGET_GATE_BIAS); + const auto &forget_gate_bias_obj = _operands.at(forget_gate_bias_index); + registerCopyInitializer(forget_gate_bias_index, forget_gate_bias_obj); + + const auto &output_gate_bias_index = node.getInputs().at(ir::operation::LSTM::OUTPUT_GATE_BIAS); + const auto &output_gate_bias_obj = _operands.at(output_gate_bias_index); + registerCopyInitializer(output_gate_bias_index, output_gate_bias_obj); + + const auto &projection_weights_index = + node.getInputs().at(ir::operation::LSTM::PROJECTION_WEIGHTS); + const auto &projection_weights_obj = _operands.at(projection_weights_index); + registerCopyInitializer(projection_weights_index, projection_weights_obj); + + const auto &projection_bias_index = node.getInputs().at(ir::operation::LSTM::PROJECTION_BIAS); + const auto &projection_bias_obj = _operands.at(projection_bias_index); + registerCopyInitializer(projection_bias_index, projection_bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::RNN &node) +{ + const auto &weights_index = node.getInputs().at(ir::operation::RNN::WEIGHTS); + const auto &weights_obj = _operands.at(weights_index); + registerCopyInitializer(weights_index, weights_obj); + + const auto &recurrent_weights_index = node.getInputs().at(ir::operation::RNN::RECURRENT_WEIGHTS); + const auto &recurrent_weights_obj = _operands.at(recurrent_weights_index); + registerCopyInitializer(recurrent_weights_index, recurrent_weights_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::RNN::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto &block_size_index = node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE); + const auto &block_size_obj = _operands.at(block_size_index); + + if (block_size_obj.isConstant()) + { + _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) { + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base()); + assert(model_obj.shape().rank() == 1); + obj.access([&](::neurun::backend::operand::ITensor &tensor) { + for (size_t i = 0; i < shape.num_elements(); ++i) + { + const int32_t value = base[shape.num_elements() - i - 1]; + int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() + + tensor.calcOffset({static_cast<int32_t>(i)})); + *into = value; + } + }); + }; + } + + const auto &paddings_index = node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS); + const auto &paddings_obj = _operands.at(paddings_index); + if (paddings_obj.isConstant()) + { + _init_map[paddings_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) { + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base()); + assert(model_obj.shape().rank() == 2); + assert(shape.dim(0) == 2); + assert(shape.dim(1) == 2); + obj.access([&](::neurun::backend::operand::ITensor &tensor) { + for (auto i = 0; i < shape.dim(0); ++i) + { + for (auto j = 0; j < shape.dim(1); ++j) + { + const int32_t value = base[i * 2 + j]; + int32_t *into = reinterpret_cast<int32_t *>( + // The coordinates of NETensor are different from the coordiantes of CLTensor in + // this operand. + // NEON : {j, reversed i} + // CL : {reversed i, j} + tensor.buffer() + tensor.calcOffset({j, shape.dim(0) - i - 1})); + *into = value; + } + } + }); + }; + } +} + +void ConstantInitializer::visit(const ir::operation::TransposeConv &node) +{ + const auto &kernel_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); +} + +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/ConstantInitializer.h b/runtime/neurun/backend/acl_neon/ConstantInitializer.h new file mode 100644 index 000000000..0f2b2d05b --- /dev/null +++ b/runtime/neurun/backend/acl_neon/ConstantInitializer.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__ +#define __NEURUN_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__ + +#include <backend/IConstantInitializer.h> +#include <ir/Operands.h> +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +class ConstantInitializer : public IConstantInitializer +{ +public: + ConstantInitializer(const ir::Operands &operands, + const std::shared_ptr<TensorBuilder> &tensor_builder); + +public: + void visit(const ir::operation::BatchToSpaceND &) override; + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::FullyConnected &) override; + void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::RNN &) override; + void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::TransposeConv &) override; + +private: + const ir::Operands &operands() const override { return _operands; } + std::shared_ptr<ITensorBuilder> tensor_builder() const override { return _tensor_builder; } + +private: + const ir::Operands &_operands; + std::shared_ptr<TensorBuilder> _tensor_builder; +}; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__ diff --git a/runtime/neurun/backend/acl_neon/KernelGenerator.cc b/runtime/neurun/backend/acl_neon/KernelGenerator.cc new file mode 100644 index 000000000..85c6a0633 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/KernelGenerator.cc @@ -0,0 +1,2152 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "KernelGenerator.h" + +#include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions +#include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions + +#include <Convert.h> +#include <Swizzle.h> + +#include "util/Padding.h" +#include "ir/Index.h" +#include "ir/DataType.h" +#include "ir/InternalType.h" +#include "compiler/IExecutionBuilder.h" +#include "exec/NopFunction.h" +#include "util/logging.h" +#include "util/Utils.h" + +using ::neurun::compiler::IExecutionBuilder; + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +using ::neurun::backend::acl_common::asAclFunction; + +// +// ActivationBuilder +// +class ActivationBuilder +{ +public: + ActivationBuilder(IExecutionBuilder &builder) : _builder(builder) + { + // DO NOTHING + } + +private: + void appendReLU(::arm_compute::ITensor *ifm_alloc); + void appendReLU1(::arm_compute::ITensor *ifm_alloc); + void appendReLU6(::arm_compute::ITensor *ifm_alloc); + +public: + void append(ir::Activation act, ::arm_compute::ITensor *ifm_alloc); + +private: + IExecutionBuilder &_builder; +}; + +void ActivationBuilder::appendReLU(::arm_compute::ITensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::appendReLU1(::arm_compute::ITensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::appendReLU6(::arm_compute::ITensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::append(ir::Activation act, ::arm_compute::ITensor *ifm_alloc) +{ + switch (act) + { + case ir::Activation::NONE: + { + // DO NOTHING + break; + } + case ir::Activation::RELU: + { + appendReLU(ifm_alloc); + break; + } + case ir::Activation::RELU1: + { + appendReLU1(ifm_alloc); + break; + } + case ir::Activation::RELU6: + { + appendReLU6(ifm_alloc); + break; + } + default: + { + throw std::runtime_error("Not supported, yet"); + } + } +} + +// +// KernelGenerator +// +KernelGenerator::KernelGenerator(const ir::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(ir::Layout::UNKNOWN) +{ + // DO NOTHING +} + +void KernelGenerator::visit(const ir::OpSequence &op_seq) +{ + _current_subg_layout = op_seq.getLayout(); + for (const auto &e : op_seq.operations()) + { + const auto &node = *(e.node); + _tensor_builder->preVisit(node); + node.accept(*this); + _tensor_builder->postVisit(node); + } +} + +void KernelGenerator::visit(const ir::operation::Abs &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ArgMax &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; + + const auto ifm_rank = node.param().rank; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto frontend_layout = _current_subg_layout; + auto backend_layout = ifm_alloc->layout(); + + int axis_value = node.param().axis; + if (axis_value < 0) + { + axis_value += ifm_rank; + } + assert(axis_value >= 0 && axis_value < ifm_rank); + const auto fixed_axis = + acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value(); + + // auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArgMinMaxLayer>(); + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArgMax>(); + + // NOTE + // if (ofm_alloc->info()->data_type() == arm_compute::DataType::S32) + //{ + ofm_alloc->info()->set_data_type(arm_compute::DataType::U32); + //} + fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle()); + // fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(), + // arm_compute::ReductionOperation::ARG_IDX_MAX); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)}; + const auto block_size_index{ + node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + + assert(_ctx.at(block_size_index).isConstant()); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEBatchToSpaceLayer>(); + + fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Cast &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NECast>(); + + auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 + ? arm_compute::SubDataType::BOOL + : arm_compute::SubDataType::NONE; + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Conv2D &node) +{ + using ir::operation::Conv2D; + + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)}; + const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]. + const auto &ker_shape = _ctx.at(ker_index).shape(); + const auto ker_height = ker_shape.dim(1); + const auto ker_width = ker_shape.dim(2); + + const auto stride = node.param().stride; + const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, + stride, ker_width, ker_height); + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEConvolutionLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), + conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::DepthToSpace &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)}; + + auto block_size = node.param().block_size; + assert(block_size > 0); + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDepthToSpaceLayerEx>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) +{ + using ir::operation::DepthwiseConv2D; + + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)}; + const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + // Kernel format is [1, kernel_height, kernel_width, depth_out]. + const auto &ker_shape = _ctx.at(ker_index).shape(); + const auto ker_height = ker_shape.dim(1); + const auto ker_width = ker_shape.dim(2); + + const auto stride = node.param().stride; + const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, + stride, ker_width, ker_height); + const auto multiplier = node.param().multiplier; + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); + + if (ker_height == 3 && ker_width == 3) + { + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDepthwiseConvolutionLayer3x3>(); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), + ofm_alloc->handle(), conv_info, multiplier, act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); + } + else + { + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>(); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), + ofm_alloc->handle(), conv_info, multiplier, act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); + } +} + +void KernelGenerator::visit(const ir::operation::Dequantize &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDequantizationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::MaxPool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl; + VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl; + VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl; + VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, + ::arm_compute::Size2D{kw, kh}, + acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append((std::move(acl_fn))); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Mean &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Mean::Input::INPUT)}; + const auto &axes{node.param().axes}; + const auto keep_dims{node.param().keep_dims}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int ifm_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += ifm_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value()); + } + + arm_compute::Coordinates fixed_axis; + for (const auto axis : acl_axes) + { + fixed_axis.set(fixed_axis.num_dimensions(), axis); + } + + // NOTE NEReduceMean has a bug that does not support NHWC layout + // NEReduceMean intermediate tensors are always NCHW layout + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceMeanEx>(); + + fn->configure(ifm_alloc->handle(), fixed_axis, keep_dims, ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::AvgPool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl; + VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl; + VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl; + VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, + acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append((std::move(acl_fn))); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Concat &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + std::vector<ir::OperandIndex> input_indexes; + for (const auto &input : node.getInputs()) + input_indexes.emplace_back(input); + + const auto axis = node.param().axis; + + // If tensor allocator allocate as subtensor + bool canEliminate = true; + for (auto ifm_ind : input_indexes) + { + if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind)) + { + canEliminate = false; + break; + } + } + if (canEliminate) + { + // If concat eliminated, return a NOP IFunction + _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>()); + return; + } + + auto output_alloc = _tensor_builder->at(ofm_index).get(); + std::vector<::arm_compute::ITensor *> input_tensors; + for (const auto &ifm_ind : input_indexes) + input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); + + std::unique_ptr<::arm_compute::IFunction> fn; + if (input_indexes.size() < 2) + { + auto l = nnfw::cpp14::make_unique<::arm_compute::NECopy>(); + l->configure(input_tensors.at(0), output_alloc->handle()); + fn = std::move(l); + } + else + { + auto l = nnfw::cpp14::make_unique<::arm_compute::NEConcatenateLayer>(); + const auto rank = node.param().rank; + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = output_alloc->layout(); + const auto fixed_axis = + acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); + l->configure(input_tensors, output_alloc->handle(), fixed_axis); + fn = std::move(l); + } + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto lookups_alloc = _tensor_builder->at(lookups_index).get(); + auto values_alloc = _tensor_builder->at(values_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEEmbeddingLookup>(); + + fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Floor &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEFloor>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::FullyConnected &node) +{ + using ir::operation::FullyConnected; + + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; + const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; + const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; + + const auto input_rank = _ctx.at(input_index).shape().rank(); + + const auto output_size = + _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); + UNUSED_RELEASE(output_size); + assert(_ctx.at(bias_index).shape().dim(0) == output_size); + assert(_ctx.at(weight_index).shape().dim(0) == output_size); + const auto batch_size = + _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); + const auto input_size = + _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); + + // Check for reshaping input's shape into rank-2 + bool needs_reshape = false; + ir::Shape reshape(2); + if (input_rank == 3 || input_rank == 4) + { + const auto &ifm_shape = _ctx.at(input_index).shape(); + auto feature_size = 1; + for (int i = 0; i < ifm_shape.rank(); ++i) + { + feature_size *= ifm_shape.dim(i); + } + + UNUSED_RELEASE(feature_size); + assert(feature_size == batch_size * input_size); + + // for reshaping + needs_reshape = true; + reshape.dim(0) = batch_size; /* H */ + reshape.dim(1) = input_size; /* W */ + } + + const auto activation = node.param().activation; + + auto output_alloc = _tensor_builder->at(output_index).get(); + const auto input_alloc = _tensor_builder->at(input_index).get(); + const auto weight_alloc = _tensor_builder->at(weight_index).get(); + const auto bias_alloc = _tensor_builder->at(bias_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto acl_layout = output_alloc->handle()->info()->data_layout(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::NEFullyConnectedReshapingLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type = + _ctx.at(weight_index).isConstant() + ? arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS + : arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL; + + fn->configure( + input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), + needs_reshape, + ::neurun::backend::acl_common::asTensorShape( + reshape, frontend_layout, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)), + kernel_type); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::HashtableLookup &node) +{ + const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)}; + const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)}; + + const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)}; + const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; + const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto hits_alloc = _tensor_builder->at(hits_index).get(); + + auto lookups_alloc = _tensor_builder->at(lookups_index).get(); + auto keys_alloc = _tensor_builder->at(keys_index).get(); + auto values_alloc = _tensor_builder->at(values_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEHashtableLookup>(); + + fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), + output_alloc->handle(), hits_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Gather &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; + const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape(); + + const auto ifm_rank = node.param().rank; + const auto axis_raw = node.param().axis; + const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw); + // Converting in reverse order + const int axis = ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto indices_alloc = _tensor_builder->at(indices_index).get(); + const auto backend_layout = ofm_alloc->layout(); + UNUSED_RELEASE(backend_layout); + + // NOTE The frontend layout and backend layout must be the same for this operation. + // If not the same, we have to add a stage(?) to perform permutation of output tensor. It + // is not not efficient even if it works well. If so, it would be better to set the + // layout of these backend tensors to the same layout. + // There is also one thing we have to think about. This operation depends on the layout of + // a model. For example, if a model in NHWC has this operation as output rank == 4, indices + // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W + // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. + assert(backend_layout == ifm_alloc->layout()); + assert(backend_layout == indices_alloc->layout()); + assert(ifm_rank < 4 || _current_subg_layout == backend_layout); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEGatherEx>(); + + fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::InstanceNorm &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)}; + const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; + const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto gamma_alloc = _tensor_builder->at(gamma_index).get(); + auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto epsilon = node.param().epsilon; + auto activation = node.param().activation; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), + beta_alloc->handle(), epsilon); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::L2Normalization &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)}; + + // {CL|Neon}L2Normalization performs the reduction only along dimension 0 + // L2 Normalization always performs the reduction along the depth axis + // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by + // choosing normalization parameters as below + + const auto &ifm_shape = _ctx.at(ifm_index).shape(); + // TODO Support optional constant dimension that normalization would be performed on + const auto normalization_axis = node.param().rank - 1; + int32_t radius = + 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1 + float alpha = 1.0f; // In the implementation to make alpha_ become 1 + float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) + float bias = 0.0f; // Don't offset the reduction. + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, + radius, alpha, beta, bias, false); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NENormalizationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::L2Pool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + + uint32_t kw = node.param().kw; + uint32_t kh = node.param().kh; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, + ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{ + node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)}; + + auto radius = node.param().radius; + auto alpha = node.param().alpha; + auto beta = node.param().beta; + auto bias = node.param().bias; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo( + ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NENormalizationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LogicalAnd &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NELogicalAnd>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LogicalNot &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEBitwiseNot>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LogicalOr &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NELogicalOr>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Logistic &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LSTM &node) +{ + // TODO Support dynamic rnn + // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. + const auto scratch_buffer_index{ + node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + const auto output_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + const auto cell_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + + const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto input_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional + const auto input_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; + const auto input_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; + const auto input_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional + const auto recurrent_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; + const auto recurrent_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; + const auto recurrent_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto cell_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional + const auto cell_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional + const auto cell_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional + const auto input_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; + const auto forget_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; + const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; + const auto output_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; + const auto projection_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional + const auto projection_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional + const auto output_state_in_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; + const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; + const auto cell_threshold = node.param().cell_threshold; + const auto projection_threshold = node.param().projection_threshold; + + bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; + bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; + bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && + _ctx.at(projection_weights_index).shape().dim(1) != 0; + bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); + + // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. + // true: no CIFG + // false: CIFG + // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + + // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. + // But the cell_to_input_weights does not exist in regular CIFG although peephole. + // true: peephole + // false: no peephole + bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; + + // NOTE Although the projection weights has data the projection bias may not have data. + bool has_projection_param = has_projection_weights; + + const auto activation = node.param().activation; + const auto cell_clip = cell_threshold; + const auto projection_clip = projection_threshold; + assert(cell_clip >= 0.f && projection_clip >= 0.f); + + auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); + auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); + auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); + auto output_alloc = _tensor_builder->at(output_index).get(); + + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); + auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); + auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); + auto recurrent_to_forget_weights_alloc = + _tensor_builder->at(recurrent_to_forget_weights_index).get(); + auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); + auto recurrent_to_output_weights_alloc = + _tensor_builder->at(recurrent_to_output_weights_index).get(); + + auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); + auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); + auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); + auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); + auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); + + auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NELSTMLayer>(); + + ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{}; + if (has_cifg_param) + { + auto input_to_input_weights_alloc = + _tensor_builder->at(input_to_input_weights_index).get(); // optional + auto recurrent_to_input_weights_alloc = + _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional + auto cell_to_input_weights_handle = + has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() + : nullptr; // optional (non-cifg && peephole) + auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional + lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), + recurrent_to_input_weights_alloc->handle(), + cell_to_input_weights_handle, input_gate_bias_alloc->handle()); + } + if (has_peephole_param) + { + auto cell_to_forget_weights_alloc = + _tensor_builder->at(cell_to_forget_weights_index).get(); // optional + auto cell_to_output_weights_alloc = + _tensor_builder->at(cell_to_output_weights_index).get(); // optional + lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), + cell_to_output_weights_alloc->handle()); + } + if (has_projection_param) + { + auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional + auto projection_bias_handle = has_projection_bias + ? _tensor_builder->at(projection_bias_index).get()->handle() + : nullptr; // optional + lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); + } + + fn->configure( + input_alloc->handle(), input_to_forget_weights_alloc->handle(), + input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), + recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), + recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), + cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), + cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), + output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), + lstm_params, act_info, cell_clip, projection_clip); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Mul &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPixelWiseMultiplication>(); + + // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Neg &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NENegLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Pack &node) +{ + const auto output_index{node.getOutputs().at(0)}; + auto axis{node.param().axis}; + + const auto output_rank = node.param().rank; + + std::vector<ir::OperandIndex> input_indexes; + for (const auto &input_index : node.getInputs()) + input_indexes.emplace_back(input_index); + + auto output = _tensor_builder->at(output_index).get()->handle(); + std::vector<arm_compute::ITensor *> inputs; + for (const auto &input_index : input_indexes) + inputs.emplace_back(_tensor_builder->at(input_index)->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(output_index).get()->layout(); + + if (axis < 0) + axis += output_rank; + axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEStackLayer>(); + + fn->configure(inputs, axis, output); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Pad &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)}; + const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)}; + const auto output_index{node.getOutputs().at(0)}; + assert(_ctx.at(pad_index).isConstant()); + + auto rank = node.param().rank; + auto pad_base = _ctx.at(pad_index).data().base(); + + auto input = _tensor_builder->at(input_index).get()->handle(); + auto output = _tensor_builder->at(output_index).get()->handle(); + + ::arm_compute::PaddingList padding_list; + padding_list.resize(rank); + for (int32_t n = 0; n < rank; ++n) + { + const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(input_index).get()->layout(); + const auto axis = + acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value(); + padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]}; + } + + const auto input_type = _ctx.at(input_index).typeInfo(); + UNUSED_RELEASE(input_type); + assert(input->info()->data_type() == acl_common::asDataType(input_type.type())); + assert(input->info()->quantization_info() == + ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset())); + const auto pixel_value = + ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPadLayer>(); + fn->configure(input, output, padding_list, pixel_value); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Permute &node) +{ + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(0)}; + const auto permute_type = node.getPermuteType(); + auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); + auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + const auto rank = _ctx.at(ofm_idx).shape().rank(); + assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); + + std::unique_ptr<::arm_compute::IFunction> fn; + arm_compute::PermutationVector pv; + if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4) + { + // WHCN -> CWHN + pv = arm_compute::PermutationVector{2, 0, 1}; + + auto l = nnfw::cpp14::make_unique<::arm_compute::NEPermute>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + + fn = std::move(l); + } + else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4) + { + // CWHN -> WHCN + pv = arm_compute::PermutationVector{1, 2, 0}; + + auto l = nnfw::cpp14::make_unique<::arm_compute::NEPermute>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + + fn = std::move(l); + } + else + { + auto l = nnfw::cpp14::make_unique<::arm_compute::NECopy>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + fn = std::move(l); + } + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::PReLU &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + + std::unique_ptr<::arm_compute::IFunction> fn; + + auto l = nnfw::cpp14::make_unique<::arm_compute::NEPReLU>(); + + l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + + fn = std::move(l); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReduceMax &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReduceMax::Input::INPUT)}; + const auto &axes{node.param().axes}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int ifm_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += ifm_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value()); + } + + arm_compute::Coordinates reduce_axes; + for (const auto axis : acl_axes) + { + reduce_axes.set(reduce_axes.num_dimensions(), axis); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceOperation>(); + + fn->configure(ifm_alloc->handle(), reduce_axes, false, ofm_alloc->handle(), + ::arm_compute::ReduceOperation::MAX); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReduceMin &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReduceMin::Input::INPUT)}; + const auto &axes{node.param().axes}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int ifm_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += ifm_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value()); + } + + arm_compute::Coordinates reduce_axes; + for (const auto axis : acl_axes) + { + reduce_axes.set(reduce_axes.num_dimensions(), axis); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceOperation>(); + + fn->configure(ifm_alloc->handle(), reduce_axes, false, ofm_alloc->handle(), + ::arm_compute::ReduceOperation::MIN); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReduceSum &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReduceSum::Input::INPUT)}; + const auto &axes{node.param().axes}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = input_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int input_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += input_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value()); + } + + arm_compute::Coordinates fixed_axes; + for (const auto axis : acl_axes) + { + fixed_axes.set(fixed_axes.num_dimensions(), axis); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceSum>(); + + fn->configure(input_alloc->handle(), fixed_axes, false, output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReLU &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::NEActivationLayer>(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReLU1 &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReLU6 &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Reshape &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + // NOTE This operation must not be changed the layout from frontend to backend + // So, PermutationOperationPass makes layouts of frontend and backend the same. + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = output_alloc->layout(); + assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || + frontend_layout == backend_layout); + UNUSED_RELEASE(frontend_layout); + UNUSED_RELEASE(backend_layout); + + auto fn = nnfw::cpp14::make_unique<arm_compute::NEReshapeLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEScale>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, + ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::RNN &node) +{ + const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)}; + const auto hidden_state_out_index{ + node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)}; + + const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)}; + const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)}; + const auto recurrent_weights_index{ + node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)}; + const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)}; + const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)}; + + const auto activation = node.param().activation; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + + auto input_alloc = _tensor_builder->at(input_index).get(); + auto weights_alloc = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation); + + auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::NECopy>(); + copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + _execution_builder->append(asAclFunction(std::move(copy_layer))); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NERNNLayerEx>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), + bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), + act_info); + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::RSQRT &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NERsqrtLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Squeeze &node) +{ + // Squeeze is identical to reshape except that it has an optional dimensions input. + // In addition, optional dims_index is ignored since output tensor already has squeezed shape + // by freezer and toco + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; + const auto dims{node.param().dims}; + const auto ndim{node.param().ndim}; + (void)dims; + (void)ndim; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + auto fn = nnfw::cpp14::make_unique<arm_compute::NEReshapeLayer>(); + fn->configure(input_alloc->handle(), output_alloc->handle()); + auto acl_fn = asAclFunction(std::move(fn)); + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Tanh &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::NEActivationLayer>(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Softmax &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)}; + const auto beta = node.param().beta; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NESoftmaxLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)}; + const auto block_size_index{ + node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; + const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + auto paddings_alloc = _tensor_builder->at(paddings_index).get(); + + assert(_ctx.at(block_size_index).isConstant()); + assert(_ctx.at(paddings_index).isConstant()); + + // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is + // not 0. + auto fn = nnfw::cpp14::make_unique<::arm_compute::NESpaceToBatchLayerEx>(); + + fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), + ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; + + auto block_size = node.param().block_size; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NESpaceToDepthLayerEx>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Split &node) +{ + // TODO Support this op by SubTensor + const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + + assert(node.param().num_splits == static_cast<int>(node.getOutputs().size())); + + const auto ifm_rank = node.param().rank; + std::vector<ir::OperandIndex> output_indexes; + for (const auto &output : node.getOutputs()) + output_indexes.emplace_back(output); + + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + std::vector<arm_compute::ITensor *> output_allocs; + for (const auto &ofm_ind : output_indexes) + output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + auto axis = node.param().axis; + if (axis < 0) + axis += ifm_rank; + axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NESplit>(); + + fn->configure(ifm_alloc->handle(), output_allocs, axis); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::SQRT &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SquaredDifference &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseSquaredDiff>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Sub &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArithmeticSubtraction>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + arm_compute::ConvertPolicy::SATURATE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Slice &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)}; + const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; + const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; + + auto outputData_alloc = _tensor_builder->at(output_index).get(); + auto inputData_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = inputData_alloc->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = node.param().rank; + std::vector<int32_t> starts; + std::vector<int32_t> ends; + starts.resize(input_rank, 0); + ends.resize(input_rank, 0); + { + auto beginData_base = _ctx.at(begins_index).data().base(); + auto sizeData_base = _ctx.at(sizes_index).data().base(); + const int beginData_size = _ctx.at(begins_index).shape().num_elements(); + const int sizeData_size = _ctx.at(sizes_index).shape().num_elements(); + + using ir::DataType; + + UNUSED_RELEASE(beginData_size); + UNUSED_RELEASE(sizeData_size); + + assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32); + assert(beginData_size == input_rank); + assert(sizeData_size == input_rank); + + assert(beginData_base != nullptr); + for (int n = 0; n < input_rank; ++n) + { + auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout, + backend_layout) + .value(); + + int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n); + starts[axis] = begin_value; + + int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n); + ends[axis] = begin_value + size_value; + } + } + + ::arm_compute::Coordinates starts_set; + ::arm_compute::Coordinates ends_set; + + for (size_t i = 0; i < starts.size(); ++i) + { + starts_set.set(i, starts[i]); + ends_set.set(i, ends[i]); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NESlice>(); + + fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::StridedSlice &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; + const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)}; + const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; + const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; + + auto outputData_alloc = _tensor_builder->at(output_index).get(); + auto inputData_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = inputData_alloc->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = node.param().rank; + std::vector<int32_t> starts; + std::vector<int32_t> ends; + std::vector<int32_t> strides; + starts.resize(input_rank, 0); + ends.resize(input_rank, 0); + strides.resize(input_rank, 0); + { + auto startData_base = _ctx.at(starts_index).data().base(); + auto endData_base = _ctx.at(ends_index).data().base(); + auto stridesData_base = _ctx.at(strides_index).data().base(); + const int startData_size = _ctx.at(starts_index).shape().num_elements(); + const int endData_size = _ctx.at(ends_index).shape().num_elements(); + const int stridesData_size = _ctx.at(strides_index).shape().num_elements(); + + using ir::DataType; + + UNUSED_RELEASE(startData_size); + UNUSED_RELEASE(endData_size); + UNUSED_RELEASE(stridesData_size); + + assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32); + assert(startData_size == input_rank); + assert(endData_size == input_rank); + assert(stridesData_size == input_rank); + + assert(startData_base != nullptr); + for (int n = 0; n < input_rank; ++n) + { + auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout, + backend_layout) + .value(); + + int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n); + starts[axis] = start_value; + + int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n); + ends[axis] = end_value; + + int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n); + strides[axis] = strides_value; + } + } + + // Set mask bits such as order of inputData + // FIXME Take the layouts into account. + const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank); + const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank); + const auto shrink_axis_mask = + acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank); + + ::arm_compute::Coordinates starts_set; + ::arm_compute::Coordinates ends_set; + ::arm_compute::BiStrides strides_set; + + for (size_t i = 0; i < starts.size(); ++i) + { + starts_set.set(i, starts[i]); + ends_set.set(i, ends[i]); + strides_set.set(i, strides[i]); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEStridedSlice>(); + + fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + strides_set, begin_mask, end_mask, shrink_axis_mask); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::TransposeConv &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto output_shape_index{ + node.getInputs().at(ir::operation::TransposeConv::Input::OUTPUT_SHAPE)}; + const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)}; + const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout); + + const auto stride = node.param().stride; + + assert((node.param().padding.type == ir::PaddingType::SAME) || + (node.param().padding.type == ir::PaddingType::VALID)); + auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride, + ker_shape.W, ker_shape.H); + + uint32_t invalid_horizontal = 0; + uint32_t invalid_vertical = 0; + if (node.param().padding.type == ir::PaddingType::VALID) + { + invalid_horizontal = + ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1); + invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); + } + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + + const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NETransposeConvLayer>(); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, + invalid_horizontal, invalid_vertical); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Transpose &node) +{ + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; + const auto &perm{node.param().perm}; + + auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); + const auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + const auto rank = node.param().rank; + std::vector<std::int32_t> pv(perm.cbegin(), perm.cend()); + auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector( + rank, pv, frontend_layout, backend_layout); + + std::unique_ptr<::arm_compute::IFunction> fn; + + if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2) + { + auto l = nnfw::cpp14::make_unique<::arm_compute::NETranspose>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + fn = std::move(l); + } + else + { + auto l = nnfw::cpp14::make_unique<::arm_compute::NEPermute>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + + fn = std::move(l); + } + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Unpack &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; + auto axis{node.param().axis}; + + const auto input_rank = node.param().rank; + + std::vector<ir::OperandIndex> output_indexes; + for (const auto &output_index : node.getOutputs()) + output_indexes.emplace_back(output_index); + + auto input = _tensor_builder->at(input_index).get()->handle(); + std::vector<arm_compute::ITensor *> outputs; + for (const auto &output_index : output_indexes) + outputs.emplace_back(_tensor_builder->at(output_index)->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(input_index).get()->layout(); + if (axis < 0) + axis += input_rank; + axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEUnstack>(); + + fn->configure(input, outputs, axis); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Add &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArithmeticAddition>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + arm_compute::ConvertPolicy::SATURATE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Div &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseDivision>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Exp &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEExpLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Comparison &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; + + const auto comparison_type = node.param().comparison_type; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseComparison>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + (arm_compute::ComparisonOperation)comparison_type); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Min &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseMin>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Max &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseMax>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/KernelGenerator.h b/runtime/neurun/backend/acl_neon/KernelGenerator.h new file mode 100644 index 000000000..f041fb725 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/KernelGenerator.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__ +#define __NEURUN_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__ + +#include <backend/IKernelGenerator.h> + +#include "ir/Operands.h" +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +class KernelGenerator : public IKernelGenerator +{ +public: + KernelGenerator(const ir::Operands &ctx, const std::shared_ptr<TensorBuilder> &tensor_builder); + + void visit(const ir::OpSequence &) override; + void visit(const ir::operation::Abs &) override; + void visit(const ir::operation::ArgMax &) override; + void visit(const ir::operation::BatchToSpaceND &) override; + void visit(const ir::operation::Cast &) override; + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthToSpace &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::Dequantize &) override; + void visit(const ir::operation::MaxPool2D &) override; + void visit(const ir::operation::Mean &) override; + void visit(const ir::operation::AvgPool2D &) override; + void visit(const ir::operation::Concat &) override; + void visit(const ir::operation::EmbeddingLookup &) override; + void visit(const ir::operation::Floor &) override; + void visit(const ir::operation::FullyConnected &) override; + void visit(const ir::operation::Gather &) override; + void visit(const ir::operation::HashtableLookup &) override; + void visit(const ir::operation::InstanceNorm &) override; + void visit(const ir::operation::L2Normalization &) override; + void visit(const ir::operation::L2Pool2D &) override; + void visit(const ir::operation::LocalResponseNormalization &) override; + void visit(const ir::operation::LogicalAnd &) override; + void visit(const ir::operation::LogicalNot &) override; + void visit(const ir::operation::LogicalOr &) override; + void visit(const ir::operation::Logistic &) override; + void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::Mul &) override; + void visit(const ir::operation::Neg &) override; + void visit(const ir::operation::Pack &) override; + void visit(const ir::operation::Pad &) override; + void visit(const ir::operation::Permute &) override; + void visit(const ir::operation::PReLU &) override; + void visit(const ir::operation::ReduceMax &) override; + void visit(const ir::operation::ReduceMin &) override; + void visit(const ir::operation::ReduceSum &) override; + void visit(const ir::operation::ReLU &) override; + void visit(const ir::operation::ReLU1 &) override; + void visit(const ir::operation::ReLU6 &) override; + void visit(const ir::operation::Reshape &) override; + void visit(const ir::operation::ResizeBilinear &) override; + void visit(const ir::operation::RNN &) override; + void visit(const ir::operation::RSQRT &) override; + void visit(const ir::operation::Squeeze &) override; + void visit(const ir::operation::Tanh &) override; + void visit(const ir::operation::Softmax &) override; + void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::SpaceToDepth &) override; + void visit(const ir::operation::Split &) override; + void visit(const ir::operation::SQRT &) override; + void visit(const ir::operation::SquaredDifference &) override; + void visit(const ir::operation::Sub &) override; + void visit(const ir::operation::Slice &) override; + void visit(const ir::operation::StridedSlice &) override; + void visit(const ir::operation::TransposeConv &) override; + void visit(const ir::operation::Transpose &) override; + void visit(const ir::operation::Unpack &) override; + void visit(const ir::operation::Add &) override; + void visit(const ir::operation::Div &) override; + void visit(const ir::operation::Exp &) override; + void visit(const ir::operation::Comparison &) override; + void visit(const ir::operation::Min &) override; + void visit(const ir::operation::Max &) override; + +private: + const ir::Operands &_ctx; + std::shared_ptr<TensorBuilder> _tensor_builder; + ir::Layout _current_subg_layout; +}; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__ diff --git a/runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc b/runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc new file mode 100644 index 000000000..75f2e9797 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <util/logging.h> + +#include "Backend.h" + +extern "C" { +neurun::backend::Backend *neurun_backend_create() +{ + VERBOSE(neurun_backend_create) << "'acl_neon' loaded\n"; + return new neurun::backend::acl_neon::Backend; +} + +void neurun_backend_destroy(neurun::backend::Backend *backend) +{ + VERBOSE(neurun_backend_create) << "'acl_neon' unloaded\n"; + delete backend; +} +} diff --git a/runtime/neurun/backend/acl_neon/ShapeFixer.cc b/runtime/neurun/backend/acl_neon/ShapeFixer.cc new file mode 100644 index 000000000..1d80e57e9 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/ShapeFixer.cc @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ShapeFixer.h" + +#include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h> +#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h> +#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h> +#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h> +#include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h> +#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h> +#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h> +#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h> +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> + +#include <Convert.h> +#include <Swizzle.h> + +#include "util/Padding.h" +#include "ir/Index.h" +#include "compiler/IExecutionBuilder.h" +#include "exec/NopFunction.h" +#include "util/logging.h" +#include "util/Utils.h" + +using ::neurun::compiler::IExecutionBuilder; + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +using ::neurun::backend::acl_common::asAclFunction; + +ShapeFixer::ShapeFixer(const ir::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _ctx(ctx), _tensor_builder(tensor_builder) +{ + assert(tensor_builder); +} + +void ShapeFixer::visit(const ir::operation::Abs &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ArgMax &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::Cast &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Conv2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::DepthToSpace &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::DepthwiseConv2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Dequantize &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::MaxPool2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Mean &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::AvgPool2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Concat &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(ofm_index, false); + for (const auto &inputs : node.getInputs()) + _tensor_builder->dimCorrection(inputs, false); +} + +void ShapeFixer::visit(const ir::operation::EmbeddingLookup &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + _tensor_builder->dimCorrection(values_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Exp &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Floor &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::FullyConnected &node) +{ + using ir::operation::FullyConnected; + const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; + const auto input_rank = _ctx.at(input_index).shape().rank(); + // Check for reshaping input's shape into rank-2 + if (input_rank == 3 || input_rank == 4) + _tensor_builder->dimCorrection(input_index, false); +} + +void ShapeFixer::visit(const ir::operation::HashtableLookup &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + _tensor_builder->dimCorrection(values_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Gather &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; + const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); + _tensor_builder->dimCorrection(indices_index, false); +} + +void ShapeFixer::visit(const ir::operation::InstanceNorm &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::L2Normalization &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::L2Pool2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LocalResponseNormalization &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LogicalAnd &node) +{ + const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::LogicalNot &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LogicalOr &node) +{ + const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Logistic &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LSTM &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Pack &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(ofm_index, false); + for (const auto &inputs : node.getInputs()) + { + _tensor_builder->dimCorrection(inputs, false); + const auto ofm_rank = _ctx.at(ofm_index).shape().rank(); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(inputs).shape()).extendRank(ofm_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Pad &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Mul &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Neg &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Permute &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::PReLU &node) +{ + const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; + + if (!(_ctx.at(ifm_index).shape() == _ctx.at(alpha_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(ifm_index).shape().rank(), _ctx.at(alpha_index).shape().rank()); + const_cast<ir::Shape &>(_ctx.at(ifm_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(alpha_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::ReduceMax &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReduceMin &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReduceSum &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReLU &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReLU1 &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReLU6 &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Reshape &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::ResizeBilinear &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::RNN &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Comparison &node) +{ + const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::RSQRT &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Squeeze &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Tanh &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Slice &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::StridedSlice &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::Softmax &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::SpaceToDepth &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Split &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + for (const auto &output : node.getOutputs()) + _tensor_builder->dimCorrection(output, false); +} + +void ShapeFixer::visit(const ir::operation::SQRT &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::SquaredDifference &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Sub &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::TransposeConv &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Transpose &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Unpack &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + for (const auto &output_index : node.getOutputs()) + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Add &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Div &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Min &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Max &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/ShapeFixer.h b/runtime/neurun/backend/acl_neon/ShapeFixer.h new file mode 100644 index 000000000..aa1f8f75a --- /dev/null +++ b/runtime/neurun/backend/acl_neon/ShapeFixer.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_SHAPE_FIXER_H__ +#define __NEURUN_BACKEND_ACL_NEON_SHAPE_FIXER_H__ + +#include <backend/IShapeFixer.h> + +#include "ir/Operands.h" +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +class ShapeFixer : public IShapeFixer +{ +public: + ShapeFixer(const ir::Operands &ctx, const std::shared_ptr<TensorBuilder> &tensor_builder); + + void visit(const ir::operation::Abs &) override; + void visit(const ir::operation::ArgMax &) override; + void visit(const ir::operation::BatchToSpaceND &) override; + void visit(const ir::operation::Cast &) override; + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthToSpace &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::Dequantize &) override; + void visit(const ir::operation::MaxPool2D &) override; + void visit(const ir::operation::Mean &) override; + void visit(const ir::operation::AvgPool2D &) override; + void visit(const ir::operation::Concat &) override; + void visit(const ir::operation::EmbeddingLookup &) override; + void visit(const ir::operation::Exp &) override; + void visit(const ir::operation::Floor &) override; + void visit(const ir::operation::FullyConnected &) override; + void visit(const ir::operation::Gather &) override; + void visit(const ir::operation::HashtableLookup &) override; + void visit(const ir::operation::InstanceNorm &) override; + void visit(const ir::operation::L2Normalization &) override; + void visit(const ir::operation::L2Pool2D &) override; + void visit(const ir::operation::LocalResponseNormalization &) override; + void visit(const ir::operation::LogicalAnd &) override; + void visit(const ir::operation::LogicalNot &) override; + void visit(const ir::operation::LogicalOr &) override; + void visit(const ir::operation::Logistic &) override; + void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::Mul &) override; + void visit(const ir::operation::Neg &) override; + void visit(const ir::operation::Pack &) override; + void visit(const ir::operation::Pad &) override; + void visit(const ir::operation::Permute &) override; + void visit(const ir::operation::PReLU &) override; + void visit(const ir::operation::ReduceMax &) override; + void visit(const ir::operation::ReduceMin &) override; + void visit(const ir::operation::ReduceSum &) override; + void visit(const ir::operation::ReLU &) override; + void visit(const ir::operation::ReLU1 &) override; + void visit(const ir::operation::ReLU6 &) override; + void visit(const ir::operation::Reshape &) override; + void visit(const ir::operation::ResizeBilinear &) override; + void visit(const ir::operation::RNN &) override; + void visit(const ir::operation::RSQRT &) override; + void visit(const ir::operation::Squeeze &) override; + void visit(const ir::operation::Tanh &) override; + void visit(const ir::operation::Softmax &) override; + void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::SpaceToDepth &) override; + void visit(const ir::operation::Split &) override; + void visit(const ir::operation::SQRT &) override; + void visit(const ir::operation::SquaredDifference &) override; + void visit(const ir::operation::Sub &) override; + void visit(const ir::operation::Slice &) override; + void visit(const ir::operation::StridedSlice &) override; + void visit(const ir::operation::TransposeConv &) override; + void visit(const ir::operation::Transpose &) override; + void visit(const ir::operation::Unpack &) override; + void visit(const ir::operation::Add &) override; + void visit(const ir::operation::Div &) override; + void visit(const ir::operation::Comparison &) override; + void visit(const ir::operation::Min &) override; + void visit(const ir::operation::Max &) override; + +private: + const ir::Operands &_ctx; + std::shared_ptr<TensorBuilder> _tensor_builder; +}; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_SHAPE_FIXER_H__ diff --git a/runtime/neurun/backend/acl_neon/TensorBuilder.h b/runtime/neurun/backend/acl_neon/TensorBuilder.h new file mode 100644 index 000000000..0a6b4921d --- /dev/null +++ b/runtime/neurun/backend/acl_neon/TensorBuilder.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_TENSOR_BUILDER_H__ +#define __NEURUN_BACKEND_ACL_NEON_TENSOR_BUILDER_H__ + +#include <TemplTensorBuilder.h> + +#include "operand/NETensor.h" +#include "operand/NESubTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +using TensorBuilder = + acl_common::TemplTensorBuilder<operand::INETensor, operand::NETensor, operand::NESubTensor>; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_TENSOR_BUILDER_H__ diff --git a/runtime/neurun/backend/acl_neon/TensorManager.h b/runtime/neurun/backend/acl_neon/TensorManager.h new file mode 100644 index 000000000..725275cef --- /dev/null +++ b/runtime/neurun/backend/acl_neon/TensorManager.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_TENSOR_MANAGER_H__ +#define __NEURUN_BACKEND_ACL_NEON_TENSOR_MANAGER_H__ + +#include <arm_compute/runtime/Allocator.h> +#include <arm_compute/runtime/PoolManager.h> +#include <arm_compute/runtime/OffsetLifetimeManager.h> +#include <arm_compute/runtime/MemoryManagerOnDemand.h> +#include <arm_compute/runtime/MemoryGroup.h> + +#include <AclMemoryManager.h> +#include <AclLinearMemoryManager.h> +#include <AclInternalBufferManager.h> +#include <AclTensorManager.h> + +#include "operand/NETensor.h" +#include "operand/NESubTensor.h" + +#include "util/logging.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +using MemoryManager = + acl_common::AclMemoryManager<operand::INETensor, operand::NETensor, operand::NESubTensor>; + +using LinearMemoryManager = acl_common::AclLinearMemoryManager< + operand::INETensor, operand::NETensor, operand::NESubTensor, + ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager, + ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator, ::arm_compute::MemoryGroup>; + +using InternalBufferManager = acl_common::AclInternalBufferManager< + ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager, + ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator>; + +using TensorManager = acl_common::AclTensorManager<acl_neon::operand::INETensor, operand::NETensor, + operand::NESubTensor>; + +TensorManager *createTensorManager() +{ + const std::string executor_str = util::getConfigString(util::config::EXECUTOR); + if (executor_str == "Linear") + { + VERBOSE(acl_neon_createTensorManager) << "AclTensorManager as Linear" << std::endl; + return new TensorManager(new MemoryManager(), new LinearMemoryManager(), + new InternalBufferManager()); + } + else + { + VERBOSE(acl_neon_createTensorManager) << "AclTensorManager" << std::endl; + return new TensorManager(new MemoryManager(), new MemoryManager(), new InternalBufferManager()); + } +} + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_TENSOR_MANAGER_H__ diff --git a/runtime/neurun/backend/acl_neon/TensorRegister.cc b/runtime/neurun/backend/acl_neon/TensorRegister.cc new file mode 100644 index 000000000..fe766cdf9 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/TensorRegister.cc @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TensorRegister.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +// NOTHING + +} // srcn +} // backend +} // neurun diff --git a/runtime/neurun/backend/acl_neon/TensorRegister.h b/runtime/neurun/backend/acl_neon/TensorRegister.h new file mode 100644 index 000000000..115e05dee --- /dev/null +++ b/runtime/neurun/backend/acl_neon/TensorRegister.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_TENSOR_REGISTER_H__ +#define __NEURUN_BACKEND_ACL_NEON_TENSOR_REGISTER_H__ + +#include <AclTensorRegister.h> +#include <misc/polymorphic_downcast.h> +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ + +class TensorRegister : public acl_common::AclTensorRegister +{ +public: + TensorRegister(const ir::Operands &operands, const std::shared_ptr<TensorBuilder> &tensor_builder) + : acl_common::AclTensorRegister{operands, tensor_builder} + { + // DO NOTHING + } + + void setUsesCount(const ir::OperandIndex &ind, size_t num_uses) const override + { + nnfw::misc::polymorphic_downcast<TensorBuilder *>(tensor_builder().get()) + ->setUsesCount(ind, num_uses); + } +}; + +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_TENSOR_REGISTER_H__ diff --git a/runtime/neurun/backend/acl_neon/operand/INETensor.cc b/runtime/neurun/backend/acl_neon/operand/INETensor.cc new file mode 100644 index 000000000..fdb20970d --- /dev/null +++ b/runtime/neurun/backend/acl_neon/operand/INETensor.cc @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "INETensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ +namespace operand +{ + +void INETensor::access(const std::function<void(ITensor &tensor)> &fn) { fn(*this); } + +} // namespace operand +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/operand/INETensor.h b/runtime/neurun/backend/acl_neon/operand/INETensor.h new file mode 100644 index 000000000..22b1140cf --- /dev/null +++ b/runtime/neurun/backend/acl_neon/operand/INETensor.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_OPERAND_I_NE_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_NEON_OPERAND_I_NE_TENSOR_H__ + +#include <arm_compute/core/ITensor.h> + +#include <IACLTensor.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ +namespace operand +{ + +class INETensor : public acl_common::IACLTensor +{ +public: + const arm_compute::ITensor *handle() const override = 0; + arm_compute::ITensor *handle() override = 0; + void access(const std::function<void(ITensor &tensor)> &fn) final; +}; + +} // namespace operand +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_OPERAND_I_NE_TENSOR_H__ diff --git a/runtime/neurun/backend/acl_neon/operand/NESubTensor.cc b/runtime/neurun/backend/acl_neon/operand/NESubTensor.cc new file mode 100644 index 000000000..a36af609c --- /dev/null +++ b/runtime/neurun/backend/acl_neon/operand/NESubTensor.cc @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "NESubTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ +namespace operand +{ + +NESubTensor::NESubTensor(INETensor *parent, const arm_compute::TensorShape &tensor_shape, + const arm_compute::Coordinates &coords, size_t rank, bool extend_parent) + : _ne_sub_tensor(std::make_shared<arm_compute::SubTensor>(parent->handle(), tensor_shape, + coords, extend_parent)), + _rank{rank} +{ + // DO NOTHING +} + +const arm_compute::SubTensor *NESubTensor::handle() const { return _ne_sub_tensor.get(); } + +arm_compute::SubTensor *NESubTensor::handle() { return _ne_sub_tensor.get(); } + +} // namespace operand +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/operand/NESubTensor.h b/runtime/neurun/backend/acl_neon/operand/NESubTensor.h new file mode 100644 index 000000000..010e4deda --- /dev/null +++ b/runtime/neurun/backend/acl_neon/operand/NESubTensor.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_SUB_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_SUB_TENSOR_H__ + +#include <arm_compute/runtime/SubTensor.h> +#include "INETensor.h" +#include "compiler/SubTensorInfo.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ +namespace operand +{ + +class NESubTensor : public INETensor +{ +public: + NESubTensor() = delete; + +public: + NESubTensor(INETensor *parent, const arm_compute::TensorShape &tensor_shape, + const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false); + +public: + size_t num_dimensions() const final { return _rank; } + +public: + const arm_compute::SubTensor *handle() const override; + arm_compute::SubTensor *handle() override; + +public: + // This method is used to prevent the use of memcpy for SubTensor + bool has_padding() const override { return true; } + +private: + std::shared_ptr<arm_compute::SubTensor> _ne_sub_tensor; + size_t _rank; +}; + +} // namespace operand +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_SUB_TENSOR_H__ diff --git a/runtime/neurun/backend/acl_neon/operand/NETensor.cc b/runtime/neurun/backend/acl_neon/operand/NETensor.cc new file mode 100644 index 000000000..8a9ece88f --- /dev/null +++ b/runtime/neurun/backend/acl_neon/operand/NETensor.cc @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <arm_compute/runtime/Memory.h> +#include <arm_compute/runtime/MemoryRegion.h> +#include "NETensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ +namespace operand +{ + +NETensor::NETensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses) + : _ne_tensor(std::make_shared<arm_compute::Tensor>()), _rank{rank}, _num_uses{num_uses} +{ + allocator()->init(info); +} + +const arm_compute::Tensor *NETensor::handle() const { return _ne_tensor.get(); } + +arm_compute::Tensor *NETensor::handle() { return _ne_tensor.get(); } + +arm_compute::TensorAllocator *NETensor::allocator() { return _ne_tensor->allocator(); } + +} // namespace operand +} // namespace acl_neon +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_neon/operand/NETensor.h b/runtime/neurun/backend/acl_neon/operand/NETensor.h new file mode 100644 index 000000000..3de4695e9 --- /dev/null +++ b/runtime/neurun/backend/acl_neon/operand/NETensor.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_TENSOR_H__ + +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/runtime/Tensor.h> +#include "arm_compute/runtime/TensorAllocator.h" +#include "INETensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_neon +{ +namespace operand +{ + +class NETensor : public INETensor +{ +public: + NETensor() = delete; + +public: + NETensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses); + +public: + size_t num_dimensions() const final { return _rank; } + +public: + const arm_compute::Tensor *handle() const override; + arm_compute::Tensor *handle() override; + size_t num_uses() const { return _num_uses; } + +public: + arm_compute::TensorAllocator *allocator(); + +private: + std::shared_ptr<arm_compute::Tensor> _ne_tensor; + size_t _rank; + size_t _num_uses; +}; + +} // namespace operand +} // namespace acl_neon +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_TENSOR_H__ |