summaryrefslogtreecommitdiff
path: root/runtime/neurun/backend/acl_neon
diff options
context:
space:
mode:
authorChunseok Lee <chunseok.lee@samsung.com>2020-03-04 18:09:24 +0900
committerChunseok Lee <chunseok.lee@samsung.com>2020-03-04 18:09:24 +0900
commit302e6564a7a76109e1178207e44e45a58631c477 (patch)
tree6cc4bd95e5e438331fc2c53234af4ed0e0f3bc20 /runtime/neurun/backend/acl_neon
parentbd11b24234d7d43dfe05a81c520aa01ffad06e42 (diff)
downloadnnfw-upstream/1.1.0.tar.gz
nnfw-upstream/1.1.0.tar.bz2
nnfw-upstream/1.1.0.zip
Diffstat (limited to 'runtime/neurun/backend/acl_neon')
-rw-r--r--runtime/neurun/backend/acl_neon/Backend.h65
-rw-r--r--runtime/neurun/backend/acl_neon/CMakeLists.txt21
-rw-r--r--runtime/neurun/backend/acl_neon/Config.cc30
-rw-r--r--runtime/neurun/backend/acl_neon/Config.h49
-rw-r--r--runtime/neurun/backend/acl_neon/ConstantInitializer.cc246
-rw-r--r--runtime/neurun/backend/acl_neon/ConstantInitializer.h60
-rw-r--r--runtime/neurun/backend/acl_neon/KernelGenerator.cc2152
-rw-r--r--runtime/neurun/backend/acl_neon/KernelGenerator.h111
-rw-r--r--runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc33
-rw-r--r--runtime/neurun/backend/acl_neon/ShapeFixer.cc439
-rw-r--r--runtime/neurun/backend/acl_neon/ShapeFixer.h109
-rw-r--r--runtime/neurun/backend/acl_neon/TensorBuilder.h39
-rw-r--r--runtime/neurun/backend/acl_neon/TensorManager.h78
-rw-r--r--runtime/neurun/backend/acl_neon/TensorRegister.cc30
-rw-r--r--runtime/neurun/backend/acl_neon/TensorRegister.h51
-rw-r--r--runtime/neurun/backend/acl_neon/operand/INETensor.cc33
-rw-r--r--runtime/neurun/backend/acl_neon/operand/INETensor.h46
-rw-r--r--runtime/neurun/backend/acl_neon/operand/NESubTensor.cc44
-rw-r--r--runtime/neurun/backend/acl_neon/operand/NESubTensor.h63
-rw-r--r--runtime/neurun/backend/acl_neon/operand/NETensor.cc45
-rw-r--r--runtime/neurun/backend/acl_neon/operand/NETensor.h64
21 files changed, 3808 insertions, 0 deletions
diff --git a/runtime/neurun/backend/acl_neon/Backend.h b/runtime/neurun/backend/acl_neon/Backend.h
new file mode 100644
index 000000000..2fcf66933
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/Backend.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_BACKEND_H__
+#define __NEURUN_BACKEND_ACL_NEON_BACKEND_H__
+
+#include <memory>
+#include <backend/Backend.h>
+#include <ir/Operands.h>
+
+#include "Config.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+#include "ShapeFixer.h"
+#include "TensorManager.h"
+#include "TensorRegister.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class Backend : public ::neurun::backend::Backend
+{
+public:
+ Backend() : _config{std::make_shared<Config>()} {}
+
+ std::shared_ptr<IConfig> config() const override { return _config; }
+
+ std::unique_ptr<BackendContext>
+ newContext(const ir::Operands &operands,
+ const std::shared_ptr<custom::IKernelBuilder> &) const override
+ {
+ auto tensor_builder = std::make_shared<TensorBuilder>(createTensorManager());
+ return std::unique_ptr<BackendContext>{new BackendContext{
+ this, tensor_builder, std::make_shared<ConstantInitializer>(operands, tensor_builder),
+ std::make_shared<KernelGenerator>(operands, tensor_builder),
+ std::make_shared<ShapeFixer>(operands, tensor_builder),
+ std::make_shared<TensorRegister>(operands, tensor_builder)}};
+ }
+
+private:
+ std::shared_ptr<IConfig> _config;
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_BACKEND_H__
diff --git a/runtime/neurun/backend/acl_neon/CMakeLists.txt b/runtime/neurun/backend/acl_neon/CMakeLists.txt
new file mode 100644
index 000000000..061246d36
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Unsupported architecture
+nnas_find_package(ARMCompute QUIET)
+if(NOT ARMCompute_FOUND)
+ return()
+endif(NOT ARMCompute_FOUND)
+
+set(LIB_NEURUN_BACKEND_ACL_NEON neurun_backend_acl_neon)
+
+file(GLOB_RECURSE SOURCES "*.cc")
+
+add_library(${LIB_NEURUN_BACKEND_ACL_NEON} SHARED ${SOURCES})
+
+target_include_directories(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE neurun_core)
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE ${LIB_NEURUN_BACKEND_ACL_COMMON})
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE nnfw_common)
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_NEON} PRIVATE nnfw_coverage)
+
+set_target_properties(${LIB_NEURUN_BACKEND_ACL_NEON} PROPERTIES OUTPUT_NAME backend_acl_neon)
+
+install(TARGETS ${LIB_NEURUN_BACKEND_ACL_NEON} DESTINATION lib)
diff --git a/runtime/neurun/backend/acl_neon/Config.cc b/runtime/neurun/backend/acl_neon/Config.cc
new file mode 100644
index 000000000..352bc0b41
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/Config.cc
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Config.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+bool Config::initialize() { return true; }
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/Config.h b/runtime/neurun/backend/acl_neon/Config.h
new file mode 100644
index 000000000..430c194ee
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/Config.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_CONFIG_H__
+#define __NEURUN_BACKEND_ACL_NEON_CONFIG_H__
+
+#include <backend/IConfig.h>
+#include <cpp14/memory.h>
+#include <util/ITimer.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class Config : public IConfig
+{
+public:
+ std::string id() override { return "acl_neon"; }
+ bool initialize() override;
+ bool SupportPermutation() override { return true; }
+ bool SupportSubTensorAlloc() override { return true; }
+
+ std::unique_ptr<util::ITimer> timer() override
+ {
+ return nnfw::cpp14::make_unique<util::CPUTimer>();
+ }
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_CONFIG_H__
diff --git a/runtime/neurun/backend/acl_neon/ConstantInitializer.cc b/runtime/neurun/backend/acl_neon/ConstantInitializer.cc
new file mode 100644
index 000000000..9a74bda29
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/ConstantInitializer.cc
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConstantInitializer.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
+ const std::shared_ptr<TensorBuilder> &tensor_builder)
+ : _operands{operands}, _tensor_builder{tensor_builder}
+{
+ // DO NOTHING
+}
+
+void ConstantInitializer::visit(const ir::operation::BatchToSpaceND &node)
+{
+ const auto &block_size_index = node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE);
+ const auto &block_size_obj = _operands.at(block_size_index);
+
+ if (block_size_obj.isConstant())
+ {
+ _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) {
+ const auto &shape = model_obj.shape();
+ const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base());
+ assert(model_obj.shape().rank() == 1);
+ obj.access([&](::neurun::backend::operand::ITensor &tensor) {
+ for (size_t i = 0; i < shape.num_elements(); ++i)
+ {
+ const int32_t value = base[shape.num_elements() - i - 1];
+ int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() +
+ tensor.calcOffset({static_cast<int32_t>(i)}));
+ *into = value;
+ }
+ });
+ };
+ }
+}
+
+void ConstantInitializer::visit(const ir::operation::Conv2D &node)
+{
+ const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
+ const auto &kernel_obj = _operands.at(kernel_index);
+ registerPermuteInitializer(kernel_index, kernel_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
+ const auto &bias_obj = _operands.at(bias_index);
+ registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
+{
+ const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
+ const auto &kernel_obj = _operands.at(kernel_index);
+ registerPermuteInitializer(kernel_index, kernel_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
+ const auto &bias_obj = _operands.at(bias_index);
+ registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
+{
+ const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
+ const auto &weight_obj = _operands.at(weight_index);
+ registerCopyInitializer(weight_index, weight_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
+ const auto &bias_obj = _operands.at(bias_index);
+ registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const ir::operation::LSTM &node)
+{
+ const auto &input_to_input_weights_index =
+ node.getInputs().at(ir::operation::LSTM::INPUT_TO_INPUT_WEIGHTS);
+ const auto &input_to_input_weights_obj = _operands.at(input_to_input_weights_index);
+ registerCopyInitializer(input_to_input_weights_index, input_to_input_weights_obj);
+
+ const auto &input_to_forget_weights_index =
+ node.getInputs().at(ir::operation::LSTM::INPUT_TO_FORGET_WEIGHTS);
+ const auto &input_to_forget_weights_obj = _operands.at(input_to_forget_weights_index);
+ registerCopyInitializer(input_to_forget_weights_index, input_to_forget_weights_obj);
+
+ const auto &input_to_cell_weights_index =
+ node.getInputs().at(ir::operation::LSTM::INPUT_TO_CELL_WEIGHTS);
+ const auto &input_to_cell_weights_obj = _operands.at(input_to_cell_weights_index);
+ registerCopyInitializer(input_to_cell_weights_index, input_to_cell_weights_obj);
+
+ const auto &input_to_output_weights_index =
+ node.getInputs().at(ir::operation::LSTM::INPUT_TO_OUTPUT_WEIGHTS);
+ const auto &input_to_output_weights_obj = _operands.at(input_to_output_weights_index);
+ registerCopyInitializer(input_to_output_weights_index, input_to_output_weights_obj);
+
+ const auto &recurrent_to_input_weights_index =
+ node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_INPUT_WEIGHTS);
+ const auto &recurrent_to_input_weights_obj = _operands.at(recurrent_to_input_weights_index);
+ registerCopyInitializer(recurrent_to_input_weights_index, recurrent_to_input_weights_obj);
+
+ const auto &recurrent_to_forget_weights_index =
+ node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_FORGET_WEIGHTS);
+ const auto &recurrent_to_forget_weights_obj = _operands.at(recurrent_to_forget_weights_index);
+ registerCopyInitializer(recurrent_to_forget_weights_index, recurrent_to_forget_weights_obj);
+
+ const auto &recurrent_to_cell_weights_index =
+ node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_CELL_WEIGHTS);
+ const auto &recurrent_to_cell_weights_obj = _operands.at(recurrent_to_cell_weights_index);
+ registerCopyInitializer(recurrent_to_cell_weights_index, recurrent_to_cell_weights_obj);
+
+ const auto &recurrent_to_output_weights_index =
+ node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_OUTPUT_WEIGHTS);
+ const auto &recurrent_to_output_weights_obj = _operands.at(recurrent_to_output_weights_index);
+ registerCopyInitializer(recurrent_to_output_weights_index, recurrent_to_output_weights_obj);
+
+ const auto &cell_to_input_weights_index =
+ node.getInputs().at(ir::operation::LSTM::CELL_TO_INPUT_WEIGHTS);
+ const auto &cell_to_input_weights_obj = _operands.at(cell_to_input_weights_index);
+ registerCopyInitializer(cell_to_input_weights_index, cell_to_input_weights_obj);
+
+ const auto &cell_to_forget_weights_index =
+ node.getInputs().at(ir::operation::LSTM::CELL_TO_FORGET_WEIGHTS);
+ const auto &cell_to_forget_weights_obj = _operands.at(cell_to_forget_weights_index);
+ registerCopyInitializer(cell_to_forget_weights_index, cell_to_forget_weights_obj);
+
+ const auto &cell_to_output_weights_index =
+ node.getInputs().at(ir::operation::LSTM::CELL_TO_OUTPUT_WEIGHTS);
+ const auto &cell_to_output_weights_obj = _operands.at(cell_to_output_weights_index);
+ registerCopyInitializer(cell_to_output_weights_index, cell_to_output_weights_obj);
+
+ const auto &input_gate_bias_index = node.getInputs().at(ir::operation::LSTM::INPUT_GATE_BIAS);
+ const auto &input_gate_bias_obj = _operands.at(input_gate_bias_index);
+ registerCopyInitializer(input_gate_bias_index, input_gate_bias_obj);
+
+ const auto &forget_gate_bias_index = node.getInputs().at(ir::operation::LSTM::FORGET_GATE_BIAS);
+ const auto &forget_gate_bias_obj = _operands.at(forget_gate_bias_index);
+ registerCopyInitializer(forget_gate_bias_index, forget_gate_bias_obj);
+
+ const auto &output_gate_bias_index = node.getInputs().at(ir::operation::LSTM::OUTPUT_GATE_BIAS);
+ const auto &output_gate_bias_obj = _operands.at(output_gate_bias_index);
+ registerCopyInitializer(output_gate_bias_index, output_gate_bias_obj);
+
+ const auto &projection_weights_index =
+ node.getInputs().at(ir::operation::LSTM::PROJECTION_WEIGHTS);
+ const auto &projection_weights_obj = _operands.at(projection_weights_index);
+ registerCopyInitializer(projection_weights_index, projection_weights_obj);
+
+ const auto &projection_bias_index = node.getInputs().at(ir::operation::LSTM::PROJECTION_BIAS);
+ const auto &projection_bias_obj = _operands.at(projection_bias_index);
+ registerCopyInitializer(projection_bias_index, projection_bias_obj);
+}
+
+void ConstantInitializer::visit(const ir::operation::RNN &node)
+{
+ const auto &weights_index = node.getInputs().at(ir::operation::RNN::WEIGHTS);
+ const auto &weights_obj = _operands.at(weights_index);
+ registerCopyInitializer(weights_index, weights_obj);
+
+ const auto &recurrent_weights_index = node.getInputs().at(ir::operation::RNN::RECURRENT_WEIGHTS);
+ const auto &recurrent_weights_obj = _operands.at(recurrent_weights_index);
+ registerCopyInitializer(recurrent_weights_index, recurrent_weights_obj);
+
+ const auto &bias_index = node.getInputs().at(ir::operation::RNN::BIAS);
+ const auto &bias_obj = _operands.at(bias_index);
+ registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node)
+{
+ const auto &block_size_index = node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE);
+ const auto &block_size_obj = _operands.at(block_size_index);
+
+ if (block_size_obj.isConstant())
+ {
+ _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) {
+ const auto &shape = model_obj.shape();
+ const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base());
+ assert(model_obj.shape().rank() == 1);
+ obj.access([&](::neurun::backend::operand::ITensor &tensor) {
+ for (size_t i = 0; i < shape.num_elements(); ++i)
+ {
+ const int32_t value = base[shape.num_elements() - i - 1];
+ int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() +
+ tensor.calcOffset({static_cast<int32_t>(i)}));
+ *into = value;
+ }
+ });
+ };
+ }
+
+ const auto &paddings_index = node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS);
+ const auto &paddings_obj = _operands.at(paddings_index);
+ if (paddings_obj.isConstant())
+ {
+ _init_map[paddings_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) {
+ const auto &shape = model_obj.shape();
+ const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base());
+ assert(model_obj.shape().rank() == 2);
+ assert(shape.dim(0) == 2);
+ assert(shape.dim(1) == 2);
+ obj.access([&](::neurun::backend::operand::ITensor &tensor) {
+ for (auto i = 0; i < shape.dim(0); ++i)
+ {
+ for (auto j = 0; j < shape.dim(1); ++j)
+ {
+ const int32_t value = base[i * 2 + j];
+ int32_t *into = reinterpret_cast<int32_t *>(
+ // The coordinates of NETensor are different from the coordiantes of CLTensor in
+ // this operand.
+ // NEON : {j, reversed i}
+ // CL : {reversed i, j}
+ tensor.buffer() + tensor.calcOffset({j, shape.dim(0) - i - 1}));
+ *into = value;
+ }
+ }
+ });
+ };
+ }
+}
+
+void ConstantInitializer::visit(const ir::operation::TransposeConv &node)
+{
+ const auto &kernel_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL);
+ const auto &kernel_obj = _operands.at(kernel_index);
+ registerPermuteInitializer(kernel_index, kernel_obj);
+}
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/ConstantInitializer.h b/runtime/neurun/backend/acl_neon/ConstantInitializer.h
new file mode 100644
index 000000000..0f2b2d05b
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/ConstantInitializer.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__
+#define __NEURUN_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__
+
+#include <backend/IConstantInitializer.h>
+#include <ir/Operands.h>
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class ConstantInitializer : public IConstantInitializer
+{
+public:
+ ConstantInitializer(const ir::Operands &operands,
+ const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+public:
+ void visit(const ir::operation::BatchToSpaceND &) override;
+ void visit(const ir::operation::Conv2D &) override;
+ void visit(const ir::operation::DepthwiseConv2D &) override;
+ void visit(const ir::operation::FullyConnected &) override;
+ void visit(const ir::operation::LSTM &) override;
+ void visit(const ir::operation::RNN &) override;
+ void visit(const ir::operation::SpaceToBatchND &) override;
+ void visit(const ir::operation::TransposeConv &) override;
+
+private:
+ const ir::Operands &operands() const override { return _operands; }
+ std::shared_ptr<ITensorBuilder> tensor_builder() const override { return _tensor_builder; }
+
+private:
+ const ir::Operands &_operands;
+ std::shared_ptr<TensorBuilder> _tensor_builder;
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_COMPILER_ACL_NEON_CONSTANT_INITIALIZER_H__
diff --git a/runtime/neurun/backend/acl_neon/KernelGenerator.cc b/runtime/neurun/backend/acl_neon/KernelGenerator.cc
new file mode 100644
index 000000000..85c6a0633
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/KernelGenerator.cc
@@ -0,0 +1,2152 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "KernelGenerator.h"
+
+#include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
+#include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
+
+#include <Convert.h>
+#include <Swizzle.h>
+
+#include "util/Padding.h"
+#include "ir/Index.h"
+#include "ir/DataType.h"
+#include "ir/InternalType.h"
+#include "compiler/IExecutionBuilder.h"
+#include "exec/NopFunction.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+
+using ::neurun::compiler::IExecutionBuilder;
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+using ::neurun::backend::acl_common::asAclFunction;
+
+//
+// ActivationBuilder
+//
+class ActivationBuilder
+{
+public:
+ ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
+ {
+ // DO NOTHING
+ }
+
+private:
+ void appendReLU(::arm_compute::ITensor *ifm_alloc);
+ void appendReLU1(::arm_compute::ITensor *ifm_alloc);
+ void appendReLU6(::arm_compute::ITensor *ifm_alloc);
+
+public:
+ void append(ir::Activation act, ::arm_compute::ITensor *ifm_alloc);
+
+private:
+ IExecutionBuilder &_builder;
+};
+
+void ActivationBuilder::appendReLU(::arm_compute::ITensor *ifm_alloc)
+{
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(ifm_alloc, nullptr, act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::appendReLU1(::arm_compute::ITensor *ifm_alloc)
+{
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(ifm_alloc, nullptr, act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::appendReLU6(::arm_compute::ITensor *ifm_alloc)
+{
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(ifm_alloc, nullptr, act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::append(ir::Activation act, ::arm_compute::ITensor *ifm_alloc)
+{
+ switch (act)
+ {
+ case ir::Activation::NONE:
+ {
+ // DO NOTHING
+ break;
+ }
+ case ir::Activation::RELU:
+ {
+ appendReLU(ifm_alloc);
+ break;
+ }
+ case ir::Activation::RELU1:
+ {
+ appendReLU1(ifm_alloc);
+ break;
+ }
+ case ir::Activation::RELU6:
+ {
+ appendReLU6(ifm_alloc);
+ break;
+ }
+ default:
+ {
+ throw std::runtime_error("Not supported, yet");
+ }
+ }
+}
+
+//
+// KernelGenerator
+//
+KernelGenerator::KernelGenerator(const ir::Operands &ctx,
+ const std::shared_ptr<TensorBuilder> &tensor_builder)
+ : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(ir::Layout::UNKNOWN)
+{
+ // DO NOTHING
+}
+
+void KernelGenerator::visit(const ir::OpSequence &op_seq)
+{
+ _current_subg_layout = op_seq.getLayout();
+ for (const auto &e : op_seq.operations())
+ {
+ const auto &node = *(e.node);
+ _tensor_builder->preVisit(node);
+ node.accept(*this);
+ _tensor_builder->postVisit(node);
+ }
+}
+
+void KernelGenerator::visit(const ir::operation::Abs &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ArgMax &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
+
+ const auto ifm_rank = node.param().rank;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto frontend_layout = _current_subg_layout;
+ auto backend_layout = ifm_alloc->layout();
+
+ int axis_value = node.param().axis;
+ if (axis_value < 0)
+ {
+ axis_value += ifm_rank;
+ }
+ assert(axis_value >= 0 && axis_value < ifm_rank);
+ const auto fixed_axis =
+ acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
+
+ // auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArgMinMaxLayer>();
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArgMax>();
+
+ // NOTE
+ // if (ofm_alloc->info()->data_type() == arm_compute::DataType::S32)
+ //{
+ ofm_alloc->info()->set_data_type(arm_compute::DataType::U32);
+ //}
+ fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle());
+ // fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(),
+ // arm_compute::ReductionOperation::ARG_IDX_MAX);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
+ const auto block_size_index{
+ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+
+ assert(_ctx.at(block_size_index).isConstant());
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEBatchToSpaceLayer>();
+
+ fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Cast &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NECast>();
+
+ auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
+ ? arm_compute::SubDataType::BOOL
+ : arm_compute::SubDataType::NONE;
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Conv2D &node)
+{
+ using ir::operation::Conv2D;
+
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
+ const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
+ const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
+
+ const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+ const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+ // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+ const auto &ker_shape = _ctx.at(ker_index).shape();
+ const auto ker_height = ker_shape.dim(1);
+ const auto ker_width = ker_shape.dim(2);
+
+ const auto stride = node.param().stride;
+ const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
+ stride, ker_width, ker_height);
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ker_alloc = _tensor_builder->at(ker_index).get();
+ auto bias_alloc = _tensor_builder->at(bias_index).get();
+
+ const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+ const auto act_info = acl_common::asActivationLayerInfo(activation);
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEConvolutionLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+ fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+ conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
+
+ auto block_size = node.param().block_size;
+ assert(block_size > 0);
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDepthToSpaceLayerEx>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+{
+ using ir::operation::DepthwiseConv2D;
+
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
+ const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
+ const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
+
+ const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+ const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+ // Kernel format is [1, kernel_height, kernel_width, depth_out].
+ const auto &ker_shape = _ctx.at(ker_index).shape();
+ const auto ker_height = ker_shape.dim(1);
+ const auto ker_width = ker_shape.dim(2);
+
+ const auto stride = node.param().stride;
+ const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
+ stride, ker_width, ker_height);
+ const auto multiplier = node.param().multiplier;
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ker_alloc = _tensor_builder->at(ker_index).get();
+ auto bias_alloc = _tensor_builder->at(bias_index).get();
+
+ const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+ const auto act_info = acl_common::asActivationLayerInfo(activation);
+
+ if (ker_height == 3 && ker_width == 3)
+ {
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDepthwiseConvolutionLayer3x3>();
+
+ fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+ ofm_alloc->handle(), conv_info, multiplier, act_info);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+ }
+ else
+ {
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>();
+
+ fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+ ofm_alloc->handle(), conv_info, multiplier, act_info);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+ }
+}
+
+void KernelGenerator::visit(const ir::operation::Dequantize &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEDequantizationLayer>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)};
+
+ const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+ const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+
+ const auto kh = node.param().kh;
+ const auto kw = node.param().kw;
+ const auto stride = node.param().stride;
+ const auto padding =
+ neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+ VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
+ VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
+ VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
+ VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
+ VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
+ VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
+ VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
+ VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
+ VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
+ VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
+ VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+ VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+ ::arm_compute::Size2D{kw, kh},
+ acl_common::asPadStrideInfo(padding, stride)};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPoolingLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append((std::move(acl_fn)));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::Mean &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Mean::Input::INPUT)};
+ const auto &axes{node.param().axes};
+ const auto keep_dims{node.param().keep_dims};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = ifm_alloc->layout();
+
+ // Convert to ACL axes taking into account negative values and possible duplicates.
+ std::set<std::uint32_t> acl_axes;
+ const int ifm_rank = node.param().rank;
+ for (int axis : axes)
+ {
+ if (axis < 0)
+ axis += ifm_rank;
+ acl_axes.insert(
+ acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
+ }
+
+ arm_compute::Coordinates fixed_axis;
+ for (const auto axis : acl_axes)
+ {
+ fixed_axis.set(fixed_axis.num_dimensions(), axis);
+ }
+
+ // NOTE NEReduceMean has a bug that does not support NHWC layout
+ // NEReduceMean intermediate tensors are always NCHW layout
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceMeanEx>();
+
+ fn->configure(ifm_alloc->handle(), fixed_axis, keep_dims, ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)};
+
+ const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+ const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+
+ const auto kh = node.param().kh;
+ const auto kw = node.param().kw;
+ const auto stride = node.param().stride;
+ const auto padding =
+ neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+ VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
+ VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
+ VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
+ VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
+ VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
+ VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
+ VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
+ VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
+ VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
+ VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
+ VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+ VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+ acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPoolingLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append((std::move(acl_fn)));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::Concat &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+
+ std::vector<ir::OperandIndex> input_indexes;
+ for (const auto &input : node.getInputs())
+ input_indexes.emplace_back(input);
+
+ const auto axis = node.param().axis;
+
+ // If tensor allocator allocate as subtensor
+ bool canEliminate = true;
+ for (auto ifm_ind : input_indexes)
+ {
+ if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind))
+ {
+ canEliminate = false;
+ break;
+ }
+ }
+ if (canEliminate)
+ {
+ // If concat eliminated, return a NOP IFunction
+ _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>());
+ return;
+ }
+
+ auto output_alloc = _tensor_builder->at(ofm_index).get();
+ std::vector<::arm_compute::ITensor *> input_tensors;
+ for (const auto &ifm_ind : input_indexes)
+ input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
+
+ std::unique_ptr<::arm_compute::IFunction> fn;
+ if (input_indexes.size() < 2)
+ {
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NECopy>();
+ l->configure(input_tensors.at(0), output_alloc->handle());
+ fn = std::move(l);
+ }
+ else
+ {
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NEConcatenateLayer>();
+ const auto rank = node.param().rank;
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = output_alloc->layout();
+ const auto fixed_axis =
+ acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+ l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+ fn = std::move(l);
+ }
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+ const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+ auto values_alloc = _tensor_builder->at(values_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEEmbeddingLookup>();
+
+ fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Floor &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEFloor>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+{
+ using ir::operation::FullyConnected;
+
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+ const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+ const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+
+ const auto input_rank = _ctx.at(input_index).shape().rank();
+
+ const auto output_size =
+ _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+ UNUSED_RELEASE(output_size);
+ assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+ assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+ const auto batch_size =
+ _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
+ const auto input_size =
+ _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
+
+ // Check for reshaping input's shape into rank-2
+ bool needs_reshape = false;
+ ir::Shape reshape(2);
+ if (input_rank == 3 || input_rank == 4)
+ {
+ const auto &ifm_shape = _ctx.at(input_index).shape();
+ auto feature_size = 1;
+ for (int i = 0; i < ifm_shape.rank(); ++i)
+ {
+ feature_size *= ifm_shape.dim(i);
+ }
+
+ UNUSED_RELEASE(feature_size);
+ assert(feature_size == batch_size * input_size);
+
+ // for reshaping
+ needs_reshape = true;
+ reshape.dim(0) = batch_size; /* H */
+ reshape.dim(1) = input_size; /* W */
+ }
+
+ const auto activation = node.param().activation;
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ const auto input_alloc = _tensor_builder->at(input_index).get();
+ const auto weight_alloc = _tensor_builder->at(weight_index).get();
+ const auto bias_alloc = _tensor_builder->at(bias_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto acl_layout = output_alloc->handle()->info()->data_layout();
+
+ auto fn = nnfw::cpp14::make_unique<arm_compute::NEFullyConnectedReshapingLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+ arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type =
+ _ctx.at(weight_index).isConstant()
+ ? arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS
+ : arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL;
+
+ fn->configure(
+ input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
+ needs_reshape,
+ ::neurun::backend::acl_common::asTensorShape(
+ reshape, frontend_layout, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)),
+ kernel_type);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+{
+ const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
+ const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
+
+ const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
+ const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+ const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto hits_alloc = _tensor_builder->at(hits_index).get();
+
+ auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+ auto keys_alloc = _tensor_builder->at(keys_index).get();
+ auto values_alloc = _tensor_builder->at(values_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEHashtableLookup>();
+
+ fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+ output_alloc->handle(), hits_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Gather &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+
+ const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+ const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+
+ const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+ const auto ifm_rank = node.param().rank;
+ const auto axis_raw = node.param().axis;
+ const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
+ // Converting in reverse order
+ const int axis = ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto indices_alloc = _tensor_builder->at(indices_index).get();
+ const auto backend_layout = ofm_alloc->layout();
+ UNUSED_RELEASE(backend_layout);
+
+ // NOTE The frontend layout and backend layout must be the same for this operation.
+ // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
+ // is not not efficient even if it works well. If so, it would be better to set the
+ // layout of these backend tensors to the same layout.
+ // There is also one thing we have to think about. This operation depends on the layout of
+ // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+ // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+ // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+ assert(backend_layout == ifm_alloc->layout());
+ assert(backend_layout == indices_alloc->layout());
+ assert(ifm_rank < 4 || _current_subg_layout == backend_layout);
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEGatherEx>();
+
+ fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
+ const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+ const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto gamma_alloc = _tensor_builder->at(gamma_index).get();
+ auto beta_alloc = _tensor_builder->at(beta_index).get();
+ auto epsilon = node.param().epsilon;
+ auto activation = node.param().activation;
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
+ beta_alloc->handle(), epsilon);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
+
+ // {CL|Neon}L2Normalization performs the reduction only along dimension 0
+ // L2 Normalization always performs the reduction along the depth axis
+ // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
+ // choosing normalization parameters as below
+
+ const auto &ifm_shape = _ctx.at(ifm_index).shape();
+ // TODO Support optional constant dimension that normalization would be performed on
+ const auto normalization_axis = node.param().rank - 1;
+ int32_t radius =
+ 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
+ float alpha = 1.0f; // In the implementation to make alpha_ become 1
+ float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
+ float bias = 0.0f; // Don't offset the reduction.
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+ radius, alpha, beta, bias, false);
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NENormalizationLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)};
+
+ const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+ const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+
+ uint32_t kw = node.param().kw;
+ uint32_t kh = node.param().kh;
+ const auto stride = node.param().stride;
+ const auto padding =
+ neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ ::arm_compute::PoolingLayerInfo info{
+ ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+ ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPoolingLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{
+ node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
+
+ auto radius = node.param().radius;
+ auto alpha = node.param().alpha;
+ auto beta = node.param().beta;
+ auto bias = node.param().bias;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+ ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NENormalizationLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input0_alloc = _tensor_builder->at(input0_index).get();
+ auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NELogicalAnd>();
+
+ fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEBitwiseNot>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input0_alloc = _tensor_builder->at(input0_index).get();
+ auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NELogicalOr>();
+
+ fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Logistic &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::LSTM &node)
+{
+ // TODO Support dynamic rnn
+ // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+ const auto scratch_buffer_index{
+ node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+ const auto output_state_out_index{
+ node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+ const auto cell_state_out_index{
+ node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+ const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+
+ const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+ const auto input_to_input_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+ const auto input_to_forget_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+ const auto input_to_cell_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+ const auto input_to_output_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+ const auto recurrent_to_input_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+ const auto recurrent_to_forget_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+ const auto recurrent_to_cell_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+ const auto recurrent_to_output_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+ const auto cell_to_input_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+ const auto cell_to_forget_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+ const auto cell_to_output_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+ const auto input_gate_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+ const auto forget_gate_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+ const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+ const auto output_gate_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+ const auto projection_weights_index{
+ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+ const auto projection_bias_index{
+ node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+ const auto output_state_in_index{
+ node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+ const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+ const auto cell_threshold = node.param().cell_threshold;
+ const auto projection_threshold = node.param().projection_threshold;
+
+ bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+ _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+ bool has_recurrent_to_input_weights =
+ _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+ _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+ bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+ bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+ bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+ _ctx.at(projection_weights_index).shape().dim(1) != 0;
+ bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+
+ // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+ // true: no CIFG
+ // false: CIFG
+ // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+ bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+ // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+ // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+ // true: peephole
+ // false: no peephole
+ bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+ // NOTE Although the projection weights has data the projection bias may not have data.
+ bool has_projection_param = has_projection_weights;
+
+ const auto activation = node.param().activation;
+ const auto cell_clip = cell_threshold;
+ const auto projection_clip = projection_threshold;
+ assert(cell_clip >= 0.f && projection_clip >= 0.f);
+
+ auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+ auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+ auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+ auto output_alloc = _tensor_builder->at(output_index).get();
+
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+ auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+ auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+ auto recurrent_to_forget_weights_alloc =
+ _tensor_builder->at(recurrent_to_forget_weights_index).get();
+ auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+ auto recurrent_to_output_weights_alloc =
+ _tensor_builder->at(recurrent_to_output_weights_index).get();
+
+ auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+ auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+ auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+ auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+ auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
+
+ auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NELSTMLayer>();
+
+ ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{};
+ if (has_cifg_param)
+ {
+ auto input_to_input_weights_alloc =
+ _tensor_builder->at(input_to_input_weights_index).get(); // optional
+ auto recurrent_to_input_weights_alloc =
+ _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+ auto cell_to_input_weights_handle =
+ has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+ : nullptr; // optional (non-cifg && peephole)
+ auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+ lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+ recurrent_to_input_weights_alloc->handle(),
+ cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+ }
+ if (has_peephole_param)
+ {
+ auto cell_to_forget_weights_alloc =
+ _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+ auto cell_to_output_weights_alloc =
+ _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+ lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+ cell_to_output_weights_alloc->handle());
+ }
+ if (has_projection_param)
+ {
+ auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+ auto projection_bias_handle = has_projection_bias
+ ? _tensor_builder->at(projection_bias_index).get()->handle()
+ : nullptr; // optional
+ lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+ }
+
+ fn->configure(
+ input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+ input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+ recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+ recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+ cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+ cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+ output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+ lstm_params, act_info, cell_clip, projection_clip);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Mul &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)};
+
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPixelWiseMultiplication>();
+
+ // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+ arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::Neg &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NENegLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Pack &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ auto axis{node.param().axis};
+
+ const auto output_rank = node.param().rank;
+
+ std::vector<ir::OperandIndex> input_indexes;
+ for (const auto &input_index : node.getInputs())
+ input_indexes.emplace_back(input_index);
+
+ auto output = _tensor_builder->at(output_index).get()->handle();
+ std::vector<arm_compute::ITensor *> inputs;
+ for (const auto &input_index : input_indexes)
+ inputs.emplace_back(_tensor_builder->at(input_index)->handle());
+
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = _tensor_builder->at(output_index).get()->layout();
+
+ if (axis < 0)
+ axis += output_rank;
+ axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEStackLayer>();
+
+ fn->configure(inputs, axis, output);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::Pad &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
+ const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
+ const auto output_index{node.getOutputs().at(0)};
+ assert(_ctx.at(pad_index).isConstant());
+
+ auto rank = node.param().rank;
+ auto pad_base = _ctx.at(pad_index).data().base();
+
+ auto input = _tensor_builder->at(input_index).get()->handle();
+ auto output = _tensor_builder->at(output_index).get()->handle();
+
+ ::arm_compute::PaddingList padding_list;
+ padding_list.resize(rank);
+ for (int32_t n = 0; n < rank; ++n)
+ {
+ const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
+
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
+ const auto axis =
+ acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
+ padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
+ }
+
+ const auto input_type = _ctx.at(input_index).typeInfo();
+ UNUSED_RELEASE(input_type);
+ assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
+ assert(input->info()->quantization_info() ==
+ ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
+ const auto pixel_value =
+ ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEPadLayer>();
+ fn->configure(input, output, padding_list, pixel_value);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::Permute &node)
+{
+ const auto ofm_idx{node.getOutputs().at(0)};
+ const auto ifm_idx{node.getInputs().at(0)};
+ const auto permute_type = node.getPermuteType();
+ auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+ const auto rank = _ctx.at(ofm_idx).shape().rank();
+ assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
+
+ std::unique_ptr<::arm_compute::IFunction> fn;
+ arm_compute::PermutationVector pv;
+ if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
+ {
+ // WHCN -> CWHN
+ pv = arm_compute::PermutationVector{2, 0, 1};
+
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NEPermute>();
+
+ l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+
+ fn = std::move(l);
+ }
+ else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
+ {
+ // CWHN -> WHCN
+ pv = arm_compute::PermutationVector{1, 2, 0};
+
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NEPermute>();
+
+ l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+
+ fn = std::move(l);
+ }
+ else
+ {
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NECopy>();
+
+ l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+ fn = std::move(l);
+ }
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::PReLU &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
+ const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+
+ std::unique_ptr<::arm_compute::IFunction> fn;
+
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NEPReLU>();
+
+ l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+
+ fn = std::move(l);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ReduceMax &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReduceMax::Input::INPUT)};
+ const auto &axes{node.param().axes};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = ifm_alloc->layout();
+
+ // Convert to ACL axes taking into account negative values and possible duplicates.
+ std::set<std::uint32_t> acl_axes;
+ const int ifm_rank = node.param().rank;
+ for (int axis : axes)
+ {
+ if (axis < 0)
+ axis += ifm_rank;
+ acl_axes.insert(
+ acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
+ }
+
+ arm_compute::Coordinates reduce_axes;
+ for (const auto axis : acl_axes)
+ {
+ reduce_axes.set(reduce_axes.num_dimensions(), axis);
+ }
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceOperation>();
+
+ fn->configure(ifm_alloc->handle(), reduce_axes, false, ofm_alloc->handle(),
+ ::arm_compute::ReduceOperation::MAX);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ReduceMin &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReduceMin::Input::INPUT)};
+ const auto &axes{node.param().axes};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = ifm_alloc->layout();
+
+ // Convert to ACL axes taking into account negative values and possible duplicates.
+ std::set<std::uint32_t> acl_axes;
+ const int ifm_rank = node.param().rank;
+ for (int axis : axes)
+ {
+ if (axis < 0)
+ axis += ifm_rank;
+ acl_axes.insert(
+ acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
+ }
+
+ arm_compute::Coordinates reduce_axes;
+ for (const auto axis : acl_axes)
+ {
+ reduce_axes.set(reduce_axes.num_dimensions(), axis);
+ }
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceOperation>();
+
+ fn->configure(ifm_alloc->handle(), reduce_axes, false, ofm_alloc->handle(),
+ ::arm_compute::ReduceOperation::MIN);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ReduceSum &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ReduceSum::Input::INPUT)};
+ const auto &axes{node.param().axes};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = input_alloc->layout();
+
+ // Convert to ACL axes taking into account negative values and possible duplicates.
+ std::set<std::uint32_t> acl_axes;
+ const int input_rank = node.param().rank;
+ for (int axis : axes)
+ {
+ if (axis < 0)
+ axis += input_rank;
+ acl_axes.insert(
+ acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value());
+ }
+
+ arm_compute::Coordinates fixed_axes;
+ for (const auto axis : acl_axes)
+ {
+ fixed_axes.set(fixed_axes.num_dimensions(), axis);
+ }
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEReduceSum>();
+
+ fn->configure(input_alloc->handle(), fixed_axes, false, output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ReLU &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<arm_compute::NEActivationLayer>();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+ fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ReLU1 &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ReLU6 &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Reshape &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ // NOTE This operation must not be changed the layout from frontend to backend
+ // So, PermutationOperationPass makes layouts of frontend and backend the same.
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = output_alloc->layout();
+ assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
+ frontend_layout == backend_layout);
+ UNUSED_RELEASE(frontend_layout);
+ UNUSED_RELEASE(backend_layout);
+
+ auto fn = nnfw::cpp14::make_unique<arm_compute::NEReshapeLayer>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+
+ const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEScale>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+ ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+ ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::RNN &node)
+{
+ const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
+ const auto hidden_state_out_index{
+ node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
+
+ const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
+ const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
+ const auto recurrent_weights_index{
+ node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
+ const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
+ const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
+
+ const auto activation = node.param().activation;
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+
+ auto input_alloc = _tensor_builder->at(input_index).get();
+ auto weights_alloc = _tensor_builder->at(weights_index).get();
+ auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+ auto bias_alloc = _tensor_builder->at(bias_index).get();
+ auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+ auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
+
+ auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::NECopy>();
+ copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+ _execution_builder->append(asAclFunction(std::move(copy_layer)));
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NERNNLayerEx>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
+ bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
+ act_info);
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::RSQRT &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NERsqrtLayer>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::Squeeze &node)
+{
+ // Squeeze is identical to reshape except that it has an optional dimensions input.
+ // In addition, optional dims_index is ignored since output tensor already has squeezed shape
+ // by freezer and toco
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
+ const auto dims{node.param().dims};
+ const auto ndim{node.param().ndim};
+ (void)dims;
+ (void)ndim;
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+ auto fn = nnfw::cpp14::make_unique<arm_compute::NEReshapeLayer>();
+ fn->configure(input_alloc->handle(), output_alloc->handle());
+ auto acl_fn = asAclFunction(std::move(fn));
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Tanh &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<arm_compute::NEActivationLayer>();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+
+ fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Softmax &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
+ const auto beta = node.param().beta;
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NESoftmaxLayer>(
+ _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+ fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
+ const auto block_size_index{
+ node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+ const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+ auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+
+ assert(_ctx.at(block_size_index).isConstant());
+ assert(_ctx.at(paddings_index).isConstant());
+
+ // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is
+ // not 0.
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NESpaceToBatchLayerEx>();
+
+ fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
+ ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
+
+ auto block_size = node.param().block_size;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NESpaceToDepthLayerEx>();
+
+ fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Split &node)
+{
+ // TODO Support this op by SubTensor
+ const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+
+ assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
+
+ const auto ifm_rank = node.param().rank;
+ std::vector<ir::OperandIndex> output_indexes;
+ for (const auto &output : node.getOutputs())
+ output_indexes.emplace_back(output);
+
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ std::vector<arm_compute::ITensor *> output_allocs;
+ for (const auto &ofm_ind : output_indexes)
+ output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = ifm_alloc->layout();
+ auto axis = node.param().axis;
+ if (axis < 0)
+ axis += ifm_rank;
+ axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NESplit>();
+
+ fn->configure(ifm_alloc->handle(), output_allocs, axis);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::SQRT &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ const ::arm_compute::ActivationLayerInfo act_info{
+ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEActivationLayer>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseSquaredDiff>();
+
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Sub &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)};
+
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArithmeticSubtraction>();
+
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+ arm_compute::ConvertPolicy::SATURATE);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::Slice &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
+ const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+ const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+
+ auto outputData_alloc = _tensor_builder->at(output_index).get();
+ auto inputData_alloc = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = inputData_alloc->layout();
+
+ // Set initializers for indices data such as order of inputData
+ int input_rank = node.param().rank;
+ std::vector<int32_t> starts;
+ std::vector<int32_t> ends;
+ starts.resize(input_rank, 0);
+ ends.resize(input_rank, 0);
+ {
+ auto beginData_base = _ctx.at(begins_index).data().base();
+ auto sizeData_base = _ctx.at(sizes_index).data().base();
+ const int beginData_size = _ctx.at(begins_index).shape().num_elements();
+ const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
+
+ using ir::DataType;
+
+ UNUSED_RELEASE(beginData_size);
+ UNUSED_RELEASE(sizeData_size);
+
+ assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
+ assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
+ assert(beginData_size == input_rank);
+ assert(sizeData_size == input_rank);
+
+ assert(beginData_base != nullptr);
+ for (int n = 0; n < input_rank; ++n)
+ {
+ auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
+ backend_layout)
+ .value();
+
+ int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
+ starts[axis] = begin_value;
+
+ int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
+ ends[axis] = begin_value + size_value;
+ }
+ }
+
+ ::arm_compute::Coordinates starts_set;
+ ::arm_compute::Coordinates ends_set;
+
+ for (size_t i = 0; i < starts.size(); ++i)
+ {
+ starts_set.set(i, starts[i]);
+ ends_set.set(i, ends[i]);
+ }
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NESlice>();
+
+ fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
+ const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
+ const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+ const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+
+ auto outputData_alloc = _tensor_builder->at(output_index).get();
+ auto inputData_alloc = _tensor_builder->at(input_index).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = inputData_alloc->layout();
+
+ // Set initializers for indices data such as order of inputData
+ int input_rank = node.param().rank;
+ std::vector<int32_t> starts;
+ std::vector<int32_t> ends;
+ std::vector<int32_t> strides;
+ starts.resize(input_rank, 0);
+ ends.resize(input_rank, 0);
+ strides.resize(input_rank, 0);
+ {
+ auto startData_base = _ctx.at(starts_index).data().base();
+ auto endData_base = _ctx.at(ends_index).data().base();
+ auto stridesData_base = _ctx.at(strides_index).data().base();
+ const int startData_size = _ctx.at(starts_index).shape().num_elements();
+ const int endData_size = _ctx.at(ends_index).shape().num_elements();
+ const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
+
+ using ir::DataType;
+
+ UNUSED_RELEASE(startData_size);
+ UNUSED_RELEASE(endData_size);
+ UNUSED_RELEASE(stridesData_size);
+
+ assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
+ assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
+ assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
+ assert(startData_size == input_rank);
+ assert(endData_size == input_rank);
+ assert(stridesData_size == input_rank);
+
+ assert(startData_base != nullptr);
+ for (int n = 0; n < input_rank; ++n)
+ {
+ auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
+ backend_layout)
+ .value();
+
+ int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
+ starts[axis] = start_value;
+
+ int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
+ ends[axis] = end_value;
+
+ int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
+ strides[axis] = strides_value;
+ }
+ }
+
+ // Set mask bits such as order of inputData
+ // FIXME Take the layouts into account.
+ const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
+ const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
+ const auto shrink_axis_mask =
+ acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
+
+ ::arm_compute::Coordinates starts_set;
+ ::arm_compute::Coordinates ends_set;
+ ::arm_compute::BiStrides strides_set;
+
+ for (size_t i = 0; i < starts.size(); ++i)
+ {
+ starts_set.set(i, starts[i]);
+ ends_set.set(i, ends[i]);
+ strides_set.set(i, strides[i]);
+ }
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEStridedSlice>();
+
+ fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+ strides_set, begin_mask, end_mask, shrink_axis_mask);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto output_shape_index{
+ node.getInputs().at(ir::operation::TransposeConv::Input::OUTPUT_SHAPE)};
+ const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
+ const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
+
+ const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+ const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+ const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout);
+
+ const auto stride = node.param().stride;
+
+ assert((node.param().padding.type == ir::PaddingType::SAME) ||
+ (node.param().padding.type == ir::PaddingType::VALID));
+ auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
+ ker_shape.W, ker_shape.H);
+
+ uint32_t invalid_horizontal = 0;
+ uint32_t invalid_vertical = 0;
+ if (node.param().padding.type == ir::PaddingType::VALID)
+ {
+ invalid_horizontal =
+ ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
+ invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+ }
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+ auto ker_alloc = _tensor_builder->at(ker_index).get();
+
+ const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NETransposeConvLayer>();
+
+ fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+ invalid_horizontal, invalid_vertical);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Transpose &node)
+{
+ const auto ofm_idx{node.getOutputs().at(0)};
+ const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+ const auto &perm{node.param().perm};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+ const auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = ifm_alloc->layout();
+
+ const auto rank = node.param().rank;
+ std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
+ auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector(
+ rank, pv, frontend_layout, backend_layout);
+
+ std::unique_ptr<::arm_compute::IFunction> fn;
+
+ if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2)
+ {
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NETranspose>();
+
+ l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+ fn = std::move(l);
+ }
+ else
+ {
+ auto l = nnfw::cpp14::make_unique<::arm_compute::NEPermute>();
+
+ l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+
+ fn = std::move(l);
+ }
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Unpack &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
+ auto axis{node.param().axis};
+
+ const auto input_rank = node.param().rank;
+
+ std::vector<ir::OperandIndex> output_indexes;
+ for (const auto &output_index : node.getOutputs())
+ output_indexes.emplace_back(output_index);
+
+ auto input = _tensor_builder->at(input_index).get()->handle();
+ std::vector<arm_compute::ITensor *> outputs;
+ for (const auto &output_index : output_indexes)
+ outputs.emplace_back(_tensor_builder->at(output_index)->handle());
+
+ const auto frontend_layout = _current_subg_layout;
+ const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
+ if (axis < 0)
+ axis += input_rank;
+ axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEUnstack>();
+
+ fn->configure(input, outputs, axis);
+
+ _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const ir::operation::Add &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)};
+
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEArithmeticAddition>();
+
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+ arm_compute::ConvertPolicy::SATURATE);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::Div &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)};
+
+ const auto activation = node.param().activation;
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseDivision>();
+
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+
+ ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const ir::operation::Exp &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input_alloc = _tensor_builder->at(input_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEExpLayer>();
+
+ fn->configure(input_alloc->handle(), output_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Comparison &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
+
+ const auto comparison_type = node.param().comparison_type;
+
+ auto output_alloc = _tensor_builder->at(output_index).get();
+ auto input0_alloc = _tensor_builder->at(input0_index).get();
+ auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseComparison>();
+
+ fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+ (arm_compute::ComparisonOperation)comparison_type);
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Min &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseMin>();
+
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const ir::operation::Max &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+
+ auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+ auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+ auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+ auto fn = nnfw::cpp14::make_unique<::arm_compute::NEElementwiseMax>();
+
+ fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+ auto acl_fn = asAclFunction(std::move(fn));
+
+ _execution_builder->append(std::move(acl_fn));
+}
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/KernelGenerator.h b/runtime/neurun/backend/acl_neon/KernelGenerator.h
new file mode 100644
index 000000000..f041fb725
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/KernelGenerator.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
+#define __NEURUN_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
+
+#include <backend/IKernelGenerator.h>
+
+#include "ir/Operands.h"
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class KernelGenerator : public IKernelGenerator
+{
+public:
+ KernelGenerator(const ir::Operands &ctx, const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+ void visit(const ir::OpSequence &) override;
+ void visit(const ir::operation::Abs &) override;
+ void visit(const ir::operation::ArgMax &) override;
+ void visit(const ir::operation::BatchToSpaceND &) override;
+ void visit(const ir::operation::Cast &) override;
+ void visit(const ir::operation::Conv2D &) override;
+ void visit(const ir::operation::DepthToSpace &) override;
+ void visit(const ir::operation::DepthwiseConv2D &) override;
+ void visit(const ir::operation::Dequantize &) override;
+ void visit(const ir::operation::MaxPool2D &) override;
+ void visit(const ir::operation::Mean &) override;
+ void visit(const ir::operation::AvgPool2D &) override;
+ void visit(const ir::operation::Concat &) override;
+ void visit(const ir::operation::EmbeddingLookup &) override;
+ void visit(const ir::operation::Floor &) override;
+ void visit(const ir::operation::FullyConnected &) override;
+ void visit(const ir::operation::Gather &) override;
+ void visit(const ir::operation::HashtableLookup &) override;
+ void visit(const ir::operation::InstanceNorm &) override;
+ void visit(const ir::operation::L2Normalization &) override;
+ void visit(const ir::operation::L2Pool2D &) override;
+ void visit(const ir::operation::LocalResponseNormalization &) override;
+ void visit(const ir::operation::LogicalAnd &) override;
+ void visit(const ir::operation::LogicalNot &) override;
+ void visit(const ir::operation::LogicalOr &) override;
+ void visit(const ir::operation::Logistic &) override;
+ void visit(const ir::operation::LSTM &) override;
+ void visit(const ir::operation::Mul &) override;
+ void visit(const ir::operation::Neg &) override;
+ void visit(const ir::operation::Pack &) override;
+ void visit(const ir::operation::Pad &) override;
+ void visit(const ir::operation::Permute &) override;
+ void visit(const ir::operation::PReLU &) override;
+ void visit(const ir::operation::ReduceMax &) override;
+ void visit(const ir::operation::ReduceMin &) override;
+ void visit(const ir::operation::ReduceSum &) override;
+ void visit(const ir::operation::ReLU &) override;
+ void visit(const ir::operation::ReLU1 &) override;
+ void visit(const ir::operation::ReLU6 &) override;
+ void visit(const ir::operation::Reshape &) override;
+ void visit(const ir::operation::ResizeBilinear &) override;
+ void visit(const ir::operation::RNN &) override;
+ void visit(const ir::operation::RSQRT &) override;
+ void visit(const ir::operation::Squeeze &) override;
+ void visit(const ir::operation::Tanh &) override;
+ void visit(const ir::operation::Softmax &) override;
+ void visit(const ir::operation::SpaceToBatchND &) override;
+ void visit(const ir::operation::SpaceToDepth &) override;
+ void visit(const ir::operation::Split &) override;
+ void visit(const ir::operation::SQRT &) override;
+ void visit(const ir::operation::SquaredDifference &) override;
+ void visit(const ir::operation::Sub &) override;
+ void visit(const ir::operation::Slice &) override;
+ void visit(const ir::operation::StridedSlice &) override;
+ void visit(const ir::operation::TransposeConv &) override;
+ void visit(const ir::operation::Transpose &) override;
+ void visit(const ir::operation::Unpack &) override;
+ void visit(const ir::operation::Add &) override;
+ void visit(const ir::operation::Div &) override;
+ void visit(const ir::operation::Exp &) override;
+ void visit(const ir::operation::Comparison &) override;
+ void visit(const ir::operation::Min &) override;
+ void visit(const ir::operation::Max &) override;
+
+private:
+ const ir::Operands &_ctx;
+ std::shared_ptr<TensorBuilder> _tensor_builder;
+ ir::Layout _current_subg_layout;
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
diff --git a/runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc b/runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc
new file mode 100644
index 000000000..75f2e9797
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/PluginClassesAllocator.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <util/logging.h>
+
+#include "Backend.h"
+
+extern "C" {
+neurun::backend::Backend *neurun_backend_create()
+{
+ VERBOSE(neurun_backend_create) << "'acl_neon' loaded\n";
+ return new neurun::backend::acl_neon::Backend;
+}
+
+void neurun_backend_destroy(neurun::backend::Backend *backend)
+{
+ VERBOSE(neurun_backend_create) << "'acl_neon' unloaded\n";
+ delete backend;
+}
+}
diff --git a/runtime/neurun/backend/acl_neon/ShapeFixer.cc b/runtime/neurun/backend/acl_neon/ShapeFixer.cc
new file mode 100644
index 000000000..1d80e57e9
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/ShapeFixer.cc
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ShapeFixer.h"
+
+#include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h>
+#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h>
+#include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+
+#include <Convert.h>
+#include <Swizzle.h>
+
+#include "util/Padding.h"
+#include "ir/Index.h"
+#include "compiler/IExecutionBuilder.h"
+#include "exec/NopFunction.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+
+using ::neurun::compiler::IExecutionBuilder;
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+using ::neurun::backend::acl_common::asAclFunction;
+
+ShapeFixer::ShapeFixer(const ir::Operands &ctx,
+ const std::shared_ptr<TensorBuilder> &tensor_builder)
+ : _ctx(ctx), _tensor_builder(tensor_builder)
+{
+ assert(tensor_builder);
+}
+
+void ShapeFixer::visit(const ir::operation::Abs &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::ArgMax &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::BatchToSpaceND &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
+ _tensor_builder->dimCorrection(ofm_index, false);
+ _tensor_builder->dimCorrection(ifm_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Cast &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Conv2D &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::DepthToSpace &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::DepthwiseConv2D &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Dequantize &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::MaxPool2D &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Mean &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::AvgPool2D &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Concat &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ _tensor_builder->dimCorrection(ofm_index, false);
+ for (const auto &inputs : node.getInputs())
+ _tensor_builder->dimCorrection(inputs, false);
+}
+
+void ShapeFixer::visit(const ir::operation::EmbeddingLookup &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+ _tensor_builder->dimCorrection(values_index, false);
+ _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Exp &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Floor &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::FullyConnected &node)
+{
+ using ir::operation::FullyConnected;
+ const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+ const auto input_rank = _ctx.at(input_index).shape().rank();
+ // Check for reshaping input's shape into rank-2
+ if (input_rank == 3 || input_rank == 4)
+ _tensor_builder->dimCorrection(input_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::HashtableLookup &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+ _tensor_builder->dimCorrection(values_index, false);
+ _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Gather &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+ const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+ _tensor_builder->dimCorrection(ofm_index, false);
+ _tensor_builder->dimCorrection(ifm_index, false);
+ _tensor_builder->dimCorrection(indices_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::InstanceNorm &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::L2Normalization &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::L2Pool2D &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::LocalResponseNormalization &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::LogicalAnd &node)
+{
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
+
+ if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::LogicalNot &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::LogicalOr &node)
+{
+ const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
+
+ if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Logistic &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::LSTM &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Pack &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ _tensor_builder->dimCorrection(ofm_index, false);
+ for (const auto &inputs : node.getInputs())
+ {
+ _tensor_builder->dimCorrection(inputs, false);
+ const auto ofm_rank = _ctx.at(ofm_index).shape().rank();
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(inputs).shape()).extendRank(ofm_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Pad &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
+ const auto output_index{node.getOutputs().at(0)};
+ _tensor_builder->dimCorrection(input_index, false);
+ _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Mul &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Neg &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Permute &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::PReLU &node)
+{
+ const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
+ const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
+
+ if (!(_ctx.at(ifm_index).shape() == _ctx.at(alpha_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(ifm_index).shape().rank(), _ctx.at(alpha_index).shape().rank());
+ const_cast<ir::Shape &>(_ctx.at(ifm_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(alpha_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::ReduceMax &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::ReduceMin &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::ReduceSum &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::ReLU &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::ReLU1 &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::ReLU6 &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Reshape &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+
+ _tensor_builder->dimCorrection(input_index, false);
+ _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::ResizeBilinear &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::RNN &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Comparison &node)
+{
+ const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
+ const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
+
+ if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::RSQRT &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Squeeze &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
+ _tensor_builder->dimCorrection(input_index, false);
+ _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Tanh &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Slice &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::StridedSlice &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
+ _tensor_builder->dimCorrection(ofm_index, false);
+ _tensor_builder->dimCorrection(ifm_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Softmax &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::SpaceToBatchND &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
+ _tensor_builder->dimCorrection(ofm_index, false);
+ _tensor_builder->dimCorrection(ifm_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::SpaceToDepth &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Split &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
+ _tensor_builder->dimCorrection(input_index, false);
+ for (const auto &output : node.getOutputs())
+ _tensor_builder->dimCorrection(output, false);
+}
+
+void ShapeFixer::visit(const ir::operation::SQRT &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::SquaredDifference &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Sub &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::TransposeConv &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Transpose &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const ir::operation::Unpack &node)
+{
+ const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
+ _tensor_builder->dimCorrection(input_index, false);
+ for (const auto &output_index : node.getOutputs())
+ _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const ir::operation::Add &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Div &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Min &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+void ShapeFixer::visit(const ir::operation::Max &node)
+{
+ const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+
+ if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+ {
+ const auto broadcast_rank =
+ std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+ // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+ // a node to extend shape may be inserted in front of this operation
+ const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+ const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+ }
+}
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/ShapeFixer.h b/runtime/neurun/backend/acl_neon/ShapeFixer.h
new file mode 100644
index 000000000..aa1f8f75a
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/ShapeFixer.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_SHAPE_FIXER_H__
+#define __NEURUN_BACKEND_ACL_NEON_SHAPE_FIXER_H__
+
+#include <backend/IShapeFixer.h>
+
+#include "ir/Operands.h"
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class ShapeFixer : public IShapeFixer
+{
+public:
+ ShapeFixer(const ir::Operands &ctx, const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+ void visit(const ir::operation::Abs &) override;
+ void visit(const ir::operation::ArgMax &) override;
+ void visit(const ir::operation::BatchToSpaceND &) override;
+ void visit(const ir::operation::Cast &) override;
+ void visit(const ir::operation::Conv2D &) override;
+ void visit(const ir::operation::DepthToSpace &) override;
+ void visit(const ir::operation::DepthwiseConv2D &) override;
+ void visit(const ir::operation::Dequantize &) override;
+ void visit(const ir::operation::MaxPool2D &) override;
+ void visit(const ir::operation::Mean &) override;
+ void visit(const ir::operation::AvgPool2D &) override;
+ void visit(const ir::operation::Concat &) override;
+ void visit(const ir::operation::EmbeddingLookup &) override;
+ void visit(const ir::operation::Exp &) override;
+ void visit(const ir::operation::Floor &) override;
+ void visit(const ir::operation::FullyConnected &) override;
+ void visit(const ir::operation::Gather &) override;
+ void visit(const ir::operation::HashtableLookup &) override;
+ void visit(const ir::operation::InstanceNorm &) override;
+ void visit(const ir::operation::L2Normalization &) override;
+ void visit(const ir::operation::L2Pool2D &) override;
+ void visit(const ir::operation::LocalResponseNormalization &) override;
+ void visit(const ir::operation::LogicalAnd &) override;
+ void visit(const ir::operation::LogicalNot &) override;
+ void visit(const ir::operation::LogicalOr &) override;
+ void visit(const ir::operation::Logistic &) override;
+ void visit(const ir::operation::LSTM &) override;
+ void visit(const ir::operation::Mul &) override;
+ void visit(const ir::operation::Neg &) override;
+ void visit(const ir::operation::Pack &) override;
+ void visit(const ir::operation::Pad &) override;
+ void visit(const ir::operation::Permute &) override;
+ void visit(const ir::operation::PReLU &) override;
+ void visit(const ir::operation::ReduceMax &) override;
+ void visit(const ir::operation::ReduceMin &) override;
+ void visit(const ir::operation::ReduceSum &) override;
+ void visit(const ir::operation::ReLU &) override;
+ void visit(const ir::operation::ReLU1 &) override;
+ void visit(const ir::operation::ReLU6 &) override;
+ void visit(const ir::operation::Reshape &) override;
+ void visit(const ir::operation::ResizeBilinear &) override;
+ void visit(const ir::operation::RNN &) override;
+ void visit(const ir::operation::RSQRT &) override;
+ void visit(const ir::operation::Squeeze &) override;
+ void visit(const ir::operation::Tanh &) override;
+ void visit(const ir::operation::Softmax &) override;
+ void visit(const ir::operation::SpaceToBatchND &) override;
+ void visit(const ir::operation::SpaceToDepth &) override;
+ void visit(const ir::operation::Split &) override;
+ void visit(const ir::operation::SQRT &) override;
+ void visit(const ir::operation::SquaredDifference &) override;
+ void visit(const ir::operation::Sub &) override;
+ void visit(const ir::operation::Slice &) override;
+ void visit(const ir::operation::StridedSlice &) override;
+ void visit(const ir::operation::TransposeConv &) override;
+ void visit(const ir::operation::Transpose &) override;
+ void visit(const ir::operation::Unpack &) override;
+ void visit(const ir::operation::Add &) override;
+ void visit(const ir::operation::Div &) override;
+ void visit(const ir::operation::Comparison &) override;
+ void visit(const ir::operation::Min &) override;
+ void visit(const ir::operation::Max &) override;
+
+private:
+ const ir::Operands &_ctx;
+ std::shared_ptr<TensorBuilder> _tensor_builder;
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_SHAPE_FIXER_H__
diff --git a/runtime/neurun/backend/acl_neon/TensorBuilder.h b/runtime/neurun/backend/acl_neon/TensorBuilder.h
new file mode 100644
index 000000000..0a6b4921d
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/TensorBuilder.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_TENSOR_BUILDER_H__
+#define __NEURUN_BACKEND_ACL_NEON_TENSOR_BUILDER_H__
+
+#include <TemplTensorBuilder.h>
+
+#include "operand/NETensor.h"
+#include "operand/NESubTensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+using TensorBuilder =
+ acl_common::TemplTensorBuilder<operand::INETensor, operand::NETensor, operand::NESubTensor>;
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_TENSOR_BUILDER_H__
diff --git a/runtime/neurun/backend/acl_neon/TensorManager.h b/runtime/neurun/backend/acl_neon/TensorManager.h
new file mode 100644
index 000000000..725275cef
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/TensorManager.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_TENSOR_MANAGER_H__
+#define __NEURUN_BACKEND_ACL_NEON_TENSOR_MANAGER_H__
+
+#include <arm_compute/runtime/Allocator.h>
+#include <arm_compute/runtime/PoolManager.h>
+#include <arm_compute/runtime/OffsetLifetimeManager.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <arm_compute/runtime/MemoryGroup.h>
+
+#include <AclMemoryManager.h>
+#include <AclLinearMemoryManager.h>
+#include <AclInternalBufferManager.h>
+#include <AclTensorManager.h>
+
+#include "operand/NETensor.h"
+#include "operand/NESubTensor.h"
+
+#include "util/logging.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+using MemoryManager =
+ acl_common::AclMemoryManager<operand::INETensor, operand::NETensor, operand::NESubTensor>;
+
+using LinearMemoryManager = acl_common::AclLinearMemoryManager<
+ operand::INETensor, operand::NETensor, operand::NESubTensor,
+ ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
+ ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator, ::arm_compute::MemoryGroup>;
+
+using InternalBufferManager = acl_common::AclInternalBufferManager<
+ ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
+ ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator>;
+
+using TensorManager = acl_common::AclTensorManager<acl_neon::operand::INETensor, operand::NETensor,
+ operand::NESubTensor>;
+
+TensorManager *createTensorManager()
+{
+ const std::string executor_str = util::getConfigString(util::config::EXECUTOR);
+ if (executor_str == "Linear")
+ {
+ VERBOSE(acl_neon_createTensorManager) << "AclTensorManager as Linear" << std::endl;
+ return new TensorManager(new MemoryManager(), new LinearMemoryManager(),
+ new InternalBufferManager());
+ }
+ else
+ {
+ VERBOSE(acl_neon_createTensorManager) << "AclTensorManager" << std::endl;
+ return new TensorManager(new MemoryManager(), new MemoryManager(), new InternalBufferManager());
+ }
+}
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_TENSOR_MANAGER_H__
diff --git a/runtime/neurun/backend/acl_neon/TensorRegister.cc b/runtime/neurun/backend/acl_neon/TensorRegister.cc
new file mode 100644
index 000000000..fe766cdf9
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/TensorRegister.cc
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorRegister.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+// NOTHING
+
+} // srcn
+} // backend
+} // neurun
diff --git a/runtime/neurun/backend/acl_neon/TensorRegister.h b/runtime/neurun/backend/acl_neon/TensorRegister.h
new file mode 100644
index 000000000..115e05dee
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/TensorRegister.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_TENSOR_REGISTER_H__
+#define __NEURUN_BACKEND_ACL_NEON_TENSOR_REGISTER_H__
+
+#include <AclTensorRegister.h>
+#include <misc/polymorphic_downcast.h>
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+
+class TensorRegister : public acl_common::AclTensorRegister
+{
+public:
+ TensorRegister(const ir::Operands &operands, const std::shared_ptr<TensorBuilder> &tensor_builder)
+ : acl_common::AclTensorRegister{operands, tensor_builder}
+ {
+ // DO NOTHING
+ }
+
+ void setUsesCount(const ir::OperandIndex &ind, size_t num_uses) const override
+ {
+ nnfw::misc::polymorphic_downcast<TensorBuilder *>(tensor_builder().get())
+ ->setUsesCount(ind, num_uses);
+ }
+};
+
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_TENSOR_REGISTER_H__
diff --git a/runtime/neurun/backend/acl_neon/operand/INETensor.cc b/runtime/neurun/backend/acl_neon/operand/INETensor.cc
new file mode 100644
index 000000000..fdb20970d
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/operand/INETensor.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "INETensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+namespace operand
+{
+
+void INETensor::access(const std::function<void(ITensor &tensor)> &fn) { fn(*this); }
+
+} // namespace operand
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/operand/INETensor.h b/runtime/neurun/backend/acl_neon/operand/INETensor.h
new file mode 100644
index 000000000..22b1140cf
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/operand/INETensor.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_OPERAND_I_NE_TENSOR_H__
+#define __NEURUN_BACKEND_ACL_NEON_OPERAND_I_NE_TENSOR_H__
+
+#include <arm_compute/core/ITensor.h>
+
+#include <IACLTensor.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+namespace operand
+{
+
+class INETensor : public acl_common::IACLTensor
+{
+public:
+ const arm_compute::ITensor *handle() const override = 0;
+ arm_compute::ITensor *handle() override = 0;
+ void access(const std::function<void(ITensor &tensor)> &fn) final;
+};
+
+} // namespace operand
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_OPERAND_I_NE_TENSOR_H__
diff --git a/runtime/neurun/backend/acl_neon/operand/NESubTensor.cc b/runtime/neurun/backend/acl_neon/operand/NESubTensor.cc
new file mode 100644
index 000000000..a36af609c
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/operand/NESubTensor.cc
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NESubTensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+namespace operand
+{
+
+NESubTensor::NESubTensor(INETensor *parent, const arm_compute::TensorShape &tensor_shape,
+ const arm_compute::Coordinates &coords, size_t rank, bool extend_parent)
+ : _ne_sub_tensor(std::make_shared<arm_compute::SubTensor>(parent->handle(), tensor_shape,
+ coords, extend_parent)),
+ _rank{rank}
+{
+ // DO NOTHING
+}
+
+const arm_compute::SubTensor *NESubTensor::handle() const { return _ne_sub_tensor.get(); }
+
+arm_compute::SubTensor *NESubTensor::handle() { return _ne_sub_tensor.get(); }
+
+} // namespace operand
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/operand/NESubTensor.h b/runtime/neurun/backend/acl_neon/operand/NESubTensor.h
new file mode 100644
index 000000000..010e4deda
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/operand/NESubTensor.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_SUB_TENSOR_H__
+#define __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_SUB_TENSOR_H__
+
+#include <arm_compute/runtime/SubTensor.h>
+#include "INETensor.h"
+#include "compiler/SubTensorInfo.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+namespace operand
+{
+
+class NESubTensor : public INETensor
+{
+public:
+ NESubTensor() = delete;
+
+public:
+ NESubTensor(INETensor *parent, const arm_compute::TensorShape &tensor_shape,
+ const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false);
+
+public:
+ size_t num_dimensions() const final { return _rank; }
+
+public:
+ const arm_compute::SubTensor *handle() const override;
+ arm_compute::SubTensor *handle() override;
+
+public:
+ // This method is used to prevent the use of memcpy for SubTensor
+ bool has_padding() const override { return true; }
+
+private:
+ std::shared_ptr<arm_compute::SubTensor> _ne_sub_tensor;
+ size_t _rank;
+};
+
+} // namespace operand
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_SUB_TENSOR_H__
diff --git a/runtime/neurun/backend/acl_neon/operand/NETensor.cc b/runtime/neurun/backend/acl_neon/operand/NETensor.cc
new file mode 100644
index 000000000..8a9ece88f
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/operand/NETensor.cc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_compute/runtime/Memory.h>
+#include <arm_compute/runtime/MemoryRegion.h>
+#include "NETensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+namespace operand
+{
+
+NETensor::NETensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses)
+ : _ne_tensor(std::make_shared<arm_compute::Tensor>()), _rank{rank}, _num_uses{num_uses}
+{
+ allocator()->init(info);
+}
+
+const arm_compute::Tensor *NETensor::handle() const { return _ne_tensor.get(); }
+
+arm_compute::Tensor *NETensor::handle() { return _ne_tensor.get(); }
+
+arm_compute::TensorAllocator *NETensor::allocator() { return _ne_tensor->allocator(); }
+
+} // namespace operand
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/acl_neon/operand/NETensor.h b/runtime/neurun/backend/acl_neon/operand/NETensor.h
new file mode 100644
index 000000000..3de4695e9
--- /dev/null
+++ b/runtime/neurun/backend/acl_neon/operand/NETensor.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_TENSOR_H__
+#define __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_TENSOR_H__
+
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/Tensor.h>
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "INETensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_neon
+{
+namespace operand
+{
+
+class NETensor : public INETensor
+{
+public:
+ NETensor() = delete;
+
+public:
+ NETensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses);
+
+public:
+ size_t num_dimensions() const final { return _rank; }
+
+public:
+ const arm_compute::Tensor *handle() const override;
+ arm_compute::Tensor *handle() override;
+ size_t num_uses() const { return _num_uses; }
+
+public:
+ arm_compute::TensorAllocator *allocator();
+
+private:
+ std::shared_ptr<arm_compute::Tensor> _ne_tensor;
+ size_t _rank;
+ size_t _num_uses;
+};
+
+} // namespace operand
+} // namespace acl_neon
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_NEON_OPERAND_NE_TENSOR_H__