diff options
Diffstat (limited to 'runtime/neurun/backend/acl_cl')
21 files changed, 3967 insertions, 0 deletions
diff --git a/runtime/neurun/backend/acl_cl/Backend.h b/runtime/neurun/backend/acl_cl/Backend.h new file mode 100644 index 000000000..2033b42e7 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/Backend.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_BACKEND_H__ +#define __NEURUN_BACKEND_ACL_CL_BACKEND_H__ + +#include <memory> +#include <backend/Backend.h> +#include <ir/Operands.h> + +#include "Config.h" +#include "ConstantInitializer.h" +#include "KernelGenerator.h" +#include "ShapeFixer.h" +#include "TensorManager.h" +#include "TensorRegister.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class Backend : public ::neurun::backend::Backend +{ +public: + Backend() : _config{std::make_shared<Config>()} {} + + std::shared_ptr<IConfig> config() const override { return _config; } + + std::unique_ptr<BackendContext> + newContext(const ir::Operands &operands, + const std::shared_ptr<custom::IKernelBuilder> &) const override + { + auto tensor_builder = std::make_shared<TensorBuilder>(createTensorManager()); + return std::unique_ptr<BackendContext>{new BackendContext{ + this, tensor_builder, std::make_shared<ConstantInitializer>(operands, tensor_builder), + std::make_shared<KernelGenerator>(operands, tensor_builder), + std::make_shared<ShapeFixer>(operands, tensor_builder), + std::make_shared<TensorRegister>(operands, tensor_builder)}}; + } + +private: + std::shared_ptr<IConfig> _config; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_BACKEND_H__ diff --git a/runtime/neurun/backend/acl_cl/CLTimer.h b/runtime/neurun/backend/acl_cl/CLTimer.h new file mode 100644 index 000000000..3939ee722 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/CLTimer.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_CLTIMER_H__ +#define __NEURUN_BACKEND_ACL_CL_CLTIMER_H__ + +#include <util/ITimer.h> +#include <arm_compute/core/CL/OpenCL.h> +#include <arm_compute/runtime/CL/CLScheduler.h> +#include <chrono> +#include <list> +#include <sstream> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +/** + * @brief Class to measure CL kernels execution time + */ +class CLTimer : public util::ITimer +{ +public: + /** + * @brief This function replaces CL function, which enqueues a command to execute a kernel + * with a wrapper which remembers enqueued kernels + */ + void handleBegin() override + { + _measured_events.clear(); + + _origin_enqueue_function = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr; + + auto _timer_enqueue_function = [this](cl_command_queue command_queue, cl_kernel kernel, + cl_uint work_dim, const size_t *gwo, const size_t *gws, + const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *usr_event) { + cl_event event; + cl_int enqueue_res = + this->_origin_enqueue_function(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &event); + this->_measured_events.emplace_back(event); + + // According to spec, if NULL was provided in usr_event - event shouldn't be returned + if (usr_event != nullptr) + { + clRetainEvent(event); + *usr_event = event; + } + return enqueue_res; + }; + arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr = _timer_enqueue_function; + + // Set CL_QUEUE_PROFILING_ENABLE flag for the CL command-queue, if it isn't already set + auto &cl_scheduler = arm_compute::CLScheduler::get(); + auto props = cl_scheduler.queue().getInfo<CL_QUEUE_PROPERTIES>(); + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) + { + cl_scheduler.set_queue( + cl::CommandQueue(cl_scheduler.context(), props | CL_QUEUE_PROFILING_ENABLE)); + } + }; + + /** + * @brief Get timer result by addition executed CL kernels durations + */ + void handleEnd() override + { + _timer_res = 0; + for (auto const &event : _measured_events) + { + cl_ulong start; + cl_ulong end; + event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start); + event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end); + _timer_res += (end - start) / 1000.f; // nanoseconds -> microseconds + } + + // Restore origin CL enqueue function + arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr = _origin_enqueue_function; + }; + +private: + std::function<decltype(clEnqueueNDRangeKernel)> _origin_enqueue_function; + std::list<::cl::Event> _measured_events; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_CLTIMER_H__ diff --git a/runtime/neurun/backend/acl_cl/CMakeLists.txt b/runtime/neurun/backend/acl_cl/CMakeLists.txt new file mode 100644 index 000000000..aaf6a4d62 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/CMakeLists.txt @@ -0,0 +1,21 @@ +# Unsupported architecture +nnas_find_package(ARMCompute QUIET) +if(NOT ARMCompute_FOUND) + return() +endif(NOT ARMCompute_FOUND) + +set(LIB_NEURUN_BACKEND_ACL_CL neurun_backend_acl_cl) + +file(GLOB_RECURSE SOURCES "*.cc") + +add_library(${LIB_NEURUN_BACKEND_ACL_CL} SHARED ${SOURCES}) + +target_include_directories(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE neurun_core) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE ${LIB_NEURUN_BACKEND_ACL_COMMON}) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE nnfw_common) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE nnfw_coverage) + +set_target_properties(${LIB_NEURUN_BACKEND_ACL_CL} PROPERTIES OUTPUT_NAME backend_acl_cl) + +install(TARGETS ${LIB_NEURUN_BACKEND_ACL_CL} DESTINATION lib) diff --git a/runtime/neurun/backend/acl_cl/Config.cc b/runtime/neurun/backend/acl_cl/Config.cc new file mode 100644 index 000000000..36bf83686 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/Config.cc @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// For CLKernelLibraryEx initialization +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +#include "Config.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +bool Config::initialize() +{ + if (!arm_compute::opencl_is_available()) + { + return false; + } + arm_compute::CLScheduler::get().default_init(); + // NOTE CLKernelLibraryEx must use the same context as CLScheduler + // It did not check whether another device is available. + arm_compute::CLKernelLibraryEx::get().init( + "./cl_kernels/", arm_compute::CLScheduler::get().context(), cl::Device::getDefault()); + + return true; +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/Config.h b/runtime/neurun/backend/acl_cl/Config.h new file mode 100644 index 000000000..a7ceaac26 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/Config.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_CONFIG_H__ +#define __NEURUN_BACKEND_ACL_CL_CONFIG_H__ + +#include "CLTimer.h" +#include <cpp14/memory.h> +#include <backend/IConfig.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class Config : public IConfig +{ +public: + std::string id() override { return "acl_cl"; } + bool initialize() override; + bool SupportPermutation() override { return true; } + bool SupportSubTensorAlloc() override { return true; } + std::unique_ptr<util::ITimer> timer() override { return nnfw::cpp14::make_unique<CLTimer>(); } +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_CONFIG_H__ diff --git a/runtime/neurun/backend/acl_cl/ConstantInitializer.cc b/runtime/neurun/backend/acl_cl/ConstantInitializer.cc new file mode 100644 index 000000000..165b17cd1 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/ConstantInitializer.cc @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConstantInitializer.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +ConstantInitializer::ConstantInitializer(const ir::Operands &operands, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _operands{operands}, _tensor_builder{tensor_builder} +{ + // DO NOTHING +} + +void ConstantInitializer::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto &block_size_index = node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE); + const auto &block_size_obj = _operands.at(block_size_index); + + if (block_size_obj.isConstant()) + { + _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) { + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base()); + assert(model_obj.shape().rank() == 1); + obj.access([&](::neurun::backend::operand::ITensor &tensor) { + for (size_t i = 0; i < shape.num_elements(); ++i) + { + const int32_t value = base[shape.num_elements() - i - 1]; + int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() + + tensor.calcOffset({static_cast<int32_t>(i)})); + *into = value; + } + }); + }; + } +} + +void ConstantInitializer::visit(const ir::operation::Conv2D &node) +{ + const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node) +{ + const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::EmbeddingLookup &node) +{ + const auto &lookups_index = node.getInputs().at(ir::operation::EmbeddingLookup::LOOKUPS); + const auto &lookups_obj = _operands.at(lookups_index); + registerCopyInitializer(lookups_index, lookups_obj); +} + +void ConstantInitializer::visit(const ir::operation::FullyConnected &node) +{ + const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); + const auto &weight_obj = _operands.at(weight_index); + registerCopyInitializer(weight_index, weight_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::Gather &node) +{ + const auto &indices_index = node.getInputs().at(ir::operation::Gather::INDICES); + const auto &indices_obj = _operands.at(indices_index); + registerCopyInitializer(indices_index, indices_obj); +} + +void ConstantInitializer::visit(const ir::operation::HashtableLookup &node) +{ + const auto &lookups_index = node.getInputs().at(ir::operation::HashtableLookup::LOOKUPS); + const auto &lookups_obj = _operands.at(lookups_index); + registerCopyInitializer(lookups_index, lookups_obj); + + const auto &keys_index = node.getInputs().at(ir::operation::HashtableLookup::KEYS); + const auto &keys_obj = _operands.at(keys_index); + registerCopyInitializer(keys_index, keys_obj); +} + +void ConstantInitializer::visit(const ir::operation::LSTM &node) +{ + const auto &input_to_input_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_INPUT_WEIGHTS); + const auto &input_to_input_weights_obj = _operands.at(input_to_input_weights_index); + registerCopyInitializer(input_to_input_weights_index, input_to_input_weights_obj); + + const auto &input_to_forget_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_FORGET_WEIGHTS); + const auto &input_to_forget_weights_obj = _operands.at(input_to_forget_weights_index); + registerCopyInitializer(input_to_forget_weights_index, input_to_forget_weights_obj); + + const auto &input_to_cell_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_CELL_WEIGHTS); + const auto &input_to_cell_weights_obj = _operands.at(input_to_cell_weights_index); + registerCopyInitializer(input_to_cell_weights_index, input_to_cell_weights_obj); + + const auto &input_to_output_weights_index = + node.getInputs().at(ir::operation::LSTM::INPUT_TO_OUTPUT_WEIGHTS); + const auto &input_to_output_weights_obj = _operands.at(input_to_output_weights_index); + registerCopyInitializer(input_to_output_weights_index, input_to_output_weights_obj); + + const auto &recurrent_to_input_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_INPUT_WEIGHTS); + const auto &recurrent_to_input_weights_obj = _operands.at(recurrent_to_input_weights_index); + registerCopyInitializer(recurrent_to_input_weights_index, recurrent_to_input_weights_obj); + + const auto &recurrent_to_forget_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_FORGET_WEIGHTS); + const auto &recurrent_to_forget_weights_obj = _operands.at(recurrent_to_forget_weights_index); + registerCopyInitializer(recurrent_to_forget_weights_index, recurrent_to_forget_weights_obj); + + const auto &recurrent_to_cell_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_CELL_WEIGHTS); + const auto &recurrent_to_cell_weights_obj = _operands.at(recurrent_to_cell_weights_index); + registerCopyInitializer(recurrent_to_cell_weights_index, recurrent_to_cell_weights_obj); + + const auto &recurrent_to_output_weights_index = + node.getInputs().at(ir::operation::LSTM::RECURRENT_TO_OUTPUT_WEIGHTS); + const auto &recurrent_to_output_weights_obj = _operands.at(recurrent_to_output_weights_index); + registerCopyInitializer(recurrent_to_output_weights_index, recurrent_to_output_weights_obj); + + const auto &cell_to_input_weights_index = + node.getInputs().at(ir::operation::LSTM::CELL_TO_INPUT_WEIGHTS); + const auto &cell_to_input_weights_obj = _operands.at(cell_to_input_weights_index); + registerCopyInitializer(cell_to_input_weights_index, cell_to_input_weights_obj); + + const auto &cell_to_forget_weights_index = + node.getInputs().at(ir::operation::LSTM::CELL_TO_FORGET_WEIGHTS); + const auto &cell_to_forget_weights_obj = _operands.at(cell_to_forget_weights_index); + registerCopyInitializer(cell_to_forget_weights_index, cell_to_forget_weights_obj); + + const auto &cell_to_output_weights_index = + node.getInputs().at(ir::operation::LSTM::CELL_TO_OUTPUT_WEIGHTS); + const auto &cell_to_output_weights_obj = _operands.at(cell_to_output_weights_index); + registerCopyInitializer(cell_to_output_weights_index, cell_to_output_weights_obj); + + const auto &input_gate_bias_index = node.getInputs().at(ir::operation::LSTM::INPUT_GATE_BIAS); + const auto &input_gate_bias_obj = _operands.at(input_gate_bias_index); + registerCopyInitializer(input_gate_bias_index, input_gate_bias_obj); + + const auto &forget_gate_bias_index = node.getInputs().at(ir::operation::LSTM::FORGET_GATE_BIAS); + const auto &forget_gate_bias_obj = _operands.at(forget_gate_bias_index); + registerCopyInitializer(forget_gate_bias_index, forget_gate_bias_obj); + + const auto &output_gate_bias_index = node.getInputs().at(ir::operation::LSTM::OUTPUT_GATE_BIAS); + const auto &output_gate_bias_obj = _operands.at(output_gate_bias_index); + registerCopyInitializer(output_gate_bias_index, output_gate_bias_obj); + + const auto &projection_weights_index = + node.getInputs().at(ir::operation::LSTM::PROJECTION_WEIGHTS); + const auto &projection_weights_obj = _operands.at(projection_weights_index); + registerCopyInitializer(projection_weights_index, projection_weights_obj); + + const auto &projection_bias_index = node.getInputs().at(ir::operation::LSTM::PROJECTION_BIAS); + const auto &projection_bias_obj = _operands.at(projection_bias_index); + registerCopyInitializer(projection_bias_index, projection_bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::RNN &node) +{ + const auto &weights_index = node.getInputs().at(ir::operation::RNN::WEIGHTS); + const auto &weights_obj = _operands.at(weights_index); + registerCopyInitializer(weights_index, weights_obj); + + const auto &recurrent_weights_index = node.getInputs().at(ir::operation::RNN::RECURRENT_WEIGHTS); + const auto &recurrent_weights_obj = _operands.at(recurrent_weights_index); + registerCopyInitializer(recurrent_weights_index, recurrent_weights_obj); + + const auto &bias_index = node.getInputs().at(ir::operation::RNN::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto &block_size_index = node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE); + const auto &block_size_obj = _operands.at(block_size_index); + + if (block_size_obj.isConstant()) + { + _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) { + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base()); + assert(model_obj.shape().rank() == 1); + obj.access([&](::neurun::backend::operand::ITensor &tensor) { + for (size_t i = 0; i < shape.num_elements(); ++i) + { + const int32_t value = base[shape.num_elements() - i - 1]; + int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() + + tensor.calcOffset({static_cast<int32_t>(i)})); + *into = value; + } + }); + }; + } + + const auto &paddings_index = node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS); + const auto &paddings_obj = _operands.at(paddings_index); + if (paddings_obj.isConstant()) + { + _init_map[paddings_index] = [](const ir::Operand &model_obj, backend::operand::ITensor &obj) { + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const int32_t *>(model_obj.data().base()); + assert(model_obj.shape().rank() == 2); + assert(obj.dimension(0) == 2); + obj.access([&](::neurun::backend::operand::ITensor &tensor) { + for (auto i = 0; i < shape.dim(0); ++i) + { + for (auto j = 0; j < shape.dim(1); ++j) + { + const int32_t value = base[i * 2 + j]; + int32_t *into = reinterpret_cast<int32_t *>( + tensor.buffer() + tensor.calcOffset({shape.dim(0) - i - 1, j})); + *into = value; + } + } + }); + }; + } +} + +void ConstantInitializer::visit(const ir::operation::TransposeConv &node) +{ + const auto &kernel_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/ConstantInitializer.h b/runtime/neurun/backend/acl_cl/ConstantInitializer.h new file mode 100644 index 000000000..5965d2046 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/ConstantInitializer.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__ +#define __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__ + +#include <backend/IConstantInitializer.h> +#include <ir/Operands.h> +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class ConstantInitializer : public IConstantInitializer +{ +public: + ConstantInitializer(const ir::Operands &operands, + const std::shared_ptr<TensorBuilder> &tensor_builder); + +public: + void visit(const ir::operation::BatchToSpaceND &) override; + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::EmbeddingLookup &) override; + void visit(const ir::operation::FullyConnected &) override; + void visit(const ir::operation::Gather &) override; + void visit(const ir::operation::HashtableLookup &) override; + void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::RNN &) override; + void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::TransposeConv &) override; + +private: + const ir::Operands &operands() const override { return _operands; } + std::shared_ptr<ITensorBuilder> tensor_builder() const override { return _tensor_builder; } + +private: + const ir::Operands &_operands; + std::shared_ptr<TensorBuilder> _tensor_builder; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__ diff --git a/runtime/neurun/backend/acl_cl/KernelGenerator.cc b/runtime/neurun/backend/acl_cl/KernelGenerator.cc new file mode 100644 index 000000000..bffb60b61 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/KernelGenerator.cc @@ -0,0 +1,2151 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "KernelGenerator.h" + +#include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions +#include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions + +#include <AclFunction.h> +#include <Convert.h> +#include <Swizzle.h> + +#include "ir/Index.h" +#include "ir/DataType.h" +#include "ir/InternalType.h" +#include "compiler/IExecutionBuilder.h" +#include "exec/NopFunction.h" +#include "util/logging.h" +#include "util/Utils.h" +#include "util/Padding.h" + +using ::neurun::compiler::IExecutionBuilder; + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using ::neurun::backend::acl_common::asAclFunction; + +// +// ActivationBuilder +// +class ActivationBuilder +{ +public: + explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder) + { + // DO NOTHING + } + +private: + void appendReLU(::arm_compute::ICLTensor *ifm_alloc); + void appendReLU1(::arm_compute::ICLTensor *ifm_alloc); + void appendReLU6(::arm_compute::ICLTensor *ifm_alloc); + +public: + void append(ir::Activation code, ::arm_compute::ICLTensor *ifm_alloc); + +private: + IExecutionBuilder &_builder; +}; + +void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::append(ir::Activation code, ::arm_compute::ICLTensor *ifm_alloc) +{ + switch (code) + { + case ir::Activation::NONE: + { + // DO NOTHING + break; + } + case ir::Activation::RELU: + { + appendReLU(ifm_alloc); + break; + } + case ir::Activation::RELU1: + { + appendReLU1(ifm_alloc); + break; + } + case ir::Activation::RELU6: + { + appendReLU6(ifm_alloc); + break; + } + default: + { + throw std::runtime_error("Not supported, yet"); + } + } +} + +// +// KernelGenerator +// +KernelGenerator::KernelGenerator(const ir::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(ir::Layout::UNKNOWN) +{ + // DO NOTHING +} + +void KernelGenerator::visit(const ir::OpSequence &op_seq) +{ + _current_subg_layout = op_seq.getLayout(); + for (const auto &e : op_seq.operations()) + { + const auto &node = *(e.node); + _tensor_builder->preVisit(node); + node.accept(*this); + _tensor_builder->postVisit(node); + } +} + +void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)}; + const auto block_size_index{ + node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + + assert(_ctx.at(block_size_index).isConstant()); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBatchToSpaceLayer>(); + + fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Cast &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 + ? arm_compute::SubDataType::BOOL + : arm_compute::SubDataType::NONE; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Conv2D &node) +{ + using ir::operation::Conv2D; + + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)}; + const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]. + const auto &ker_shape = _ctx.at(ker_index).shape(); + const auto ker_height = ker_shape.dim(1); + const auto ker_width = ker_shape.dim(2); + + const auto stride = node.param().stride; + const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, + stride, ker_width, ker_height); + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), + conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) +{ + using ir::operation::DepthwiseConv2D; + + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)}; + const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + // Kernel format is [1, kernel_height, kernel_width, depth_out]. + const auto &ker_shape = _ctx.at(ker_index).shape(); + const auto ker_height = ker_shape.dim(1); + const auto ker_width = ker_shape.dim(2); + + const auto stride = node.param().stride; + const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, + stride, ker_width, ker_height); + const auto multiplier = node.param().multiplier; + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); + + if (ker_height == 3 && ker_width == 3) + { + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer3x3>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), + ofm_alloc->handle(), conv_info, multiplier, act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); + } + else + { + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), + ofm_alloc->handle(), conv_info, multiplier, act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); + } +} + +void KernelGenerator::visit(const ir::operation::MaxPool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl; + VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl; + VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl; + VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, + ::arm_compute::Size2D{kw, kh}, + acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append((std::move(acl_fn))); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::AvgPool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl; + VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl; + VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl; + VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, + acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append((std::move(acl_fn))); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Concat &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + std::vector<ir::OperandIndex> input_indexes; + + for (const auto &input : node.getInputs()) + input_indexes.emplace_back(input); + + const auto axis = node.param().axis; + + // If tensor allocator allocate as subtensor + bool canEliminate = true; + for (auto &ifm_ind : input_indexes) + { + if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind)) + { + canEliminate = false; + break; + } + } + if (canEliminate) + { + // If concat eliminated, return a NOP IFunction + _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>()); + return; + } + + auto output_alloc = _tensor_builder->at(ofm_index).get(); + std::vector<::arm_compute::ICLTensor *> input_tensors; + for (auto &ifm_ind : input_indexes) + input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); + + std::unique_ptr<::arm_compute::IFunction> fn; + if (input_indexes.size() < 2) + { + auto l = nnfw::cpp14::make_unique<::arm_compute::CLCopy>(); + l->configure(input_tensors.at(0), output_alloc->handle()); + fn = std::move(l); + } + else + { + auto l = nnfw::cpp14::make_unique<::arm_compute::CLConcatenateLayer>(); + const auto rank = node.param().rank; + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = output_alloc->layout(); + const auto fixed_axis = + acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); + l->configure(input_tensors, output_alloc->handle(), fixed_axis); + fn = std::move(l); + } + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::FullyConnected &node) +{ + using ir::operation::FullyConnected; + + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; + const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; + const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; + + const auto input_rank = _ctx.at(input_index).shape().rank(); + // TODO Currently we are not handling where the case is that the input's rank is 3. + // The handling should be added in the future. + assert(input_rank != 3); + + const auto output_size = + _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); + UNUSED_RELEASE(output_size); + assert(_ctx.at(bias_index).shape().dim(0) == output_size); + assert(_ctx.at(weight_index).shape().dim(0) == output_size); + const auto batch_size = + _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); + const auto input_size = + _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); + + // Check for reshaping input's shape into rank-2 + bool needs_reshape = false; + ir::Shape reshape(2); + if (input_rank == 4) + { + const auto feature_size = _ctx.at(input_index).shape().num_elements(); + + UNUSED_RELEASE(feature_size); + assert(batch_size >= 0 && input_size >= 0); + assert(feature_size == static_cast<uint64_t>(batch_size) * static_cast<uint64_t>(input_size)); + + // for reshaping + needs_reshape = true; + reshape.dim(0) = batch_size; /* H */ + reshape.dim(1) = input_size; /* W */ + } + + const auto activation = node.param().activation; + + auto output_alloc = _tensor_builder->at(output_index).get(); + const auto input_alloc = _tensor_builder->at(input_index).get(); + const auto weight_alloc = _tensor_builder->at(weight_index).get(); + const auto bias_alloc = _tensor_builder->at(bias_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto acl_layout = output_alloc->handle()->info()->data_layout(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure( + input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), + needs_reshape, + ::neurun::backend::acl_common::asTensorShape( + reshape, frontend_layout, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout))); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Mul &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::ReduceSum &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReduceSum::Input::INPUT)}; + const auto &axes{node.param().axes}; + const auto keep_dims{node.param().keep_dims}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = input_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int input_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += input_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value()); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims, + ::arm_compute::ReduceOperation::SUM); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Reshape &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + // NOTE This operation must not be changed the layout from frontend to backend + // So, PermutationOperationPass makes layouts of frontend and backend the same. + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = output_alloc->layout(); + assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || + frontend_layout == backend_layout); + UNUSED_RELEASE(frontend_layout); + UNUSED_RELEASE(backend_layout); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReshapeLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Squeeze &node) +{ + // Squeeze is identical to reshape except that it has an optional dimensions input. + // In addition, optional dims_index is ignored since output tensor already has squeezed shape + // by freezer and toco + // TODO Support multi-layout for frontend and backend + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; + const auto dims{node.param().dims}; + const auto ndim{node.param().ndim}; + (void)dims; + (void)ndim; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>(); + fn->configure(input_alloc->handle(), output_alloc->handle()); + auto acl_fn = asAclFunction(std::move(fn)); + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Tanh &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Softmax &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)}; + + const auto beta = node.param().beta; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Slice &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)}; + const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; + const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; + + auto outputData_alloc = _tensor_builder->at(output_index).get(); + auto inputData_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = inputData_alloc->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = node.param().rank; + std::vector<int32_t> starts; + std::vector<int32_t> ends; + starts.resize(input_rank, 0); + ends.resize(input_rank, 0); + { + auto beginData_base = _ctx.at(begins_index).data().base(); + auto sizeData_base = _ctx.at(sizes_index).data().base(); + const int beginData_size = _ctx.at(begins_index).shape().num_elements(); + const int sizeData_size = _ctx.at(sizes_index).shape().num_elements(); + + using ir::DataType; + + UNUSED_RELEASE(beginData_size); + UNUSED_RELEASE(sizeData_size); + + assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32); + assert(beginData_size == input_rank); + assert(sizeData_size == input_rank); + + assert(beginData_base != nullptr); + for (int n = 0; n < input_rank; ++n) + { + auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout, + backend_layout) + .value(); + + int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n); + starts[axis] = begin_value; + + int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n); + ends[axis] = begin_value + size_value; + } + } + + ::arm_compute::Coordinates starts_set; + ::arm_compute::Coordinates ends_set; + + for (size_t i = 0; i < starts.size(); ++i) + { + starts_set.set(i, starts[i]); + ends_set.set(i, ends[i]); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSlice>(); + + fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::StridedSlice &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; + const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)}; + const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; + const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; + + auto outputData_alloc = _tensor_builder->at(output_index).get(); + auto inputData_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = inputData_alloc->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = node.param().rank; + std::vector<int32_t> starts; + std::vector<int32_t> ends; + std::vector<int32_t> strides; + starts.resize(input_rank, 0); + ends.resize(input_rank, 0); + strides.resize(input_rank, 0); + { + auto startData_base = _ctx.at(starts_index).data().base(); + auto endData_base = _ctx.at(ends_index).data().base(); + auto stridesData_base = _ctx.at(strides_index).data().base(); + const int startData_size = _ctx.at(starts_index).shape().num_elements(); + const int endData_size = _ctx.at(ends_index).shape().num_elements(); + const int stridesData_size = _ctx.at(strides_index).shape().num_elements(); + + using ir::DataType; + + UNUSED_RELEASE(startData_size); + UNUSED_RELEASE(endData_size); + UNUSED_RELEASE(stridesData_size); + + assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32); + assert(startData_size == input_rank); + assert(endData_size == input_rank); + assert(stridesData_size == input_rank); + + assert(startData_base != nullptr); + for (int n = 0; n < input_rank; ++n) + { + auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout, + backend_layout) + .value(); + + int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n); + starts[axis] = start_value; + + int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n); + ends[axis] = end_value; + + int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n); + strides[axis] = strides_value; + } + } + + // Set mask bits such as order of inputData + const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank, + frontend_layout, backend_layout); + const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank, + frontend_layout, backend_layout); + const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>( + node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout); + + ::arm_compute::Coordinates starts_set; + ::arm_compute::Coordinates ends_set; + ::arm_compute::BiStrides strides_set; + + for (size_t i = 0; i < starts.size(); ++i) + { + starts_set.set(i, starts[i]); + ends_set.set(i, ends[i]); + strides_set.set(i, strides[i]); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>(); + + fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + strides_set, begin_mask, end_mask, shrink_axis_mask); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Transpose &node) +{ + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; + const auto &perm{node.param().perm}; + + const auto rank = node.param().rank; + + auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); + auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + std::vector<std::int32_t> pv(perm.cbegin(), perm.cend()); + // Reversed + auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector( + rank, pv, frontend_layout, backend_layout); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPermute>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Add &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + arm_compute::ConvertPolicy::SATURATE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Sub &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + arm_compute::ConvertPolicy::SATURATE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Div &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Exp &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::InstanceNorm &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)}; + const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; + const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto gamma_alloc = _tensor_builder->at(gamma_index).get(); + auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto epsilon = node.param().epsilon; + auto activation = node.param().activation; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), + beta_alloc->handle(), epsilon); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::Logistic &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LogicalAnd &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + ::arm_compute::BinaryLogicalOperation::AND); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LSTM &node) +{ + // TODO Support dynamic rnn + // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. + const auto scratch_buffer_index{ + node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + const auto output_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + const auto cell_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + + const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto input_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional + const auto input_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; + const auto input_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; + const auto input_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional + const auto recurrent_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; + const auto recurrent_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; + const auto recurrent_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto cell_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional + const auto cell_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional + const auto cell_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional + const auto input_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; + const auto forget_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; + const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; + const auto output_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; + const auto projection_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional + const auto projection_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional + const auto output_state_in_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; + const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; + const auto cell_threshold = node.param().cell_threshold; + const auto projection_threshold = node.param().projection_threshold; + + bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; + bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; + bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && + _ctx.at(projection_weights_index).shape().dim(1) != 0; + bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); + + // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. + // true: no CIFG + // false: CIFG + // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + + // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. + // But the cell_to_input_weights does not exist in regular CIFG although peephole. + // true: peephole + // false: no peephole + bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; + + // NOTE Although the projection weights has data the projection bias may not have data. + bool has_projection_param = has_projection_weights; + + const auto activation = node.param().activation; + const auto cell_clip = cell_threshold; + const auto projection_clip = projection_threshold; + assert(cell_clip >= 0.f && projection_clip >= 0.f); + + auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); + auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); + auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); + auto output_alloc = _tensor_builder->at(output_index).get(); + + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); + auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); + auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); + auto recurrent_to_forget_weights_alloc = + _tensor_builder->at(recurrent_to_forget_weights_index).get(); + auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); + auto recurrent_to_output_weights_alloc = + _tensor_builder->at(recurrent_to_output_weights_index).get(); + + auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); + auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); + auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); + auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); + auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); + + auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>(); + + ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{}; + if (has_cifg_param) + { + auto input_to_input_weights_alloc = + _tensor_builder->at(input_to_input_weights_index).get(); // optional + auto recurrent_to_input_weights_alloc = + _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional + auto cell_to_input_weights_handle = + has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() + : nullptr; // optional (non-cifg && peephole) + auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional + lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), + recurrent_to_input_weights_alloc->handle(), + cell_to_input_weights_handle, input_gate_bias_alloc->handle()); + } + if (has_peephole_param) + { + auto cell_to_forget_weights_alloc = + _tensor_builder->at(cell_to_forget_weights_index).get(); // optional + auto cell_to_output_weights_alloc = + _tensor_builder->at(cell_to_output_weights_index).get(); // optional + lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), + cell_to_output_weights_alloc->handle()); + } + if (has_projection_param) + { + auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional + auto projection_bias_handle = has_projection_bias + ? _tensor_builder->at(projection_bias_index).get()->handle() + : nullptr; // optional + lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); + } + + fn->configure( + input_alloc->handle(), input_to_forget_weights_alloc->handle(), + input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), + recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), + recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), + cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), + cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), + output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), + lstm_params, act_info, cell_clip, projection_clip); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReduceMax &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReduceMax::Input::INPUT)}; + const auto &axes{node.param().axes}; + const auto keep_dims{node.param().keep_dims}; + + auto ofm_alloc = _tensor_builder->at(output_index).get(); + auto ifm_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int ifm_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += ifm_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value()); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes, keep_dims, + arm_compute::ReduceOperation::MAX); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Comparison &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; + + const auto comparison_type = node.param().comparison_type; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLComparison>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + (arm_compute::ComparisonOperation)comparison_type); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Pack &node) +{ + const auto output_index{node.getOutputs().at(0)}; + auto axis{node.param().axis}; + + const auto output_rank = node.param().rank; + + std::vector<ir::OperandIndex> input_indexes; + for (const auto &input_index : node.getInputs()) + input_indexes.emplace_back(input_index); + + auto output = _tensor_builder->at(output_index).get()->handle(); + std::vector<arm_compute::ICLTensor *> inputs; + for (const auto &input_index : input_indexes) + inputs.emplace_back(_tensor_builder->at(input_index)->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(output_index).get()->layout(); + + if (axis < 0) + axis += output_rank; + axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStackLayer>(); + + fn->configure(inputs, axis, output); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Permute &node) +{ + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(0)}; + const auto permute_type = node.getPermuteType(); + auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); + auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + const auto rank = _ctx.at(ofm_idx).shape().rank(); + assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); + + std::unique_ptr<::arm_compute::IFunction> fn; + arm_compute::PermutationVector pv; + if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4) + { + // WHCN -> CWHN + pv = arm_compute::PermutationVector{2, 0, 1}; + + auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + + fn = std::move(l); + } + else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4) + { + // CWHN -> WHCN + pv = arm_compute::PermutationVector{1, 2, 0}; + + auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + + fn = std::move(l); + } + else + { + auto l = nnfw::cpp14::make_unique<::arm_compute::CLCopy>(); + + l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + fn = std::move(l); + } + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::RSQRT &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::ReLU &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLScale>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, + ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReLU1 &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReLU6 &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::RNN &node) +{ + const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)}; + const auto hidden_state_out_index{ + node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)}; + + const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)}; + const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)}; + const auto recurrent_weights_index{ + node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)}; + const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)}; + const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)}; + + const auto activation = node.param().activation; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + + auto input_alloc = _tensor_builder->at(input_index).get(); + auto weights_alloc = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation); + + auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>(); + copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + _execution_builder->append(asAclFunction(std::move(copy_layer))); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), + bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), + act_info); + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Floor &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLFloor>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)}; + const auto block_size_index{ + node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; + const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + auto paddings_alloc = _tensor_builder->at(paddings_index).get(); + + assert(_ctx.at(block_size_index).isConstant()); + assert(_ctx.at(paddings_index).isConstant()); + + std::unique_ptr<::arm_compute::IFunction> fn; + if (_ctx.at(ofm_index).typeInfo().type() == ir::DataType::QUANT8_ASYMM) + { + // NOTE CLSpaceToBatchLayer has a bug that padding's values are 0 even when zero point of + // QASYMM8 is not 0. + auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchND>(); + l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), + ofm_alloc->handle()); + fn = std::move(l); + } + else + { + auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchLayer>(); + l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), + ofm_alloc->handle()); + fn = std::move(l); + } + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; + + auto block_size = node.param().block_size; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::L2Pool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + + uint32_t kw = node.param().kw; + uint32_t kh = node.param().kh; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, + ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto lookups_alloc = _tensor_builder->at(lookups_index).get(); + auto values_alloc = _tensor_builder->at(values_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>(); + + fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::L2Normalization &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)}; + + // {CL|Neon}L2Normalization performs the reduction only along dimension 0 + // L2 Normalization always performs the reduction along the depth axis + // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by + // choosing normalization parameters as below + + const auto &ifm_shape = _ctx.at(ifm_index).shape(); + // TODO Support optional constant dimension that normalization would be performed on + const auto normalization_axis = node.param().rank - 1; + int32_t radius = + 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1 + float alpha = 1.0f; // In the implementation to make alpha_ become 1 + float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) + float bias = 0.0f; // Don't offset the reduction. + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, + radius, alpha, beta, bias, false); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::HashtableLookup &node) +{ + const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)}; + const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)}; + + const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)}; + const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; + const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto hits_alloc = _tensor_builder->at(hits_index).get(); + + auto lookups_alloc = _tensor_builder->at(lookups_index).get(); + auto keys_alloc = _tensor_builder->at(keys_index).get(); + auto values_alloc = _tensor_builder->at(values_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>(); + + fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), + output_alloc->handle(), hits_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::PReLU &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>(); + + fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::TransposeConv &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto output_shape_index{ + node.getInputs().at(ir::operation::TransposeConv::Input::OUTPUT_SHAPE)}; + const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)}; + const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout); + + const auto stride = node.param().stride; + + assert((node.param().padding.type == ir::PaddingType::SAME) || + (node.param().padding.type == ir::PaddingType::VALID)); + auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride, + ker_shape.W, ker_shape.H); + + uint32_t invalid_horizontal = 0; + uint32_t invalid_vertical = 0; + if (node.param().padding.type == ir::PaddingType::VALID) + { + invalid_horizontal = + ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1); + invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); + } + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + + const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, + invalid_horizontal, invalid_vertical); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SQRT &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LogicalOr &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LogicalNot &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::SquaredDifference &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::TopKV2 &node) +{ + const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)}; + const auto outputIndices_index{ + node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)}; + + const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)}; + + // Currently, we only support the vector input. + assert(_ctx.at(inputData_index).shape().rank() == 1 || + _ctx.at(inputData_index).shape().rank() == 2); + + const auto k = node.param().k; + + auto values_alloc = _tensor_builder->at(outputValues_index).get(); + auto indices_alloc = _tensor_builder->at(outputIndices_index).get(); + auto input_alloc = _tensor_builder->at(inputData_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>(); + + fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Gather &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; + const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape(); + + const auto ifm_rank = node.param().rank; + const auto axis_raw = node.param().axis; + const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw); + const int axis = ::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto indices_alloc = _tensor_builder->at(indices_index).get(); + + // NOTE The frontend layout and backend layout must be the same for this operation. + // If not the same, we have to add a stage(?) to perform permutation of output tensor. It + // is not not efficient even if it works well. If so, it would be better to set the + // layout of these backend tensors to the same layout. + // There is also one thing we have to think about. This operation depends on the layout of + // a model. For example, if a model in NHWC has this operation as output rank == 4, indices + // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W + // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. + const auto backend_layout = ofm_alloc->layout(); + UNUSED_RELEASE(backend_layout); + assert(backend_layout == ifm_alloc->layout()); + assert(backend_layout == indices_alloc->layout()); + assert(ifm_rank < 4 || _current_subg_layout == backend_layout); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLGatherEx>(); + + fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Neg &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNeg>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Abs &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ArgMax &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; + + auto ifm_shape = _ctx.at(ifm_index).shape(); + auto ofm_shape = _ctx.at(ofm_index).shape(); + + assert((ifm_shape.rank() - 1) == ofm_shape.rank()); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto ifm_rank = node.param().rank; + auto frontend_layout = _current_subg_layout; + auto backend_layout = ifm_alloc->layout(); + + int axis_value = node.param().axis; + if (axis_value < 0) + { + axis_value += ifm_rank; + } + + auto acl_axis = + acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis}, + ::arm_compute::ArgOperation::MAX); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Dequantize &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Mean &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Mean::Input::INPUT)}; + const auto &axes{node.param().axes}; + const auto keep_dims{node.param().keep_dims}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int ifm_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += ifm_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value()); + } + + arm_compute::Coordinates reduce_axes; + for (const auto axis : acl_axes) + { + reduce_axes.set(reduce_axes.num_dimensions(), axis); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceMean>(); + + fn->configure(ifm_alloc->handle(), reduce_axes, keep_dims, ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{ + node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)}; + + auto radius = node.param().radius; + auto alpha = node.param().alpha; + auto beta = node.param().beta; + auto bias = node.param().bias; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo( + ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::DepthToSpace &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)}; + + auto block_size = node.param().block_size; + assert(block_size > 0); + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::ReduceMin &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ReduceMin::Input::INPUT)}; + const auto &axes{node.param().axes}; + const auto keep_dims{node.param().keep_dims}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + + // Convert to ACL axes taking into account negative values and possible duplicates. + std::set<std::uint32_t> acl_axes; + const int ifm_rank = node.param().rank; + for (int axis : axes) + { + if (axis < 0) + axis += ifm_rank; + acl_axes.insert( + acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value()); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes, keep_dims, + ::arm_compute::ReduceOperation::MIN); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Split &node) +{ + const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + + assert(node.param().num_splits == static_cast<int>(node.getOutputs().size())); + + const auto ifm_rank = node.param().rank; + std::vector<ir::OperandIndex> output_indexes; + for (const auto &output : node.getOutputs()) + output_indexes.emplace_back(output); + + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + std::vector<arm_compute::ICLTensor *> output_allocs; + for (const auto &ofm_ind : output_indexes) + output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + auto axis = node.param().axis; + if (axis < 0) + axis += ifm_rank; + axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSplit>(); + + fn->configure(ifm_alloc->handle(), output_allocs, axis); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Unpack &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; + auto axis{node.param().axis}; + + const auto input_rank = node.param().rank; + + std::vector<ir::OperandIndex> output_indexes; + for (const auto &output_index : node.getOutputs()) + output_indexes.emplace_back(output_index); + + auto input = _tensor_builder->at(input_index).get()->handle(); + std::vector<arm_compute::ICLTensor *> outputs; + for (const auto &output_index : output_indexes) + outputs.emplace_back(_tensor_builder->at(output_index)->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(input_index).get()->layout(); + if (axis < 0) + axis += input_rank; + axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>(); + + fn->configure(input, outputs, axis); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Pad &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)}; + const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)}; + const auto output_index{node.getOutputs().at(0)}; + assert(_ctx.at(pad_index).isConstant()); + + auto rank = node.param().rank; + auto pad_base = _ctx.at(pad_index).data().base(); + + auto input_type = _ctx.at(input_index).typeInfo(); + auto data_type = acl_common::asDataType(input_type.type()); + auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()); + const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info); + + auto input = _tensor_builder->at(input_index).get()->handle(); + auto output = _tensor_builder->at(output_index).get()->handle(); + + ::arm_compute::PaddingList padding_list; + padding_list.resize(rank); + for (int32_t n = 0; n < rank; ++n) + { + const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(input_index).get()->layout(); + const auto axis = + acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value(); + padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]}; + } + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>(); + fn->configure(input, output, padding_list, pixel_value); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const ir::operation::Min &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseMin>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const ir::operation::Max &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseMax>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/KernelGenerator.h b/runtime/neurun/backend/acl_cl/KernelGenerator.h new file mode 100644 index 000000000..a577f1ebc --- /dev/null +++ b/runtime/neurun/backend/acl_cl/KernelGenerator.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__ +#define __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__ + +#include <backend/IKernelGenerator.h> + +#include "ir/Operands.h" +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class KernelGenerator : public IKernelGenerator +{ +public: + KernelGenerator(const ir::Operands &ctx, const std::shared_ptr<TensorBuilder> &tensor_builder); + + void visit(const ir::OpSequence &) override; + void visit(const ir::operation::BatchToSpaceND &) override; + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::MaxPool2D &) override; + void visit(const ir::operation::AvgPool2D &) override; + void visit(const ir::operation::Concat &) override; + void visit(const ir::operation::FullyConnected &) override; + void visit(const ir::operation::Mul &) override; + void visit(const ir::operation::ReduceSum &) override; + void visit(const ir::operation::Reshape &) override; + void visit(const ir::operation::Squeeze &) override; + void visit(const ir::operation::Tanh &) override; + void visit(const ir::operation::Softmax &) override; + void visit(const ir::operation::Slice &) override; + void visit(const ir::operation::StridedSlice &) override; + void visit(const ir::operation::Transpose &) override; + void visit(const ir::operation::Add &) override; + void visit(const ir::operation::Sub &) override; + void visit(const ir::operation::Cast &) override; + void visit(const ir::operation::Div &) override; + void visit(const ir::operation::Exp &) override; + void visit(const ir::operation::InstanceNorm &) override; + void visit(const ir::operation::Logistic &) override; + void visit(const ir::operation::ReduceMax &) override; + void visit(const ir::operation::Comparison &) override; + void visit(const ir::operation::LogicalAnd &) override; + void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::Pack &) override; + void visit(const ir::operation::Permute &) override; + void visit(const ir::operation::RSQRT &) override; + void visit(const ir::operation::ReLU &) override; + void visit(const ir::operation::ResizeBilinear &) override; + void visit(const ir::operation::ReLU1 &) override; + void visit(const ir::operation::ReLU6 &) override; + void visit(const ir::operation::RNN &) override; + void visit(const ir::operation::Floor &) override; + void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::SpaceToDepth &) override; + void visit(const ir::operation::L2Pool2D &) override; + void visit(const ir::operation::EmbeddingLookup &) override; + void visit(const ir::operation::L2Normalization &) override; + void visit(const ir::operation::HashtableLookup &) override; + void visit(const ir::operation::PReLU &) override; + void visit(const ir::operation::TransposeConv &) override; + void visit(const ir::operation::SQRT &) override; + void visit(const ir::operation::LogicalOr &) override; + void visit(const ir::operation::LogicalNot &) override; + void visit(const ir::operation::SquaredDifference &) override; + void visit(const ir::operation::TopKV2 &) override; + void visit(const ir::operation::Gather &) override; + void visit(const ir::operation::Neg &) override; + void visit(const ir::operation::Abs &) override; + void visit(const ir::operation::ArgMax &) override; + void visit(const ir::operation::Dequantize &) override; + void visit(const ir::operation::Mean &) override; + void visit(const ir::operation::LocalResponseNormalization &) override; + void visit(const ir::operation::DepthToSpace &) override; + void visit(const ir::operation::ReduceMin &) override; + void visit(const ir::operation::Split &) override; + void visit(const ir::operation::Unpack &) override; + void visit(const ir::operation::Pad &) override; + void visit(const ir::operation::Min &) override; + void visit(const ir::operation::Max &) override; + +private: + const ir::Operands &_ctx; + std::shared_ptr<TensorBuilder> _tensor_builder; + ir::Layout _current_subg_layout; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__ diff --git a/runtime/neurun/backend/acl_cl/PluginClassesAllocator.cc b/runtime/neurun/backend/acl_cl/PluginClassesAllocator.cc new file mode 100644 index 000000000..ac3f0acff --- /dev/null +++ b/runtime/neurun/backend/acl_cl/PluginClassesAllocator.cc @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <util/logging.h> + +#include "Backend.h" + +extern "C" { +neurun::backend::Backend *neurun_backend_create() +{ + VERBOSE(neurun_backend_create) << "'acl_cl' loaded\n"; + return new neurun::backend::acl_cl::Backend; +} + +void neurun_backend_destroy(neurun::backend::Backend *backend) +{ + VERBOSE(neurun_backend_create) << "'acl_cl' unloaded\n"; + delete backend; +} +} diff --git a/runtime/neurun/backend/acl_cl/ShapeFixer.cc b/runtime/neurun/backend/acl_cl/ShapeFixer.cc new file mode 100644 index 000000000..e1cbeab6c --- /dev/null +++ b/runtime/neurun/backend/acl_cl/ShapeFixer.cc @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ShapeFixer.h" + +#include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions +#include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions + +#include <AclFunction.h> +#include <Convert.h> +#include <Swizzle.h> + +#include "ir/Index.h" +#include "compiler/IExecutionBuilder.h" +#include "exec/NopFunction.h" +#include "util/logging.h" +#include "util/Utils.h" +#include "util/Padding.h" + +using ::neurun::compiler::IExecutionBuilder; + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using ::neurun::backend::acl_common::asAclFunction; + +ShapeFixer::ShapeFixer(const ir::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _ctx(ctx), _tensor_builder(tensor_builder) +{ + assert(tensor_builder); +} + +void ShapeFixer::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::Cast &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Conv2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::DepthwiseConv2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::MaxPool2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::AvgPool2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Concat &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(ofm_index, false); + for (const auto &input : node.getInputs()) + _tensor_builder->dimCorrection(input, false); +} + +void ShapeFixer::visit(const ir::operation::FullyConnected &node) +{ + using ir::operation::FullyConnected; + const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; + const auto input_rank = _ctx.at(input_index).shape().rank(); + if (input_rank == 4) + _tensor_builder->dimCorrection(input_index, false); +} + +void ShapeFixer::visit(const ir::operation::Mul &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::ReduceSum &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Reshape &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Squeeze &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).shape().rank() == 0) + const_cast<ir::Shape &>(_ctx.at(output_index).shape()).extendRank(1); + const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Tanh &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Softmax &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Slice &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::StridedSlice &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::Transpose &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Add &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Sub &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Div &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Exp &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::InstanceNorm &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Logistic &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LogicalAnd &node) +{ + const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::LSTM &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReduceMax &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Comparison &node) +{ + const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Pack &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(ofm_index, false); + for (const auto &inputs : node.getInputs()) + { + _tensor_builder->dimCorrection(inputs, false); + const auto ofm_rank = _ctx.at(ofm_index).shape().rank(); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(inputs).shape()).extendRank(ofm_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Permute &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::RSQRT &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReLU &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ResizeBilinear &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReLU1 &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReLU6 &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::RNN &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Floor &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::SpaceToDepth &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::L2Pool2D &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::EmbeddingLookup &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + _tensor_builder->dimCorrection(values_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::L2Normalization &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::HashtableLookup &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::PReLU &node) +{ + const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; + + if (!(_ctx.at(ifm_index).shape() == _ctx.at(alpha_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(ifm_index).shape().rank(), _ctx.at(alpha_index).shape().rank()); + const_cast<ir::Shape &>(_ctx.at(ifm_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(alpha_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::TransposeConv &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::SQRT &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LogicalOr &node) +{ + const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + const_cast<ir::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::LogicalNot &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::SquaredDifference &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::TopKV2 &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Gather &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; + const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); + _tensor_builder->dimCorrection(indices_index, false); +} + +void ShapeFixer::visit(const ir::operation::Neg &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Abs &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ArgMax &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const ir::operation::Dequantize &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Mean &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::LocalResponseNormalization &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::DepthToSpace &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::ReduceMin &) { /* DO NOTHING */} + +void ShapeFixer::visit(const ir::operation::Split &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + for (const auto &output : node.getOutputs()) + _tensor_builder->dimCorrection(output, false); +} + +void ShapeFixer::visit(const ir::operation::Unpack &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + for (const auto &output_index : node.getOutputs()) + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Pad &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const ir::operation::Min &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const ir::operation::Max &node) +{ + const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<ir::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<ir::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/ShapeFixer.h b/runtime/neurun/backend/acl_cl/ShapeFixer.h new file mode 100644 index 000000000..ec5f5c896 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/ShapeFixer.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__ +#define __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__ + +#include <backend/IShapeFixer.h> + +#include "ir/Operands.h" +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class ShapeFixer : public IShapeFixer +{ +public: + ShapeFixer(const ir::Operands &ctx, const std::shared_ptr<TensorBuilder> &tensor_builder); + + void visit(const ir::operation::BatchToSpaceND &) override; + void visit(const ir::operation::Conv2D &) override; + void visit(const ir::operation::DepthwiseConv2D &) override; + void visit(const ir::operation::MaxPool2D &) override; + void visit(const ir::operation::AvgPool2D &) override; + void visit(const ir::operation::Concat &) override; + void visit(const ir::operation::FullyConnected &) override; + void visit(const ir::operation::Mul &) override; + void visit(const ir::operation::ReduceSum &) override; + void visit(const ir::operation::Reshape &) override; + void visit(const ir::operation::Squeeze &) override; + void visit(const ir::operation::Tanh &) override; + void visit(const ir::operation::Softmax &) override; + void visit(const ir::operation::Slice &) override; + void visit(const ir::operation::StridedSlice &) override; + void visit(const ir::operation::Transpose &) override; + void visit(const ir::operation::Add &) override; + void visit(const ir::operation::Sub &) override; + void visit(const ir::operation::Cast &) override; + void visit(const ir::operation::Div &) override; + void visit(const ir::operation::Exp &) override; + void visit(const ir::operation::InstanceNorm &) override; + void visit(const ir::operation::Logistic &) override; + void visit(const ir::operation::ReduceMax &) override; + void visit(const ir::operation::Comparison &) override; + void visit(const ir::operation::LogicalAnd &) override; + void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::Pack &) override; + void visit(const ir::operation::Permute &) override; + void visit(const ir::operation::RSQRT &) override; + void visit(const ir::operation::ReLU &) override; + void visit(const ir::operation::ResizeBilinear &) override; + void visit(const ir::operation::ReLU1 &) override; + void visit(const ir::operation::ReLU6 &) override; + void visit(const ir::operation::RNN &) override; + void visit(const ir::operation::Floor &) override; + void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::SpaceToDepth &) override; + void visit(const ir::operation::L2Pool2D &) override; + void visit(const ir::operation::EmbeddingLookup &) override; + void visit(const ir::operation::L2Normalization &) override; + void visit(const ir::operation::HashtableLookup &) override; + void visit(const ir::operation::PReLU &) override; + void visit(const ir::operation::TransposeConv &) override; + void visit(const ir::operation::SQRT &) override; + void visit(const ir::operation::LogicalOr &) override; + void visit(const ir::operation::LogicalNot &) override; + void visit(const ir::operation::SquaredDifference &) override; + void visit(const ir::operation::TopKV2 &) override; + void visit(const ir::operation::Gather &) override; + void visit(const ir::operation::Neg &) override; + void visit(const ir::operation::Abs &) override; + void visit(const ir::operation::ArgMax &) override; + void visit(const ir::operation::Dequantize &) override; + void visit(const ir::operation::Mean &) override; + void visit(const ir::operation::LocalResponseNormalization &) override; + void visit(const ir::operation::DepthToSpace &) override; + void visit(const ir::operation::ReduceMin &) override; + void visit(const ir::operation::Split &) override; + void visit(const ir::operation::Unpack &) override; + void visit(const ir::operation::Pad &) override; + void visit(const ir::operation::Min &) override; + void visit(const ir::operation::Max &) override; + +private: + const ir::Operands &_ctx; + std::shared_ptr<TensorBuilder> _tensor_builder; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__ diff --git a/runtime/neurun/backend/acl_cl/TensorBuilder.h b/runtime/neurun/backend/acl_cl/TensorBuilder.h new file mode 100644 index 000000000..b9a0dd4a6 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/TensorBuilder.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__ +#define __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__ + +#include <TemplTensorBuilder.h> + +#include "operand/CLTensor.h" +#include "operand/CLSubTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using TensorBuilder = + acl_common::TemplTensorBuilder<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__ diff --git a/runtime/neurun/backend/acl_cl/TensorManager.h b/runtime/neurun/backend/acl_cl/TensorManager.h new file mode 100644 index 000000000..fd2a9059c --- /dev/null +++ b/runtime/neurun/backend/acl_cl/TensorManager.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__ +#define __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__ + +#include <arm_compute/runtime/CL/CLBufferAllocator.h> +#include <arm_compute/runtime/PoolManager.h> +#include <arm_compute/runtime/BlobLifetimeManager.h> +#include <arm_compute/runtime/MemoryManagerOnDemand.h> +#include <arm_compute/runtime/CL/CLMemoryGroup.h> + +#include <AclMemoryManager.h> +#include <AclLinearMemoryManager.h> +#include <AclInternalBufferManager.h> +#include <AclTensorManager.h> + +#include "operand/CLTensor.h" +#include "operand/CLSubTensor.h" + +#include "util/logging.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using MemoryManager = + acl_common::AclMemoryManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>; + +using LinearMemoryManager = acl_common::AclLinearMemoryManager< + operand::ICLTensor, operand::CLTensor, operand::CLSubTensor, + ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager, + ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator, + ::arm_compute::CLMemoryGroup>; + +using InternalBufferManager = acl_common::AclInternalBufferManager< + ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager, + ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator>; + +using TensorManager = + acl_common::AclTensorManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>; + +TensorManager *createTensorManager() +{ + const std::string executor_str = util::getConfigString(util::config::EXECUTOR); + + if (executor_str == "Linear") + { + VERBOSE(acl_cl_createTensorManager) << "AclTensorManager as Linear" << std::endl; + return new TensorManager(new MemoryManager(), new LinearMemoryManager(), + new InternalBufferManager()); + } + else + { + VERBOSE(acl_cl_createTensorManager) << "AclTensorManager" << std::endl; + return new TensorManager(new MemoryManager(), new MemoryManager(), new InternalBufferManager()); + } +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__ diff --git a/runtime/neurun/backend/acl_cl/TensorRegister.h b/runtime/neurun/backend/acl_cl/TensorRegister.h new file mode 100644 index 000000000..02de45580 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/TensorRegister.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_REGISTER_H__ +#define __NEURUN_BACKEND_ACL_CL_TENSOR_REGISTER_H__ + +#include <AclTensorRegister.h> +#include <misc/polymorphic_downcast.h> +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class TensorRegister : public acl_common::AclTensorRegister +{ +public: + TensorRegister(const ir::Operands &operands, const std::shared_ptr<TensorBuilder> &tensor_builder) + : acl_common::AclTensorRegister{operands, tensor_builder} + { + // DO NOTHING + } + + void setUsesCount(const ir::OperandIndex &ind, size_t num_uses) const override + { + nnfw::misc::polymorphic_downcast<TensorBuilder *>(tensor_builder().get()) + ->setUsesCount(ind, num_uses); + } +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_REGISTER_H__ diff --git a/runtime/neurun/backend/acl_cl/operand/CLSubTensor.cc b/runtime/neurun/backend/acl_cl/operand/CLSubTensor.cc new file mode 100644 index 000000000..70c8829d9 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/operand/CLSubTensor.cc @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CLSubTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +CLSubTensor::CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape, + const arm_compute::Coordinates &coords, size_t rank, bool extend_parent) + : _cl_sub_tensor(std::make_shared<arm_compute::CLSubTensor>(parent->handle(), tensor_shape, + coords, extend_parent)), + _rank{rank} +{ + // DO NOTHING +} + +const arm_compute::CLSubTensor *CLSubTensor::handle() const { return _cl_sub_tensor.get(); } + +arm_compute::CLSubTensor *CLSubTensor::handle() { return _cl_sub_tensor.get(); } + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/operand/CLSubTensor.h b/runtime/neurun/backend/acl_cl/operand/CLSubTensor.h new file mode 100644 index 000000000..8eba3760f --- /dev/null +++ b/runtime/neurun/backend/acl_cl/operand/CLSubTensor.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__ + +#include <arm_compute/runtime/CL/CLSubTensor.h> +#include "ICLTensor.h" +#include "compiler/SubTensorInfo.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class CLSubTensor : public ICLTensor +{ +public: + CLSubTensor() = delete; + +public: + CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape, + const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false); + +public: + size_t num_dimensions() const final { return _rank; } + +public: + const arm_compute::CLSubTensor *handle() const override; + arm_compute::CLSubTensor *handle() override; + +public: + // This method is used to prevent the use of memcpy for SubTensor + bool has_padding() const override { return true; } + +private: + std::shared_ptr<arm_compute::CLSubTensor> _cl_sub_tensor; + size_t _rank; +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__ diff --git a/runtime/neurun/backend/acl_cl/operand/CLTensor.cc b/runtime/neurun/backend/acl_cl/operand/CLTensor.cc new file mode 100644 index 000000000..dab74e65f --- /dev/null +++ b/runtime/neurun/backend/acl_cl/operand/CLTensor.cc @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CLTensor.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> +#include <arm_compute/runtime/CL/CLMemory.h> +#include <arm_compute/runtime/CL/CLMemoryRegion.h> + +#include <Convert.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +CLTensor::CLTensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses) + : _cl_tensor(std::make_shared<arm_compute::CLTensor>()), _rank{rank}, _num_uses{num_uses} +{ + allocator()->init(info); +} + +const arm_compute::CLTensor *CLTensor::handle() const { return _cl_tensor.get(); } + +arm_compute::CLTensor *CLTensor::handle() { return _cl_tensor.get(); } + +arm_compute::CLTensorAllocator *CLTensor::allocator() { return _cl_tensor->allocator(); } + +void CLTensor::map(bool blocking) { _cl_tensor->map(blocking); } + +void CLTensor::unmap() { _cl_tensor->unmap(); } + +void CLTensor::setBuffer(void *host_ptr) +{ + // Constructs a Buffer on a user-supplied memory + auto buffer = cl::Buffer(arm_compute::CLScheduler::get().context(), + CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, info()->total_size(), host_ptr); + // import memory + allocator()->import_memory(buffer); +} + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/operand/CLTensor.h b/runtime/neurun/backend/acl_cl/operand/CLTensor.h new file mode 100644 index 000000000..8518bf0c3 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/operand/CLTensor.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__ + +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/CLScheduler.h> +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "ICLTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class CLTensor : public ICLTensor +{ +public: + CLTensor() = delete; + +public: + CLTensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses); + +public: + size_t num_dimensions() const final { return _rank; } + +public: + const arm_compute::CLTensor *handle() const override; + arm_compute::CLTensor *handle() override; + size_t num_uses() const { return _num_uses; } + +public: + arm_compute::CLTensorAllocator *allocator(); + void map(bool blocking = true); + void unmap(); + /** Set given buffer as the buffer of the tensor + * + * @note Ownership of the memory is not transferred to this object. + * Thus management (allocate/free) should be done by the client. + * + * @param[in] host_ptr Storage to be used. + */ + void setBuffer(void *host_ptr); + +private: + std::shared_ptr<arm_compute::CLTensor> _cl_tensor; + size_t _rank; + size_t _num_uses; +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__ diff --git a/runtime/neurun/backend/acl_cl/operand/ICLTensor.cc b/runtime/neurun/backend/acl_cl/operand/ICLTensor.cc new file mode 100644 index 000000000..6b14584e0 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/operand/ICLTensor.cc @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ICLTensor.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +void ICLTensor::access(const std::function<void(ITensor &tensor)> &fn) +{ + auto &queue = ::arm_compute::CLScheduler::get().queue(); + + // This is an optional input + if (total_size() == 0) + return; + + map(queue); + fn(*this); + unmap(queue); +} +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtime/neurun/backend/acl_cl/operand/ICLTensor.h b/runtime/neurun/backend/acl_cl/operand/ICLTensor.h new file mode 100644 index 000000000..68e4e7fc5 --- /dev/null +++ b/runtime/neurun/backend/acl_cl/operand/ICLTensor.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__ + +#include <arm_compute/core/CL/ICLTensor.h> + +#include <IACLTensor.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class ICLTensor : public acl_common::IACLTensor +{ +public: + const arm_compute::ICLTensor *handle() const override = 0; + arm_compute::ICLTensor *handle() override = 0; + +public: + void map(cl::CommandQueue &q, bool blocking = true) { return handle()->map(q, blocking); } + void unmap(cl::CommandQueue &q) { return handle()->unmap(q); } + void access(const std::function<void(ITensor &tensor)> &fn) final; +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__ |