diff options
Diffstat (limited to 'runtimes/neurun/backend/acl_cl')
23 files changed, 3947 insertions, 0 deletions
diff --git a/runtimes/neurun/backend/acl_cl/Backend.h b/runtimes/neurun/backend/acl_cl/Backend.h new file mode 100644 index 000000000..7c69d7b40 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/Backend.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_BACKEND_H__ +#define __NEURUN_BACKEND_ACL_CL_BACKEND_H__ + +#include <memory> +#include <backend/Backend.h> +#include <model/Operands.h> + +#include "Config.h" +#include "ConstantInitializer.h" +#include "KernelGenerator.h" +#include "ShapeFixer.h" +#include "TensorManager.h" +#include "backend/CustomKernelRegistry.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class Backend : public ::neurun::backend::Backend +{ +public: + Backend() : _config{std::make_shared<Config>()} {} + + std::shared_ptr<IConfig> config() const override { return _config; } + + std::unique_ptr<BackendContext> + newContext(const model::Operands &operands, + const std::shared_ptr<custom::KernelRegistry> &) const override + { + auto tensor_builder = std::make_shared<TensorBuilder>(createTensorManager()); + return std::unique_ptr<BackendContext>{new BackendContext{ + this, tensor_builder, std::make_shared<ConstantInitializer>(operands, tensor_builder), + std::make_shared<KernelGenerator>(operands, tensor_builder), + std::make_shared<ShapeFixer>(operands, tensor_builder)}}; + } + +private: + std::shared_ptr<IConfig> _config; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_BACKEND_H__ diff --git a/runtimes/neurun/backend/acl_cl/CLTimer.h b/runtimes/neurun/backend/acl_cl/CLTimer.h new file mode 100644 index 000000000..3939ee722 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/CLTimer.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_CLTIMER_H__ +#define __NEURUN_BACKEND_ACL_CL_CLTIMER_H__ + +#include <util/ITimer.h> +#include <arm_compute/core/CL/OpenCL.h> +#include <arm_compute/runtime/CL/CLScheduler.h> +#include <chrono> +#include <list> +#include <sstream> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +/** + * @brief Class to measure CL kernels execution time + */ +class CLTimer : public util::ITimer +{ +public: + /** + * @brief This function replaces CL function, which enqueues a command to execute a kernel + * with a wrapper which remembers enqueued kernels + */ + void handleBegin() override + { + _measured_events.clear(); + + _origin_enqueue_function = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr; + + auto _timer_enqueue_function = [this](cl_command_queue command_queue, cl_kernel kernel, + cl_uint work_dim, const size_t *gwo, const size_t *gws, + const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *usr_event) { + cl_event event; + cl_int enqueue_res = + this->_origin_enqueue_function(command_queue, kernel, work_dim, gwo, gws, lws, + num_events_in_wait_list, event_wait_list, &event); + this->_measured_events.emplace_back(event); + + // According to spec, if NULL was provided in usr_event - event shouldn't be returned + if (usr_event != nullptr) + { + clRetainEvent(event); + *usr_event = event; + } + return enqueue_res; + }; + arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr = _timer_enqueue_function; + + // Set CL_QUEUE_PROFILING_ENABLE flag for the CL command-queue, if it isn't already set + auto &cl_scheduler = arm_compute::CLScheduler::get(); + auto props = cl_scheduler.queue().getInfo<CL_QUEUE_PROPERTIES>(); + if ((props & CL_QUEUE_PROFILING_ENABLE) == 0) + { + cl_scheduler.set_queue( + cl::CommandQueue(cl_scheduler.context(), props | CL_QUEUE_PROFILING_ENABLE)); + } + }; + + /** + * @brief Get timer result by addition executed CL kernels durations + */ + void handleEnd() override + { + _timer_res = 0; + for (auto const &event : _measured_events) + { + cl_ulong start; + cl_ulong end; + event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start); + event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end); + _timer_res += (end - start) / 1000.f; // nanoseconds -> microseconds + } + + // Restore origin CL enqueue function + arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr = _origin_enqueue_function; + }; + +private: + std::function<decltype(clEnqueueNDRangeKernel)> _origin_enqueue_function; + std::list<::cl::Event> _measured_events; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_CLTIMER_H__ diff --git a/runtimes/neurun/backend/acl_cl/CMakeLists.txt b/runtimes/neurun/backend/acl_cl/CMakeLists.txt new file mode 100644 index 000000000..7d7b50cf0 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/CMakeLists.txt @@ -0,0 +1,21 @@ +# Unsupported architecture +nnfw_find_package(ARMCompute QUIET) +if(NOT ARMCompute_FOUND) + return() +endif(NOT ARMCompute_FOUND) + +set(LIB_NEURUN_BACKEND_ACL_CL neurun_backend_acl_cl) + +file(GLOB_RECURSE SOURCES "*.cc") + +add_library(${LIB_NEURUN_BACKEND_ACL_CL} SHARED ${SOURCES}) + +target_include_directories(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE neurun_core) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE ${LIB_NEURUN_BACKEND_ACL_COMMON}) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE nnfw_common) +target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE nnfw_coverage) + +set_target_properties(${LIB_NEURUN_BACKEND_ACL_CL} PROPERTIES OUTPUT_NAME backend_acl_cl) + +install(TARGETS ${LIB_NEURUN_BACKEND_ACL_CL} DESTINATION lib) diff --git a/runtimes/neurun/backend/acl_cl/Config.cc b/runtimes/neurun/backend/acl_cl/Config.cc new file mode 100644 index 000000000..0c0769184 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/Config.cc @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// For CLKernelLibraryEx initialization +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +#include "Config.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +void Config::initialize() +{ + arm_compute::CLScheduler::get().default_init(); + // NOTE CLKernelLibraryEx must use the same context as CLScheduler + // It did not check whether another device is available. + arm_compute::CLKernelLibraryEx::get().init( + "./cl_kernels/", arm_compute::CLScheduler::get().context(), cl::Device::getDefault()); +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/Config.h b/runtimes/neurun/backend/acl_cl/Config.h new file mode 100644 index 000000000..185765161 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/Config.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_CONFIG_H__ +#define __NEURUN_BACKEND_ACL_CL_CONFIG_H__ + +#include "CLTimer.h" +#include <cpp14/memory.h> +#include <backend/IConfig.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class Config : public IConfig +{ +public: + std::string id() override { return "acl_cl"; } + void initialize() override; + bool SupportSubTensorAlloc() override { return true; } + std::unique_ptr<util::ITimer> timer() override { return nnfw::cpp14::make_unique<CLTimer>(); } +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_CONFIG_H__ diff --git a/runtimes/neurun/backend/acl_cl/ConstantInitializer.cc b/runtimes/neurun/backend/acl_cl/ConstantInitializer.cc new file mode 100644 index 000000000..0a8f536ec --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/ConstantInitializer.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConstantInitializer.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +ConstantInitializer::ConstantInitializer(const model::Operands &operands, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _operands{operands}, _tensor_builder{tensor_builder} +{ + // DO NOTHING +} + +void ConstantInitializer::run() +{ + for (const auto &it : _init_map) + { + const auto &ind = it.first; + const auto &fn = it.second; + + const auto &model_obj = _operands.at(ind); + auto tensor_obj = _tensor_builder->wrapTensor(ind); + fn(model_obj, *tensor_obj); + } + + _init_map.clear(); +} + +void ConstantInitializer::visit(const model::operation::Conv2DNode &node) +{ + const auto &kernel_index = node.getInputs().at(model::operation::Conv2DNode::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(model::operation::Conv2DNode::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const model::operation::DepthwiseConv2DNode &node) +{ + const auto &kernel_index = node.getInputs().at(model::operation::DepthwiseConv2DNode::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); + + const auto &bias_index = node.getInputs().at(model::operation::DepthwiseConv2DNode::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const model::operation::EmbeddingLookupNode &node) +{ + const auto &lookups_index = node.getInputs().at(model::operation::EmbeddingLookupNode::LOOKUPS); + const auto &lookups_obj = _operands.at(lookups_index); + registerCopyInitializer(lookups_index, lookups_obj); +} + +void ConstantInitializer::visit(const model::operation::FullyConnectedNode &node) +{ + const auto &weight_index = node.getInputs().at(model::operation::FullyConnectedNode::WEIGHT); + const auto &weight_obj = _operands.at(weight_index); + registerCopyInitializer(weight_index, weight_obj); + + const auto &bias_index = node.getInputs().at(model::operation::FullyConnectedNode::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const model::operation::GatherNode &node) +{ + const auto &indices_index = node.getInputs().at(model::operation::GatherNode::INDICES); + const auto &indices_obj = _operands.at(indices_index); + registerCopyInitializer(indices_index, indices_obj); +} + +void ConstantInitializer::visit(const model::operation::HashtableLookupNode &node) +{ + const auto &lookups_index = node.getInputs().at(model::operation::HashtableLookupNode::LOOKUPS); + const auto &lookups_obj = _operands.at(lookups_index); + registerCopyInitializer(lookups_index, lookups_obj); + + const auto &keys_index = node.getInputs().at(model::operation::HashtableLookupNode::KEYS); + const auto &keys_obj = _operands.at(keys_index); + registerCopyInitializer(keys_index, keys_obj); +} + +void ConstantInitializer::visit(const model::operation::LSTMNode &node) +{ + const auto &input_to_input_weights_index = + node.getInputs().at(model::operation::LSTMNode::INPUT_TO_INPUT_WEIGHTS); + const auto &input_to_input_weights_obj = _operands.at(input_to_input_weights_index); + registerCopyInitializer(input_to_input_weights_index, input_to_input_weights_obj); + + const auto &input_to_forget_weights_index = + node.getInputs().at(model::operation::LSTMNode::INPUT_TO_FORGET_WEIGHTS); + const auto &input_to_forget_weights_obj = _operands.at(input_to_forget_weights_index); + registerCopyInitializer(input_to_forget_weights_index, input_to_forget_weights_obj); + + const auto &input_to_cell_weights_index = + node.getInputs().at(model::operation::LSTMNode::INPUT_TO_CELL_WEIGHTS); + const auto &input_to_cell_weights_obj = _operands.at(input_to_cell_weights_index); + registerCopyInitializer(input_to_cell_weights_index, input_to_cell_weights_obj); + + const auto &input_to_output_weights_index = + node.getInputs().at(model::operation::LSTMNode::INPUT_TO_OUTPUT_WEIGHTS); + const auto &input_to_output_weights_obj = _operands.at(input_to_output_weights_index); + registerCopyInitializer(input_to_output_weights_index, input_to_output_weights_obj); + + const auto &recurrent_to_input_weights_index = + node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_INPUT_WEIGHTS); + const auto &recurrent_to_input_weights_obj = _operands.at(recurrent_to_input_weights_index); + registerCopyInitializer(recurrent_to_input_weights_index, recurrent_to_input_weights_obj); + + const auto &recurrent_to_forget_weights_index = + node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_FORGET_WEIGHTS); + const auto &recurrent_to_forget_weights_obj = _operands.at(recurrent_to_forget_weights_index); + registerCopyInitializer(recurrent_to_forget_weights_index, recurrent_to_forget_weights_obj); + + const auto &recurrent_to_cell_weights_index = + node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_CELL_WEIGHTS); + const auto &recurrent_to_cell_weights_obj = _operands.at(recurrent_to_cell_weights_index); + registerCopyInitializer(recurrent_to_cell_weights_index, recurrent_to_cell_weights_obj); + + const auto &recurrent_to_output_weights_index = + node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_OUTPUT_WEIGHTS); + const auto &recurrent_to_output_weights_obj = _operands.at(recurrent_to_output_weights_index); + registerCopyInitializer(recurrent_to_output_weights_index, recurrent_to_output_weights_obj); + + const auto &cell_to_input_weights_index = + node.getInputs().at(model::operation::LSTMNode::CELL_TO_INPUT_WEIGHTS); + const auto &cell_to_input_weights_obj = _operands.at(cell_to_input_weights_index); + registerCopyInitializer(cell_to_input_weights_index, cell_to_input_weights_obj); + + const auto &cell_to_forget_weights_index = + node.getInputs().at(model::operation::LSTMNode::CELL_TO_FORGET_WEIGHTS); + const auto &cell_to_forget_weights_obj = _operands.at(cell_to_forget_weights_index); + registerCopyInitializer(cell_to_forget_weights_index, cell_to_forget_weights_obj); + + const auto &cell_to_output_weights_index = + node.getInputs().at(model::operation::LSTMNode::CELL_TO_OUTPUT_WEIGHTS); + const auto &cell_to_output_weights_obj = _operands.at(cell_to_output_weights_index); + registerCopyInitializer(cell_to_output_weights_index, cell_to_output_weights_obj); + + const auto &input_gate_bias_index = + node.getInputs().at(model::operation::LSTMNode::INPUT_GATE_BIAS); + const auto &input_gate_bias_obj = _operands.at(input_gate_bias_index); + registerCopyInitializer(input_gate_bias_index, input_gate_bias_obj); + + const auto &forget_gate_bias_index = + node.getInputs().at(model::operation::LSTMNode::FORGET_GATE_BIAS); + const auto &forget_gate_bias_obj = _operands.at(forget_gate_bias_index); + registerCopyInitializer(forget_gate_bias_index, forget_gate_bias_obj); + + const auto &output_gate_bias_index = + node.getInputs().at(model::operation::LSTMNode::OUTPUT_GATE_BIAS); + const auto &output_gate_bias_obj = _operands.at(output_gate_bias_index); + registerCopyInitializer(output_gate_bias_index, output_gate_bias_obj); + + const auto &projection_weights_index = + node.getInputs().at(model::operation::LSTMNode::PROJECTION_WEIGHTS); + const auto &projection_weights_obj = _operands.at(projection_weights_index); + registerCopyInitializer(projection_weights_index, projection_weights_obj); + + const auto &projection_bias_index = + node.getInputs().at(model::operation::LSTMNode::PROJECTION_BIAS); + const auto &projection_bias_obj = _operands.at(projection_bias_index); + registerCopyInitializer(projection_bias_index, projection_bias_obj); +} + +void ConstantInitializer::visit(const model::operation::RNNNode &node) +{ + const auto &weights_index = node.getInputs().at(model::operation::RNNNode::WEIGHTS); + const auto &weights_obj = _operands.at(weights_index); + registerCopyInitializer(weights_index, weights_obj); + + const auto &recurrent_weights_index = + node.getInputs().at(model::operation::RNNNode::RECURRENT_WEIGHTS); + const auto &recurrent_weights_obj = _operands.at(recurrent_weights_index); + registerCopyInitializer(recurrent_weights_index, recurrent_weights_obj); + + const auto &bias_index = node.getInputs().at(model::operation::RNNNode::BIAS); + const auto &bias_obj = _operands.at(bias_index); + registerCopyInitializer(bias_index, bias_obj); +} + +void ConstantInitializer::visit(const model::operation::TransposeConvNode &node) +{ + const auto &kernel_index = node.getInputs().at(model::operation::TransposeConvNode::KERNEL); + const auto &kernel_obj = _operands.at(kernel_index); + registerPermuteInitializer(kernel_index, kernel_obj); +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/ConstantInitializer.h b/runtimes/neurun/backend/acl_cl/ConstantInitializer.h new file mode 100644 index 000000000..59772e0f7 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/ConstantInitializer.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__ +#define __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__ + +#include <backend/IConstantInitializer.h> +#include <model/Operands.h> +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class ConstantInitializer : public IConstantInitializer +{ +public: + ConstantInitializer(const model::Operands &operands, + const std::shared_ptr<TensorBuilder> &tensor_builder); + +public: + void run() override; + +public: + void visit(const model::operation::Conv2DNode &) override; + void visit(const model::operation::DepthwiseConv2DNode &) override; + void visit(const model::operation::EmbeddingLookupNode &) override; + void visit(const model::operation::FullyConnectedNode &) override; + void visit(const model::operation::GatherNode &) override; + void visit(const model::operation::HashtableLookupNode &) override; + void visit(const model::operation::LSTMNode &) override; + void visit(const model::operation::RNNNode &) override; + void visit(const model::operation::TransposeConvNode &) override; + +private: + const model::Operands &_operands; + std::shared_ptr<TensorBuilder> _tensor_builder; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__ diff --git a/runtimes/neurun/backend/acl_cl/KernelGenerator.cc b/runtimes/neurun/backend/acl_cl/KernelGenerator.cc new file mode 100644 index 000000000..8b019a45a --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/KernelGenerator.cc @@ -0,0 +1,2034 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "KernelGenerator.h" + +#include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions +#include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions +#include <arm_compute/runtime/misc/functions/GenericGather.h> +#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h> + +#include <AclFunction.h> +#include <Convert.h> +#include <Swizzle.h> + +#include "kernel/ConcatLayer.h" +#include "model/Index.h" +#include "model/DataType.h" +#include "model/InternalType.h" +#include "compiler/IExecutionBuilder.h" +#include "exec/NopFunction.h" +#include "util/logging.h" +#include "util/Utils.h" +#include "util/Padding.h" + +using ::neurun::compiler::IExecutionBuilder; + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using ::neurun::backend::acl_common::asAclFunction; + +// +// ActivationBuilder +// +class ActivationBuilder +{ +public: + explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder) + { + // DO NOTHING + } + +private: + void appendReLU(::arm_compute::ICLTensor *ifm_alloc); + void appendReLU1(::arm_compute::ICLTensor *ifm_alloc); + void appendReLU6(::arm_compute::ICLTensor *ifm_alloc); + +public: + void append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc); + +private: + IExecutionBuilder &_builder; +}; + +void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc) +{ + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _builder.append(std::move(acl_fn)); +} + +void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc) +{ + switch (code) + { + case model::Activation::NONE: + { + // DO NOTHING + break; + } + case model::Activation::RELU: + { + appendReLU(ifm_alloc); + break; + } + case model::Activation::RELU1: + { + appendReLU1(ifm_alloc); + break; + } + case model::Activation::RELU6: + { + appendReLU6(ifm_alloc); + break; + } + default: + { + throw std::runtime_error("Not supported, yet"); + } + } +} + +// +// KernelGenerator +// +KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(model::Layout::UNKNOWN) +{ + // DO NOTHING +} + +void KernelGenerator::visit(const model::Subgraph &subgraph) +{ + _current_subg_layout = subgraph.getLayout(); + for (const auto &e : subgraph.operations()) + { + const auto &node = *(e.node); + _tensor_builder->preVisit(node); + node.accept(*this); + _tensor_builder->postVisit(node); + } +} + +void KernelGenerator::visit(const model::operation::CastNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::CastNode::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::Conv2DNode &node) +{ + using model::operation::Conv2DNode; + + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(Conv2DNode::Input::INPUT)}; + const auto ker_index{node.getInputs().at(Conv2DNode::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(Conv2DNode::Input::BIAS)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]. + const auto &ker_shape = _ctx.at(ker_index).shape(); + const auto ker_height = ker_shape.dim(1); + const auto ker_width = ker_shape.dim(2); + + const auto stride = node.param().stride; + const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, + stride, ker_width, ker_height); + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + const auto act_info = acl_common::asActivationLayerInfo(activation); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), + conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const model::operation::DepthwiseConv2DNode &node) +{ + using model::operation::DepthwiseConv2DNode; + + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(DepthwiseConv2DNode::Input::INPUT)}; + const auto ker_index{node.getInputs().at(DepthwiseConv2DNode::Input::KERNEL)}; + const auto bias_index{node.getInputs().at(DepthwiseConv2DNode::Input::BIAS)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + // Kernel format is [1, kernel_height, kernel_width, depth_out]. + const auto &ker_shape = _ctx.at(ker_index).shape(); + const auto ker_height = ker_shape.dim(1); + const auto ker_width = ker_shape.dim(2); + + const auto stride = node.param().stride; + const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, + stride, ker_width, ker_height); + const auto multiplier = node.param().multiplier; + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + + const auto conv_info = acl_common::asPadStrideInfo(padding, stride); + // TODO Use `activation` instead of `model::Activation::NONE`. See below. + const auto act_info = acl_common::asActivationLayerInfo(model::Activation::NONE); + + if (ker_height == 3 && ker_width == 3) + { + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer3x3>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), + ofm_alloc->handle(), conv_info, multiplier, act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); + } + else + { + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), + ofm_alloc->handle(), conv_info, multiplier, act_info); + + _execution_builder->append(asAclFunction(std::move(fn))); + } + + // TODO Use fused activation instead of separate layer after switching to ACL version >= v19.05. + // Prior versions had a bug due to which the fused activation did not apply in some cases. + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::MaxPool2DNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::MaxPool2DNode::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl; + VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl; + VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl; + VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, + ::arm_compute::Size2D{kw, kh}, + acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append((std::move(acl_fn))); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::AvgPool2DNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::AvgPool2DNode::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl; + VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl; + VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl; + VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; + VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, + acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append((std::move(acl_fn))); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::ConcatNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + std::vector<model::OperandIndex> input_indexes; + + for (const auto &input : node.getInputs()) + input_indexes.emplace_back(input); + + const auto axis = node.param().axis; + + // If tensor allocator allocate as subtensor + bool canEliminate = true; + for (auto &ifm_ind : input_indexes) + { + if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind)) + { + canEliminate = false; + break; + } + } + if (canEliminate) + { + // If concat eliminated, return a NOP IFunction + _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>()); + return; + } + + auto output_alloc = static_cast<::neurun::backend::acl_cl::operand::Object *>( + _tensor_builder->wrapTensor(ofm_index).get()); + + std::vector<::neurun::backend::acl_cl::operand::Object *> input_allocs; + for (auto &ifm_ind : input_indexes) + input_allocs.emplace_back(static_cast<::neurun::backend::acl_cl::operand::Object *>( + _tensor_builder->wrapTensor(ifm_ind).get())); + + auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>(); + + const auto rank = _ctx.at(ofm_index).shape().rank(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = output_alloc->ptr()->layout(); + const auto fixed_axis = + acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); + + fn->configure(input_allocs, fixed_axis, output_alloc); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::FullyConnectedNode &node) +{ + using model::operation::FullyConnectedNode; + + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(FullyConnectedNode::Input::INPUT)}; + const auto weight_index{node.getInputs().at(FullyConnectedNode::Input::WEIGHT)}; + const auto bias_index{node.getInputs().at(FullyConnectedNode::Input::BIAS)}; + + const auto input_rank = _ctx.at(input_index).shape().rank(); + // TODO Currently we are not handling where the case is that the input's rank is 3. + // The handling should be added in the future. + assert(input_rank != 3); + + const auto output_size = _ctx.at(output_index).shape().dim(1); + UNUSED_RELEASE(output_size); + assert(_ctx.at(bias_index).shape().dim(0) == output_size); + assert(_ctx.at(weight_index).shape().dim(0) == output_size); + const auto batch_size = _ctx.at(output_index).shape().dim(0); + const auto input_size = _ctx.at(weight_index).shape().dim(1); + + // Check for reshaping input's shape into rank-2 + bool needs_reshape = false; + neurun::model::Shape reshape(2); + if (input_rank == 4) + { + const auto feature_size = _ctx.at(input_index).shape().num_elements(); + + UNUSED_RELEASE(feature_size); + assert((batch_size * input_size) >= 0); + assert(feature_size == static_cast<uint64_t>(batch_size * input_size)); + + // for reshaping + needs_reshape = true; + reshape.dim(0) = batch_size; /* H */ + reshape.dim(1) = input_size; /* W */ + } + + const auto activation = node.param().activation; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + auto weight_alloc = _tensor_builder->at(weight_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto acl_layout = output_alloc->handle()->info()->data_layout(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure( + input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), + needs_reshape, + ::neurun::backend::acl_common::asTensorShape(/* TODO Support NCHW frontend */ + reshape, model::Layout::NHWC, + ::neurun::backend::acl_common::asRuntimeLayout( + acl_layout))); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::MulNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::ReduceSumNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ReduceSumNode::Input::INPUT)}; + const auto axis_index{node.param().axis_index}; + + const auto axis_base = _ctx.at(axis_index).data().base(); + const auto axis_size = _ctx.at(axis_index).shape().num_elements(); + const auto input_rank = _ctx.at(input_index).shape().rank(); + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = input_alloc->layout(); + // The axis's data must exist as constant values + assert(axis_base != nullptr); + std::set<uint32_t> axes; + for (size_t n = 0; n < axis_size; ++n) + { + int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n); + if (axis_value < 0) + { + axis_value += input_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(input_rank, axis_value, + frontend_layout, backend_layout) + .value()); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), axes, + ::arm_compute::ReduceOperation::SUM); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ReshapeNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ReshapeNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + // NOTE This operation must not be changed the layout from frontend to backend + // However, this runtime can be change the layout of this operation from NHWC to NCHW now + // TODO Change the layout of frontend and backend to be the same and layer to CLReshapeLayer + auto fn = nnfw::cpp14::make_unique<::arm_compute::misc::GenericReshapeLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::SqueezeNode &node) +{ + // Squeeze is identical to reshape except that it has an optional dimensions input. + // In addition, optional dims_index is ignored since output tensor already has squeezed shape + // by freezer and toco + // TODO Support multi-layout for frontend and backend + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::SqueezeNode::Input::INPUT)}; + const auto dims_index{node.param().dims}; + (void)dims_index; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>(); + fn->configure(input_alloc->handle(), output_alloc->handle()); + auto acl_fn = asAclFunction(std::move(fn)); + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::TanhNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::TanhNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::SoftmaxNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::SoftmaxNode::Input::INPUT)}; + + const auto beta = node.param().beta; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::StridedSliceNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::StridedSliceNode::Input::INPUT)}; + const auto startData_index{node.param().startData_index}; + const auto endData_index{node.param().endData_index}; + const auto stridesData_index{node.param().stridesData_index}; + const auto beginMask_index{node.param().beginMask_index}; + const auto endMask_index{node.param().endMask_index}; + const auto shrinkAxisMask_index{node.param().shrinkAxisMask_index}; + + auto outputData_alloc = _tensor_builder->at(output_index).get(); + auto inputData_alloc = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = inputData_alloc->layout(); + + // Set initializers for indices data such as order of inputData + int input_rank = _ctx.at(input_index).shape().rank(); + std::vector<int32_t> starts; + std::vector<int32_t> ends; + std::vector<int32_t> strides; + starts.resize(input_rank, 0); + ends.resize(input_rank, 0); + strides.resize(input_rank, 0); + { + auto input_shape = _ctx.at(input_index).shape(); + auto startData_base = _ctx.at(startData_index).data().base(); + auto endData_base = _ctx.at(endData_index).data().base(); + auto stridesData_base = _ctx.at(stridesData_index).data().base(); + const int startData_size = _ctx.at(startData_index).shape().num_elements(); + const int endData_size = _ctx.at(endData_index).shape().num_elements(); + const int stridesData_size = _ctx.at(stridesData_index).shape().num_elements(); + + using neurun::model::DataType; + + UNUSED_RELEASE(startData_size); + UNUSED_RELEASE(endData_size); + UNUSED_RELEASE(stridesData_size); + + assert(_ctx.at(startData_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(endData_index).typeInfo().type() == DataType::INT32); + assert(_ctx.at(stridesData_index).typeInfo().type() == DataType::INT32); + assert(startData_size == input_rank); + assert(endData_size == input_rank); + assert(stridesData_size == input_rank); + + assert(startData_base != nullptr); + for (int n = 0; n < input_rank; ++n) + { + auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout, + backend_layout) + .value(); + + int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n); + starts[axis] = start_value; + + int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n); + ends[axis] = end_value; + + int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n); + strides[axis] = strides_value; + } + } + + // Set mask bits such as order of inputData + const auto beginMask = ::neurun::backend::acl_common::ReorderBits<int32_t>( + _ctx.at(beginMask_index).asScalar<int32_t>(), input_rank, frontend_layout, backend_layout); + const auto endMask = ::neurun::backend::acl_common::ReorderBits<int32_t>( + _ctx.at(endMask_index).asScalar<int32_t>(), input_rank, frontend_layout, backend_layout); + const auto shrinkAxisMask = ::neurun::backend::acl_common::ReorderBits<int32_t>( + _ctx.at(shrinkAxisMask_index).asScalar<int32_t>(), input_rank, frontend_layout, + backend_layout); + + ::arm_compute::Coordinates starts_set; + ::arm_compute::Coordinates ends_set; + ::arm_compute::BiStrides strides_set; + + for (size_t i = 0; i < starts.size(); ++i) + { + starts_set.set(i, starts[i]); + ends_set.set(i, ends[i]); + strides_set.set(i, strides[i]); + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>(); + + fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + strides_set, beginMask, endMask, shrinkAxisMask); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::TransposeNode &node) +{ + const auto ofm_idx{node.getOutputs().at(0)}; + const auto ifm_idx{node.getInputs().at(model::operation::TransposeNode::Input::INPUT)}; + const auto perm{node.param().perm}; + + const auto rank = _ctx.at(ifm_idx).shape().rank(); + std::vector<int32_t> pv; + const auto perm_base = _ctx.at(perm).data().base(); + const int perm_size = _ctx.at(perm).shape().num_elements(); + + assert(perm_base != nullptr); + for (int32_t n = 0; n < perm_size; ++n) + { + int32_t perm_value = *(reinterpret_cast<const int32_t *>(perm_base) + n); + assert(perm_value < rank); + pv.emplace_back(perm_value); + } + + auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); + auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + // Reversed + auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector( + rank, pv, frontend_layout, backend_layout); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPermute>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::AddNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + arm_compute::ConvertPolicy::SATURATE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::SubNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + arm_compute::ConvertPolicy::SATURATE); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::DivNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)}; + + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::ExpNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ExpNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::LogisticNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::LogisticNode::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::LogicalAndNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + ::arm_compute::BinaryLogicalOperation::AND); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::LSTMNode &node) +{ + // TODO Support dynamic rnn + // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. + const auto scratch_buffer_index{ + node.getOutputs().at(model::operation::LSTMNode::Output::SCRATCH_BUFFER)}; + const auto output_state_out_index{ + node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT_STATE_OUT)}; + const auto cell_state_out_index{ + node.getOutputs().at(model::operation::LSTMNode::Output::CELL_STATE_OUT)}; + const auto output_index{node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT)}; + + const auto input_index{node.getInputs().at(model::operation::LSTMNode::Input::INPUT)}; + const auto input_to_input_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional + const auto input_to_forget_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_FORGET_WEIGHTS)}; + const auto input_to_cell_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_CELL_WEIGHTS)}; + const auto input_to_output_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{node.getInputs().at( + model::operation::LSTMNode::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional + const auto recurrent_to_forget_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_FORGET_WEIGHTS)}; + const auto recurrent_to_cell_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_CELL_WEIGHTS)}; + const auto recurrent_to_output_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto cell_to_input_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_INPUT_WEIGHTS)}; // optional + const auto cell_to_forget_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_FORGET_WEIGHTS)}; // optional + const auto cell_to_output_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional + const auto input_gate_bias_index{ + node.getInputs().at(model::operation::LSTMNode::Input::INPUT_GATE_BIAS)}; + const auto forget_gate_bias_index{ + node.getInputs().at(model::operation::LSTMNode::Input::FORGET_GATE_BIAS)}; + const auto cell_bias_index{node.getInputs().at(model::operation::LSTMNode::Input::CELL_BIAS)}; + const auto output_gate_bias_index{ + node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_GATE_BIAS)}; + const auto projection_weights_index{ + node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_WEIGHTS)}; // optional + const auto projection_bias_index{ + node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_BIAS)}; // optional + const auto output_state_in_index{ + node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_STATE_IN)}; + const auto cell_state_in_index{ + node.getInputs().at(model::operation::LSTMNode::Input::CELL_STATE_IN)}; + const auto cell_threshold = node.param().cell_threshold; + const auto projection_threshold = node.param().projection_threshold; + + bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; + bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; + bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && + _ctx.at(projection_weights_index).shape().dim(1) != 0; + bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); + + // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. + // true: no CIFG + // false: CIFG + // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + + // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. + // But the cell_to_input_weights does not exist in regular CIFG although peephole. + // true: peephole + // false: no peephole + bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; + + // NOTE Although the projection weights has data the projection bias may not have data. + bool has_projection_param = has_projection_weights; + + const auto activation = node.param().activation; + const auto cell_clip = cell_threshold; + const auto projection_clip = projection_threshold; + assert(cell_clip >= 0.f && projection_clip >= 0.f); + + auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); + auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); + auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); + auto output_alloc = _tensor_builder->at(output_index).get(); + + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); + auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); + auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); + auto recurrent_to_forget_weights_alloc = + _tensor_builder->at(recurrent_to_forget_weights_index).get(); + auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); + auto recurrent_to_output_weights_alloc = + _tensor_builder->at(recurrent_to_output_weights_index).get(); + + auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); + auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); + auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); + auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); + auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); + + auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>(); + + ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{}; + if (has_cifg_param) + { + auto input_to_input_weights_alloc = + _tensor_builder->at(input_to_input_weights_index).get(); // optional + auto recurrent_to_input_weights_alloc = + _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional + auto cell_to_input_weights_handle = + has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() + : nullptr; // optional (non-cifg && peephole) + auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional + lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), + recurrent_to_input_weights_alloc->handle(), + cell_to_input_weights_handle, input_gate_bias_alloc->handle()); + } + if (has_peephole_param) + { + auto cell_to_forget_weights_alloc = + _tensor_builder->at(cell_to_forget_weights_index).get(); // optional + auto cell_to_output_weights_alloc = + _tensor_builder->at(cell_to_output_weights_index).get(); // optional + lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), + cell_to_output_weights_alloc->handle()); + } + if (has_projection_param) + { + auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional + auto projection_bias_handle = has_projection_bias + ? _tensor_builder->at(projection_bias_index).get()->handle() + : nullptr; // optional + lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); + } + + fn->configure( + input_alloc->handle(), input_to_forget_weights_alloc->handle(), + input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), + recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), + recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), + cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), + cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), + output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), + lstm_params, act_info, cell_clip, projection_clip); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ReduceMaxNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ReduceMaxNode::Input::INPUT)}; + const auto axis_index{node.param().axis_index}; + + auto input_shape = _ctx.at(input_index).shape(); + auto axis_shape = _ctx.at(axis_index).shape(); + + auto ofm_alloc = _tensor_builder->at(output_index).get(); + auto ifm_alloc = _tensor_builder->at(input_index).get(); + std::set<uint32_t> axes; + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + { + const auto ifm_rank = input_shape.rank(); + switch (axis_shape.rank()) + { + case 0: // scalar + { + int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>(); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value, + frontend_layout, backend_layout) + .value()); + break; + } + case 1: // vector + { + const auto axis_base = _ctx.at(axis_index).data().base(); + const int axis_size = axis_shape.num_elements(); + + // If axis's data does not exist as constant values and can be gotten as input data, we have + // to find a way to infer output shape when sinking output. + assert(axis_base != nullptr); + for (int32_t n = 0; n < axis_size; ++n) + { + int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis( + ifm_rank, axis_value, frontend_layout, backend_layout) + .value()); + } + break; + } + default: + throw std::runtime_error("Not supported"); + break; + } + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes, arm_compute::ReduceOperation::MAX); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ComparisonNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)}; + + const auto comparison_type = node.param().comparison_type; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLComparison>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + (arm_compute::ComparisonOperation)comparison_type); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::RSQRTNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::RSQRTNode::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const model::operation::ReLUNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ReLUNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ResizeBilinearNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinearNode::Input::INPUT)}; + const auto height_index{node.param().height_index}; + const auto width_index{node.param().width_index}; + (void)height_index; + (void)width_index; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLScale>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, + ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ReLU1Node &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::ReLU1Node::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ReLU6Node &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::ReLU6Node::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::RNNNode &node) +{ + const auto output_index{node.getOutputs().at(model::operation::RNNNode::Output::OUTPUT)}; + const auto hidden_state_out_index{ + node.getOutputs().at(model::operation::RNNNode::Output::HIDDEN_STATE_OUT)}; + + const auto input_index{node.getInputs().at(model::operation::RNNNode::Input::INPUT)}; + const auto weights_index{node.getInputs().at(model::operation::RNNNode::Input::WEIGHTS)}; + const auto recurrent_weights_index{ + node.getInputs().at(model::operation::RNNNode::Input::RECURRENT_WEIGHTS)}; + const auto bias_index{node.getInputs().at(model::operation::RNNNode::Input::BIAS)}; + const auto hidden_state_in_index{ + node.getInputs().at(model::operation::RNNNode::Input::HIDDEN_STATE_IN)}; + + const auto activation = node.param().activation; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + + auto input_alloc = _tensor_builder->at(input_index).get(); + auto weights_alloc = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation); + + auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>(); + copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + _execution_builder->append(asAclFunction(std::move(copy_layer))); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), + bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), + act_info); + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const model::operation::FloorNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::FloorNode::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLFloor>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::SpaceToDepthNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepthNode::Input::INPUT)}; + const auto block_size_index{node.param().block_size_index}; + + auto block_size = _ctx.at(block_size_index).asScalar<int32_t>(); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::L2Pool2DNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::L2Pool2DNode::Input::INPUT)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + + uint32_t kw = node.param().kw; + uint32_t kh = node.param().kh; + const auto stride = node.param().stride; + const auto padding = + neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto activation = node.param().activation; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, + ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + +void KernelGenerator::visit(const model::operation::EmbeddingLookupNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto lookups_index{ + node.getInputs().at(model::operation::EmbeddingLookupNode::Input::LOOKUPS)}; + const auto values_index{ + node.getInputs().at(model::operation::EmbeddingLookupNode::Input::VALUES)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto lookups_alloc = _tensor_builder->at(lookups_index).get(); + auto values_alloc = _tensor_builder->at(values_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>(); + + fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::L2NormalizationNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::L2NormalizationNode::Input::INPUT)}; + + // {CL|Neon}L2Normalization performs the reduction only along dimension 0 + // L2 Normalization always performs the reduction along the depth axis + // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by + // choosing normalization parameters as below + + const auto &ifm_shape = _ctx.at(ifm_index).shape(); + // TODO Support optional constant dimension that normalization would be performed on + const auto normalization_axis = ifm_shape.rank() - 1; + int32_t radius = + 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1 + float alpha = 1.0f; // In the implementation to make alpha_ become 1 + float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) + float bias = 0.0f; // Don't offset the reduction. + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, + radius, alpha, beta, bias, false); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::HashtableLookupNode &node) +{ + const auto output_index{ + node.getOutputs().at(model::operation::HashtableLookupNode::Output::OUTPUT)}; + const auto hits_index{node.getOutputs().at(model::operation::HashtableLookupNode::Output::HITS)}; + + const auto lookups_index{ + node.getInputs().at(model::operation::HashtableLookupNode::Input::LOOKUPS)}; + const auto keys_index{node.getInputs().at(model::operation::HashtableLookupNode::Input::KEYS)}; + const auto values_index{ + node.getInputs().at(model::operation::HashtableLookupNode::Input::VALUES)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto hits_alloc = _tensor_builder->at(hits_index).get(); + + auto lookups_alloc = _tensor_builder->at(lookups_index).get(); + auto keys_alloc = _tensor_builder->at(keys_index).get(); + auto values_alloc = _tensor_builder->at(values_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>(); + + fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), + output_alloc->handle(), hits_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::PReLUNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>(); + + fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::TransposeConvNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto output_shape_index{ + node.getInputs().at(model::operation::TransposeConvNode::Input::OUTPUT_SHAPE)}; + const auto ker_index{node.getInputs().at(model::operation::TransposeConvNode::Input::KERNEL)}; + const auto ifm_index{node.getInputs().at(model::operation::TransposeConvNode::Input::INPUT)}; + + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout); + const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout); + + const auto stride = node.param().stride; + + assert((node.param().padding.type == model::PaddingType::SAME) || + (node.param().padding.type == model::PaddingType::VALID)); + auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride, + ker_shape.W, ker_shape.H); + + uint32_t invalid_horizontal = 0; + uint32_t invalid_vertical = 0; + if (node.param().padding.type == model::PaddingType::VALID) + { + invalid_horizontal = + ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1); + invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); + } + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ker_alloc = _tensor_builder->at(ker_index).get(); + + const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>( + _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, + invalid_horizontal, invalid_vertical); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::SQRTNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::SQRTNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::LogicalOrNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input0_alloc = _tensor_builder->at(input0_index).get(); + auto input1_alloc = _tensor_builder->at(input1_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>(); + + fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::LogicalNotNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::LogicalNotNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::SquaredDifferenceNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto lhs_alloc = _tensor_builder->at(lhs_index).get(); + auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>(); + + fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::TopKV2Node &node) +{ + const auto outputValues_index{ + node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_VALUES)}; + const auto outputIndices_index{ + node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_INDICES)}; + + const auto inputData_index{node.getInputs().at(model::operation::TopKV2Node::Input::INPUT)}; + const auto k_index{node.param().k_index}; + + // Currently, we only support the vector input. + assert(_ctx.at(inputData_index).shape().rank() == 1 || + _ctx.at(inputData_index).shape().rank() == 2); + + const auto k = _ctx.at(k_index).asScalar<int32_t>(); + + auto values_alloc = _tensor_builder->at(outputValues_index).get(); + auto indices_alloc = _tensor_builder->at(outputIndices_index).get(); + auto input_alloc = _tensor_builder->at(inputData_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>(); + + fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::GatherNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + + const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)}; + const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)}; + + const auto axis_index{node.param().axis_index}; + + const auto ifm_shape = _ctx.at(ifm_index).shape(); + + const auto axis_value = static_cast<int>(_ctx.at(axis_index).asScalar<int32_t>()); + // Converting in reverse order + const int axis = + ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value(); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto indices_alloc = _tensor_builder->at(indices_index).get(); + auto acl_layout = ofm_alloc->handle()->info()->data_layout(); + UNUSED_RELEASE(acl_layout); + + // NOTE The frontend layout and backend layout must be the same for this operation. + // If not the same, we have to add a stage(?) to perform permutation of output tensor. It + // is not not efficient even if it works well. If so, it would be better to set the + // layout of these backend tensors to the same layout. + // There is also one thing we have to think about. This operation depends on the layout of + // a model. For example, if a model in NHWC has this operation as output rank == 4, indices + // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W + // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. + // TODO Remove this workaround + // It is a workaround how to set the layout of these backend tensors to the layout of the + // frontend when creating them + // TODO Supports front-end in NCHW + // TODO Change the layout of frontend and backend to be the same + // assert(::arm_compute::DataLayout::NHWC == acl_layout); + assert(acl_layout == ifm_alloc->handle()->info()->data_layout()); + assert(acl_layout == indices_alloc->handle()->info()->data_layout()); + + // TODO Change to CLGather + auto fn = nnfw::cpp14::make_unique<::arm_compute::misc::GenericGather>(); + + fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::NegNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::NegNode::Input::INPUT)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNeg>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::AbsNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::AbsNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + const ::arm_compute::ActivationLayerInfo act_info{ + ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ArgMaxNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::ArgMaxNode::Input::INPUT)}; + const auto axis_index{node.param().axis_index}; + + auto ifm_shape = _ctx.at(ifm_index).shape(); + auto ofm_shape = _ctx.at(ofm_index).shape(); + auto axis_shape = _ctx.at(axis_index).shape(); + + assert(_ctx.at(axis_index).isConstant()); + // Axis dimension is always 1. + assert(axis_shape.rank() == 1); + assert((ifm_shape.rank() - 1) == ofm_shape.rank()); + + const int axis_size = axis_shape.num_elements(); + auto axis_base = _ctx.at(axis_index).data().base(); + // TODO Should support axis size > 1. + assert(axis_size == 1); + // axis is tensor with 1 dimension - always a vector. + assert(axis_base != nullptr); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + const auto ifm_rank = ifm_shape.rank(); + auto frontend_layout = _current_subg_layout; + auto backend_layout = ifm_alloc->layout(); + std::set<uint32_t> axes; + for (int32_t n = 0; n < axis_size; ++n) + { + int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout) + .value()); + } + std::vector<uint32_t> fixed_axes(axes.begin(), axes.end()); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), fixed_axes, + ::arm_compute::ArgOperation::MAX); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::DequantizeNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::DequantizeNode::Input::INPUT)}; + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>(); + + fn->configure(input_alloc->handle(), output_alloc->handle()); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::MeanNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::MeanNode::Input::INPUT)}; + + const auto axis_index{node.param().axis_index}; + const auto keep_dims{node.param().keep_dims}; + (void)keep_dims; + + const auto ifm_shape = _ctx.at(ifm_index).shape(); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + std::set<uint32_t> axes; + { + const auto ifm_rank = ifm_shape.rank(); + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + const auto axis_shape = _ctx.at(axis_index).shape(); + switch (axis_shape.rank()) + { + case 0: // scalar + { + auto axis_value = _ctx.at(axis_index).asScalar<int32_t>(); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value, + frontend_layout, backend_layout) + .value()); + break; + } + case 1: // vector + { + const auto axis_base = _ctx.at(axis_index).data().base(); + const int axis_size = axis_shape.num_elements(); + + // If axis's data does not exist as constant values and can be gotten as input data, we have + // to find a way to infer output shape when sinking output. + assert(axis_base != nullptr); + for (int32_t n = 0; n < axis_size; ++n) + { + int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis( + ifm_rank, axis_value, frontend_layout, backend_layout) + .value()); + } + break; + } + default: + throw std::runtime_error("Not supported"); + } + } + + // NOTE CLReduceMean has a bug that does not support NHWC layout + // CLReduceMean intermediate tensors are always NCHW layout + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes, + ::arm_compute::ReduceOperation::MEAN); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::LocalResponseNormalizationNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{ + node.getInputs().at(model::operation::LocalResponseNormalizationNode::Input::INPUT)}; + const auto radius_index{node.param().radius_index}; + const auto bias_index{node.param().bias_index}; + const auto alpha_index{node.param().alpha_index}; + const auto beta_index{node.param().beta_index}; + + auto radius = _ctx.at(radius_index).asScalar<int32_t>(); + auto alpha = _ctx.at(alpha_index).asScalar<float>(); + auto beta = _ctx.at(beta_index).asScalar<float>(); + auto bias = _ctx.at(bias_index).asScalar<float>(); + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + + const auto norm_info = ::arm_compute::NormalizationLayerInfo( + ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::DepthToSpaceNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::DepthToSpaceNode::Input::INPUT)}; + const auto block_size_index{node.param().block_size_index}; + + auto block_size = _ctx.at(block_size_index).asScalar<int32_t>(); + assert(block_size > 0); + + auto output_alloc = _tensor_builder->at(output_index).get(); + auto input_alloc = _tensor_builder->at(input_index).get(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>(); + + fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::ReduceMinNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::ReduceMinNode::Input::INPUT)}; + const auto axis_index{node.param().axis_index}; + + auto ifm_shape = _ctx.at(ifm_index).shape(); + auto ofm_shape = _ctx.at(ofm_index).shape(); + auto axis_shape = _ctx.at(axis_index).shape(); + + const auto ifm_rank = ifm_shape.rank(); + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + std::set<uint32_t> axes; + { + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + switch (axis_shape.rank()) + { + case 0: // scalar + { + auto axis_value = _ctx.at(axis_index).asScalar<int32_t>(); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value, + frontend_layout, backend_layout) + .value()); + break; + } + case 1: // vector + { + const auto axis_base = _ctx.at(axis_index).data().base(); + const int axis_size = axis_shape.num_elements(); + + // If axis's data does not exist as constant values and can be gotten as input data, we have + // to find a way to infer output shape when sinking output. + assert(axis_base != nullptr); + for (int32_t n = 0; n < axis_size; ++n) + { + int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n); + if (axis_value < 0) + { + axis_value += ifm_rank; + } + axes.insert(::neurun::backend::acl_common::ToARMComputeAxis( + ifm_rank, axis_value, frontend_layout, backend_layout) + .value()); + } + break; + } + default: + throw std::runtime_error("Not supported"); + break; + } + } + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes, + ::arm_compute::ReduceOperation::MIN); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::SplitNode &node) +{ + const auto ifm_index{node.getInputs().at(model::operation::SplitNode::Input::INPUT)}; + const auto axis_index{node.param().axis_index}; + const auto num_of_splits_index{node.param().num_of_splits_index}; + + assert(_ctx.at(num_of_splits_index).asScalar<unsigned int>() == node.getOutputs().size()); + + const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); + std::vector<model::OperandIndex> output_indexes; + for (const auto &output : node.getOutputs()) + output_indexes.emplace_back(output); + + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + std::vector<arm_compute::ICLTensor *> output_allocs; + for (const auto &ofm_ind : output_indexes) + output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = ifm_alloc->layout(); + auto axis = _ctx.at(axis_index).asScalar<int32_t>(); + if (axis < 0) + axis += ifm_rank; + axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSplit>(); + + fn->configure(ifm_alloc->handle(), output_allocs, axis); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); +} + +void KernelGenerator::visit(const model::operation::UnpackNode &node) +{ + const auto input_index{node.getInputs().at(model::operation::UnpackNode::Input::INPUT)}; + auto axis{node.param().axis}; + + const auto input_rank = _ctx.at(input_index).shape().rank(); + + std::vector<model::OperandIndex> output_indexes; + for (const auto &output_index : node.getOutputs()) + output_indexes.emplace_back(output_index); + + auto input = _tensor_builder->at(input_index).get()->handle(); + std::vector<arm_compute::ICLTensor *> outputs; + for (const auto &output_index : output_indexes) + outputs.emplace_back(_tensor_builder->at(output_index)->handle()); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(input_index).get()->layout(); + if (axis < 0) + axis += input_rank; + axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value(); + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>(); + + fn->configure(input, outputs, axis); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +void KernelGenerator::visit(const model::operation::PadNode &node) +{ + const auto input_index{node.getInputs().at(model::operation::PadNode::Input::INPUT)}; + const auto pad_index{node.getInputs().at(model::operation::PadNode::Input::PAD)}; + const auto output_index{node.getOutputs().at(0)}; + assert(_ctx.at(pad_index).isConstant()); + + auto rank = _ctx.at(pad_index).shape().dim(0); + auto pad_base = _ctx.at(pad_index).data().base(); + + auto input_type = _ctx.at(input_index).typeInfo(); + auto data_type = acl_common::asDataType(input_type.type()); + auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()); + const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info); + + auto input = _tensor_builder->at(input_index).get()->handle(); + auto output = _tensor_builder->at(output_index).get()->handle(); + + ::arm_compute::PaddingList padding_list; + padding_list.resize(rank); + for (int32_t n = 0; n < rank; ++n) + { + const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2); + + const auto frontend_layout = _current_subg_layout; + const auto backend_layout = _tensor_builder->at(input_index).get()->layout(); + const auto axis = + acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value(); + padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]}; + } + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>(); + fn->configure(input, output, padding_list, pixel_value); + + _execution_builder->append(asAclFunction(std::move(fn))); +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/KernelGenerator.h b/runtimes/neurun/backend/acl_cl/KernelGenerator.h new file mode 100644 index 000000000..db9bf4199 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/KernelGenerator.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__ +#define __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__ + +#include <backend/IKernelGenerator.h> + +#include "model/Operands.h" +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class KernelGenerator : public IKernelGenerator +{ +public: + KernelGenerator(const neurun::model::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder); + + void visit(const model::Subgraph &) override; + void visit(const model::operation::Conv2DNode &) override; + void visit(const model::operation::DepthwiseConv2DNode &) override; + void visit(const model::operation::MaxPool2DNode &) override; + void visit(const model::operation::AvgPool2DNode &) override; + void visit(const model::operation::ConcatNode &) override; + void visit(const model::operation::FullyConnectedNode &) override; + void visit(const model::operation::MulNode &) override; + void visit(const model::operation::ReduceSumNode &) override; + void visit(const model::operation::ReshapeNode &) override; + void visit(const model::operation::SqueezeNode &) override; + void visit(const model::operation::TanhNode &) override; + void visit(const model::operation::SoftmaxNode &) override; + void visit(const model::operation::StridedSliceNode &) override; + void visit(const model::operation::TransposeNode &) override; + void visit(const model::operation::AddNode &) override; + void visit(const model::operation::SubNode &) override; + void visit(const model::operation::CastNode &) override; + void visit(const model::operation::DivNode &) override; + void visit(const model::operation::ExpNode &) override; + void visit(const model::operation::LogisticNode &) override; + void visit(const model::operation::ReduceMaxNode &) override; + void visit(const model::operation::ComparisonNode &) override; + void visit(const model::operation::LogicalAndNode &) override; + void visit(const model::operation::LSTMNode &) override; + void visit(const model::operation::RSQRTNode &) override; + void visit(const model::operation::ReLUNode &) override; + void visit(const model::operation::ResizeBilinearNode &) override; + void visit(const model::operation::ReLU1Node &) override; + void visit(const model::operation::ReLU6Node &) override; + void visit(const model::operation::RNNNode &) override; + void visit(const model::operation::FloorNode &) override; + void visit(const model::operation::SpaceToDepthNode &) override; + void visit(const model::operation::L2Pool2DNode &) override; + void visit(const model::operation::EmbeddingLookupNode &) override; + void visit(const model::operation::L2NormalizationNode &) override; + void visit(const model::operation::HashtableLookupNode &) override; + void visit(const model::operation::PReLUNode &) override; + void visit(const model::operation::TransposeConvNode &) override; + void visit(const model::operation::SQRTNode &) override; + void visit(const model::operation::LogicalOrNode &) override; + void visit(const model::operation::LogicalNotNode &) override; + void visit(const model::operation::SquaredDifferenceNode &) override; + void visit(const model::operation::TopKV2Node &) override; + void visit(const model::operation::GatherNode &) override; + void visit(const model::operation::NegNode &) override; + void visit(const model::operation::AbsNode &) override; + void visit(const model::operation::ArgMaxNode &) override; + void visit(const model::operation::DequantizeNode &) override; + void visit(const model::operation::MeanNode &) override; + void visit(const model::operation::LocalResponseNormalizationNode &) override; + void visit(const model::operation::DepthToSpaceNode &) override; + void visit(const model::operation::ReduceMinNode &) override; + void visit(const model::operation::SplitNode &) override; + void visit(const model::operation::UnpackNode &) override; + void visit(const model::operation::PadNode &) override; + +private: + const neurun::model::Operands &_ctx; + std::shared_ptr<TensorBuilder> _tensor_builder; + model::Layout _current_subg_layout; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__ diff --git a/runtimes/neurun/backend/acl_cl/PluginClassesAllocator.cc b/runtimes/neurun/backend/acl_cl/PluginClassesAllocator.cc new file mode 100644 index 000000000..ac3f0acff --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/PluginClassesAllocator.cc @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <util/logging.h> + +#include "Backend.h" + +extern "C" { +neurun::backend::Backend *neurun_backend_create() +{ + VERBOSE(neurun_backend_create) << "'acl_cl' loaded\n"; + return new neurun::backend::acl_cl::Backend; +} + +void neurun_backend_destroy(neurun::backend::Backend *backend) +{ + VERBOSE(neurun_backend_create) << "'acl_cl' unloaded\n"; + delete backend; +} +} diff --git a/runtimes/neurun/backend/acl_cl/ShapeFixer.cc b/runtimes/neurun/backend/acl_cl/ShapeFixer.cc new file mode 100644 index 000000000..e6744cc24 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/ShapeFixer.cc @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ShapeFixer.h" + +#include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions +#include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions +#include <arm_compute/runtime/misc/functions/GenericGather.h> +#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h> + +#include <AclFunction.h> +#include <Convert.h> +#include <Swizzle.h> + +#include "kernel/ConcatLayer.h" +#include "model/Index.h" +#include "model/DataType.h" +#include "model/InternalType.h" +#include "compiler/IExecutionBuilder.h" +#include "exec/NopFunction.h" +#include "util/logging.h" +#include "util/Utils.h" +#include "util/Padding.h" + +using ::neurun::compiler::IExecutionBuilder; + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using ::neurun::backend::acl_common::asAclFunction; + +ShapeFixer::ShapeFixer(const neurun::model::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder) + : _ctx(ctx), _tensor_builder(tensor_builder) +{ + assert(tensor_builder); +} + +void ShapeFixer::visit(const model::operation::CastNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::Conv2DNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::DepthwiseConv2DNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::MaxPool2DNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::AvgPool2DNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ConcatNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(ofm_index, false); + for (const auto &input : node.getInputs()) + _tensor_builder->dimCorrection(input, false); +} + +void ShapeFixer::visit(const model::operation::FullyConnectedNode &node) +{ + using model::operation::FullyConnectedNode; + const auto input_index{node.getInputs().at(FullyConnectedNode::Input::INPUT)}; + const auto input_rank = _ctx.at(input_index).shape().rank(); + if (input_rank == 4) + _tensor_builder->dimCorrection(input_index, false); +} + +void ShapeFixer::visit(const model::operation::MulNode &node) +{ + const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::ReduceSumNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ReshapeNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::ReshapeNode::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const model::operation::SqueezeNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(model::operation::SqueezeNode::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const model::operation::TanhNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::SoftmaxNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::StridedSliceNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::TransposeNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::AddNode &node) +{ + const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::SubNode &node) +{ + const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::DivNode &node) +{ + const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::ExpNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::LogisticNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::LogicalAndNode &node) +{ + const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::LSTMNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ReduceMaxNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ComparisonNode &node) +{ + const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + + // TODO remove const_cast later. For example, _ctx may need to be a non const variable or + // a node to extend shape may be inserted in front of this operation + const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::RSQRTNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ReLUNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ResizeBilinearNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ReLU1Node &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ReLU6Node &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::RNNNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::FloorNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::SpaceToDepthNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepthNode::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const model::operation::L2Pool2DNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::EmbeddingLookupNode &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto values_index{ + node.getInputs().at(model::operation::EmbeddingLookupNode::Input::VALUES)}; + _tensor_builder->dimCorrection(values_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const model::operation::L2NormalizationNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::HashtableLookupNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::PReLUNode &node) +{ + const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)}; + const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)}; + + if (!(_ctx.at(ifm_index).shape() == _ctx.at(alpha_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(ifm_index).shape().rank(), _ctx.at(alpha_index).shape().rank()); + const_cast<::neurun::model::Shape &>(_ctx.at(ifm_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(alpha_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::TransposeConvNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::SQRTNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::LogicalOrNode &node) +{ + const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)}; + const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)}; + + if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank()); + const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank); + const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::LogicalNotNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::SquaredDifferenceNode &node) +{ + const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)}; + const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)}; + + if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape())) + { + const auto broadcast_rank = + std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank()); + const_cast<neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank); + const_cast<neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank); + } +} + +void ShapeFixer::visit(const model::operation::TopKV2Node &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::GatherNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)}; + const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); + _tensor_builder->dimCorrection(indices_index, false); +} + +void ShapeFixer::visit(const model::operation::NegNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::AbsNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ArgMaxNode &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::ArgMaxNode::Input::INPUT)}; + _tensor_builder->dimCorrection(ofm_index, false); + _tensor_builder->dimCorrection(ifm_index, false); +} + +void ShapeFixer::visit(const model::operation::DequantizeNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::MeanNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::LocalResponseNormalizationNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::DepthToSpaceNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::ReduceMinNode &) { /* DO NOTHING */} + +void ShapeFixer::visit(const model::operation::SplitNode &node) +{ + const auto input_index{node.getInputs().at(model::operation::SplitNode::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + for (const auto &output : node.getOutputs()) + _tensor_builder->dimCorrection(output, false); +} + +void ShapeFixer::visit(const model::operation::UnpackNode &node) +{ + const auto input_index{node.getInputs().at(model::operation::UnpackNode::Input::INPUT)}; + _tensor_builder->dimCorrection(input_index, false); + for (const auto &output_index : node.getOutputs()) + _tensor_builder->dimCorrection(output_index, false); +} + +void ShapeFixer::visit(const model::operation::PadNode &node) +{ + const auto input_index{node.getInputs().at(model::operation::PadNode::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + _tensor_builder->dimCorrection(input_index, false); + _tensor_builder->dimCorrection(output_index, false); +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/ShapeFixer.h b/runtimes/neurun/backend/acl_cl/ShapeFixer.h new file mode 100644 index 000000000..519d1bafb --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/ShapeFixer.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__ +#define __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__ + +#include <backend/IShapeFixer.h> + +#include "model/Operands.h" +#include "TensorBuilder.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +class ShapeFixer : public IShapeFixer +{ +public: + ShapeFixer(const neurun::model::Operands &ctx, + const std::shared_ptr<TensorBuilder> &tensor_builder); + + std::shared_ptr<ITensorBuilder> tensor_builder() override { return _tensor_builder; } + + void visit(const model::operation::Conv2DNode &) override; + void visit(const model::operation::DepthwiseConv2DNode &) override; + void visit(const model::operation::MaxPool2DNode &) override; + void visit(const model::operation::AvgPool2DNode &) override; + void visit(const model::operation::ConcatNode &) override; + void visit(const model::operation::FullyConnectedNode &) override; + void visit(const model::operation::MulNode &) override; + void visit(const model::operation::ReduceSumNode &) override; + void visit(const model::operation::ReshapeNode &) override; + void visit(const model::operation::SqueezeNode &) override; + void visit(const model::operation::TanhNode &) override; + void visit(const model::operation::SoftmaxNode &) override; + void visit(const model::operation::StridedSliceNode &) override; + void visit(const model::operation::TransposeNode &) override; + void visit(const model::operation::AddNode &) override; + void visit(const model::operation::SubNode &) override; + void visit(const model::operation::CastNode &) override; + void visit(const model::operation::DivNode &) override; + void visit(const model::operation::ExpNode &) override; + void visit(const model::operation::LogisticNode &) override; + void visit(const model::operation::ReduceMaxNode &) override; + void visit(const model::operation::ComparisonNode &) override; + void visit(const model::operation::LogicalAndNode &) override; + void visit(const model::operation::LSTMNode &) override; + void visit(const model::operation::RSQRTNode &) override; + void visit(const model::operation::ReLUNode &) override; + void visit(const model::operation::ResizeBilinearNode &) override; + void visit(const model::operation::ReLU1Node &) override; + void visit(const model::operation::ReLU6Node &) override; + void visit(const model::operation::RNNNode &) override; + void visit(const model::operation::FloorNode &) override; + void visit(const model::operation::SpaceToDepthNode &) override; + void visit(const model::operation::L2Pool2DNode &) override; + void visit(const model::operation::EmbeddingLookupNode &) override; + void visit(const model::operation::L2NormalizationNode &) override; + void visit(const model::operation::HashtableLookupNode &) override; + void visit(const model::operation::PReLUNode &) override; + void visit(const model::operation::TransposeConvNode &) override; + void visit(const model::operation::SQRTNode &) override; + void visit(const model::operation::LogicalOrNode &) override; + void visit(const model::operation::LogicalNotNode &) override; + void visit(const model::operation::SquaredDifferenceNode &) override; + void visit(const model::operation::TopKV2Node &) override; + void visit(const model::operation::GatherNode &) override; + void visit(const model::operation::NegNode &) override; + void visit(const model::operation::AbsNode &) override; + void visit(const model::operation::ArgMaxNode &) override; + void visit(const model::operation::DequantizeNode &) override; + void visit(const model::operation::MeanNode &) override; + void visit(const model::operation::LocalResponseNormalizationNode &) override; + void visit(const model::operation::DepthToSpaceNode &) override; + void visit(const model::operation::ReduceMinNode &) override; + void visit(const model::operation::SplitNode &) override; + void visit(const model::operation::UnpackNode &) override; + void visit(const model::operation::PadNode &) override; + +private: + const neurun::model::Operands &_ctx; + std::shared_ptr<TensorBuilder> _tensor_builder; +}; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__ diff --git a/runtimes/neurun/backend/acl_cl/TensorBuilder.h b/runtimes/neurun/backend/acl_cl/TensorBuilder.h new file mode 100644 index 000000000..8ce69a6c2 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/TensorBuilder.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__ +#define __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__ + +#include <TemplTensorBuilder.h> + +#include "operand/CLTensor.h" +#include "operand/CLSubTensor.h" +#include "operand/Object.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using TensorBuilder = + ::neurun::backend::acl_common::TemplTensorBuilder<::neurun::backend::acl_cl::operand::ICLTensor, + operand::CLTensor, operand::CLSubTensor, + operand::Object>; + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__ diff --git a/runtimes/neurun/backend/acl_cl/TensorManager.h b/runtimes/neurun/backend/acl_cl/TensorManager.h new file mode 100644 index 000000000..10145bb9a --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/TensorManager.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__ +#define __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__ + +#include <arm_compute/runtime/CL/CLBufferAllocator.h> +#include <arm_compute/runtime/PoolManager.h> +#include <arm_compute/runtime/BlobLifetimeManager.h> +#include <arm_compute/runtime/MemoryManagerOnDemand.h> +#include <arm_compute/runtime/CL/CLMemoryGroup.h> + +#include <AclMemoryManager.h> +#include <AclLinearMemoryManager.h> +#include <AclInternalBufferManager.h> +#include <AclTensorManager.h> + +#include "operand/CLTensor.h" +#include "operand/CLSubTensor.h" +#include "operand/Object.h" + +#include "util/logging.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ + +using MemoryManager = + ::neurun::backend::acl_common::AclMemoryManager<operand::ICLTensor, operand::CLTensor, + operand::CLSubTensor, operand::Object>; + +using LinearMemoryManager = ::neurun::backend::acl_common::AclLinearMemoryManager< + operand::ICLTensor, operand::CLTensor, operand::CLSubTensor, operand::Object, + ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager, + ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator, + ::arm_compute::CLMemoryGroup>; + +using InternalBufferManager = ::neurun::backend::acl_common::AclInternalBufferManager< + ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager, + ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator>; + +using TensorManager = + ::neurun::backend::acl_common::AclTensorManager<::neurun::backend::acl_cl::operand::ICLTensor, + operand::CLTensor, operand::CLSubTensor, + operand::Object>; + +TensorManager *createTensorManager() +{ + const std::string executor_str = util::getConfigString(util::config::EXECUTOR); + + if (executor_str == "Linear") + { + VERBOSE(acl_cl_createTensorManager) << "AclTensorManager as Linear" << std::endl; + return new TensorManager(new MemoryManager(), new LinearMemoryManager(), + new InternalBufferManager()); + } + else + { + VERBOSE(acl_cl_createTensorManager) << "AclTensorManager" << std::endl; + return new TensorManager(new MemoryManager(), new MemoryManager(), new InternalBufferManager()); + } +} + +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__ diff --git a/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.cc b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.cc new file mode 100644 index 000000000..aa1fd9aed --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.cc @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConcatLayer.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +#include "util/feature/nchw/View.h" +#include "util/logging.h" + +namespace +{ + +inline bool matchSizeExceptAxis(const ::neurun::backend::acl_cl::operand::ICLTensor *t1, + const ::neurun::backend::acl_cl::operand::ICLTensor *t2, + uint32_t axis) +{ + assert(t1->num_dimensions() <= 4); + assert(t2->num_dimensions() <= 4); + + for (uint32_t i = 0; i < 4; i++) + { + if (axis == i) + continue; + if (t1->info()->dimension(i) != t2->info()->dimension(i)) + return false; + } + return true; +} + +} // namespace {anonymous} + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace kernel +{ + +ConcatLayer::ConcatLayer() + : _input_allocs(), _output_alloc(nullptr), _axis(0), _input_type(arm_compute::DataType::F32) +{ + // DO NOTHING +} + +template <typename T> bool ConcatLayer::concatenate() +{ + // Input and output size check + { + // NOTE Support only tensor with dimension 4 or less + + uint32_t axis_sum = 0; + + for (auto input : _input_allocs) + { + assert(_output_alloc->ptr()->layout() == input->ptr()->layout()); + assert(matchSizeExceptAxis(_output_alloc->ptr(), input->ptr(), _axis)); + axis_sum += input->ptr()->info()->dimension(_axis); + } + + assert(_output_alloc->ptr()->info()->dimension(_axis) == axis_sum); + } + + VERBOSE(Concat_RUN) << "START Concat" << std::endl; + + // Perform operation + { + uint32_t axis_offset = 0; + + auto outout_fn = [&](::neurun::backend::operand::ITensor &out_tensor) { + for (auto input : _input_allocs) + { + auto &out_cl_tensor = + static_cast<::neurun::backend::acl_cl::operand::ICLTensor &>(out_tensor); + auto input_fn = [&](::neurun::backend::operand::ITensor &in_tensor) { + auto &in_cl_tensor = + static_cast<::neurun::backend::acl_cl::operand::ICLTensor &>(in_tensor); + for (uint32_t i = 0; i < in_cl_tensor.info()->dimension(0); i++) + { + for (uint32_t j = 0; j < in_cl_tensor.info()->dimension(1); j++) + { + for (uint32_t k = 0; k < in_cl_tensor.info()->dimension(2); k++) + { + for (uint32_t l = 0; l < in_cl_tensor.info()->dimension(3); l++) + { + int32_t io = (_axis == 0) ? axis_offset : 0; + int32_t jo = (_axis == 1) ? axis_offset : 0; + int32_t ko = (_axis == 2) ? axis_offset : 0; + int32_t lo = (_axis == 3) ? axis_offset : 0; + T value = + *reinterpret_cast<T *>(in_cl_tensor.handle()->ptr_to_element({i, j, k, l})); + *reinterpret_cast<T *>(out_cl_tensor.handle()->ptr_to_element( + {i + io, j + jo, k + ko, l + lo})) = value; + } + } + } + } + if (_axis == 0) + axis_offset += in_cl_tensor.info()->dimension(0); + if (_axis == 1) + axis_offset += in_cl_tensor.info()->dimension(1); + if (_axis == 2) + axis_offset += in_cl_tensor.info()->dimension(2); + if (_axis == 3) + axis_offset += in_cl_tensor.info()->dimension(3); + }; + input->access(input_fn); + } + }; + _output_alloc->access(outout_fn); + } + + VERBOSE(Concat_RUN) << "End Concat" << std::endl; + + return true; +} + +void ConcatLayer::configure( + const std::vector<::neurun::backend::acl_cl::operand::Object *> &input_allocs, int32_t axis, + ::neurun::backend::acl_cl::operand::Object *output_alloc) +{ + _input_allocs = input_allocs; + _output_alloc = output_alloc; + + assert(axis < 4); + + // TODO Handle when axis is negative + assert(axis >= 0); + + _axis = axis; + + _input_type = input_allocs[0]->ptr()->data_type(); +} + +void ConcatLayer::run() +{ + if (_input_type == arm_compute::DataType::F32) + { + concatenate<float>(); + } + else if (_input_type == arm_compute::DataType::QASYMM8) + { + concatenate<uint8_t>(); + } +} + +} // namespace kernel +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.h b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.h new file mode 100644 index 000000000..ed273e297 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_KERNEL_CONCAT_LAYER_H__ +#define __NEURUN_BACKEND_ACL_CL_KERNEL_CONCAT_LAYER_H__ + +#include <arm_compute/runtime/IFunction.h> +#include <arm_compute/core/Types.h> + +#include "operand/Object.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace kernel +{ + +// +// neurun::backend::acl_cl::kernel::ConcatLayer +// A naive implementation of ConcatLayer for ACL +// + +class ConcatLayer : public ::arm_compute::IFunction +{ +public: + ConcatLayer(); + +public: + void configure(const std::vector<::neurun::backend::acl_cl::operand::Object *> &input_allocs, + int32_t axis /* NNAPI tensor axis from NHWC order */, + ::neurun::backend::acl_cl::operand::Object *output_alloc); + + void run(); + +private: + template <typename T> bool concatenate(); + +private: + std::vector<::neurun::backend::acl_cl::operand::Object *> _input_allocs; + ::neurun::backend::acl_cl::operand::Object *_output_alloc; + int32_t _axis; + arm_compute::DataType _input_type; +}; + +} // namespace kernel +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_KERNEL_CONCAT_LAYER_H__ diff --git a/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.cc b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.cc new file mode 100644 index 000000000..70c8829d9 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.cc @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CLSubTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +CLSubTensor::CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape, + const arm_compute::Coordinates &coords, size_t rank, bool extend_parent) + : _cl_sub_tensor(std::make_shared<arm_compute::CLSubTensor>(parent->handle(), tensor_shape, + coords, extend_parent)), + _rank{rank} +{ + // DO NOTHING +} + +const arm_compute::CLSubTensor *CLSubTensor::handle() const { return _cl_sub_tensor.get(); } + +arm_compute::CLSubTensor *CLSubTensor::handle() { return _cl_sub_tensor.get(); } + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.h b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.h new file mode 100644 index 000000000..8eba3760f --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__ + +#include <arm_compute/runtime/CL/CLSubTensor.h> +#include "ICLTensor.h" +#include "compiler/SubTensorInfo.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class CLSubTensor : public ICLTensor +{ +public: + CLSubTensor() = delete; + +public: + CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape, + const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false); + +public: + size_t num_dimensions() const final { return _rank; } + +public: + const arm_compute::CLSubTensor *handle() const override; + arm_compute::CLSubTensor *handle() override; + +public: + // This method is used to prevent the use of memcpy for SubTensor + bool has_padding() const override { return true; } + +private: + std::shared_ptr<arm_compute::CLSubTensor> _cl_sub_tensor; + size_t _rank; +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__ diff --git a/runtimes/neurun/backend/acl_cl/operand/CLTensor.cc b/runtimes/neurun/backend/acl_cl/operand/CLTensor.cc new file mode 100644 index 000000000..6153fc2e4 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/CLTensor.cc @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CLTensor.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> +#include <arm_compute/runtime/CL/CLMemory.h> +#include <arm_compute/runtime/CL/CLMemoryRegion.h> + +#include <Convert.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +CLTensor::CLTensor(const arm_compute::TensorInfo &info, size_t rank) + : _cl_tensor(std::make_shared<arm_compute::CLTensor>()), _rank{rank} +{ + allocator()->init(info); +} + +const arm_compute::CLTensor *CLTensor::handle() const { return _cl_tensor.get(); } + +arm_compute::CLTensor *CLTensor::handle() { return _cl_tensor.get(); } + +arm_compute::CLTensorAllocator *CLTensor::allocator() { return _cl_tensor->allocator(); } + +void CLTensor::map(bool blocking) { _cl_tensor->map(blocking); } + +void CLTensor::unmap() { _cl_tensor->unmap(); } + +void CLTensor::setBuffer(void *host_ptr) +{ + // Constructs a Buffer on a user-supplied memory + auto buffer = cl::Buffer(arm_compute::CLScheduler::get().context(), + CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, info()->total_size(), host_ptr); + // import memory + allocator()->import_memory(buffer); +} + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/operand/CLTensor.h b/runtimes/neurun/backend/acl_cl/operand/CLTensor.h new file mode 100644 index 000000000..952851623 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/CLTensor.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__ + +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/CLScheduler.h> +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "ICLTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class CLTensor : public ICLTensor +{ +public: + CLTensor() = delete; + +public: + CLTensor(const arm_compute::TensorInfo &info, size_t rank); + +public: + size_t num_dimensions() const final { return _rank; } + +public: + const arm_compute::CLTensor *handle() const override; + arm_compute::CLTensor *handle() override; + +public: + arm_compute::CLTensorAllocator *allocator(); + void map(bool blocking = true); + void unmap(); + /** Set given buffer as the buffer of the tensor + * + * @note Ownership of the memory is not transferred to this object. + * Thus management (allocate/free) should be done by the client. + * + * @param[in] host_ptr Storage to be used. + */ + void setBuffer(void *host_ptr); + +private: + std::shared_ptr<arm_compute::CLTensor> _cl_tensor; + size_t _rank; +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__ diff --git a/runtimes/neurun/backend/acl_cl/operand/ICLTensor.h b/runtimes/neurun/backend/acl_cl/operand/ICLTensor.h new file mode 100644 index 000000000..022cec6e3 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/ICLTensor.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__ + +#include <arm_compute/core/CL/ICLTensor.h> + +#include <IACLTensor.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class ICLTensor : public acl_common::IACLTensor +{ +public: + const arm_compute::ICLTensor *handle() const override = 0; + arm_compute::ICLTensor *handle() override = 0; + +public: + void map(cl::CommandQueue &q, bool blocking = true) { return handle()->map(q, blocking); } + void unmap(cl::CommandQueue &q) { return handle()->unmap(q); } +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__ diff --git a/runtimes/neurun/backend/acl_cl/operand/Object.cc b/runtimes/neurun/backend/acl_cl/operand/Object.cc new file mode 100644 index 000000000..8f9b2a181 --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/Object.cc @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Object.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +void Object::access(const std::function<void(backend::operand::ITensor &tensor)> &fn) const +{ + auto &queue = ::arm_compute::CLScheduler::get().queue(); + + // This is an optional input + if (_tensor->total_size() == 0) + return; + + _tensor->map(queue); + fn(*_tensor); + _tensor->unmap(queue); +} + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun diff --git a/runtimes/neurun/backend/acl_cl/operand/Object.h b/runtimes/neurun/backend/acl_cl/operand/Object.h new file mode 100644 index 000000000..a4308feed --- /dev/null +++ b/runtimes/neurun/backend/acl_cl/operand/Object.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_OBJECT_H__ +#define __NEURUN_BACKEND_ACL_CL_OPERAND_OBJECT_H__ + +#include <memory> + +#include <backend/operand/IObject.h> +#include "operand/ICLTensor.h" + +namespace neurun +{ +namespace backend +{ +namespace acl_cl +{ +namespace operand +{ + +class Object : public backend::operand::IObject +{ +public: + Object() = default; + +public: + Object(const std::shared_ptr<acl_cl::operand::ICLTensor> &tensor) : _tensor{tensor} + { + // DO NOTHING + } + +public: + acl_cl::operand::ICLTensor *ptr(void) const override { return _tensor.get(); } + +private: + std::shared_ptr<acl_cl::operand::ICLTensor> _tensor; + +public: + void access(const std::function<void(backend::operand::ITensor &tensor)> &fn) const override; +}; + +} // namespace operand +} // namespace acl_cl +} // namespace backend +} // namespace neurun + +#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_OBJECT_H__ |