From 323663bb115ef625642391a5a8e9b35fee8b2ae3 Mon Sep 17 00:00:00 2001 From: Hyeongseok Oh Date: Wed, 12 Apr 2023 15:42:02 +0900 Subject: Imported Upstream version 1.22.0 --- runtime/onert/api/include/nnfw_version.h | 2 +- runtime/onert/api/src/nnfw_api_internal.cc | 370 +++------ runtime/onert/api/src/nnfw_api_internal.h | 9 +- runtime/onert/backend/acl_cl/KernelGenerator.cc | 2 +- .../onert/backend/acl_common/AclTensorBuilder.h | 20 +- .../cl_common/include/cl_common/BackendContext.h | 2 +- runtime/onert/backend/cpu/BackendContext.cc | 2 +- runtime/onert/backend/cpu/KernelGenerator.cc | 18 +- runtime/onert/backend/gpu_cl/Backend.h | 16 +- runtime/onert/backend/gpu_cl/BackendContext.cc | 26 + runtime/onert/backend/gpu_cl/BackendContext.h | 2 + runtime/onert/backend/gpu_cl/CMakeLists.txt | 43 +- runtime/onert/backend/gpu_cl/ClFunction.h | 32 +- runtime/onert/backend/gpu_cl/Config.h | 3 - runtime/onert/backend/gpu_cl/KernelGenerator.cc | 612 ++++++++------ runtime/onert/backend/gpu_cl/KernelGenerator.h | 15 +- runtime/onert/backend/gpu_cl/MemoryManager.h | 115 +-- runtime/onert/backend/gpu_cl/TensorBuilder.cc | 24 +- runtime/onert/backend/gpu_cl/TensorBuilder.h | 7 +- runtime/onert/backend/gpu_cl/TensorBuilderHelper.h | 44 - runtime/onert/backend/gpu_cl/TensorManager.cc | 38 +- runtime/onert/backend/gpu_cl/TensorManager.h | 22 +- runtime/onert/backend/gpu_cl/TensorRegistry.h | 2 +- runtime/onert/backend/gpu_cl/Utils.h | 155 ++++ .../onert/backend/gpu_cl/ex/InferenceContextEx.h | 108 --- runtime/onert/backend/gpu_cl/operand/CLTensor.cc | 8 +- runtime/onert/backend/gpu_cl/operand/CLTensor.h | 4 +- runtime/onert/backend/gpu_cl/operand/ICLTensor.cc | 40 +- runtime/onert/backend/gpu_cl/operand/ICLTensor.h | 49 +- runtime/onert/backend/ruy/BackendContext.cc | 2 +- runtime/onert/backend/trix/BackendContext.cc | 2 +- runtime/onert/backend/trix/BatchThreadPool.cc | 69 ++ runtime/onert/backend/trix/BatchThreadPool.h | 130 +++ runtime/onert/backend/trix/Convert.cc | 54 ++ runtime/onert/backend/trix/Convert.h | 93 +++ runtime/onert/backend/trix/DevContext.cc | 307 +++++++ runtime/onert/backend/trix/DevContext.h | 197 +++-- runtime/onert/backend/trix/KernelGenerator.cc | 4 +- runtime/onert/backend/trix/ops/BulkLayer.cc | 137 +--- runtime/onert/backend/trix/ops/BulkLayer.h | 3 +- runtime/onert/backend/xnnpack/BackendContext.cc | 2 +- runtime/onert/core/CMakeLists.txt | 2 +- .../include/backend/basic/BackendContextHelpers.h | 18 +- runtime/onert/core/include/compiler/Compiler.h | 107 +-- .../onert/core/include/compiler/CompilerFactory.h | 47 ++ .../onert/core/include/compiler/CompilerOptions.h | 91 +++ runtime/onert/core/include/compiler/ICompiler.h | 63 ++ runtime/onert/core/include/compiler/LoweredGraph.h | 5 - .../core/include/compiler/StaticShapeInferer.h | 9 + runtime/onert/core/include/exec/Execution.h | 119 +-- runtime/onert/core/include/exec/Executors.h | 71 -- runtime/onert/core/include/exec/FunctionSequence.h | 2 +- runtime/onert/core/include/exec/IExecutor.h | 17 +- runtime/onert/core/include/exec/IExecutors.h | 98 +++ runtime/onert/core/include/ir/Graph.h | 36 - runtime/onert/core/include/ir/Index.h | 4 +- runtime/onert/core/include/ir/NNPkg.h | 102 ++- .../onert/core/include/ir/OperandIndexSequence.h | 7 - runtime/onert/core/include/ir/Shape.h | 6 +- runtime/onert/core/include/util/Config.lst | 1 - runtime/onert/core/include/util/Index.h | 7 + runtime/onert/core/include/util/ObjectManager.h | 4 +- runtime/onert/core/include/util/Utils.h | 37 +- .../onert/core/src/backend/basic/MemoryManager.cc | 2 +- .../onert/core/src/backend/basic/MemoryPlanner.cc | 2 +- .../core/src/backend/basic/StaticTensorManager.cc | 2 +- .../core/src/backend/builtin/BackendContext.cc | 2 +- runtime/onert/core/src/backend/builtin/IOTensor.h | 2 +- .../core/src/backend/builtin/KernelGenerator.cc | 8 +- .../core/src/backend/builtin/KernelGenerator.h | 9 +- .../core/src/backend/builtin/kernel/IfLayer.cc | 8 +- .../core/src/backend/builtin/kernel/IfLayer.h | 8 +- .../src/backend/builtin/kernel/PermuteLayer.cc | 8 +- .../core/src/backend/builtin/kernel/WhileLayer.cc | 10 +- .../core/src/backend/builtin/kernel/WhileLayer.h | 8 +- runtime/onert/core/src/compiler/Compiler.cc | 772 ++---------------- runtime/onert/core/src/compiler/CompilerFactory.cc | 45 + runtime/onert/core/src/compiler/CompilerOptions.cc | 145 ++++ runtime/onert/core/src/compiler/ExecutorFactory.cc | 51 +- runtime/onert/core/src/compiler/ExecutorFactory.h | 25 +- .../onert/core/src/compiler/Fp32ToFp16Converter.cc | 24 +- runtime/onert/core/src/compiler/HEScheduler.cc | 2 +- .../onert/core/src/compiler/HEScheduler.test.cc | 4 +- runtime/onert/core/src/compiler/LoweredGraph.cc | 8 - runtime/onert/core/src/compiler/ManualScheduler.cc | 4 +- .../onert/core/src/compiler/MultiModelCompiler.cc | 214 +++++ .../onert/core/src/compiler/MultiModelCompiler.h | 75 ++ .../onert/core/src/compiler/StaticShapeInferer.cc | 98 ++- runtime/onert/core/src/compiler/TensorRegistries.h | 2 +- .../onert/core/src/compiler/pass/OddOutputPass.cc | 4 +- runtime/onert/core/src/compiler/pass/PassRunner.cc | 2 +- .../src/compiler/pass/PermutationInsertionPass.cc | 4 +- runtime/onert/core/src/exec/Execution.cc | 247 +----- runtime/onert/core/src/exec/Execution.test.cc | 337 +++++++- runtime/onert/core/src/exec/ExecutionObservee.cc | 8 +- runtime/onert/core/src/exec/ExecutionObservers.h | 2 +- runtime/onert/core/src/exec/ExecutorBase.cc | 57 +- runtime/onert/core/src/exec/ExecutorBase.h | 13 +- runtime/onert/core/src/exec/Executors.cc | 672 ++++++++++++--- runtime/onert/core/src/exec/Executors.h | 169 ++++ runtime/onert/core/src/exec/IPermuteFunction.cc | 320 ++++++++ runtime/onert/core/src/exec/IPermuteFunction.h | 99 +-- .../onert/core/src/exec/IPermuteFunction.test.cc | 902 +++++++++++++++++++++ runtime/onert/core/src/exec/ParallelScheduler.cc | 2 +- .../onert/core/src/exec/SingleModelExecutors.cc | 61 ++ runtime/onert/core/src/exec/SingleModelExecutors.h | 70 ++ runtime/onert/core/src/exec/ThreadPool.cc | 2 +- runtime/onert/core/src/interp/Buffer.h | 91 --- runtime/onert/core/src/interp/ExecEnv.h | 212 ----- runtime/onert/core/src/interp/InterpExecutor.cc | 127 --- runtime/onert/core/src/interp/InterpExecutor.h | 89 -- .../onert/core/src/interp/InterpExecutor.test.cc | 355 -------- runtime/onert/core/src/interp/InterpOps.lst | 73 -- runtime/onert/core/src/interp/Interpreter.cc | 184 ----- runtime/onert/core/src/interp/Interpreter.h | 64 -- runtime/onert/core/src/interp/Registration.h | 43 - runtime/onert/core/src/interp/Tensor.cc | 57 -- runtime/onert/core/src/interp/Tensor.h | 189 ----- .../src/interp/operations/BinaryArithmeticOps.cc | 204 ----- runtime/onert/core/src/interp/operations/Concat.cc | 147 ---- runtime/onert/core/src/interp/operations/Conv2D.cc | 151 ---- .../core/src/interp/operations/DepthwiseConv2D.cc | 156 ---- .../interp/operations/ElementwiseActivations.cc | 160 ---- .../core/src/interp/operations/FullyConnected.cc | 134 --- runtime/onert/core/src/interp/operations/Gather.cc | 138 ---- .../core/src/interp/operations/InstanceNorm.cc | 121 --- .../core/src/interp/operations/OperationUtil.h | 203 ----- runtime/onert/core/src/interp/operations/Pad.cc | 106 --- runtime/onert/core/src/interp/operations/Pool2D.cc | 140 ---- .../onert/core/src/interp/operations/Reshape.cc | 63 -- .../onert/core/src/interp/operations/Softmax.cc | 123 --- .../core/src/interp/operations/TransposeConv.cc | 141 ---- runtime/onert/core/src/ir/Shape.cc | 8 +- runtime/onert/core/src/ir/Shape.test.cc | 2 +- .../core/src/util/ChromeTracingEventWriter.cc | 6 +- runtime/onert/core/src/util/MDTableEventWriter.cc | 8 +- runtime/onert/core/src/util/SNPEEventWriter.cc | 22 +- runtime/onert/core/src/util/ShapeInference.cc | 6 +- .../frontend/base_loader/include/base_loader.h | 30 +- runtime/onert/frontend/circle/src/circle_loader.cc | 4 +- runtime/onert/frontend/nnapi/CMakeLists.txt | 2 +- runtime/onert/frontend/nnapi/compilation.cc | 4 +- runtime/onert/frontend/nnapi/execution.cc | 2 +- .../nnapi/wrapper/ANeuralNetworksCompilation.cc | 5 +- .../nnapi/wrapper/ANeuralNetworksCompilation.h | 6 +- .../nnapi/wrapper/ANeuralNetworksExecution.h | 2 +- runtime/onert/frontend/tflite/src/tflite_loader.cc | 5 +- 147 files changed, 5448 insertions(+), 6287 deletions(-) delete mode 100644 runtime/onert/backend/gpu_cl/TensorBuilderHelper.h create mode 100644 runtime/onert/backend/gpu_cl/Utils.h delete mode 100644 runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h create mode 100644 runtime/onert/backend/trix/BatchThreadPool.cc create mode 100644 runtime/onert/backend/trix/BatchThreadPool.h create mode 100644 runtime/onert/backend/trix/Convert.cc create mode 100644 runtime/onert/backend/trix/Convert.h create mode 100644 runtime/onert/backend/trix/DevContext.cc create mode 100644 runtime/onert/core/include/compiler/CompilerFactory.h create mode 100644 runtime/onert/core/include/compiler/CompilerOptions.h create mode 100644 runtime/onert/core/include/compiler/ICompiler.h delete mode 100644 runtime/onert/core/include/exec/Executors.h create mode 100644 runtime/onert/core/include/exec/IExecutors.h create mode 100644 runtime/onert/core/src/compiler/CompilerFactory.cc create mode 100644 runtime/onert/core/src/compiler/CompilerOptions.cc create mode 100644 runtime/onert/core/src/compiler/MultiModelCompiler.cc create mode 100644 runtime/onert/core/src/compiler/MultiModelCompiler.h create mode 100644 runtime/onert/core/src/exec/Executors.h create mode 100644 runtime/onert/core/src/exec/IPermuteFunction.cc create mode 100644 runtime/onert/core/src/exec/IPermuteFunction.test.cc create mode 100644 runtime/onert/core/src/exec/SingleModelExecutors.cc create mode 100644 runtime/onert/core/src/exec/SingleModelExecutors.h delete mode 100644 runtime/onert/core/src/interp/Buffer.h delete mode 100644 runtime/onert/core/src/interp/ExecEnv.h delete mode 100644 runtime/onert/core/src/interp/InterpExecutor.cc delete mode 100644 runtime/onert/core/src/interp/InterpExecutor.h delete mode 100644 runtime/onert/core/src/interp/InterpExecutor.test.cc delete mode 100644 runtime/onert/core/src/interp/InterpOps.lst delete mode 100644 runtime/onert/core/src/interp/Interpreter.cc delete mode 100644 runtime/onert/core/src/interp/Interpreter.h delete mode 100644 runtime/onert/core/src/interp/Registration.h delete mode 100644 runtime/onert/core/src/interp/Tensor.cc delete mode 100644 runtime/onert/core/src/interp/Tensor.h delete mode 100644 runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc delete mode 100644 runtime/onert/core/src/interp/operations/Concat.cc delete mode 100644 runtime/onert/core/src/interp/operations/Conv2D.cc delete mode 100644 runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc delete mode 100644 runtime/onert/core/src/interp/operations/ElementwiseActivations.cc delete mode 100644 runtime/onert/core/src/interp/operations/FullyConnected.cc delete mode 100644 runtime/onert/core/src/interp/operations/Gather.cc delete mode 100644 runtime/onert/core/src/interp/operations/InstanceNorm.cc delete mode 100644 runtime/onert/core/src/interp/operations/OperationUtil.h delete mode 100644 runtime/onert/core/src/interp/operations/Pad.cc delete mode 100644 runtime/onert/core/src/interp/operations/Pool2D.cc delete mode 100644 runtime/onert/core/src/interp/operations/Reshape.cc delete mode 100644 runtime/onert/core/src/interp/operations/Softmax.cc delete mode 100644 runtime/onert/core/src/interp/operations/TransposeConv.cc (limited to 'runtime/onert') diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h index 2fbb96f31..be30ee296 100644 --- a/runtime/onert/api/include/nnfw_version.h +++ b/runtime/onert/api/include/nnfw_version.h @@ -21,6 +21,6 @@ * NNFW_VERSION is a uint32 value representing nnfw runtime version * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch */ -#define NNFW_VERSION 0x01001500 +#define NNFW_VERSION 0x01001600 #endif // __NNFW_VERSION_H__ diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc index 9b43dd381..8eedb5314 100644 --- a/runtime/onert/api/src/nnfw_api_internal.cc +++ b/runtime/onert/api/src/nnfw_api_internal.cc @@ -16,7 +16,7 @@ #include "nnfw_api_internal.h" #include "CustomKernelRegistry.h" -#include "compiler/Compiler.h" +#include "compiler/CompilerFactory.h" #include "util/ConfigSource.h" #include "util/Exceptions.h" #include "util/logging.h" @@ -208,29 +208,24 @@ NNFW_STATUS nnfw_session::create(nnfw_session **session) { if (session == nullptr) return NNFW_STATUS_UNEXPECTED_NULL; - - // Create session - *session = new (std::nothrow) nnfw_session(); - if (*session == nullptr) + try { - std::cerr << "Error during session creation" << std::endl; - return NNFW_STATUS_OUT_OF_MEMORY; + auto new_session = std::unique_ptr(new nnfw_session()); + new_session->_kernel_registry = std::make_shared(); + *session = new_session.release(); } - - // Initialize fields - try + catch (const std::bad_alloc &e) { - (*session)->_kernel_registry = std::make_shared(); + std::cerr << "Error during session creation" << std::endl; + *session = nullptr; // Set nullptr on error to keep the old behavior + return NNFW_STATUS_OUT_OF_MEMORY; } catch (const std::exception &e) { std::cerr << "Error during session initialization : " << e.what() << std::endl; - delete *session; - *session = nullptr; - + *session = nullptr; // Set nullptr on error to keep the old behavior return NNFW_STATUS_ERROR; } - return NNFW_STATUS_NO_ERROR; } @@ -331,7 +326,6 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir) std::string manifest_file_name = package_path + "/metadata/MANIFEST"; std::ifstream mfs(manifest_file_name); - _package_file_path = package_path; // extract the filename of the first(index 0) model // e.g. In MANIFEST file, { "models" : [ "firstmodel.tflite", "2nd.tflite" ] } Json::Value root; @@ -351,7 +345,14 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir) } } _nnpkg = std::make_shared(); - for (uint32_t i = 0; i < models.size(); ++i) + auto num_models = models.size(); + if (num_models == 0 || (num_models - 1) > onert::ir::ModelIndex::max()) + { + std::cerr << "Invalid model size - " << std::to_string(num_models) << std::endl; + return NNFW_STATUS_ERROR; + } + + for (uint16_t i = 0; i < num_models; ++i) { auto model_file_path = package_path + std::string("/") + models[i].asString(); auto model_type = model_types[i].asString(); @@ -390,6 +391,8 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir) for (uint32_t j = 0; j < tos.size(); ++j) _nnpkg->addEdge(toIODesc(fromtos[i]["from"].asString()), toIODesc(tos[j].asString())); } + + _nnpkg->verify(); _state = State::MODEL_LOADED; } catch (const std::exception &e) @@ -420,14 +423,7 @@ NNFW_STATUS nnfw_session::prepare() try { - // TODO: Compile all models in case of multiple models - if (_nnpkg->model_count() > 2) - { - std::cerr << "Error during model prepare : more than 3 multiple models are not supported yet." - << std::endl; - return NNFW_STATUS_ERROR; - } - auto compiler = std::make_unique(_nnpkg, _coptions); + auto compiler = onert::compiler::CompilerFactory::get().create(_nnpkg, _coptions); _nnpkg.reset(); _compiler_artifact = compiler->compile(); _execution = std::make_unique(_compiler_artifact->_executors); @@ -442,50 +438,10 @@ NNFW_STATUS nnfw_session::prepare() return NNFW_STATUS_NO_ERROR; } -NNFW_STATUS nnfw_session::prepare_pipeline(const char *map_file_path) +NNFW_STATUS nnfw_session::prepare_pipeline(const char *) { - // NOTE. If users want to run prepare_pipeline() more than one time, this could be removed. - if (!isStateModelLoaded()) - { - std::cerr << "Error during model prepare pipeline : "; - if (isStateInitialized()) - { - std::cerr << "prepare_pipeline should be run once"; - } - else - { - std::cerr << "invalid state"; - } - std::cerr << std::endl; - return NNFW_STATUS_INVALID_STATE; - } - - try - { - auto model = _nnpkg->primary_model(); - auto compiler = std::make_unique(model, *_coptions[0]); - _nnpkg.reset(); - auto artifacts = compiler->compile(_package_file_path.c_str(), map_file_path); - - for (auto it = artifacts.begin(); it != artifacts.end(); ++it) - { - _executions.push_back(std::make_shared(it->get()->_executors)); - } - make_dependency(); - _threads.resize(_executions.size()); - for (uint32_t i = 0; i < _threads.size(); i++) - { - _threads[i] = std::thread(&onert::exec::Execution::runInference, _executions[i].get()); - } - } - catch (const std::exception &e) - { - std::cerr << "Error during model prepare : " << e.what() << std::endl; - return NNFW_STATUS_ERROR; - } - - _state = State::PREPARED; - return NNFW_STATUS_NO_ERROR; + std::cerr << "Pipeline prepare_pipeline: deprecated feature " << std::endl; + return NNFW_STATUS_ERROR; } NNFW_STATUS nnfw_session::run() @@ -497,12 +453,6 @@ NNFW_STATUS nnfw_session::run() return NNFW_STATUS_INVALID_STATE; } - if (!_executions.empty()) - { - std::cerr << "Error during nnfw_session::run : not supported for pipeline run" << std::endl; - return NNFW_STATUS_ERROR; - } - try { _execution->execute(); @@ -532,13 +482,6 @@ NNFW_STATUS nnfw_session::run_async() return NNFW_STATUS_INVALID_STATE; } - if (!_executions.empty()) - { - std::cerr << "Error during nnfw_session::run_async : not supported for pipeline run" - << std::endl; - return NNFW_STATUS_ERROR; - } - _execution->startExecute(); _state = State::RUNNING; @@ -554,12 +497,6 @@ NNFW_STATUS nnfw_session::await() return NNFW_STATUS_ERROR; } - if (!_executions.empty()) - { - std::cerr << "Error during nnfw_session::await : not supported for pipeline run" << std::endl; - return NNFW_STATUS_ERROR; - } - _execution->waitFinish(); _state = State::FINISHED_RUN; @@ -583,13 +520,6 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo return NNFW_STATUS_ERROR; } - if (!_executions.empty()) - { - std::cerr << "Error during nnfw_session::set_input : not supported for pipeline run" - << std::endl; - return NNFW_STATUS_ERROR; - } - try { _execution->setInput(onert::ir::IOIndex(index), buffer, length); @@ -619,13 +549,6 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b return NNFW_STATUS_ERROR; } - if (!_executions.empty()) - { - std::cerr << "Error during nnfw_session::set_output : not supported for pipeline run" - << std::endl; - return NNFW_STATUS_ERROR; - } - try { _execution->setOutput(onert::ir::IOIndex(index), buffer, length); @@ -650,7 +573,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number) std::cerr << "Error during nnfw_session::input_size, number is null pointer." << std::endl; return NNFW_STATUS_UNEXPECTED_NULL; } - *number = primary_subgraph()->getInputs().size(); + *number = getInputSize(); } catch (const std::exception &e) { @@ -672,7 +595,7 @@ NNFW_STATUS nnfw_session::output_size(uint32_t *number) std::cerr << "Error during nnfw_session::output_size, number is null pointer." << std::endl; return NNFW_STATUS_UNEXPECTED_NULL; } - *number = primary_subgraph()->getOutputs().size(); + *number = getOutputSize(); } catch (const std::exception &e) { @@ -684,6 +607,13 @@ NNFW_STATUS nnfw_session::output_size(uint32_t *number) NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout) { + if (!isStatePreparedOrFinishedRun()) + { + std::cerr << "Error during nnfw_session::set_input_layout : " + << "run should be run after prepare" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + try { if (layout != NNFW_LAYOUT_NONE && layout != NNFW_LAYOUT_CHANNELS_FIRST && @@ -692,14 +622,8 @@ NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout) std::cerr << "Error during nnfw_session::set_input_layout, not supported layout" << std::endl; return NNFW_STATUS_ERROR; } - if (_execution) - { - _execution->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout)); - } - else - { - _executions.at(0)->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout)); - } + + _execution->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout)); } catch (const std::exception &e) { @@ -711,6 +635,13 @@ NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout) NNFW_STATUS nnfw_session::set_output_layout(uint32_t index, NNFW_LAYOUT layout) { + if (!isStatePreparedOrFinishedRun()) + { + std::cerr << "Error during nnfw_session::set_output_layout : " + << "run should be run after prepare" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + try { if (layout != NNFW_LAYOUT_NONE && layout != NNFW_LAYOUT_CHANNELS_FIRST && @@ -720,15 +651,8 @@ NNFW_STATUS nnfw_session::set_output_layout(uint32_t index, NNFW_LAYOUT layout) << std::endl; return NNFW_STATUS_ERROR; } - if (_execution) - { - _execution->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout)); - } - else - { - _executions.at(_executions.size() - 1) - ->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout)); - } + + _execution->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout)); } catch (const std::exception &e) { @@ -771,27 +695,13 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti) if (!isStatePreparedOrFinishedRun()) { - // In this case, if we apply input shape in primary_subgraph, it will propagate after - // compilation and excution - auto model = _nnpkg->primary_model(); - auto primary_subgraph = model->primary_subgraph(); - auto ind = primary_subgraph->getInputs().at(index); - auto &input = primary_subgraph->operands().at(ind); - // overwrite input shape with the shape from ti - input.info().shape(new_shape); + // In this case, if we apply input shape, it will propagate after compilation and excution + auto &info = _nnpkg->inputInfo(index); + info.shape(new_shape); } else // when called after nnfw_session::prepare() - { - if (_execution) - { - _execution->changeInputShape(onert::ir::IOIndex(index), new_shape); - } - else - { - _executions.at(0)->changeInputShape(onert::ir::IOIndex(index), new_shape); - } - } + _execution->changeInputShape(onert::ir::IOIndex(index), new_shape); return NNFW_STATUS_NO_ERROR; } @@ -815,21 +725,26 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) << std::endl; return NNFW_STATUS_UNEXPECTED_NULL; } - if (index >= primary_subgraph()->getInputs().size()) + + if (index >= getInputSize()) { std::cerr << "Error during nnfw_session::input_tensorinfo, index is out of range." << std::endl; return NNFW_STATUS_ERROR; } - auto opidx = primary_subgraph()->getInputs().at(index); - auto shape = primary_subgraph()->operands().at(opidx).shape(); - if (isStatePreparedOrFinishedRun()) + + if (isStateModelLoaded()) + { + auto info = _nnpkg->inputInfo(index); + fillTensorInfo(ti, info.shape(), info.typeInfo().type()); + } + else { - shape = _execution ? _execution->getInputShape(onert::ir::IOIndex{index}) - : _executions.at(0)->getInputShape(onert::ir::IOIndex{index}); + auto io_index = onert::ir::IOIndex{index}; + auto shape = _execution->getInputShape(io_index); + auto dtype = _compiler_artifact->_executors->inputInfo(io_index).typeInfo().type(); + fillTensorInfo(ti, shape, dtype); } - auto dtype = primary_subgraph()->operands().at(opidx).typeInfo().type(); - fillTensorInfo(ti, shape, dtype); } catch (const std::exception &e) { @@ -851,26 +766,27 @@ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) return NNFW_STATUS_UNEXPECTED_NULL; } - if (index >= primary_subgraph()->getOutputs().size()) - { - std::cerr << "Error during nnfw_session::output_tensorinfo, index is out of range." - << std::endl; - return NNFW_STATUS_ERROR; - } - try { - auto opidx = primary_subgraph()->getOutputs().at(index); - auto shape = primary_subgraph()->operands().at(opidx).shape(); - // If it is called after `nnfw_run` then get the shape from Execution, not from the graph - if (isStateFinishedRun()) + if (index >= getOutputSize()) + { + std::cerr << "Error during nnfw_session::output_tensorinfo, index is out of range." + << std::endl; + return NNFW_STATUS_ERROR; + } + + if (isStateModelLoaded()) { - shape = _execution - ? _execution->getOutputShape(onert::ir::IOIndex{index}) - : _executions.at(_executions.size() - 1)->getOutputShape(onert::ir::IOIndex{index}); + auto info = _nnpkg->outputInfo(index); + fillTensorInfo(ti, info.shape(), info.typeInfo().type()); + } + else + { + auto io_index = onert::ir::IOIndex{index}; + auto shape = _execution->getOutputShape(io_index); + auto dtype = _compiler_artifact->_executors->outputInfo(io_index).typeInfo().type(); + fillTensorInfo(ti, shape, dtype); } - auto dtype = primary_subgraph()->operands().at(opidx).typeInfo().type(); - fillTensorInfo(ti, shape, dtype); } catch (const std::exception &e) { @@ -881,86 +797,16 @@ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti) return NNFW_STATUS_NO_ERROR; } -void nnfw_session::make_dependency() +NNFW_STATUS nnfw_session::push_pipeline_input(std::vector *, std::vector *) { - for (uint32_t out_exe = 0; out_exe < _executions.size(); out_exe++) - { - auto &out_graph = _executions[out_exe]->primary_subgraph(); - for (uint32_t in_exe = 0; in_exe < _executions.size(); in_exe++) - { - if (out_exe == in_exe) - continue; - auto &in_graph = _executions[in_exe]->primary_subgraph(); - for (auto out = out_graph._name_to_output_begin(); out != out_graph._name_to_output_end(); - out++) - { - auto out_opidx = out_graph.getOutputs().at(out->second); - auto out_shape = out_graph.operands().at(out_opidx).shape(); - for (auto in = in_graph._name_to_input_begin(); in != in_graph._name_to_input_end(); in++) - { - if (out->first != in->first) - continue; - - auto in_opidx = in_graph.getInputs().at(in->second); - auto in_shape = in_graph.operands().at(in_opidx).shape(); - if (out_shape.rank() != in_shape.rank()) - continue; - - bool is_same = true; - for (int32_t i = 0; i < out_shape.rank(); i++) - { - if (out_shape.dim(i) != in_shape.dim(i)) - { - is_same = false; - break; - } - } - - if (is_same) - _executions[out_exe]->pushNextExe(_executions[in_exe], out->second, in->second); - } - } - } - } -} - -NNFW_STATUS nnfw_session::push_pipeline_input(std::vector *inputs, - std::vector *lengths) -{ - static uint32_t count = 0; - if (inputs->empty()) - { - _executions[0]->setFinish(); - for (uint32_t i = 0; i < _threads.size(); i++) - { - _threads[i].join(); - } - return NNFW_STATUS_NO_ERROR; - } - _executions[0]->asyncIoDescSemWait(); - _executions[0]->createNewAsyncDesc(count++); - for (uint32_t i = 0; i < inputs->size(); i++) - { - _executions[0]->executeAsyncInput(onert::ir::IOIndex(i), inputs->at(i), lengths->at(i)); - } - _executions[0]->asyncIoDescSemPost(); - return NNFW_STATUS_NO_ERROR; + std::cerr << "Pipeline push_pipeline_input: deprecated feature " << std::endl; + return NNFW_STATUS_ERROR; } -NNFW_STATUS nnfw_session::pop_pipeline_output(std::vector *outputs) +NNFW_STATUS nnfw_session::pop_pipeline_output(std::vector *) { - auto results = _executions[_executions.size() - 1]->getAsyncResults(); - while (results->empty()) - { - if (_executions[_executions.size() - 1]->stopWait()) - return NNFW_STATUS_ERROR; - } - - auto result = results->front(); - results->pop_front(); - for (uint32_t i = 0; i < result.size(); i++) - outputs->push_back(result[i]); - return NNFW_STATUS_NO_ERROR; + std::cerr << "Pipeline pop_pipeline_output: deprecated feature " << std::endl; + return NNFW_STATUS_ERROR; } NNFW_STATUS nnfw_session::register_custom_operation(const std::string &id, @@ -1088,10 +934,6 @@ NNFW_STATUS nnfw_session::set_config(const char *key, const char *value) { options.he_profiling_mode = toBool(value); } - else if (skey == config::DISABLE_COMPILE) - { - options.disable_compile = toBool(value); - } else { return NNFW_STATUS_ERROR; @@ -1103,23 +945,41 @@ const onert::ir::Graph *nnfw_session::primary_subgraph() { if (_nnpkg != nullptr) { - assert(_execution == nullptr && _executions.empty()); + assert(_execution == nullptr); return _nnpkg->primary_model()->primary_subgraph().get(); } else { - assert(_execution != nullptr || !_executions.empty()); - // TODO Remove const_cast + assert(_execution != nullptr); // We assumed the graph will not change after compilation, but shape could change - if (!_executions.empty()) - { - return &_executions[0]->primary_parentgraph(); - } - return &_execution->primary_subgraph(); } } +uint32_t nnfw_session::getInputSize() +{ + if (isStateInitialized()) + throw std::runtime_error{"Model is not loaded yet"}; + + if (isStateModelLoaded()) + return _nnpkg->inputSize(); + + // Session is prepared (general inference) + return _compiler_artifact->_executors->inputSize(); +} + +uint32_t nnfw_session::getOutputSize() +{ + if (isStateInitialized()) + throw std::runtime_error{"Model is not loaded yet"}; + + if (isStateModelLoaded()) + return _nnpkg->outputSize(); + + // Session is prepared (general inference) + return _compiler_artifact->_executors->outputSize(); +} + NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size) { if (!isStateModelLoaded()) @@ -1174,7 +1034,7 @@ bool nnfw_session::isStateInitialized() { assert(_nnpkg == nullptr); assert(_coptions.empty()); - assert(_execution == nullptr && _executions.empty()); + assert(_execution == nullptr); return true; } else @@ -1189,7 +1049,7 @@ bool nnfw_session::isStateModelLoaded() { assert(_nnpkg != nullptr); assert(!_coptions.empty()); - assert(_execution == nullptr && _executions.empty()); + assert(_execution == nullptr); return true; } else @@ -1204,7 +1064,7 @@ bool nnfw_session::isStatePrepared() { assert(_nnpkg == nullptr); assert(!_coptions.empty()); - assert(_execution != nullptr || !_executions.empty()); + assert(_execution != nullptr); return true; } else @@ -1219,7 +1079,7 @@ bool nnfw_session::isStateRunning() { assert(_nnpkg == nullptr); assert(!_coptions.empty()); - assert(_execution != nullptr || !_executions.empty()); + assert(_execution != nullptr); return true; } return false; @@ -1231,7 +1091,7 @@ bool nnfw_session::isStateFinishedRun() { assert(_nnpkg == nullptr); assert(!_coptions.empty()); - assert(_execution != nullptr || !_executions.empty()); + assert(_execution != nullptr); return true; } else diff --git a/runtime/onert/api/src/nnfw_api_internal.h b/runtime/onert/api/src/nnfw_api_internal.h index 9b729fd5f..8e2c2fba6 100644 --- a/runtime/onert/api/src/nnfw_api_internal.h +++ b/runtime/onert/api/src/nnfw_api_internal.h @@ -136,9 +136,6 @@ public: NNFW_STATUS set_available_backends(const char *backends); NNFW_STATUS set_op_backend(const char *op, const char *backend); - // accessor - std::vector> *get_executions() { return &_executions; } - // // Internal-only API // @@ -151,7 +148,6 @@ public: // // Experimental API // - void make_dependency(); NNFW_STATUS push_pipeline_input(std::vector *inputs, std::vector *lengths); NNFW_STATUS pop_pipeline_output(std::vector *outputs); @@ -166,6 +162,9 @@ public: private: const onert::ir::Graph *primary_subgraph(); + uint32_t getInputSize(); + uint32_t getOutputSize(); + bool isStateInitialized(); bool isStateModelLoaded(); bool isStatePrepared(); @@ -181,8 +180,6 @@ private: std::unique_ptr _execution; std::shared_ptr _kernel_registry; std::vector _threads; - std::vector> _executions; - std::string _package_file_path; }; #endif // __API_NNFW_API_INTERNAL_H__ diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc index e709286df..5b0ec92b7 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.cc +++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc @@ -256,7 +256,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto output_tensor = _tensor_reg->getAclTensor(ofm_index); std::vector input_tensors; - for (auto &ifm_ind : input_indexes) + for (const auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle()); std::unique_ptr<::arm_compute::IFunction> fn; diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h index e008fd6f5..b0b5ca612 100644 --- a/runtime/onert/backend/acl_common/AclTensorBuilder.h +++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h @@ -162,7 +162,7 @@ void AclTensorBuilder::registerTensorInfo( auto &offset = parent_info.coordinates; auto frontend_layout = parent_info.frontend_layout; - assert(obj.shape().rank() <= ir::Shape::MAX_RANK); + assert(obj.shape().rank() <= ir::Shape::kMaxRank); auto shape = obj.shape(); if (_operands.at(parent_index).shape().rank() >= 4 && frontend_layout == ir::Layout::NHWC && backend_layout == ir::Layout::NCHW) @@ -218,11 +218,11 @@ void AclTensorBuilder::allocate(void) { auto lifetime_map = cl_common::createLifetimeMap(_lifetime_seq, _parent_map); - for (auto &entry : lifetime_map) + for (const auto &entry : lifetime_map) { - auto &use = entry.second; - auto use_type = use.first; - auto use_index = use.second; + const auto &use = entry.second; + const auto &use_type = use.first; + const auto &use_index = use.second; assert(use_index.valid()); if (use_type == UsesType::FIRST) _tensor_mgr->startLifetime(use_index); @@ -255,9 +255,9 @@ void AclTensorBuilder::buildTensors(void) assert(_tensor_mgr->nonconstTensors().size() == 0); // Normal tensors - for (auto &entry : _tensor_info_map) + for (const auto &entry : _tensor_info_map) { - auto ind = entry.first; + const auto &ind = entry.first; if (_parent_map.count(ind) > 0) continue; @@ -273,9 +273,9 @@ void AclTensorBuilder::buildTensors(void) assert(_tensor_mgr->nonconstSubtensors().size() == 0); // TODO Iterate `_parent_map` instead, once the optimizer bug is fixed // `Optimizer` iterates the entire Operations, so there is a bug if iterating _parent_map - for (auto &entry : _tensor_info_map) + for (const auto &entry : _tensor_info_map) { - auto ind = entry.first; + const auto &ind = entry.first; if (_parent_map.count(ind) == 0) continue; @@ -343,7 +343,7 @@ template bool AclTensorBuilder::areSubTensorsOf( const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq) { - for (auto &cand : seq) + for (const auto &cand : seq) { if (!isSubTensorOf(parent, cand)) { diff --git a/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h b/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h index 7bb72d74e..5536d2780 100644 --- a/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h +++ b/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h @@ -65,7 +65,7 @@ public: .operands() .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - for (auto &it : ret) + for (auto &&it : ret) { auto &fn_seq = it.second; fn_seq->iterate([&](exec::IFunction &ifunc) { diff --git a/runtime/onert/backend/cpu/BackendContext.cc b/runtime/onert/backend/cpu/BackendContext.cc index e6f7b8470..da48a785d 100644 --- a/runtime/onert/backend/cpu/BackendContext.cc +++ b/runtime/onert/backend/cpu/BackendContext.cc @@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels() .operands() .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - for (auto &it : ret) + for (auto &&it : ret) { auto &fn_seq = it.second; fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc index 762ee7392..896883bc3 100644 --- a/runtime/onert/backend/cpu/KernelGenerator.cc +++ b/runtime/onert/backend/cpu/KernelGenerator.cc @@ -279,7 +279,7 @@ void KernelGenerator::visit(const ir::operation::AddN &node) const auto output_index{node.getOutputs().at(0)}; std::vector input_tensors; - for (auto &input_idx : node.getInputs()) + for (const auto &input_idx : node.getInputs()) input_tensors.emplace_back(_tensor_reg->getPortableTensor(input_idx)); auto output_tensor = _tensor_reg->getPortableTensor(output_index); @@ -386,7 +386,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector input_tensors; - for (auto &ifm_idx : node.getInputs()) + for (const auto &ifm_idx : node.getInputs()) input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); auto fn = std::make_unique(); @@ -626,7 +626,7 @@ void KernelGenerator::visit(const ir::operation::Einsum &node) auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector input_tensors; - for (auto &ifm_idx : node.getInputs()) + for (const auto &ifm_idx : node.getInputs()) input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); const auto equation = node.param().equation; @@ -643,7 +643,7 @@ void KernelGenerator::visit(const ir::operation::Custom &node) auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq, std::vector &types, std::vector &tensors) { - for (auto &idx : opSeq) + for (const auto &idx : opSeq) { const auto &operand = _ctx.at(idx); // TODO make sure using `_current_layout` is correct for custom operations @@ -750,7 +750,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector input_tensors; - for (auto &ifm_idx : node.getInputs()) + for (const auto &ifm_idx : node.getInputs()) input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); auto fn = std::make_unique(); @@ -772,7 +772,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) auto input_tensor = _tensor_reg->getPortableTensor(input_index); std::vector output_tensors; - for (auto &output_idx : node.getOutputs()) + for (const auto &output_idx : node.getOutputs()) output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx)); auto fn = std::make_unique(); @@ -934,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) auto axis_tensor = _tensor_reg->getPortableTensor(axis_idx); std::vector out_tensors; - for (auto &output_idx : node.getOutputs()) + for (const auto &output_idx : node.getOutputs()) out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx)); auto fn = std::make_unique(); @@ -1261,7 +1261,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector input_tensors; - for (auto &ifm_idx : node.getInputs()) + for (const auto &ifm_idx : node.getInputs()) input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); const auto epsilon = node.param().epsilon; @@ -1372,7 +1372,7 @@ void KernelGenerator::visit(const ir::operation::SplitV &node) auto in_split_dim = _tensor_reg->getPortableTensor(split_dim); std::vector out_tensors; - for (auto &output_idx : node.getOutputs()) + for (const auto &output_idx : node.getOutputs()) out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx)); auto fn = std::make_unique(); diff --git a/runtime/onert/backend/gpu_cl/Backend.h b/runtime/onert/backend/gpu_cl/Backend.h index d67ba1602..cdf965557 100644 --- a/runtime/onert/backend/gpu_cl/Backend.h +++ b/runtime/onert/backend/gpu_cl/Backend.h @@ -28,6 +28,7 @@ #include "TensorBuilder.h" #include "tensorflow/lite/delegates/gpu/cl/environment.h" +#include "tensorflow/lite/delegates/gpu/common/precision.h" #include "tensorflow/lite/delegates/gpu/common/status.h" namespace onert @@ -55,15 +56,16 @@ public: { return nullptr; } - auto tm = createTensorManager(&environment->context()); - auto tr = std::make_shared(tm); - - tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info; - create_info.precision = tflite::gpu::cl::CalculationsPrecision::F32; + tflite::gpu::CreateGpuModelInfo create_info; + create_info.precision = tflite::gpu::CalculationsPrecision::F32; create_info.storage_type = tflite::gpu::cl::GetStorageTypeWithMinimalMemoryConsumption(environment->device().GetInfo()); - create_info.hints.Add(tflite::gpu::cl::ModelHints::kFastestInference); + create_info.hints.Add(tflite::gpu::ModelHints::kFastestInference); + + auto tm = createTensorManager(&environment->context(), create_info, environment); + + auto tr = std::make_shared(tm); auto cc = std::make_shared(); cc->device = environment->GetDevicePtr(); @@ -71,7 +73,7 @@ public: cc->queue = environment->queue(); cc->cache = environment->program_cache(); - auto tb = std::make_shared(operands, tm, create_info, environment); + auto tb = std::make_shared(operands, tm); context->tensor_registry = tr; context->tensor_builder = tb; diff --git a/runtime/onert/backend/gpu_cl/BackendContext.cc b/runtime/onert/backend/gpu_cl/BackendContext.cc index ec9442155..b09319d98 100644 --- a/runtime/onert/backend/gpu_cl/BackendContext.cc +++ b/runtime/onert/backend/gpu_cl/BackendContext.cc @@ -86,6 +86,32 @@ ITensorRegistry *BackendContext::genTensors() return tensor_registry.get(); } +FunctionMap BackendContext::genKernels() +{ + FunctionMap fn_map; + + for (auto op_ind : _data.op_order) + { + auto fn_seq = kernel_gen->generate(op_ind); + fn_map.emplace_back(op_ind, std::move(fn_seq)); + } + + kernel_gen->get_operation(fn_map); + tensor_builder->allocate(); + // NOTE For memory optimization, we want to free some operand data + const_cast(*_data.graph) + .operands() + .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); + + for (auto &&it : fn_map) + { + auto &fn_seq = it.second; + fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); + } + + return fn_map; +} + } // namespace gpu_cl } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/gpu_cl/BackendContext.h b/runtime/onert/backend/gpu_cl/BackendContext.h index 7412d2bce..da5daae02 100644 --- a/runtime/onert/backend/gpu_cl/BackendContext.h +++ b/runtime/onert/backend/gpu_cl/BackendContext.h @@ -25,6 +25,7 @@ #include "ConstantInitializer.h" #include "KernelGenerator.h" #include "TensorBuilder.h" + #include "tensorflow/lite/delegates/gpu/cl/inference_context.h" namespace onert @@ -52,6 +53,7 @@ public: } ITensorRegistry *genTensors() override; + FunctionMap genKernels() override; protected: void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, diff --git a/runtime/onert/backend/gpu_cl/CMakeLists.txt b/runtime/onert/backend/gpu_cl/CMakeLists.txt index eb1964214..d62dbd84c 100644 --- a/runtime/onert/backend/gpu_cl/CMakeLists.txt +++ b/runtime/onert/backend/gpu_cl/CMakeLists.txt @@ -24,7 +24,26 @@ if(NOT Fp16_FOUND) return() endif(NOT Fp16_FOUND) -nnas_find_package(TensorFlowGpu QUIET) +nnas_find_package(VulkanSource QUIET) +if(NOT VulkanSource_FOUND) + return() +endif(NOT VulkanSource_FOUND) + +nnas_find_package(Opengl_HeadersSource QUIET) +if(NOT Opengl_HeadersSource_FOUND) + return() +endif(NOT Opengl_HeadersSource_FOUND) + +nnas_find_package(Egl_HeadersSource QUIET) +if(NOT Egl_HeadersSource_FOUND) + return() +endif(NOT Egl_HeadersSource_FOUND) + +if (NOT ${TARGET_OS} MATCHES "tizen") + nnas_find_package(FlatBuffers REQUIRED) +endif () + +nnfw_find_package(TensorFlowGpu QUIET) if(NOT TensorFlowGpu_FOUND) message(FATAL_ERROR 'TensorFlowGpu lib not found') return() @@ -35,18 +54,32 @@ file(GLOB_RECURSE SOURCES "*.cc") add_library(${LIB_ONERT_BACKEND_GPU_CL} SHARED ${SOURCES}) target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${TENSORFLOWGPU_SOURCE_DIR}) +target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${TensorFlowSource_DIR}) +target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${VulkanSource_DIR}/include) +target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${Opengl_HeadersSource_DIR}/api) +target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${Egl_HeadersSource_DIR}/api) + +if (${TARGET_OS} MATCHES "tizen") + target_compile_options(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE "-Wno-error=deprecated-copy") +endif () + +target_compile_options(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE "-DCL_TARGET_OPENCL_VERSION=220" "-DEGL_NO_X11") target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE abseil) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE dl) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE farmhash) -target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} INTERFACE Open_CL_Headers) +target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE OpenCL_Headers) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE fp16) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE TensorFlowGpu) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE onert_core) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${LIB_ONERT_BACKEND_CL_COMMON}) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_common) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_coverage) +if (${TARGET_OS} MATCHES "tizen") + target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE flatbuffers) +else() + target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE flatbuffers::flatbuffers) +endif () set_target_properties(${LIB_ONERT_BACKEND_GPU_CL} PROPERTIES OUTPUT_NAME backend_gpu_cl) @@ -55,4 +88,8 @@ if(CMAKE_BUILD_TYPE_LC STREQUAL "release") COMMAND ${CMAKE_STRIP} "--strip-unneeded" $) endif() +add_library(tflite_ignore_warnings INTERFACE) +target_compile_options(tflite_ignore_warnings INTERFACE -Wno-unused-parameter -Wno-sign-compare) +target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE tflite_ignore_warnings) + install(TARGETS ${LIB_ONERT_BACKEND_GPU_CL} DESTINATION lib) diff --git a/runtime/onert/backend/gpu_cl/ClFunction.h b/runtime/onert/backend/gpu_cl/ClFunction.h index 5e8a11a84..6afbd4910 100644 --- a/runtime/onert/backend/gpu_cl/ClFunction.h +++ b/runtime/onert/backend/gpu_cl/ClFunction.h @@ -22,9 +22,9 @@ #include #include -#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h" -#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" #include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_operation.h" namespace onert { @@ -35,53 +35,51 @@ namespace gpu_cl class ClFunction : public ::onert::exec::IFunction { public: - ClFunction() : _gpu_operations(), _creation_context() {} - -public: - void configure(std::shared_ptr creation_context) + ClFunction(std::shared_ptr creation_context) + : _creation_context(creation_context), _gpu_operations() { - _creation_context = creation_context; } - void add_operation(std::unique_ptr gpu_operation) +public: + void add_operation(tflite::gpu::cl::ClOperation *gpu_operation) { - _gpu_operations.push_back(std::move(gpu_operation)); + _gpu_operations.push_back(gpu_operation); } void run() override { - for (const auto &gpu_operation : _gpu_operations) + for (const auto gpu_operation : _gpu_operations) { if (!gpu_operation->AddToQueue(_creation_context->queue).ok()) { throw std::runtime_error("Failed to AddToQueue."); } - if (!_creation_context->queue->WaitForCompletion().ok()) - { - throw std::runtime_error("Failed to WaitForCompletion."); - } } } void prepare() override { - for (const auto &gpu_operation : _gpu_operations) + for (const auto gpu_operation : _gpu_operations) { + if (!gpu_operation->GetGpuOperation().AssembleCode(_creation_context->GetGpuInfo()).ok()) + { + throw std::runtime_error("Failed to AssembleCode."); + } if (!gpu_operation->Compile(*_creation_context).ok()) { throw std::runtime_error("Failed to Compile."); } - if (!gpu_operation->UpdateParams().ok()) { throw std::runtime_error("Failed to UpdateParams."); } + gpu_operation->GetGpuOperation().args_.ReleaseCPURepresentation(); } } private: - std::vector> _gpu_operations; std::shared_ptr _creation_context; + std::vector _gpu_operations; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/Config.h b/runtime/onert/backend/gpu_cl/Config.h index 6a455bbb5..f8f94aaf4 100644 --- a/runtime/onert/backend/gpu_cl/Config.h +++ b/runtime/onert/backend/gpu_cl/Config.h @@ -41,9 +41,6 @@ public: bool supportDynamicTensor() override { return false; } bool supportFP16() override { return true; } std::unique_ptr timer() override { return std::make_unique(); } - -private: - void *_handle = nullptr; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.cc b/runtime/onert/backend/gpu_cl/KernelGenerator.cc index 04edc3928..a24c4f59c 100644 --- a/runtime/onert/backend/gpu_cl/KernelGenerator.cc +++ b/runtime/onert/backend/gpu_cl/KernelGenerator.cc @@ -23,10 +23,11 @@ #include "TensorManager.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" -#include "tensorflow/lite/delegates/gpu/cl/tensor.h" -#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h" -#include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h" -#include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h" +#include "tensorflow/lite/delegates/gpu/common/tensor.h" +#include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h" +#include "tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h" +#include "tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h" +#include "tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h" #include "ir/Operations.h" #include "ir/Operations.Include.h" @@ -38,9 +39,6 @@ #include "util/logging.h" #include "util/Utils.h" -using namespace tflite::gpu; -using namespace tflite::gpu::cl; - namespace onert { namespace backend @@ -48,39 +46,170 @@ namespace backend namespace gpu_cl { -HW ToHW(int32_t h, int32_t w) { return HW(h > 0 ? h : 1, w > 0 ? w : 1); } +void KernelGenerator::addClNode(const std::vector &inputs, + const std::vector &outputs, + std::unique_ptr gpu_op) +{ + tflite::gpu::cl::CLNode cl_node; + cl_node.cl_operation.Init(std::move(gpu_op)); + cl_node.inputs.resize(inputs.size()); + for (size_t i = 0; i < inputs.size(); ++i) + { + cl_node.inputs[i] = inputs[i].value(); + } + cl_node.outputs.resize(outputs.size()); + for (size_t i = 0; i < outputs.size(); ++i) + { + cl_node.outputs[i] = outputs[i].value(); + } + _nodes.push_back(std::move(cl_node)); + _operation_indexes.push_back(_operation_index); + return; +} + +void KernelGenerator::get_operation(FunctionMap &Functions) +{ + size_t size = _nodes.size(); + size_t i = 0; + for (auto &&it : Functions) + { + auto index = it.first; + auto node_index = _operation_indexes[i]; + while (index == node_index) + { + auto &fn_seq = it.second; + auto &node = _nodes[i++]; + for (size_t j = 0; j < node.inputs.size(); ++j) + { + uint32_t idx = node.inputs[j]; + node.cl_operation.GetGpuOperation().SetSrc( + _tensor_reg->getClTensor(ir::OperandIndex{idx})->handle(), j); + } + for (size_t j = 0; j < node.outputs.size(); ++j) + { + uint32_t idx = node.outputs[j]; + node.cl_operation.GetGpuOperation().SetDst( + _tensor_reg->getClTensor(ir::OperandIndex{idx})->handle(), j); + } + fn_seq->iterate([&](exec::IFunction &ifunc) { + static_cast(ifunc).add_operation(&node.cl_operation); + }); + if (i == size) + { + break; + } + node_index = _operation_indexes[i]; + } + if (i == size) + { + break; + } + } +} -template -void UpdatePadding(const ir::PaddingType type, const BHWC &input_shape, AttrT *attr) +absl::Status KernelGenerator::readConstTensor(const ir::OperandIndex &index, + tflite::gpu::TensorOrScalar *param) { - if (type == ir::PaddingType::SAME) + const auto shape = _ctx.at(index).shape(); + if (shape.rank() == 0 && shape.num_elements() == 1) { - attr->padding = CalculateSamePadding(input_shape, *attr); + tflite::gpu::Tensor tensor; + tensor.shape.v = 1; + tensor.data.resize(1); + std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize()); + *param = tensor.data[0]; } else { - attr->padding.prepended = HW(0, 0); - attr->padding.appended = HW(0, 0); + if (CheckIfLinearConvertible(&shape)) + { + tflite::gpu::Tensor tensor; + tensor.shape.v = shape.dim(shape.rank() - 1); + tensor.data.resize(shape.num_elements()); + std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize()); + *param = std::move(tensor); + } + else + { + tflite::gpu::Tensor tensor; + if (shape.rank() == 3) + { + tensor.shape.h = shape.dim(0); + tensor.shape.w = shape.dim(1); + tensor.shape.c = shape.dim(2); + } + else if (shape.rank() == 4) + { + if (shape.dim(0) != 1) + { + return absl::UnimplementedError("Batch size is not equal to 1."); + } + tensor.shape.h = shape.dim(1); + tensor.shape.w = shape.dim(2); + tensor.shape.c = shape.dim(3); + } + else + { + return absl::InvalidArgumentError( + "Expected a 3D tensor of shape HxWxC or a 4D tensor of shape 1xHxWxC."); + } + tensor.data.resize(shape.num_elements()); + std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize()); + *param = std::move(tensor); + } } + return absl::OkStatus(); } -PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir) +absl::Status KernelGenerator::readConstTensor( + const ir::OperandIndex &index, + absl::variant, + tflite::gpu::Tensor> *alpha) { - switch (type_ir) + const auto shape = _ctx.at(index).shape(); + if (CheckIfLinearConvertible(&shape)) { - case ir::operation::Pool2D::PoolType::AVG: - return PoolingType::AVERAGE; - case ir::operation::Pool2D::PoolType::MAX: - return PoolingType::MAX; - default: - throw std::runtime_error("gpu_Cl KernelGenerator : Not supported operation yet"); + tflite::gpu::Tensor tensor; + tensor.shape.v = shape.dim(shape.rank() - 1); + tensor.data.resize(shape.num_elements()); + std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize()); + *alpha = std::move(tensor); } + else + { + tflite::gpu::Tensor tensor; + if (shape.rank() == 3) + { + tensor.shape.h = shape.dim(0); + tensor.shape.w = shape.dim(1); + tensor.shape.c = shape.dim(2); + } + else if (shape.rank() == 4) + { + if (shape.dim(0) != 1) + { + return absl::UnimplementedError("Batch size is not equal to 1."); + } + tensor.shape.h = shape.dim(1); + tensor.shape.w = shape.dim(2); + tensor.shape.c = shape.dim(3); + } + else + { + return absl::InvalidArgumentError( + "Expected a 3D tensor of shape HxWxC or a 4D tensor of shape 1xHxWxC."); + } + tensor.data.resize(shape.num_elements()); + std::memcpy(&tensor.data[0], _ctx.at(index).data()->base(), _ctx.at(index).operandSize()); + *alpha = std::move(tensor); + } + return absl::OkStatus(); } -KernelGenerator::KernelGenerator(const ir::Graph &graph, - const std::shared_ptr &tensor_builder, - const std::shared_ptr &tensor_reg, - const std::shared_ptr &creation_context) +KernelGenerator::KernelGenerator( + const ir::Graph &graph, const std::shared_ptr &tensor_builder, + const std::shared_ptr &tensor_reg, + const std::shared_ptr &creation_context) : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()), _current_layout{graph.layout()}, _tensor_builder(tensor_builder), _tensor_reg(tensor_reg), _creation_context(creation_context) @@ -89,13 +218,13 @@ KernelGenerator::KernelGenerator(const ir::Graph &graph, std::unique_ptr KernelGenerator::generate(ir::OperationIndex ind) { - auto ret = std::make_unique(); - ret->enableDynamicShapeInferer(false); - + auto fn_seq = std::make_unique(); + fn_seq->enableDynamicShapeInferer(false); + _operation_index = ind; const auto &op = _graph.operations().at(ind); op.accept(*this); - ret->append(releaseFunction()); - return ret; + fn_seq->append(releaseFunction()); + return fn_seq; } void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) @@ -104,63 +233,66 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)}; - // const auto activation = node.param().activation; + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; + + const bool lhs_const = _ctx.at(lhs_index).isConstant(); + const bool rhs_const = _ctx.at(rhs_index).isConstant(); + + if (lhs_const && rhs_const) + { + throw std::runtime_error("No runtime input tensors for " + node.name()); + } + + auto fn = std::make_unique(_creation_context); + std::unique_ptr gpu_op; - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; + tflite::gpu::OperationType op_type = convertArithmeticType(node.param().arithmetic_type); - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(lhs_index)->descriptor); - auto lhs_shape = _tensor_reg->getClTensorReserver(lhs_index)->shape; + if (!lhs_const && !rhs_const) + { + auto lhs_shape = _tensor_reg->getClTensor(lhs_index)->get_info()._shape; + auto rhs_shape = _tensor_reg->getClTensor(rhs_index)->get_info()._shape; + + bool swap = + (op_type == tflite::gpu::OperationType::MUL) && + (lhs_shape.h <= rhs_shape.h && lhs_shape.w <= rhs_shape.w && lhs_shape.c <= rhs_shape.c); - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(rhs_index)->descriptor); - auto rhs_shape = _tensor_reg->getClTensorReserver(rhs_index)->shape; + auto first_index = swap ? rhs_index : lhs_index; + auto second_index = swap ? lhs_index : rhs_index; - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor); - auto out_shape = _tensor_reg->getClTensorReserver(ofm_index)->shape; + op_def.src_tensors.push_back(_tensor_reg->getClTensor(first_index)->get_info()._desc); + op_def.src_tensors.push_back(_tensor_reg->getClTensor(second_index)->get_info()._desc); + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc); - auto fn = std::make_unique(); + auto second_shape = _tensor_reg->getClTensor(second_index)->get_info()._shape; - std::unique_ptr gpu_op; - switch (node.param().arithmetic_type) + tflite::gpu::GPUOperation operation = CreateElementwiseTwoInput(op_def, op_type, second_shape); + gpu_op = std::make_unique(std::move(operation)); + + addClNode({first_index, second_index}, {ofm_index}, std::move(gpu_op)); + } + else { - case ir::operation::BinaryArithmetic::ArithmeticType::ADD: - { - std::vector channels(2); - channels[0] = lhs_shape.c; - channels[1] = rhs_shape.c; - SelectAdd(op_def, channels, out_shape.c, &gpu_op); - - auto ofm_tensor = _tensor_reg->getClTensor(ofm_index); - auto lhs_tensor = _tensor_reg->getClTensor(lhs_index); - auto rhs_tensor = _tensor_reg->getClTensor(rhs_index); - gpu_op->SetSrc(lhs_tensor->handle(), ir::operation::BinaryArithmetic::Input::LHS); - gpu_op->SetSrc(rhs_tensor->handle(), ir::operation::BinaryArithmetic::Input::RHS); - gpu_op->SetDst(ofm_tensor->handle(), 0); - - fn->configure(_creation_context); - fn->add_operation(std::move(gpu_op)); - break; - } - case ir::operation::BinaryArithmetic::ArithmeticType::SUB: - { - // NYI - break; - } - case ir::operation::BinaryArithmetic::ArithmeticType::MUL: - { - // NYI - break; - } - case ir::operation::BinaryArithmetic::ArithmeticType::DIV: + auto non_const_index = rhs_const ? lhs_index : rhs_index; + auto const_index = rhs_const ? rhs_index : lhs_index; + + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc); + op_def.src_tensors.push_back(_tensor_reg->getClTensor(non_const_index)->get_info()._desc); + + tflite::gpu::ElementwiseAttributes attr; + + if (!readConstTensor(const_index, &attr.param).ok()) { - // NYI - break; + throw std::runtime_error("BinaryArithmetic unsupported constant tensor"); } - default: - assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations"); - break; - } + tflite::gpu::GPUOperation operation = + CreateElementwise(_creation_context->GetGpuInfo(), op_def, op_type, attr); + gpu_op = absl::make_unique(std::move(operation)); + + addClNode({non_const_index}, {ofm_index}, std::move(gpu_op)); + } _return_fn = std::move(fn); } @@ -174,30 +306,30 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) const auto param = node.param(); - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input)->descriptor); + op_def.src_tensors.push_back(_tensor_reg->getClTensor(input)->get_info()._desc); - auto input_shape = _tensor_reg->getClTensorReserver(input)->shape; - auto kernel_shape = _tensor_reg->getClTensorReserver(kernel)->shape; - auto output_shape = _tensor_reg->getClTensorReserver(output)->shape; - auto bias_shape = _tensor_reg->getClTensorReserver(bias)->shape; + auto input_shape = _tensor_reg->getClTensor(input)->get_info()._shape; + auto kernel_shape = _tensor_reg->getClTensor(kernel)->get_info()._shape; + auto output_shape = _tensor_reg->getClTensor(output)->get_info()._shape; + auto bias_shape = _tensor_reg->getClTensor(bias)->get_info()._shape; - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor); + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output)->get_info()._desc); - ModelHints hints; - std::unique_ptr gpu_op; // = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph); + tflite::gpu::ModelHints hints; + std::unique_ptr + gpu_op; // = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph); - auto input_tensor = _tensor_reg->getClTensor(input); auto kernel_tensor = _tensor_reg->getClTensor(kernel); auto bias_tensor = _tensor_reg->getClTensor(bias); - auto output_tensor = _tensor_reg->getClTensor(output); - Convolution2DAttributes attr; + tflite::gpu::Convolution2DAttributes attr; attr.strides = ToHW(param.stride.vertical, param.stride.horizontal); - attr.dilations = HW(std::max(static_cast(1), param.dilation.height_factor), - std::max(static_cast(1), param.dilation.width_factor)); + attr.dilations = + tflite::gpu::HW(std::max(static_cast(1), param.dilation.height_factor), + std::max(static_cast(1), param.dilation.width_factor)); bool is_weight = (_ctx.at(kernel).isConstant() ? true : false); @@ -220,12 +352,14 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) UpdatePadding(param.padding.type, input_shape, &attr); - gpu_op = SelectConvolution(attr, output_shape, _creation_context->GetDeviceInfo(), op_def, hints); - gpu_op->SetSrc(input_tensor->handle(), ir::operation::Conv2D::INPUT); + gpu_op = SelectConvolution(attr, output_shape, _creation_context->GetGpuInfo(), op_def, hints); - auto fn = std::make_unique(); + tflite::gpu::cl::CLNode cl_node; + cl_node.inputs.resize(1); + cl_node.inputs[0] = input.value(); + cl_node.outputs.resize(1); - fn->configure(_creation_context); + auto fn = std::make_unique(_creation_context); const auto activation = node.param().activation; @@ -233,47 +367,43 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) { case ir::Activation::NONE: { - gpu_op->SetDst(output_tensor->handle(), 0); - fn->add_operation(std::move(gpu_op)); + addClNode({input}, {output}, std::move(gpu_op)); break; } + case ir::Activation::RELU: case ir::Activation::RELU6: { - std::unique_ptr gpu_op_1; - OperationDef op_def_1; - std::shared_ptr new_tensor = std::make_shared(); - - _new_tensors[output] = new_tensor; - if (!CreateTensor(*_creation_context->context, output_shape, - _tensor_reg->getClTensorReserver(output)->descriptor, new_tensor.get()) - .ok()) - { - throw std::runtime_error("Error CreateTensor."); - } + std::unique_ptr gpu_op_1; + tflite::gpu::OperationDef op_def_1; + const auto shape = _ctx.at(output).shape(); + auto new_ind = _tensor_reg->addNewClTensor(shape); + + addClNode({input}, {new_ind}, std::move(gpu_op)); - gpu_op->SetDst(new_tensor.get(), 0); - fn->add_operation(std::move(gpu_op)); - op_def_1.precision = CalculationsPrecision::F32; - op_def_1.src_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor); - op_def_1.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor); + op_def_1.precision = tflite::gpu::CalculationsPrecision::F32; + op_def_1.src_tensors.push_back(_tensor_reg->getClTensor(output)->get_info()._desc); + op_def_1.dst_tensors.push_back(_tensor_reg->getClTensor(output)->get_info()._desc); - // - ReLU6: clip = 6, alpha = 0 - ReLUAttributes attr_1; - attr_1.clip = 6; + tflite::gpu::ReLUAttributes attr_1; + if (activation == ir::Activation::RELU6) + { + attr_1.clip = 6; + } + else + { + attr_1.clip = 0; + } attr_1.alpha = 0; gpu_op_1 = SelectReLU(attr_1, op_def_1); - gpu_op_1->SetSrc(new_tensor.get(), 0); - gpu_op_1->SetDst(output_tensor->handle(), 0); - fn->add_operation(std::move(gpu_op_1)); + addClNode({new_ind}, {output}, std::move(gpu_op_1)); break; } default: { - throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet"); + throw std::runtime_error("gpu_cl KernelGenerator : Not supported Conv2D activiation"); } } - _return_fn = std::move(fn); } @@ -292,28 +422,23 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; - auto ofm_tensor = _tensor_reg->getClTensor(ofm_index); - auto ifm_tensor = _tensor_reg->getClTensor(ifm_index); - auto ker_tensor = _tensor_reg->getClTensor(ker_index); - auto bias_tensor = _tensor_reg->getClTensor(bias_index); - bool is_weight = (_ctx.at(ker_index).isConstant() ? true : false); - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(ifm_index)->descriptor); - auto input_shape = _tensor_reg->getClTensorReserver(ifm_index)->shape; + op_def.src_tensors.push_back(_tensor_reg->getClTensor(ifm_index)->get_info()._desc); + auto input_shape = _tensor_reg->getClTensor(ifm_index)->get_info()._shape; - auto ker_shape = _tensor_reg->getClTensorReserver(ker_index)->shape; + auto ker_shape = _tensor_reg->getClTensor(ker_index)->get_info()._shape; - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor); - auto out_shape = _tensor_reg->getClTensorReserver(ofm_index)->shape; - auto bias_shape = _tensor_reg->getClTensorReserver(bias_index)->shape; + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc); + auto out_shape = _tensor_reg->getClTensor(ofm_index)->get_info()._shape; + auto bias_shape = _tensor_reg->getClTensor(bias_index)->get_info()._shape; - DepthwiseConvolution2DAttributes attr; + tflite::gpu::DepthwiseConvolution2DAttributes attr; attr.strides = ToHW(stride.vertical, stride.horizontal); - attr.dilations = HW(std::max(static_cast(1), dilation.height_factor), - std::max(static_cast(1), dilation.width_factor)); + attr.dilations = tflite::gpu::HW(std::max(static_cast(1), dilation.height_factor), + std::max(static_cast(1), dilation.width_factor)); if (is_weight) { @@ -323,12 +448,14 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) attr.weights.shape.w = ker_shape.w; attr.weights.shape.i = ker_shape.c; attr.weights.data.resize(ker_shape.DimensionsProduct()); - memcpy(attr.weights.data.data(), _ctx.at(ker_index).data()->base(), ker_tensor->total_size()); + memcpy(attr.weights.data.data(), _ctx.at(ker_index).data()->base(), + _ctx.at(ker_index).operandSize()); } attr.bias.id = bias_index.value(); attr.bias.shape.v = bias_shape.b != 1 ? bias_shape.b : bias_shape.c; attr.bias.data.resize(bias_shape.DimensionsProduct()); - memcpy(attr.bias.data.data(), _ctx.at(bias_index).data()->base(), bias_tensor->total_size()); + memcpy(attr.bias.data.data(), _ctx.at(bias_index).data()->base(), + _ctx.at(bias_index).operandSize()); UpdatePadding(padding.type, input_shape, &attr); if (multiplier != 1) @@ -338,7 +465,7 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const int filter_width = ker_shape.w; const int output_depth = out_shape.c; - tflite::gpu::Tensor weights; + tflite::gpu::Tensor weights; weights.id = attr.weights.id; weights.shape = tflite::gpu::OHWI(output_depth, filter_height, filter_width, input_depth); weights.data.resize(weights.shape.DimensionsProduct()); @@ -356,12 +483,12 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) attr.weights = std::move(weights); } - auto fn = std::make_unique(); - std::unique_ptr gpu_op; + auto fn = std::make_unique(_creation_context); + std::unique_ptr gpu_op; if (is_weight) { - gpu_op = SelectDWConvolution(attr, _creation_context->GetDeviceInfo(), op_def); + gpu_op = SelectDWConvolution(attr, _creation_context->GetGpuInfo(), op_def); } else { @@ -370,57 +497,51 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) throw std::runtime_error( "No support of depthwise runtime weights with channel multiplier != 1"); } - gpu_op = SelectDWConvolutionDynamicWeights(attr, _creation_context->GetDeviceInfo(), op_def); + gpu_op = SelectDWConvolutionDynamicWeights(attr, _creation_context->GetGpuInfo(), op_def); } - gpu_op->SetSrc(ifm_tensor->handle(), ir::operation::DepthwiseConv2D::Input::INPUT); - - fn->configure(_creation_context); - const auto activation = node.param().activation; switch (activation) { case ir::Activation::NONE: { - gpu_op->SetDst(ofm_tensor->handle(), 0); - fn->add_operation(std::move(gpu_op)); + addClNode({ifm_index}, {ofm_index}, std::move(gpu_op)); break; } + case ir::Activation::RELU: case ir::Activation::RELU6: { - std::unique_ptr gpu_op_1; - OperationDef op_def_1; - std::shared_ptr new_tensor = std::make_shared(); - - _new_tensors[ofm_index] = new_tensor; - if (!CreateTensor(*_creation_context->context, out_shape, - _tensor_reg->getClTensorReserver(ofm_index)->descriptor, new_tensor.get()) - .ok()) - { - throw std::runtime_error("Error CreateTensor."); - } + std::unique_ptr gpu_op_1; + tflite::gpu::OperationDef op_def_1; + const auto shape = _ctx.at(ofm_index).shape(); + auto new_ind = _tensor_reg->addNewClTensor(shape); + + addClNode({ifm_index}, {new_ind}, std::move(gpu_op)); - gpu_op->SetDst(new_tensor.get(), 0); - fn->add_operation(std::move(gpu_op)); - op_def_1.precision = CalculationsPrecision::F32; - op_def_1.src_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor); - op_def_1.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor); + op_def_1.precision = tflite::gpu::CalculationsPrecision::F32; - // - ReLU6: clip = 6, alpha = 0 - ReLUAttributes attr_1; - attr_1.clip = 6; + op_def_1.src_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc); + op_def_1.dst_tensors.push_back(_tensor_reg->getClTensor(ofm_index)->get_info()._desc); + + tflite::gpu::ReLUAttributes attr_1; + if (activation == ir::Activation::RELU6) + { + attr_1.clip = 6; + } + else + { + attr_1.clip = 0; + } attr_1.alpha = 0; gpu_op_1 = SelectReLU(attr_1, op_def_1); - gpu_op_1->SetSrc(new_tensor.get(), 0); - gpu_op_1->SetDst(ofm_tensor->handle(), 0); - fn->add_operation(std::move(gpu_op_1)); + addClNode({new_ind}, {ofm_index}, std::move(gpu_op_1)); break; } default: { - throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet"); + throw std::runtime_error("gpu_cl KernelGenerator : Not supported DepthwiseConv2D acvivation"); } } @@ -429,26 +550,23 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node) { - std::unique_ptr gpu_op; - auto fn = std::make_unique(); + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)}; + + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc); + op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc); + + std::unique_ptr gpu_op; + auto fn = std::make_unique(_creation_context); switch (node.param().op_type) { case ir::operation::ElementwiseActivation::Type::LEAKY_RELU: case ir::operation::ElementwiseActivation::Type::RELU: { - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{ - node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)}; - - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; - auto output_tensor = _tensor_reg->getClTensor(output_index); - auto input_tensor = _tensor_reg->getClTensor(input_index); - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor); - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor); - - ReLUAttributes attr; + tflite::gpu::ReLUAttributes attr; if (ir::operation::ElementwiseActivation::Type::LEAKY_RELU == node.param().op_type) { attr.alpha = node.param().alpha; @@ -460,17 +578,33 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node) attr.clip = node.param().alpha; } gpu_op = SelectReLU(attr, op_def); - gpu_op->SetSrc(input_tensor->handle(), ir::operation::ElementwiseActivation::Input::INPUT); - gpu_op->SetDst(output_tensor->handle(), 0); - fn->configure(_creation_context); - fn->add_operation(std::move(gpu_op)); - - _return_fn = std::move(fn); + break; + } + case ir::operation::ElementwiseActivation::Type::LOGISTIC: + { + if (_ctx.at(input_index).typeInfo().type() != ir::DataType::FLOAT32) + { + throw std::runtime_error{"Unsupported data type of LOGISTIC"}; + } + tflite::gpu::GPUOperation operation = + CreateElementwiseOneInput(_creation_context->GetGpuInfo(), op_def, + convertElementwiseActivationType(node.param().op_type)); + gpu_op = std::make_unique(std::move(operation)); + break; + } + case ir::operation::ElementwiseActivation::Type::TANH: + { + tflite::gpu::GPUOperation operation = CreateElementwiseOneInput( + _creation_context->GetGpuInfo(), op_def, tflite::gpu::OperationType::TANH); + gpu_op = std::make_unique(std::move(operation)); break; } default: - throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet"); + throw std::runtime_error( + "gpu_cl KernelGenerator : Not supported operation on ElementwiseActivation"); } + addClNode({input_index}, {output_index}, std::move(gpu_op)); + _return_fn = std::move(fn); } void KernelGenerator::visit(const ir::operation::Pool2D &node) @@ -478,24 +612,24 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)}; - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor); - auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape; + op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc); + auto input_shape = _tensor_reg->getClTensor(input_index)->get_info()._shape; - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor); + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc); const auto kh = node.param().kh; const auto kw = node.param().kw; const auto stride = node.param().stride; const auto op_type = convertPoolType(node.param().op_type); - Pooling2DAttributes attributes; + tflite::gpu::Pooling2DAttributes attributes; attributes.type = op_type; - attributes.kernel = HW(kh > 0 ? kh : 1, kw > 0 ? kw : 1); - attributes.strides = - HW(stride.vertical > 0 ? stride.vertical : 1, stride.horizontal > 0 ? stride.horizontal : 1); + attributes.kernel = tflite::gpu::HW(kh > 0 ? kh : 1, kw > 0 ? kw : 1); + attributes.strides = tflite::gpu::HW(stride.vertical > 0 ? stride.vertical : 1, + stride.horizontal > 0 ? stride.horizontal : 1); if (node.param().padding.type == ir::PaddingType::SAME) { @@ -503,23 +637,15 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node) } else { - attributes.padding.prepended = HW(0, 0); - attributes.padding.appended = HW(0, 0); + attributes.padding.prepended = tflite::gpu::HW(0, 0); + attributes.padding.appended = tflite::gpu::HW(0, 0); } - auto fn = std::make_unique(); - std::unique_ptr gpu_op; + auto fn = std::make_unique(_creation_context); + std::unique_ptr gpu_op; gpu_op = SelectPooling(attributes, op_def); - auto input_tensor = _tensor_reg->getClTensor(input_index); - auto output_tensor = _tensor_reg->getClTensor(output_index); - - gpu_op->SetSrc(input_tensor->handle(), ir::operation::Pool2D::Input::INPUT); - gpu_op->SetDst(output_tensor->handle(), 0); - - fn->configure(_creation_context); - fn->add_operation(std::move(gpu_op)); - + addClNode({input_index}, {output_index}, std::move(gpu_op)); _return_fn = std::move(fn); } @@ -528,31 +654,24 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor); - auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape; + op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc); + auto input_shape = _tensor_reg->getClTensor(input_index)->get_info()._shape; - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor); - auto output_shape = _tensor_reg->getClTensorReserver(output_index)->shape; + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc); + auto output_shape = _tensor_reg->getClTensor(output_index)->get_info()._shape; - ReshapeAttributes attr; + tflite::gpu::ReshapeAttributes attr; attr.new_shape = output_shape; - auto fn = std::make_unique(); - std::unique_ptr gpu_op; + auto fn = std::make_unique(_creation_context); + std::unique_ptr gpu_op; const int src_channels = input_shape.c; SelectReshape(src_channels, attr.new_shape.c, op_def, &gpu_op); - auto input_tensor = _tensor_reg->getClTensor(input_index); - auto output_tensor = _tensor_reg->getClTensor(output_index); - gpu_op->SetSrc(input_tensor->handle(), ir::operation::Reshape::Input::INPUT); - gpu_op->SetDst(output_tensor->handle(), 0); - - fn->configure(_creation_context); - fn->add_operation(std::move(gpu_op)); - + addClNode({input_index}, {output_index}, std::move(gpu_op)); _return_fn = std::move(fn); } @@ -568,27 +687,20 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) throw std::runtime_error("Softmax.beta != 1 is not supported in gpu_cl"); } - OperationDef op_def; - op_def.precision = CalculationsPrecision::F32; + tflite::gpu::OperationDef op_def; + op_def.precision = tflite::gpu::CalculationsPrecision::F32; - op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor); + op_def.dst_tensors.push_back(_tensor_reg->getClTensor(output_index)->get_info()._desc); - op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor); - auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape; + op_def.src_tensors.push_back(_tensor_reg->getClTensor(input_index)->get_info()._desc); + auto input_shape = _tensor_reg->getClTensor(input_index)->get_info()._shape; - auto fn = std::make_unique(); + auto fn = std::make_unique(_creation_context); - std::unique_ptr gpu_op; + std::unique_ptr gpu_op; SelectSoftmax(input_shape, op_def, &gpu_op); - auto output_tensor = _tensor_reg->getClTensor(output_index); - auto input_tensor = _tensor_reg->getClTensor(input_index); - - gpu_op->SetSrc(input_tensor->handle(), ir::operation::Softmax::Input::INPUT); - gpu_op->SetDst(output_tensor->handle(), 0); - - fn->configure(_creation_context); - fn->add_operation(std::move(gpu_op)); + addClNode({input_index}, {output_index}, std::move(gpu_op)); _return_fn = std::move(fn); } diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.h b/runtime/onert/backend/gpu_cl/KernelGenerator.h index 91fd3cd9d..5e8c2621f 100644 --- a/runtime/onert/backend/gpu_cl/KernelGenerator.h +++ b/runtime/onert/backend/gpu_cl/KernelGenerator.h @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -46,6 +47,8 @@ public: std::unique_ptr generate(ir::OperationIndex ind) override; + void get_operation(FunctionMap &Functions); + private: void visit(const ir::operation::BinaryArithmetic &) override; void visit(const ir::operation::Conv2D &) override; @@ -54,6 +57,14 @@ private: void visit(const ir::operation::Pool2D &) override; void visit(const ir::operation::Reshape &) override; void visit(const ir::operation::Softmax &) override; + absl::Status readConstTensor(const ir::OperandIndex &index, tflite::gpu::TensorOrScalar *param); + absl::Status readConstTensor( + const ir::OperandIndex &index, + absl::variant, + tflite::gpu::Tensor> *alpha); + void addClNode(const std::vector &inputs, + const std::vector &outputs, + std::unique_ptr gpu_op); private: const ir::Operands &_ctx; @@ -62,7 +73,9 @@ private: std::shared_ptr _tensor_builder; std::shared_ptr _tensor_reg; std::shared_ptr _creation_context; - ir::OperandIndexMap> _new_tensors; + std::vector _nodes; + ir::OperationIndex _operation_index; + std::vector _operation_indexes; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/MemoryManager.h b/runtime/onert/backend/gpu_cl/MemoryManager.h index a3b9b39de..4b34c39b9 100644 --- a/runtime/onert/backend/gpu_cl/MemoryManager.h +++ b/runtime/onert/backend/gpu_cl/MemoryManager.h @@ -17,17 +17,18 @@ #ifndef __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__ #define __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__ -#include "ex/InferenceContextEx.h" #include "operand/CLTensor.h" #include "ir/OperandIndexMap.h" #include "ir/OperandInfo.h" #include "util/logging.h" +#include "tensorflow/lite/delegates/gpu/spi.h" #include "tensorflow/lite/delegates/gpu/cl/cl_context.h" +#include "tensorflow/lite/delegates/gpu/cl/inference_context.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h" #include "tensorflow/lite/delegates/gpu/common/status.h" -#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h" -#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" +#include "tensorflow/lite/delegates/gpu/common/task/storage_type_util.h" #include @@ -41,24 +42,31 @@ namespace gpu_cl class MemoryManager { public: - MemoryManager(tflite::gpu::cl::CLContext *context) : _context{context} {} + MemoryManager(tflite::gpu::cl::CLContext *context, tflite::gpu::CreateGpuModelInfo create_info, + const std::shared_ptr &environment) + : _context{context}, _create_info{create_info}, _environment{environment} + { + } ~MemoryManager() = default; void allocate(void) { + std::unique_ptr converter_builder = + NewConverterBuilder(_environment.get()); for (const auto &tensor_entry : _tensors) { auto tensor = tensor_entry.second; auto type = tensor->get_type(); - // if (type == TensorType::TENSOR_TYPE_DELETE) { - // continue; - // } + if (type == TensorType::TENSOR_TYPE_DELETE) + { + continue; + } + + const auto &shape = tensor->get_info()._shape; + const auto &descriptor = tensor->get_info()._desc; - const auto &t = tensor_reserver_.Get(tensor_entry.first.value()); - const auto &shape = t->shape; - const auto &descriptor = t->descriptor; if (!CreateTensor(*_context, shape, descriptor, tensor->handle()).ok()) { std::runtime_error("Failed to CreateTensor"); @@ -66,10 +74,10 @@ public: switch (type) { case TensorType::TENSOR_TYPE_INPUT: - tensor->writeConvertInit(); + tensor->writeConvertInit(converter_builder.get(), _environment); break; case TensorType::TENSOR_TYPE_OUTPUT: - tensor->readConvertInit(); + tensor->readConvertInit(converter_builder.get(), _environment); break; default: break; @@ -89,65 +97,60 @@ public: { /* DO NOTHING */ } - void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, - tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, - std::shared_ptr environment, - tflite::gpu::cl::DeviceInfo &device_info, TensorType type) + void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, TensorType type) { - tflite::gpu::ValueId max_id = 0; - auto data_type = DeduceDataTypeFromPrecision(create_info.precision); - const auto shape = info.shape(); + auto data_type = DeduceDataTypeFromPrecision(_create_info.precision); - auto tensor = std::make_shared(shape.rank(), shape, environment, type); - _tensors[ind] = tensor; - tflite::gpu::BHWC t_shape; - switch (shape.rank()) + tflite::gpu::BHWC BHWC_shape = ToBHWC(info.shape()); + + tflite::gpu::TensorStorageType storage_type = _create_info.storage_type; + tflite::gpu::Layout layout = + BHWC_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC; + + if (!SelectBestStorageType(_environment->device().GetInfo(), BHWC_shape, storage_type, + data_type, layout, &storage_type) + .ok()) { - case 1: - // B layout - t_shape = tflite::gpu::BHWC(shape.dim(0), 1, 1, 1); - break; - case 2: - // BC layout - t_shape = tflite::gpu::BHWC(shape.dim(0), 1, 1, shape.dim(1)); - break; - case 3: - // BWC layout - t_shape = tflite::gpu::BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2)); - break; - case 4: - // BHWC layout - t_shape = tflite::gpu::BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3)); - break; - default: - break; + throw std::runtime_error("Failed to SelectBestStorageType"); } + auto tensor = std::make_shared( + info.shape().rank(), type, BHWC_shape, + tflite::gpu::TensorDescriptor{data_type, storage_type, layout}); + _tensors[ind] = tensor; + } - tflite::gpu::cl::TensorStorageType storage_type = create_info.storage_type; - tflite::gpu::Layout layout = - t_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC; + ir::OperandIndex addTensor(const ir::Shape &shape) + { + auto data_type = DeduceDataTypeFromPrecision(_create_info.precision); - tflite::gpu::ValueId id = ind.value(); - storage_type = - tflite::gpu::cl::SelectBestStorageType(device_info, t_shape, storage_type, data_type, layout); - auto dummy = std::make_shared(); - dummy->shape = t_shape; - dummy->descriptor = tflite::gpu::cl::TensorDescriptor{data_type, storage_type, layout}; - tensor_reserver_.Add(id, dummy); + tflite::gpu::BHWC BHWC_shape = ToBHWC(shape); - max_id = std::max(max_id, id); + tflite::gpu::TensorStorageType storage_type = _create_info.storage_type; + tflite::gpu::Layout layout = + BHWC_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC; - tensor_reserver_.SetNext(max_id + 1); + if (!SelectBestStorageType(_environment->device().GetInfo(), BHWC_shape, storage_type, + data_type, layout, &storage_type) + .ok()) + { + throw std::runtime_error("Failed to SelectBestStorageType"); + } + auto ind = ir::OperandIndex(_new_id--); + auto tensor = std::make_shared( + shape.rank(), TensorType::TENSOR_TYPE_VALID, BHWC_shape, + tflite::gpu::TensorDescriptor{data_type, storage_type, layout}); + _tensors[ind] = tensor; + return ind; } ir::OperandIndexMap> &tensors(void) { return _tensors; } - InferenceContextEx::TensorReserverEx &tensorReservers(void) { return tensor_reserver_; } - private: ir::OperandIndexMap> _tensors; - InferenceContextEx::TensorReserverEx tensor_reserver_; tflite::gpu::cl::CLContext *_context; + tflite::gpu::CreateGpuModelInfo _create_info; + std::shared_ptr _environment; + uint32_t _new_id = UINT32_MAX; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.cc b/runtime/onert/backend/gpu_cl/TensorBuilder.cc index e71733427..318335471 100644 --- a/runtime/onert/backend/gpu_cl/TensorBuilder.cc +++ b/runtime/onert/backend/gpu_cl/TensorBuilder.cc @@ -21,7 +21,6 @@ #include "TensorManager.h" -#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" #include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h" #include "tensorflow/lite/delegates/gpu/cl/cl_device.h" #include "tensorflow/lite/delegates/gpu/cl/inference_context.h" @@ -45,11 +44,8 @@ namespace gpu_cl using UsesType = cl_common::UsesType; -TensorBuilder::TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr, - tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, - const std::shared_ptr &environment) - : _operands{operands}, _tensor_mgr{tensor_mgr}, _create_info{create_info}, _environment{ - environment} +TensorBuilder::TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr) + : _operands{operands}, _tensor_mgr{tensor_mgr} { assert(_tensor_mgr); } @@ -89,9 +85,9 @@ void TensorBuilder::allocate(void) { auto lifetime_map = cl_common::createLifetimeMap(_lifetime_seq, _parent_map); - for (auto &entry : lifetime_map) + for (const auto &entry : lifetime_map) { - auto &use = entry.second; + const auto &use = entry.second; auto use_type = use.first; auto use_index = use.second; assert(use_index.valid()); @@ -118,18 +114,22 @@ void TensorBuilder::buildTensors(void) assert(_tensor_mgr->constTensors().size() == 0); assert(_tensor_mgr->nonconstTensors().size() == 0); // Normal tensors - for (auto &entry : _tensor_info_map) + for (const auto &entry : _tensor_info_map) { - auto ind = entry.first; + const auto &ind = entry.first; if (_parent_map.count(ind) > 0) continue; auto type = _tensor_type_map.at(ind); const auto &info = entry.second; - _tensor_mgr->buildTensor(ind, info, _create_info, _environment, _environment->device().info_, - type); + _tensor_mgr->buildTensor(ind, info, type); } } +ir::OperandIndex TensorBuilder::addTensor(const ir::Shape &shape) +{ + return _tensor_mgr->addTensor(shape); +} + } // namespace gpu_cl } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.h b/runtime/onert/backend/gpu_cl/TensorBuilder.h index 2a5cb8b5e..e0333fef5 100644 --- a/runtime/onert/backend/gpu_cl/TensorBuilder.h +++ b/runtime/onert/backend/gpu_cl/TensorBuilder.h @@ -34,9 +34,7 @@ namespace gpu_cl class TensorBuilder { public: - TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr, - tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, - const std::shared_ptr &environment); + TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr); /** * @brief Register tensor information to allocate on ACL-CL backend @@ -83,6 +81,7 @@ public: private: void buildTensors(void); ir::OperandIndex findRootParent(ir::OperandIndex index); + ir::OperandIndex addTensor(const ir::Shape &shape); private: const ir::Operands &_operands; @@ -92,8 +91,6 @@ private: ir::OperandIndexMap _uses_count_map; std::unique_ptr _tensor_mgr; - tflite::gpu::cl::InferenceContext::CreateInferenceInfo _create_info; - std::shared_ptr _environment; // for linear executor cl_common::LifetimeSeq _lifetime_seq; diff --git a/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h b/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h deleted file mode 100644 index 7290ff5da..000000000 --- a/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ -#define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ - -#include "absl/status/status.h" -#include "tensorflow/lite/delegates/gpu/common/shape.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum TensorType -{ - TENSOR_TYPE_VALID = 0, - TENSOR_TYPE_INPUT = 1, - TENSOR_TYPE_OUTPUT = 2, - TENSOR_TYPE_DELETE = 3 -}; - -absl::Status ExtractAxisFromIndex(int dims, int index, tflite::gpu::Axis *axis); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ diff --git a/runtime/onert/backend/gpu_cl/TensorManager.cc b/runtime/onert/backend/gpu_cl/TensorManager.cc index 9fe0605ac..02e26ed91 100644 --- a/runtime/onert/backend/gpu_cl/TensorManager.cc +++ b/runtime/onert/backend/gpu_cl/TensorManager.cc @@ -42,23 +42,28 @@ void TensorManager::deallocateConsts(void) { _const_mgr->deallocate(); } void TensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); } void TensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, - tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, - std::shared_ptr environment, - tflite::gpu::cl::DeviceInfo &device_info, TensorType type) + TensorType type) { assert(_ind_to_mgr.find(ind) == _ind_to_mgr.end()); if (info.isConstant()) { - _const_mgr->buildTensor(ind, info, create_info, environment, device_info, type); + _const_mgr->buildTensor(ind, info, type); _ind_to_mgr.insert({ind, *_const_mgr}); } else { - _nonconst_mgr->buildTensor(ind, info, create_info, environment, device_info, type); + _nonconst_mgr->buildTensor(ind, info, type); _ind_to_mgr.insert({ind, *_nonconst_mgr}); } } +ir::OperandIndex TensorManager::addTensor(const ir::Shape &shape) +{ + auto ind = _nonconst_mgr->addTensor(shape); + _ind_to_mgr.insert({ind, *_nonconst_mgr}); + + return ind; +} void TensorManager::startLifetime(const ir::OperandIndex &ind) { @@ -96,29 +101,6 @@ ir::OperandIndexMap> &TensorManager::nonconst return _nonconst_mgr->tensors(); } -std::shared_ptr TensorManager::atR(const ir::OperandIndex &ind) -{ - if (_nonconst_mgr->tensorReservers().HaveTensor(ind.value())) - { - return _nonconst_mgr->tensorReservers().Get(ind.value()); - } - else if (_const_mgr->tensorReservers().HaveTensor(ind.value())) - { - return _const_mgr->tensorReservers().Get(ind.value()); - } - return nullptr; -} - -InferenceContextEx::TensorReserverEx &TensorManager::constTensorReservers(void) -{ - return _const_mgr->tensorReservers(); -} - -InferenceContextEx::TensorReserverEx &TensorManager::nonconstTensorReservers(void) -{ - return _nonconst_mgr->tensorReservers(); -} - void TensorManager::iterate(const std::function &fn) { for (auto it : _nonconst_mgr->tensors()) diff --git a/runtime/onert/backend/gpu_cl/TensorManager.h b/runtime/onert/backend/gpu_cl/TensorManager.h index 52abc579a..5b09ac130 100644 --- a/runtime/onert/backend/gpu_cl/TensorManager.h +++ b/runtime/onert/backend/gpu_cl/TensorManager.h @@ -19,8 +19,10 @@ #include "MemoryManager.h" +#include "Utils.h" + #include "tensorflow/lite/delegates/gpu/cl/inference_context.h" -#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h" #include "ir/OperandInfo.h" #include "ir/OperandIndexMap.h" @@ -44,10 +46,8 @@ public: void deallocateConsts(void); void deallocateNonconsts(void); - void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, - tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, - std::shared_ptr environment, - tflite::gpu::cl::DeviceInfo &device_info, TensorType type); + void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, TensorType type); + ir::OperandIndex addTensor(const ir::Shape &shape); std::shared_ptr findTensorAsParent(const ir::OperandIndex &ind); @@ -55,10 +55,6 @@ public: void finishLifetime(const ir::OperandIndex &ind); std::shared_ptr at(const ir::OperandIndex &ind); - std::shared_ptr atR(const ir::OperandIndex &ind); - - InferenceContextEx::TensorReserverEx &constTensorReservers(void); - InferenceContextEx::TensorReserverEx &nonconstTensorReservers(void); ir::OperandIndexMap> &constTensors(void); ir::OperandIndexMap> &nonconstTensors(void); @@ -73,10 +69,14 @@ private: ir::OperandIndexMap _ind_to_mgr; }; -inline TensorManager *createTensorManager(tflite::gpu::cl::CLContext *context) +inline TensorManager * +createTensorManager(tflite::gpu::cl::CLContext *context, + tflite::gpu::CreateGpuModelInfo create_info, + const std::shared_ptr &environment) { VERBOSE(createTensorManager) << "GPU-CL TensorManager" << std::endl; - return new TensorManager(new MemoryManager(context), new MemoryManager(context)); + return new TensorManager(new MemoryManager(context, create_info, environment), + new MemoryManager(context, create_info, environment)); } } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/TensorRegistry.h b/runtime/onert/backend/gpu_cl/TensorRegistry.h index 6f17aff54..be342e9cb 100644 --- a/runtime/onert/backend/gpu_cl/TensorRegistry.h +++ b/runtime/onert/backend/gpu_cl/TensorRegistry.h @@ -44,7 +44,7 @@ public: auto getClTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind).get(); } - auto getClTensorReserver(const ir::OperandIndex &ind) { return _tensor_mgr->atR(ind); } + ir::OperandIndex addNewClTensor(const ir::Shape &shape) { return _tensor_mgr->addTensor(shape); } private: TensorManager *_tensor_mgr; diff --git a/runtime/onert/backend/gpu_cl/Utils.h b/runtime/onert/backend/gpu_cl/Utils.h new file mode 100644 index 000000000..1953c0e43 --- /dev/null +++ b/runtime/onert/backend/gpu_cl/Utils.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ +#define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ + +#include "absl/status/status.h" + +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/operations.h" + +#include "ir/operation/BinaryArithmetic.h" +#include "ir/operation/ElementwiseActivation.h" +#include "ir/operation/ElementwiseBinary.h" +#include "ir/operation/ElementwiseUnary.h" +#include "ir/operation/Pool2D.h" + +namespace onert +{ +namespace backend +{ +namespace gpu_cl +{ + +inline tflite::gpu::HW ToHW(int32_t h, int32_t w) +{ + return tflite::gpu::HW(h > 0 ? h : 1, w > 0 ? w : 1); +} + +template +inline void UpdatePadding(const ir::PaddingType type, const tflite::gpu::BHWC &input_shape, + AttrT *attr) +{ + if (type == ir::PaddingType::SAME) + { + attr->padding = CalculateSamePadding(input_shape, *attr); + } + else + { + attr->padding.prepended = tflite::gpu::HW(0, 0); + attr->padding.appended = tflite::gpu::HW(0, 0); + } +} + +inline tflite::gpu::PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir) +{ + switch (type_ir) + { + case ir::operation::Pool2D::PoolType::AVG: + return tflite::gpu::PoolingType::AVERAGE; + case ir::operation::Pool2D::PoolType::MAX: + return tflite::gpu::PoolingType::MAX; + default: + throw std::runtime_error("gpu_Cl KernelGenerator : Not supported operation yet"); + } +} + +inline tflite::gpu::BHWC ToBHWC(ir::Shape shape) +{ + switch (shape.rank()) + { + case 1: + // B layout + return tflite::gpu::BHWC(shape.dim(0), 1, 1, 1); + break; + case 2: + // BC layout + return tflite::gpu::BHWC(shape.dim(0), 1, 1, shape.dim(1)); + break; + case 3: + // BWC layout + return tflite::gpu::BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2)); + break; + case 4: + // BHWC layout + return tflite::gpu::BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3)); + break; + default: + break; + } + return tflite::gpu::BHWC(); +} + +inline bool CheckIfLinearConvertible(const ir::Shape *shape) +{ + if (shape->num_elements() <= 0) + { + return false; + } + for (int i = 0; i < shape->rank() - 1; ++i) + { + if (shape->dim(i) != 1) + { + return false; + } + } + return true; +} + +inline tflite::gpu::OperationType +convertArithmeticType(ir::operation::BinaryArithmetic::ArithmeticType arithmetic_type_ir) +{ + switch (arithmetic_type_ir) + { + case ir::operation::BinaryArithmetic::ArithmeticType::ADD: + return tflite::gpu::OperationType::ADD; + case ir::operation::BinaryArithmetic::ArithmeticType::SUB: + return tflite::gpu::OperationType::SUB; + case ir::operation::BinaryArithmetic::ArithmeticType::MUL: + return tflite::gpu::OperationType::MUL; + case ir::operation::BinaryArithmetic::ArithmeticType::DIV: + return tflite::gpu::OperationType::DIV; + default: + throw std::runtime_error("Unsupported ArithmeticType"); + } +} + +inline tflite::gpu::OperationType +convertElementwiseActivationType(ir::operation::ElementwiseActivation::Type type_ir) +{ + switch (type_ir) + { + case ir::operation::ElementwiseActivation::Type::LOGISTIC: + return tflite::gpu::OperationType::SIGMOID; + default: + throw std::runtime_error("Unsupported ElementwiseActivationType"); + } +} + +enum TensorType +{ + TENSOR_TYPE_VALID = 0, + TENSOR_TYPE_INPUT = 1, + TENSOR_TYPE_OUTPUT = 2, + TENSOR_TYPE_DELETE = 3 +}; + +} // namespace gpu_cl +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ diff --git a/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h b/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h deleted file mode 100644 index f67387904..000000000 --- a/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__ -#define __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__ - -#include "tensorflow/lite/delegates/gpu/cl/inference_context.h" -#include "tensorflow/lite/delegates/gpu/common/model.h" -#include "absl/strings/str_cat.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class InferenceContextEx : public tflite::gpu::cl::InferenceContext -{ -public: - struct DummyTensor - { - tflite::gpu::BHWC shape; - tflite::gpu::cl::TensorDescriptor descriptor; - - bool operator==(const DummyTensor &b) const - { - return shape == b.shape && descriptor == b.descriptor; - } - }; - - class TensorReserverEx - { - public: - tflite::gpu::ValueId Add(const std::shared_ptr &dummy) - { - reservations_[next_] = dummy; - return next_++; - } - void Add(tflite::gpu::ValueId id, const std::shared_ptr &dummy) - { - reservations_[id] = dummy; - } - void SetNext(tflite::gpu::ValueId id) { next_ = id; } - bool HaveTensor(tflite::gpu::ValueId id) - { - return reservations_.find(id) != reservations_.end(); - } - std::shared_ptr Get(tflite::gpu::ValueId id) { return reservations_[id]; } - - std::vector> - GetTensorDescs() const - { - std::vector> result; - for (auto &v : reservations_) - { - tflite::gpu::cl::TensorDescriptor desc = v.second->descriptor; - desc.shape.b = v.second->shape.b; - desc.shape.h = v.second->shape.h; - desc.shape.w = v.second->shape.w; - desc.shape.d = 1; - desc.shape.c = v.second->shape.c; - result.push_back({v.first, desc}); - } - return result; - } - - void Add(const std::vector> - &tensors) - { - for (auto &v : tensors) - { - auto dummy = std::make_shared(); - dummy->descriptor = v.second; - dummy->shape.b = v.second.shape.b; - dummy->shape.h = v.second.shape.h; - dummy->shape.w = v.second.shape.w; - dummy->shape.c = v.second.shape.c; - Add(v.first, dummy); - } - } - - private: - // absl::flat_hash_map reservations_; - std::unordered_map> reservations_; - tflite::gpu::ValueId next_ = 0; - }; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__ diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc index d3ed102a1..1b19b10f8 100644 --- a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc +++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc @@ -19,7 +19,7 @@ #include "tensorflow/lite/delegates/gpu/cl/buffer.h" #include "tensorflow/lite/delegates/gpu/cl/cl_context.h" #include "tensorflow/lite/delegates/gpu/cl/tensor.h" -#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h" using namespace tflite::gpu::cl; @@ -32,9 +32,9 @@ namespace gpu_cl namespace operand { -CLTensor::CLTensor(size_t rank, ir::Shape shape, - std::shared_ptr environment, TensorType type) - : ICLTensor{rank, shape, environment, type}, _tensor(std::make_shared()) +CLTensor::CLTensor(size_t rank, TensorType type, tflite::gpu::BHWC shape, + tflite::gpu::TensorDescriptor desc) + : ICLTensor{rank, type, shape, desc}, _tensor(std::make_shared()) { } diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.h b/runtime/onert/backend/gpu_cl/operand/CLTensor.h index f2153f430..269551d0c 100644 --- a/runtime/onert/backend/gpu_cl/operand/CLTensor.h +++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.h @@ -38,8 +38,8 @@ public: CLTensor() = delete; public: - CLTensor(size_t rank, ir::Shape shape, std::shared_ptr environment, - TensorType type); + CLTensor(size_t rank, TensorType type, tflite::gpu::BHWC shape, + tflite::gpu::TensorDescriptor desc); public: const tflite::gpu::cl::Tensor *handle() const override; diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc index a95f78056..ef71bbc13 100644 --- a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc +++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc @@ -43,8 +43,10 @@ void ICLTensor::access(const std::function &fn) fn(*this); } -void ICLTensor::writeConvertInit() +void ICLTensor::writeConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder, + std::shared_ptr environment) { + _environment = environment; TensorObjectDef input_def; input_def.dimensions.b = handle()->Batch(); input_def.dimensions.h = handle()->Height(); @@ -74,21 +76,20 @@ void ICLTensor::writeConvertInit() output_def.object_def.data_type = handle()->GetDataType(); input_def.object_def.user_provided = false; - _converter_builder = NewConverterBuilder(_environment.get()); - if (!_converter_builder->MakeConverter(input_def, permute_def, &_converter_to).ok()) + if (!converter_builder->MakeConverter(input_def, permute_def, &_converter_to).ok()) { throw std::runtime_error("Failed to make converter_to"); } - if (!_converter_builder->MakeConverter(permute_def, output_def, &_converter_from).ok()) + if (!converter_builder->MakeConverter(permute_def, output_def, &_converter_from).ok()) { throw std::runtime_error("Failed to make converter_from"); } } -void ICLTensor::readConvertInit() +void ICLTensor::readConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder, + std::shared_ptr environment) { - _converter_builder = NewConverterBuilder(_environment.get()); - + _environment = environment; TensorObjectDef input_def; input_def.dimensions.b = handle()->Batch(); input_def.dimensions.h = handle()->Height(); @@ -118,20 +119,20 @@ void ICLTensor::readConvertInit() TensorObjectDef output_def = permute_def; output_def.object_def.object_type = ObjectType::CPU_MEMORY; - if (!_converter_builder->MakeConverter(input_def, permute_def, &_converter_from).ok()) + if (!converter_builder->MakeConverter(input_def, permute_def, &_converter_from).ok()) { throw std::runtime_error("Failed to make converter_from"); } - if (!_converter_builder->MakeConverter(permute_def, output_def, &_converter_to).ok()) + if (!converter_builder->MakeConverter(permute_def, output_def, &_converter_to).ok()) { throw std::runtime_error("Failed to make converter_to"); } } -void ICLTensor::enqueueWriteBuffer(const void *ptr, bool) +void ICLTensor::enqueueWriteBuffer(const void *ptr, bool blocking) { - TensorObject input_obj = - MakeReadableCpuMemory(absl::MakeSpan(static_cast(ptr), _shape.num_elements())); + TensorObject input_obj = MakeReadableCpuMemory( + absl::MakeSpan(static_cast(ptr), _info._shape.DimensionsProduct())); TensorObject output_obj; @@ -162,13 +163,19 @@ void ICLTensor::enqueueWriteBuffer(const void *ptr, bool) { throw std::runtime_error("Failed to write cl buffer from cpu memory"); } + + if (blocking && !_environment->queue()->WaitForCompletion().ok()) + { + throw std::runtime_error("Failed to WaitForCompletion"); + } + if (!_converter_from->Convert(permute_obj, output_obj).ok()) { throw std::runtime_error("Failed to change layout"); } } -void ICLTensor::enqueueReadBuffer(void *ptr, bool) +void ICLTensor::enqueueReadBuffer(void *ptr, bool blocking) { TensorObject input_obj; @@ -196,7 +203,7 @@ void ICLTensor::enqueueReadBuffer(void *ptr, bool) } TensorObject output_obj = - MakeCpuMemory(absl::MakeSpan(static_cast(ptr), _shape.num_elements())); + MakeCpuMemory(absl::MakeSpan(static_cast(ptr), _info._shape.DimensionsProduct())); if (!_converter_from->Convert(input_obj, permute_obj).ok()) { @@ -206,6 +213,11 @@ void ICLTensor::enqueueReadBuffer(void *ptr, bool) { throw std::runtime_error("Failed to read cl buffer"); } + + if (blocking && !_environment->queue()->WaitForCompletion().ok()) + { + throw std::runtime_error("Failed to WaitForCompletion"); + } } } // namespace operand diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h index b8ad4469f..47420a1c2 100644 --- a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h +++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h @@ -26,7 +26,7 @@ #include "tensorflow/lite/delegates/gpu/cl/tensor.h" #include "tensorflow/lite/delegates/gpu/cl/environment.h" -#include "TensorBuilderHelper.h" +#include "Utils.h" namespace onert { @@ -37,6 +37,12 @@ namespace gpu_cl namespace operand { +struct TensorInfo +{ + tflite::gpu::BHWC _shape; + tflite::gpu::TensorDescriptor _desc; +}; + class ICLTensor : public ITensor { public: @@ -46,15 +52,15 @@ public: ICLTensor(ICLTensor &&) = default; ICLTensor &operator=(ICLTensor &&) = default; - ICLTensor(size_t rank, ir::Shape shape, std::shared_ptr environment, - TensorType type) - : _rank{rank}, _shape{shape}, _environment(environment), _type(type) + ICLTensor(size_t rank, TensorType type, tflite::gpu::BHWC shape, + tflite::gpu::TensorDescriptor desc) + : _rank{rank}, _type(type), _info{shape, desc} { } public: uint8_t *buffer() const final { return reinterpret_cast(handle()->GetMemoryPtr()); } - size_t total_size() const final { return _shape.num_elements() * sizeof(float); } + size_t total_size() const final { return _info._shape.DimensionsProduct() * sizeof(float); } size_t calcOffset(const ir::Coordinates &) const final { throw std::runtime_error("ICLTensor::calcOffset() is not supported."); @@ -78,16 +84,38 @@ public: throw std::runtime_error("ICLTensor::data_zero_points() is not supported."); } bool is_dynamic() const override { return false; } - ir::Shape getShape() const override { return _shape; } + ir::Shape getShape() const override + { + tflite::gpu::BHWC shape = _info._shape; + switch (_rank) + { + case 1: + return ir::Shape{shape.b}; + case 2: + return ir::Shape{shape.b, shape.c}; + case 3: + return ir::Shape{shape.b, shape.w, shape.c}; + case 4: + return ir::Shape{shape.b, shape.h, shape.w, shape.c}; + default: + break; + } + return ir::Shape{}; + } bool has_padding() const override { return false; } void access(const std::function &fn) final; bool needMemoryMap() const final { return true; } void enqueueWriteBuffer(const void *ptr, bool blocking = true) final; void enqueueReadBuffer(void *ptr, bool blocking = true) final; - void writeConvertInit(); - void readConvertInit(); + void writeConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder, + std::shared_ptr environment); + void readConvertInit(tflite::gpu::TensorObjectConverterBuilder *converter_builder, + std::shared_ptr environment); + TensorType get_type() { return _type; } + TensorType set_type(TensorType type) { return _type = type; } + const TensorInfo get_info() { return _info; } public: virtual const tflite::gpu::cl::Tensor *handle() const = 0; @@ -96,11 +124,10 @@ public: private: protected: size_t _rank; // Actual rank (reflects extended rank) - ir::Shape _shape; - std::shared_ptr _environment; TensorType _type; - std::unique_ptr _converter_builder; + TensorInfo _info; tflite::gpu::cl::CLMemory _cl_memory; + std::shared_ptr _environment; std::unique_ptr _converter_to; std::unique_ptr _converter_from; }; diff --git a/runtime/onert/backend/ruy/BackendContext.cc b/runtime/onert/backend/ruy/BackendContext.cc index 877772619..48da91b50 100644 --- a/runtime/onert/backend/ruy/BackendContext.cc +++ b/runtime/onert/backend/ruy/BackendContext.cc @@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels() .operands() .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - for (auto &it : ret) + for (auto &&it : ret) { auto &fn_seq = it.second; fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); diff --git a/runtime/onert/backend/trix/BackendContext.cc b/runtime/onert/backend/trix/BackendContext.cc index e46b11d20..39048f2be 100644 --- a/runtime/onert/backend/trix/BackendContext.cc +++ b/runtime/onert/backend/trix/BackendContext.cc @@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels() .operands() .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - for (auto &it : ret) + for (auto &&it : ret) { auto &fn_seq = it.second; fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); diff --git a/runtime/onert/backend/trix/BatchThreadPool.cc b/runtime/onert/backend/trix/BatchThreadPool.cc new file mode 100644 index 000000000..3c2001d75 --- /dev/null +++ b/runtime/onert/backend/trix/BatchThreadPool.cc @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BatchThreadPool.h" + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +BatchThreadPool::BatchThreadPool(size_t num_threads) : _num_threads(num_threads), _stop_all(false) +{ + _worker_threads.reserve(_num_threads); + for (uint32_t thread_num = 0; thread_num < _num_threads; ++thread_num) + { + _worker_threads.emplace_back([this, thread_num]() { this->worker(thread_num); }); + } +} + +void BatchThreadPool::worker(uint32_t thread_num) +{ + while (true) + { + std::unique_lock lock(_m_job_queue); + _cv_job_queue.wait(lock, [this]() { return !this->_job_queue.empty() || _stop_all; }); + if (_stop_all && this->_job_queue.empty()) + { + return; + } + + // Pop a job in front of queue + auto job = std::move(_job_queue.front()); + _job_queue.pop(); + lock.unlock(); + + // Run the job + job(thread_num); + } +} + +BatchThreadPool::~BatchThreadPool() +{ + _stop_all = true; + _cv_job_queue.notify_all(); + + for (auto &&t : _worker_threads) + { + t.join(); + } +} + +} // namespace trix +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/trix/BatchThreadPool.h b/runtime/onert/backend/trix/BatchThreadPool.h new file mode 100644 index 000000000..bc2936fb4 --- /dev/null +++ b/runtime/onert/backend/trix/BatchThreadPool.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_BATCH_THREAD_POOL_H__ +#define __ONERT_BACKEND_TRIX_BATCH_THREAD_POOL_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +/** + * @brief Class that has a threadpool for batch-by-batch multi-threading + * + */ +class BatchThreadPool +{ +public: + BatchThreadPool(size_t num_threads); + ~BatchThreadPool(); + + /** + * @brief + * + * @tparam F Type of the function for job + * @tparam Args Type of arguments of job + * @param f Function for job + * @param args Arguments of job + * @return std::future::type> + */ + template + std::future::type> enqueueJob(F &&f, + Args &&... args) + { + if (_stop_all) + { + throw std::runtime_error("Stop all threads in BatchThreadPool"); + } + + using return_type = typename std::result_of::type; + auto job = std::make_shared>( + std::bind(std::forward(f), std::placeholders::_1, std::forward(args)...)); + std::future job_result_future = job->get_future(); + { + // Push job in the assigned queue + std::lock_guard lock(_m_job_queue); + + // Push job + _job_queue.push([job](uint32_t thread_num) { (*job)(thread_num); }); + } + _cv_job_queue.notify_one(); + + return job_result_future; + } + +private: + /** + * @brief Worker to run jobs + * + * @param thread_num Thread number on which worker is running + */ + void worker(uint32_t thread_num); + +private: + /** + * @brief The number of threads + * + */ + size_t _num_threads; + + /** + * @brief Threads worked for jobs + * + */ + std::vector _worker_threads; + + /** + * @brief Queue for jobs + * + */ + std::queue> _job_queue; + + /** + * @brief condition_variables for _job_queue and _worker_threads + * + */ + std::condition_variable _cv_job_queue; + + /** + * @brief Mutex for the queue _job_queue + * + */ + std::mutex _m_job_queue; + + /** + * @brief Whether all threads are stopped + * + */ + bool _stop_all; +}; + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_BATCH_THREAD_POOL_H__ diff --git a/runtime/onert/backend/trix/Convert.cc b/runtime/onert/backend/trix/Convert.cc new file mode 100644 index 000000000..fe003e7ea --- /dev/null +++ b/runtime/onert/backend/trix/Convert.cc @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Convert.h" + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +data_layout convertDataLayout(const ir::Layout layout) +{ + switch (layout) + { + case ir::Layout::NCHW: + return DATA_LAYOUT_NCHW; + case ir::Layout::NHWC: + return DATA_LAYOUT_NHWC; + default: + throw std::runtime_error("Unknown Layout"); + } +} + +data_type convertDataType(const ir::DataType type) +{ + switch (type) + { + case ir::DataType::QUANT_UINT8_ASYMM: + return DATA_TYPE_QASYMM8; + case ir::DataType::QUANT_INT16_SYMM: + return DATA_TYPE_QSYMM16; + default: + throw std::runtime_error("Unsupported data type"); + } +} + +} // namespace trix +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/trix/Convert.h b/runtime/onert/backend/trix/Convert.h new file mode 100644 index 000000000..662ed44b6 --- /dev/null +++ b/runtime/onert/backend/trix/Convert.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_CONVERT_H__ +#define __ONERT_BACKEND_TRIX_CONVERT_H__ + +#include +#include +#include + +#include +#include + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +/** + * @brief Convert type of layout from onert type to npu type + * + * @param layout Layout type in onert + * @return data_layout Layout type in npu + */ +data_layout convertDataLayout(const ir::Layout layout); + +/** + * @brief Convert type of data from onert type to npu type + * + * @param type Data type in onert + * @return data_type Data type in npu + */ +data_type convertDataType(const ir::DataType type); + +/** + * @brief Set the tensors_data_info object + * + * @tparam T Type of tensor based of IPortableTensor + * @param tensors Tensors that have data information + * @param info tensors_data_info to be set + */ +template ::value, bool> = true> +void setDataInfo(const std::vector &tensors, tensors_data_info *info) +{ + info->num_info = static_cast(tensors.size()); + + for (uint32_t idx = 0; idx < info->num_info; ++idx) + { + info->info[idx].layout = convertDataLayout(tensors[idx]->layout()); + info->info[idx].type = convertDataType(tensors[idx]->data_type()); + } +} + +/** + * @brief Set the generic_buffers object + * + * @tparam T Type of tensor based of IPortableTensor + * @param tensors Tensors that have buffer information + * @param buf generic_buffers to be set + */ +template ::value, bool> = true> +void setBuffers(const std::vector &tensors, generic_buffers *buf) +{ + buf->num_buffers = static_cast(tensors.size()); + + for (uint32_t idx = 0; idx < buf->num_buffers; ++idx) + { + buf->bufs[idx].addr = tensors[idx]->buffer(); + buf->bufs[idx].size = static_cast(tensors[idx]->total_size()); + buf->bufs[idx].type = BUFFER_MAPPED; + } +} + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_CONVERT_H__ diff --git a/runtime/onert/backend/trix/DevContext.cc b/runtime/onert/backend/trix/DevContext.cc new file mode 100644 index 000000000..059514878 --- /dev/null +++ b/runtime/onert/backend/trix/DevContext.cc @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DevContext.h" + +#include "Convert.h" + +#include + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +// All things related to npu device handle are gathered this Class, but when implementing npu +// deamon, others except the context roles should be seperated. +DevContext::DevContext() : _dev_handles{}, _model_ids{}, _meta_map{} +{ + auto dev_count = getnumNPUdeviceByType(NPUCOND_TRIV2_CONN_SOCIP); + if (dev_count <= 0) + { + throw std::runtime_error("Unable to find TRIX NPU device"); + } + + // Get NPU device handles + for (int i = 0; i < dev_count; ++i) + { + npudev_h handle; + if (getNPUdeviceByType(&handle, NPUCOND_TRIV2_CONN_SOCIP, i) < 0) + { + throw std::runtime_error("Failed to get TRIX NPU device handle"); + } + _dev_handles.emplace_back(handle); + } + + // NOTE Do not change the number of threads as long as jobs in thread call + // the synchronous APIs such as submitNPU_request() + _batch_thread_pool = std::make_unique(_dev_handles.size()); + // We need to careful not to create multiple `BatchThreadPool`. In case of multiple models, there + // may be a problem having multiple `BatchThreadPool` in current implementation. But if this + // creating thread pool is moved to npu deamon, I think this problem will be solved smoothly. +} + +DevContext::~DevContext() +{ + // NOTE Must release _batch_thread_pool before releasing _dev_handles to wait for all threads to + // be terminated + _batch_thread_pool.reset(nullptr); + + for (const auto &dev_handle : _dev_handles) + { + unregisterNPUmodel_all(dev_handle); + putNPUdevice(dev_handle); + } +} + +ModelID DevContext::registerModel(const std::string &model_file_path) +{ + auto meta = getNPUmodel_metadata(model_file_path.c_str(), false); + + if (meta == nullptr) + { + throw std::runtime_error("Unable to extract the model metadata"); + } + + generic_buffer file_info; + file_info.type = BUFFER_FILE; + file_info.filepath = model_file_path.c_str(); + file_info.size = meta->size; + + ModelID model_id; + + for (uint32_t dev_num = 0; dev_num < _dev_handles.size(); ++dev_num) + { + // Register model for each device + uint32_t model_id_at_device; + if (registerNPUmodel(_dev_handles.at(dev_num), &file_info, &model_id_at_device) < 0) + { + throw std::runtime_error("Failed to register npu model"); + } + + if (dev_num == 0) + { + model_id = model_id_at_device; + _meta_map[model_id_at_device] = std::shared_ptr(meta); + } + else + { + _meta_map[model_id_at_device] = _meta_map[model_id]; + } + + _model_ids[model_id].resize(dev_num + 1); + _model_ids[model_id].at(dev_num) = model_id_at_device; + } + + // Return the model id for device 0 only + return model_id; +} + +void DevContext::unRegisterModel(ModelID model_id) +{ + for (uint32_t dev_num = 0; dev_num < _dev_handles.size(); ++dev_num) + { + const auto model_id_at_device = _model_ids.at(model_id).at(dev_num); + const auto &dev_handle = _dev_handles.at(dev_num); + + // Remove meta data + _meta_map.erase(model_id_at_device); + + // Unregister Model for each device + unregisterNPUmodel(dev_handle, model_id_at_device); + } + // Remove model IDs + _model_ids.erase(model_id); +} + +void DevContext::requestRun(ModelID model_id, input_buffers *input_bufs, tensors_data_info *in_info, + output_buffers *output_bufs, tensors_data_info *out_info, + size_t batch_size) +{ + if (batch_size > 1) + { + if (in_info->num_info != 1) + { + throw std::runtime_error("Supported only an input that has batch now"); + } + if (out_info->num_info != 1) + { + throw std::runtime_error("Supported only one output now"); + } + + if (input_bufs->bufs[0].size % batch_size != 0) + { + throw std::runtime_error("Invalid batch size. batch size :" + std::to_string(batch_size) + + ", input buffer size : " + std::to_string(input_bufs->bufs[0].size)); + } + + if (output_bufs->bufs[0].size % batch_size != 0) + { + throw std::runtime_error( + "Invalid batch size. batch size :" + std::to_string(batch_size) + + ", output tensor size : " + std::to_string(output_bufs->bufs[0].size)); + } + + // inputs/outputs for each batch + std::vector in_buffers_vec(batch_size); + std::vector out_buffers_vec(batch_size); + + // Run on thread pool + std::vector> batch_futures; + for (uint32_t batch_num = 0; batch_num < batch_size; ++batch_num) + { + // Enqueue jobs + // The in_info and out_info are always the same even if they are divided by batch, so they are + // used as they are. + auto future = _batch_thread_pool->enqueueJob( + [batch_size, in_info, out_info, + this](uint32_t dev_num, ModelID model_id, const input_buffers *input_bufs, + const output_buffers *output_bufs, uint32_t batch_num) -> int32_t { + // Set buffers of inputs/outputs for each batch + // TODO Support multiple inputs/outputs + input_buffers in_batch_buffers; + in_batch_buffers.num_buffers = input_bufs->num_buffers; + const uint64_t in_batch_offset = input_bufs->bufs[0].size / batch_size; + setBufferByBatch(input_bufs->bufs[0], batch_num, in_batch_offset, + &in_batch_buffers.bufs[0]); + + output_buffers out_batch_buffers; + out_batch_buffers.num_buffers = output_bufs->num_buffers; + const uint64_t out_batch_offset = output_bufs->bufs[0].size / batch_size; + setBufferByBatch(output_bufs->bufs[0], batch_num, out_batch_offset, + &out_batch_buffers.bufs[0]); + + try + { + // dev_num is the same as the thread number in _batch_thread_pool + this->runOneBatch(dev_num, model_id, &in_batch_buffers, in_info, &out_batch_buffers, + out_info); + } + catch (...) + { + _eptr = std::current_exception(); + } + + return batch_num; + }, + model_id, input_bufs, output_bufs, batch_num); + batch_futures.emplace_back(std::move(future)); + } + + for (auto &&future : batch_futures) + { + future.get(); + } + + if (_eptr) + { + std::exception_ptr eptr(nullptr); + _eptr.swap(eptr); + std::rethrow_exception(eptr); + } + } + else + { + runOneBatch(0, model_id, input_bufs, in_info, output_bufs, out_info); + } +} + +void DevContext::runOneBatch(uint32_t dev_num, ModelID model_id, input_buffers *input_bufs, + tensors_data_info *in_info, output_buffers *output_bufs, + tensors_data_info *out_info) +{ + const auto &model_id_at_device = _model_ids.at(model_id).at(dev_num); + + const auto meta = _meta_map.at(model_id_at_device); + if (meta->input_seg_num != in_info->num_info) + { + throw std::runtime_error("The number of inputs does not match to model input seg num"); + } + + if (meta->output_seg_num != out_info->num_info) + { + throw std::runtime_error("The number of outputs does not match to model output seg num"); + } + + const auto &dev_handle = _dev_handles.at(dev_num); + int req_id; + + if (auto error_code = createNPU_request(dev_handle, model_id_at_device, &req_id)) + { + throw std::runtime_error("Unable to create NPU request with model id (" + + std::to_string(model_id_at_device) + ")" + + " error code : " + std::to_string(error_code)); + } + + if (auto error_code = + setNPU_requestData(dev_handle, req_id, input_bufs, in_info, output_bufs, out_info)) + { + removeNPU_request(dev_handle, req_id); + throw std::runtime_error("Unable to create NPU request for model id (" + + std::to_string(model_id_at_device) + ")" + + " error code : " + std::to_string(error_code)); + } + + // NOTE submitNPU_request is not thread-safe(?). It is rarely hanging(unresponsive). + // Ultimately, to solve this problem, we have to either use other thread-safe API or + // change submitNPU_request to be thread-safe, but both works take time. + // As a workaround, let's allow hanging thread. + // TODO Change submitNPU_request to be thread-safe or replaced with other thread-safe API + std::packaged_task task(submitNPU_request); + auto f = task.get_future(); + std::thread thread_submit_request(std::move(task), dev_handle, req_id); + auto status = f.wait_until(std::chrono::system_clock::now() + std::chrono::seconds(60)); + if (status == std::future_status::timeout) + { + // There is no way to terminate hanging submitNPU_request from the outside. + // If a hanging thread is detached, it will remain as a hanging thread. Even so, it's better + // than having the main thread hanging. + thread_submit_request.detach(); + + // TODO Enable removeNPU_request after resolving hanging. + // removeNPU_request(dev_handle, req_id); + throw std::runtime_error("The npu API \"submitNPU_request\" timeout"); + } + + auto error_code = f.get(); + thread_submit_request.join(); + if (error_code != 0) + { + removeNPU_request(dev_handle, req_id); + throw std::runtime_error("Unable to submit NPU request with req id (" + std::to_string(req_id) + + ")" + " error code : " + std::to_string(error_code)); + } + + if (auto error_code = removeNPU_request(dev_handle, req_id)) + { + throw std::runtime_error("Unable to remove NPU request with req id (" + std::to_string(req_id) + + ")" + " error code : " + std::to_string(error_code)); + } +} + +void DevContext::setBufferByBatch(const generic_buffer &origin_buf, uint32_t batch_num, + uint64_t batch_offset, generic_buffer *batch_buf) +{ + batch_buf->addr = reinterpret_cast(origin_buf.addr) + batch_num * batch_offset; + batch_buf->size = batch_offset; + batch_buf->type = BUFFER_MAPPED; +} + +} // namespace trix +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/trix/DevContext.h b/runtime/onert/backend/trix/DevContext.h index a7dbd7a59..cd8de97e6 100644 --- a/runtime/onert/backend/trix/DevContext.h +++ b/runtime/onert/backend/trix/DevContext.h @@ -17,7 +17,12 @@ #ifndef __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__ #define __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__ +#include "BatchThreadPool.h" + #include +#include +#include +#include namespace onert { @@ -26,103 +31,117 @@ namespace backend namespace trix { +using ModelID = uint32_t; + +/** + * @brief NPU device context of trix backend + * + */ class DevContext { public: - DevContext() - { - auto device_count = getnumNPUdeviceByType(NPUCOND_TRIV2_CONN_SOCIP); - // TODO: x64 platform has 3 cores. We do not support more that 2 cores for now. - if (device_count > 2) - { - device_count = 2; - } - - if (device_count <= 0) - { - throw std::runtime_error("Unable to find TRIX NPU device"); - } - - for (int i = 0; i < device_count; i++) - { - npudev_h h; - if (getNPUdeviceByType(&h, NPUCOND_TRIV2_CONN_SOCIP, i) < 0) - { - throw std::runtime_error("Failed to get TRIX NPU device handle"); - } - _dev_handles.push_back(h); - } - } - - ~DevContext() - { - for (auto h : _dev_handles) - { - if (h != nullptr) - { - unregisterNPUmodel_all(h); - putNPUdevice(h); - } - } - } - - npudev_h getDev(int i) { return _dev_handles[i]; } - int getDevSize() { return _dev_handles.size(); } - - template void setDataInfo(tensors_data_info *info, std::vector &tensors) - { - info->num_info = static_cast(tensors.size()); - - for (uint32_t idx = 0; idx < info->num_info; ++idx) - { - info->info[idx].layout = convertDataLayout(tensors[idx]->layout()); - info->info[idx].type = convertDataType(tensors[idx]->data_type()); - } - } - - template - void setBuffer(generic_buffers *buf, std::vector &tensors, int batch_size, int batch_index) - { - buf->num_buffers = static_cast(tensors.size()); - - for (uint32_t idx = 0; idx < buf->num_buffers; ++idx) - { - buf->bufs[idx].size = static_cast(tensors[idx]->total_size() / batch_size); - buf->bufs[idx].addr = tensors[idx]->buffer() + (batch_index * buf->bufs[idx].size); - buf->bufs[idx].type = BUFFER_MAPPED; - } - } + /** + * @brief Construct a new device Context object + * + */ + DevContext(); + + /** + * @brief Destroy the device Context object + * + */ + ~DevContext(); + + DevContext(const DevContext &) = delete; + DevContext &operator=(const DevContext &) = delete; + + /** + * @brief Register a trix model for all NPU devices + * + * @param model_file_path File path of a trix model + * @return ModelID Internal ID of the trix model + */ + ModelID registerModel(const std::string &model_file_path); + + /** + * @brief Unregister a trix model + * + * @param model_id Internal ID of the trix model to be unregistered + */ + void unRegisterModel(ModelID model_id); + + /** + * @brief Request a trix model to be run on NPU + * + * @param model_id Internal ID of a trix model + * @param input_bufs Buffer data of inputs + * @param in_info Data info of inputs + * @param output_bufs Buffer data of outputs + * @param out_info data info of outputs + * @param batch_size Batch size + */ + void requestRun(ModelID model_id, input_buffers *input_bufs, tensors_data_info *in_info, + output_buffers *output_bufs, tensors_data_info *out_info, size_t batch_size); private: - data_layout convertDataLayout(const ir::Layout layout) - { - switch (layout) - { - case ir::Layout::NCHW: - return DATA_LAYOUT_NCHW; - case ir::Layout::NHWC: - return DATA_LAYOUT_NHWC; - default: - throw std::runtime_error("Unknown Layout"); - } - } - - data_type convertDataType(const ir::DataType type) - { - switch (type) - { - case ir::DataType::QUANT_UINT8_ASYMM: - return DATA_TYPE_QASYMM8; - case ir::DataType::QUANT_INT16_SYMM: - return DATA_TYPE_QSYMM16; - default: - throw std::runtime_error("Unsupported data type"); - } - } + /** + * @brief Rquest one batch of a trix model to be run on a device of NPU + * + * @param dev_num Device number + * @param model_id Internal ID of a trix model + * @param input_bufs Buffer data of inputs + * @param in_info Data info of inputs + * @param output_bufs Buffer data of outputs + * @param out_info data info of outputs + */ + void runOneBatch(uint32_t dev_num, ModelID model_id, input_buffers *input_bufs, + tensors_data_info *in_info, output_buffers *output_bufs, + tensors_data_info *out_info); + + /** + * @brief Set the buffer object by batch + * + * @param origin_buf Buffer object that has all batches + * @param batch_num Batch number + * @param batch_offset Size of a batch + * @param batch_buf One batch buffer object to be set + */ + void setBufferByBatch(const generic_buffer &origin_buf, uint32_t batch_num, uint64_t batch_offset, + generic_buffer *batch_buf); private: - // NPU device handles + /** + * @brief NPU device handles + * + */ std::vector _dev_handles; + + /** + * @brief Threadpool for batch-by-batch multi-threading + * + */ + std::unique_ptr _batch_thread_pool; + + // TODO Change key to internal trix model context(?) if it is needed + /** + * @brief Map for ID of models + * Internal Model ID : Model ID array for each device + * + */ + std::unordered_map> _model_ids; + + /** + * @brief Map for meta data + * Model ID at each device : meta data + * + */ + std::unordered_map> _meta_map; + + /** + * @brief Exception pointer captured whthin threads + * + */ + std::exception_ptr _eptr; }; } // namespace trix diff --git a/runtime/onert/backend/trix/KernelGenerator.cc b/runtime/onert/backend/trix/KernelGenerator.cc index 68e6840dd..2783bd75b 100644 --- a/runtime/onert/backend/trix/KernelGenerator.cc +++ b/runtime/onert/backend/trix/KernelGenerator.cc @@ -61,11 +61,11 @@ void KernelGenerator::visit(const ir::operation::Bulk &node) using ir::operation::Bulk; std::vector output_tensors; - for (auto &ofm_idx : node.getOutputs()) + for (const auto &ofm_idx : node.getOutputs()) output_tensors.emplace_back(_tensor_reg->getPortableTensor(ofm_idx)); std::vector input_tensors; - for (auto &ifm_idx : node.getInputs()) + for (const auto &ifm_idx : node.getInputs()) input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); // parameters diff --git a/runtime/onert/backend/trix/ops/BulkLayer.cc b/runtime/onert/backend/trix/ops/BulkLayer.cc index 3c49da9a3..db5c81ba7 100644 --- a/runtime/onert/backend/trix/ops/BulkLayer.cc +++ b/runtime/onert/backend/trix/ops/BulkLayer.cc @@ -15,10 +15,8 @@ */ #include "BulkLayer.h" -#include -#include -#include +#include "../Convert.h" namespace onert { @@ -29,12 +27,12 @@ namespace trix namespace ops { -BulkLayer::BulkLayer() : _inputs(), _outputs(), _model_id(0), _meta(nullptr), _dev_context(nullptr) +BulkLayer::BulkLayer() : _inputs(), _outputs(), _model_id(0), _dev_context(nullptr) { // DO NOTHING } -BulkLayer::~BulkLayer() { free(_meta); } +BulkLayer::~BulkLayer() { _dev_context->unRegisterModel(_model_id); } void BulkLayer::configure(const std::vector &inputs, std::vector &outputs, std::string binary_path, @@ -43,133 +41,28 @@ void BulkLayer::configure(const std::vector &inputs, _inputs = inputs; _outputs = outputs; _dev_context = dev_context; - - _meta = getNPUmodel_metadata(binary_path.c_str(), false); - if (_meta == nullptr) - { - throw std::runtime_error("Unable to extract the model metadata"); - } - - _model_id.resize(_dev_context->getDevSize()); - - generic_buffer model_file; - model_file.type = BUFFER_FILE; - model_file.filepath = binary_path.c_str(); - model_file.size = _meta->size; - - for (int i = 0; i < _dev_context->getDevSize(); i++) - { - if (registerNPUmodel(dev_context->getDev(i), &model_file, &_model_id[i]) < 0) - { - throw std::runtime_error("Failed to register npu model"); - } - } -} - -void single_job(npudev_h dev, int req_id, input_buffers *input_buf, tensors_data_info *in_info, - output_buffers *output_buf, tensors_data_info *out_info) -{ - if (setNPU_requestData(dev, req_id, input_buf, in_info, output_buf, out_info)) - { - throw std::runtime_error("Unable to create NPU request for red_id (" + std::to_string(req_id) + - ")"); - } - - if (submitNPU_request(dev, req_id)) - { - throw std::runtime_error("Unable to submit NPU request with req id (" + std::to_string(req_id) + - ")"); - } + _model_id = _dev_context->registerModel(binary_path); } void BulkLayer::run() { - // TODO: Remove too many assumption - // We assume user wants batch execution if user's input size is multiples of model's input size - int user_input_batch = (_inputs[0]->get_info().shape()).dim(0); - int model_input_batch = _meta->input_seg_dims[0][0]; - int batch_size = user_input_batch / model_input_batch; - bool is_batch_execution = (batch_size != 1 ? true : false); - - std::vector req_id(_dev_context->getDevSize()); - - for (int i = 0; i < _dev_context->getDevSize(); i++) - { - if (createNPU_request(_dev_context->getDev(i), _model_id[i], &req_id[i])) - { - throw std::runtime_error("Unable to create NPU request with model id (" + - std::to_string(_model_id[i]) + ")"); - } - } - - if (_meta->input_seg_num != _inputs.size()) - { - throw std::runtime_error("input size does not match to model input seg num"); - } - - if (_meta->output_seg_num != _outputs.size()) - { - throw std::runtime_error("output size does not match to model output seg num"); - } - tensors_data_info in_info; tensors_data_info out_info; - _dev_context->setDataInfo(&in_info, _inputs); - _dev_context->setDataInfo(&out_info, _outputs); + setDataInfo(_inputs, &in_info); + setDataInfo(_outputs, &out_info); - std::vector input_buf; - std::vector output_buf; - input_buf.resize(_dev_context->getDevSize()); - output_buf.resize(_dev_context->getDevSize()); - - std::vector> f(_dev_context->getDevSize()); - - const int num_cores = _dev_context->getDevSize(); - if (is_batch_execution) - { - // TODO: Support for general number of cores(>2) - // Here we assume that 2 trix cores - for (int i = 0; i < (batch_size); i = i + num_cores) - { - for (int core = 0; core < num_cores; core++) - { - _dev_context->setBuffer(&input_buf[core], _inputs, batch_size, - i + core); - _dev_context->setBuffer(&output_buf[core], _outputs, batch_size, i + core); - } - for (int core = 0; core < num_cores; core++) - { - - if (i + core < batch_size) - { - f[core] = - std::async(std::launch::async, &single_job, _dev_context->getDev(core), req_id[core], - &input_buf[core], &in_info, &output_buf[core], &out_info); - } - } - for (int core = 0; core < num_cores; core++) - { - f[core].wait(); - } - } - } - else - { - _dev_context->setBuffer(&input_buf[0], _inputs, batch_size, 0); - _dev_context->setBuffer(&output_buf[0], _outputs, batch_size, 0); - - single_job(_dev_context->getDev(0), req_id[0], &input_buf[0], &in_info, &output_buf[0], - &out_info); - } + input_buffers input_bufs; + output_buffers output_bufs; + setBuffers(_inputs, &input_bufs); + setBuffers(_outputs, &output_bufs); - for (int i = 0; i < _dev_context->getDevSize(); i++) + size_t batch_size = 1; + // TODO Remove this assumption + if (_inputs.size() == 1 && _outputs.size() == 1 && _inputs.at(0)->getShape().dim(0) > 1) { - if (removeNPU_request(_dev_context->getDev(i), req_id[i])) - { - throw std::runtime_error("Unable to remove NPU request with req id (" + - std::to_string(req_id[i]) + ")"); - } + batch_size = _inputs.at(0)->getShape().dim(0); } + _dev_context->requestRun(_model_id, &input_bufs, &in_info, &output_bufs, &out_info, batch_size); } void BulkLayer::prepare() diff --git a/runtime/onert/backend/trix/ops/BulkLayer.h b/runtime/onert/backend/trix/ops/BulkLayer.h index 614c0f728..6590b6989 100644 --- a/runtime/onert/backend/trix/ops/BulkLayer.h +++ b/runtime/onert/backend/trix/ops/BulkLayer.h @@ -50,8 +50,7 @@ private: std::vector _inputs; std::vector _outputs; - std::vector _model_id; - npubin_meta *_meta; + ModelID _model_id; std::shared_ptr _dev_context; }; diff --git a/runtime/onert/backend/xnnpack/BackendContext.cc b/runtime/onert/backend/xnnpack/BackendContext.cc index 42fffb608..c52e275aa 100644 --- a/runtime/onert/backend/xnnpack/BackendContext.cc +++ b/runtime/onert/backend/xnnpack/BackendContext.cc @@ -50,7 +50,7 @@ FunctionMap BackendContext::genKernels() .operands() .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - for (auto &it : ret) + for (auto &&it : ret) { auto &fn_seq = it.second; fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); diff --git a/runtime/onert/core/CMakeLists.txt b/runtime/onert/core/CMakeLists.txt index 87c7a13e4..8041ab5bc 100644 --- a/runtime/onert/core/CMakeLists.txt +++ b/runtime/onert/core/CMakeLists.txt @@ -57,4 +57,4 @@ target_link_libraries(${TEST_ONERT_CORE} nnfw_coverage) target_link_libraries(${TEST_ONERT_CORE} gtest gtest_main dl ${LIB_PTHREAD}) add_test(${TEST_ONERT_CORE} ${TEST_ONERT_CORE}) -install(TARGETS ${TEST_ONERT_CORE} DESTINATION unittest_standalone) +install(TARGETS ${TEST_ONERT_CORE} DESTINATION unittest) diff --git a/runtime/onert/core/include/backend/basic/BackendContextHelpers.h b/runtime/onert/core/include/backend/basic/BackendContextHelpers.h index cf2da4c34..970a9f71c 100644 --- a/runtime/onert/core/include/backend/basic/BackendContextHelpers.h +++ b/runtime/onert/core/include/backend/basic/BackendContextHelpers.h @@ -84,19 +84,23 @@ template void planTensors(const T_BackendContext &ct tensor_builder->notifyFirstUse(ind); } - for (auto &pair : def_map) + for (const auto &pair : def_map) { - if (pair.second == 0) - tensor_builder->notifyFirstUse(pair.first); + const auto &ind = pair.first; + const auto def_count = pair.second; + if (def_count == 0) + tensor_builder->notifyFirstUse(ind); } // This is a workaround to keep the operands over the execution // (the operands look like they are unused) std::vector operands_last_until_end; - for (auto &pair : uses_map) + for (const auto &pair : uses_map) { - if (pair.second == 0) - operands_last_until_end.push_back(pair.first); + const auto &ind = pair.first; + const auto use_count = pair.second; + if (use_count == 0) + operands_last_until_end.push_back(ind); } // At each operation, @@ -161,7 +165,7 @@ template void planTensors(const T_BackendContext &ct } } - for (auto &ind : operands_last_until_end) + for (const auto &ind : operands_last_until_end) { tensor_builder->notifyLastUse(ind); } diff --git a/runtime/onert/core/include/compiler/Compiler.h b/runtime/onert/core/include/compiler/Compiler.h index f05d63c66..9a86f407e 100644 --- a/runtime/onert/core/include/compiler/Compiler.h +++ b/runtime/onert/core/include/compiler/Compiler.h @@ -22,76 +22,19 @@ #ifndef __ONERT_COMPILER_COMPILE_H_ #define __ONERT_COMPILER_COMPILE_H_ +#include "CompilerOptions.h" +#include "ICompiler.h" #include "ir/NNPkg.h" -#include "exec/Executors.h" -#include "util/TracingCtx.h" namespace onert { - namespace compiler { -enum class State -{ - CREATED, // Before compilation - COMPILED // Success compilation -}; - -struct ManualSchedulerOptions -{ -public: - void setBackendMap(const std::string &str); - -public: - std::string backend_for_all; - std::unordered_map opcode_to_backend; - std::unordered_map index_to_backend; -}; - -struct PartialGraphOptions -{ - std::unordered_map index_to_graph; -}; - -class CompilerOptions -{ -public: - // Set default values for CompilerOptions - // All these default values should not be fetched from Env, when we stop supporting Android NNAPI. - static std::unique_ptr fromGlobalConfig(); - -public: - // GENERAL OPTIONS - std::vector backend_list; - - // OPTIONS ONLY FOR DEBUGGING/PROFILING - std::string trace_filepath; //< File path to save trace records - int graph_dump_level; //< Graph dump level, values between 0 and 2 are valid - std::string executor; //< Executor name to use - ManualSchedulerOptions manual_scheduler_options; //< Options for ManualScheduler - bool he_scheduler; //< HEScheduler if true, ManualScheduler otherwise - bool he_profiling_mode; //< Whether HEScheduler profiling mode ON/OFF - bool disable_compile; //< Run with Interpreter if true, try compilation otherwise - bool fp16_enable; //< Whether fp16 mode ON/OFF - PartialGraphOptions partial_graph_options; -}; - -struct CompilerArtifact -{ - CompilerArtifact(void) = delete; - CompilerArtifact(std::shared_ptr executors, - std::unique_ptr tracing_ctx) - : _executors{executors}, _tracing_ctx{std::move(tracing_ctx)} {}; - - std::shared_ptr _executors; - std::unique_ptr _tracing_ctx; -}; - /** * @brief Class to compile NN package */ -class Compiler +class Compiler : public ICompiler { public: /** @@ -109,55 +52,25 @@ public: Compiler(const std::shared_ptr &nnpkg, std::vector> &copts); -public: /** - * @brief Do compilation with the options - * - * @return std::shared_ptr Executors as a result of compilation + * @brief Destroy the Compiler object */ - std::shared_ptr compile(void); + ~Compiler() = default; +public: /** * @brief Do compilation with the options * - * @return std::vector> Executors as a result of compilation - * for pipeline - */ - std::vector> compile(const char *package_file_path, - const char *map_file_path); - - State state(void) const { return _state; } - - /** - * @brief Allow to compute float32 using float16 data type - */ - void enableToFp16(); - - /** - * @brief Build the partial graphs to compile with original graph + * @return std::shared_ptr Executors as a result of compilation */ - bool buildPartialGraph(uint32_t num_graphs); - -private: - void checkProfilerConditions(); - std::shared_ptr &primary_subgraph() - { - return _nnpkg->primary_model()->at(ir::SubgraphIndex{0}); - } + std::shared_ptr compile(void); private: - std::shared_ptr _nnpkg; - // NOTE These executors does not have duplicated subgraph. This mean they do not allow support - // subgraphs being called recursively because data of non-constant tensor of parent executor will - // be updated by child executor. If you want to support subgraphs being called recursively, you - // have to add allocate non-constant tensor memory of executors in execution time when each - // subgraph is called. - State _state; - std::vector _voptions; + std::shared_ptr _model; + CompilerOptions *_options; }; } // namespace compiler - } // namespace onert #endif // __ONERT_COMPILER_COMPILE_H_ diff --git a/runtime/onert/core/include/compiler/CompilerFactory.h b/runtime/onert/core/include/compiler/CompilerFactory.h new file mode 100644 index 000000000..4894366a2 --- /dev/null +++ b/runtime/onert/core/include/compiler/CompilerFactory.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_COMPILER_COMPILER_FACTORY_H__ +#define __ONERT_COMPILER_COMPILER_FACTORY_H__ + +#include "ICompiler.h" +#include "CompilerOptions.h" +#include "ir/NNPkg.h" + +namespace onert +{ +namespace compiler +{ + +// TODO Support register and use compiler plugin +class CompilerFactory +{ +public: + static CompilerFactory &get(); + +public: + std::unique_ptr create(const std::shared_ptr &nnpkg, + std::vector> &copts); + +private: + // It is not allowed to use CompilerFactory without get() + CompilerFactory() = default; +}; + +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_COMPILER_FACTORY_H__ diff --git a/runtime/onert/core/include/compiler/CompilerOptions.h b/runtime/onert/core/include/compiler/CompilerOptions.h new file mode 100644 index 000000000..bbe15fc06 --- /dev/null +++ b/runtime/onert/core/include/compiler/CompilerOptions.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_COMPILER_COMPILER_OPTIONS_H_ +#define __ONERT_COMPILER_COMPILER_OPTIONS_H_ + +#include "ir/OpCode.h" +#include "ir/Index.h" + +#include +#include +#include +#include + +namespace onert +{ +namespace compiler +{ + +struct ManualSchedulerOptions +{ +public: + void setBackendMap(const std::string &str); + +public: + std::string backend_for_all; + std::unordered_map opcode_to_backend; + std::unordered_map index_to_backend; +}; + +class CompilerOptions +{ +public: + /** + * @brief Set default values for CompilerOptions + * @return Generated CompileOption + * + * @note All these default values should not be fetched from Env + * when we stop supporting Android NNAPI. + */ + static std::unique_ptr fromGlobalConfig(); + + /** + * @brief Allow to compute float32 using float16 data type + */ + void enableToFp16() { fp16_enable = true; } + + /** + * @brief Force default values of CompilerOptions for correct compilations + * + * @note This should be called after CompilerOptions setting is finished + * to prevent value overwriting + */ + void forceInternalOptions(); + + /** + * @brief Print option value + */ + void verboseOptions(); + +public: + // GENERAL OPTIONS + std::vector backend_list; + + // OPTIONS ONLY FOR DEBUGGING/PROFILING + std::string trace_filepath; //< File path to save trace records + int graph_dump_level; //< Graph dump level, values between 0 and 2 are valid + std::string executor; //< Executor name to use + ManualSchedulerOptions manual_scheduler_options; //< Options for ManualScheduler + bool he_scheduler; //< HEScheduler if true, ManualScheduler otherwise + bool he_profiling_mode; //< Whether HEScheduler profiling mode ON/OFF + bool fp16_enable; //< Whether fp16 mode ON/OFF +}; + +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_COMPILER_OPTIONS_H_ diff --git a/runtime/onert/core/include/compiler/ICompiler.h b/runtime/onert/core/include/compiler/ICompiler.h new file mode 100644 index 000000000..255e0509d --- /dev/null +++ b/runtime/onert/core/include/compiler/ICompiler.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file ICompiler.h + * @brief This file contains ICompiler class to define and run compilation phase + */ + +#ifndef __ONERT_COMPILER_I_COMPILER_H_ +#define __ONERT_COMPILER_I_COMPILER_H_ + +#include "exec/IExecutors.h" +#include "util/TracingCtx.h" + +namespace onert +{ +namespace compiler +{ + +struct CompilerArtifact +{ + CompilerArtifact(void) = delete; + CompilerArtifact(std::shared_ptr executors, + std::unique_ptr tracing_ctx) + : _executors{executors}, _tracing_ctx{std::move(tracing_ctx)} {}; + + std::shared_ptr _executors; + std::unique_ptr _tracing_ctx; +}; + +class ICompiler +{ +public: + /** + * @brief Virtual ICompiler destructor + * @note Require derived class destructor + */ + virtual ~ICompiler() = default; + + /** + * @brief Do compilation + * @return std::shared_ptr Executors as a result of compilation + */ + virtual std::shared_ptr compile(void) = 0; +}; + +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_I_COMPILER_H_ diff --git a/runtime/onert/core/include/compiler/LoweredGraph.h b/runtime/onert/core/include/compiler/LoweredGraph.h index 7264f2a10..e9f0ae0de 100644 --- a/runtime/onert/core/include/compiler/LoweredGraph.h +++ b/runtime/onert/core/include/compiler/LoweredGraph.h @@ -36,13 +36,9 @@ class LoweredGraph { public: LoweredGraph(const ir::Graph &graph, const compiler::CompilerOptions &options); - LoweredGraph(const ir::Graph &parent_graph, const ir::Graph &graph, - const compiler::CompilerOptions &options); ir::Graph &graph() { return _graph; } const ir::Graph &graph() const { return _graph; } - ir::Graph &parent_graph() { return _parent_graph; } - const ir::Graph &parent_graph() const { return _parent_graph; } const compiler::GraphLowerInfo &lower_info() const { return _lower_info_map; } compiler::GraphLowerInfo &lower_info() { return _lower_info_map; } std::shared_ptr> indexed_ranks() { return _indexed_ranks; } @@ -69,7 +65,6 @@ private: * It allows the original graph can be compiled multiple times. */ ir::Graph _graph; - ir::Graph _parent_graph; std::shared_ptr> _indexed_ranks; compiler::GraphLowerInfo _lower_info_map; ir::OperationIndexMap _has_dynamic_tensor_map; diff --git a/runtime/onert/core/include/compiler/StaticShapeInferer.h b/runtime/onert/core/include/compiler/StaticShapeInferer.h index f701dc207..94d6ba1a7 100644 --- a/runtime/onert/core/include/compiler/StaticShapeInferer.h +++ b/runtime/onert/core/include/compiler/StaticShapeInferer.h @@ -101,6 +101,15 @@ public: void dump(); + /** + * @brief Create a lowered model shape inferer map + * @param[in] lowered_subgs lowered model subgraph map + * @return Shape inferer map + */ + static std::unordered_map> + createStaticShapeInferers( + const std::unordered_map> &lowered_subgs); + private: bool checkDynamicInput(const ir::Operation &op); bool checkDynamicOutput(const ir::Operation &op); diff --git a/runtime/onert/core/include/exec/Execution.h b/runtime/onert/core/include/exec/Execution.h index 1e8083c4c..ba3edcdd6 100644 --- a/runtime/onert/core/include/exec/Execution.h +++ b/runtime/onert/core/include/exec/Execution.h @@ -22,7 +22,7 @@ #define __ONERT_EXEC_EXECUTION_H__ #include "ir/Layout.h" -#include "exec/Executors.h" +#include "exec/IExecutors.h" #include "IODescription.h" #include @@ -46,16 +46,15 @@ public: * @brief Construct a new Execution object * @param[in] executor Model executor */ - Execution(const std::shared_ptr &executors); + Execution(const std::shared_ptr &executors); public: /** * @brief Returns primary graph object * @return Graph object */ - const ir::Graph &primary_subgraph() const { return primary_executor()->graph(); } + const ir::Graph &primary_subgraph() const { return entryExecutor()->graph(); } - const ir::Graph &primary_parentgraph() const { return primary_executor()->parent_graph(); } /** * @brief Change input shape * @param[in] index Input index @@ -146,121 +145,15 @@ public: ir::Shape getInputShape(ir::IOIndex ind) const; ir::Shape getOutputShape(ir::IOIndex ind) const; - // - // Experimental API - // - - // accessor - std::vector< - std::tuple, onert::ir::IOIndex, onert::ir::IOIndex>> - getNextExes() - { - return next_exes; - } - std::deque> *getAsyncIoDescs() { return &_async_io_descs; } - std::deque> *getAsyncResults() { return &_async_results; } - - /** - * @brief Push IO information between related executions into next_exes - * @param[in] next address of next execution - * @param[in] o_index Output index of current execution (it will be the input of next execution) - * @param[in] i_index Input index of next execution - */ - void pushNextExe(std::shared_ptr next, onert::ir::IOIndex o_index, - onert::ir::IOIndex i_index) - { - next_exes.push_back({next, o_index, i_index}); - } - - /** - * @brief Create New IODescription instance for new inputs outputs - * @param[in] index instance count number - */ - void createNewAsyncDesc(uint32_t count = 0); - - /** - * @brief Set async input data's information - * @param[in] index Input index - * @param[in] buffer Input data's buffer pointer - * @param[in] length Input data's length - * @param[in] layout Input data's data format - */ - void executeAsyncInput(const ir::IOIndex &index, const void *buffer, size_t length, - ir::Layout layout = ir::Layout::NHWC); - - /** - * @brief Set async output data's information - * @param[in] index Output index - * @param[in] buffer Output data's buffer pointer - * @param[in] length Output data's length - * @param[in] layout Output data's data format - */ - void executeAsyncOutput(const ir::IOIndex &index, void *buffer, size_t length, - ir::Layout layout = ir::Layout::NHWC); - - /** - * @brief Async execution - * @note It should be called after setting input and output buffer - */ - void AsyncExecute(); - - /** - * @brief Set finish - */ - void setFinish(); - - /** - * @brief Check if input queue is empty - * @return @c true if queue is empty, otherwise @c false - */ - bool isEmptyQueue(); - - /** - * @brief Wait semaphore to prevent race condition - */ - void asyncIoDescSemWait(); - - /** - * @brief Post semaphore to prevent race condition - */ - void asyncIoDescSemPost(); - - /** - * @brief Inference - * @note this function provided to the thread for pipelining - */ - void runInference(); - - /** - * @brief Check if stop_wait is true - * @return @c true if stop_wait is true, otherwise @c false - */ - bool stopWait(void) const; - - /** - * @brief Set stop_wait to terminate consumer thread - */ - void sholudStop(); - private: - const std::unique_ptr &primary_executor() const - { - return _executors->at(ir::SubgraphIndex{0}); - }; - std::unique_ptr &primary_executor() { return _executors->at(ir::SubgraphIndex{0}); }; + const IExecutor *entryExecutor() const { return _executors->entryExecutor(); }; + IExecutor *entryExecutor() { return _executors->entryExecutor(); }; private: - const std::shared_ptr _executors; + const std::shared_ptr _executors; IODescription _io_desc; - std::deque> _async_io_descs; - sem_t _async_io_descs_sem; - std::deque> _async_results; - std::vector< - std::tuple, onert::ir::IOIndex, onert::ir::IOIndex>> - next_exes; std::unique_ptr _exec_thread; bool finished{false}; - bool stop_wait{false}; }; } // namespace exec diff --git a/runtime/onert/core/include/exec/Executors.h b/runtime/onert/core/include/exec/Executors.h deleted file mode 100644 index 5adb0eda4..000000000 --- a/runtime/onert/core/include/exec/Executors.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_EXEC_EXECUTORS_H__ -#define __ONERT_EXEC_EXECUTORS_H__ - -#include "IExecutor.h" -#include "ir/NNPkg.h" - -namespace onert -{ -namespace exec -{ - -/** - * @brief Class to gather executors - */ -class Executors -{ -public: - Executors(void) = default; - Executors(std::unique_ptr model_edges) { _model_edges = std::move(model_edges); } - Executors(const Executors &) = delete; - Executors(Executors &&) = default; - - // TODO Use Executor index - void emplace(ir::SubgraphIndex idx, std::unique_ptr exec) - { - _executors.emplace(idx, std::move(exec)); - } - - std::unique_ptr &at(ir::SubgraphIndex idx) { return _executors.at(idx); } - - uint32_t inputSize() const; - - uint32_t outputSize() const; - - const ir::OperandInfo inputInfo(const ir::IOIndex &index); - - const ir::OperandInfo outputInfo(const ir::IOIndex &index); - - void execute(const IODescription &desc); - -private: - void executeEntries(const IODescription &desc); - -private: - // TODO Use Executor index - // Changing index will effect if/while compile and kernel implementation - std::unordered_map> _executors; - // NOTE _model_edges may use different struct type for executor implementation - std::unique_ptr _model_edges; -}; - -} // namespace exec -} // namespace onert - -#endif // __ONERT_EXEC_EXECUTORS_H__ diff --git a/runtime/onert/core/include/exec/FunctionSequence.h b/runtime/onert/core/include/exec/FunctionSequence.h index 7ff6d8b8c..a7020d425 100644 --- a/runtime/onert/core/include/exec/FunctionSequence.h +++ b/runtime/onert/core/include/exec/FunctionSequence.h @@ -66,7 +66,7 @@ public: template void wrap(Args &&... args) { - for (auto &function : _functions) + for (auto &&function : _functions) { function = std::make_unique(std::move(function), args...); } diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h index bb5b5af98..46dbcd033 100644 --- a/runtime/onert/core/include/exec/IExecutor.h +++ b/runtime/onert/core/include/exec/IExecutor.h @@ -46,7 +46,6 @@ namespace onert { namespace exec { -class IExecutionObserver; /** * @brief Struct to define interface of Executor */ @@ -66,14 +65,7 @@ struct IExecutor * * @return Graph object */ - virtual const ir::Graph &graph() = 0; - - /** - * @brief Returns parent graph object - * - * @return Graph object - */ - virtual const ir::Graph &parent_graph() = 0; + virtual const ir::Graph &graph() const = 0; /** * @brief Set an ordering on operations @@ -99,6 +91,13 @@ struct IExecutor virtual void execute(const std::vector &inputs, const std::vector &outputs) = 0; + /** + * @brief Get input tensor objects + * + * @return Vector of @c IOTensor + */ + virtual const std::vector &getInputTensors() const = 0; + /** * @brief Get output tensor objects * diff --git a/runtime/onert/core/include/exec/IExecutors.h b/runtime/onert/core/include/exec/IExecutors.h new file mode 100644 index 000000000..013da716b --- /dev/null +++ b/runtime/onert/core/include/exec/IExecutors.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_EXEC_I_EXECUTORS_H__ +#define __ONERT_EXEC_I_EXECUTORS_H__ + +#include "IExecutor.h" + +namespace onert +{ +namespace exec +{ + +/** + * @brief Class to gather NN package's executor set + */ +class IExecutors +{ +public: + /** + * @brief Virtual IExecutors destructor + * @note Require derived class destructor + */ + virtual ~IExecutors() = default; + +public: + /** + * @brief Insert executor in executor set + * @param[in] model_index Model index + * @param[in] subg_index Subgraph index + * @param[in] exec Executor to insert + * + * @todo Use Executor index + */ + virtual void emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + std::unique_ptr exec) = 0; + + /** + * @brief Return executor of index + * @param[in] model_index Model index + * @param[in] subg_index Subgraph index + * @return Executor + */ + virtual IExecutor *at(const ir::ModelIndex &model_index, + const ir::SubgraphIndex &subg_index) const = 0; + + IExecutor *entryExecutor() const { return at(ir::ModelIndex{0}, ir::SubgraphIndex{0}); } + + /** + * @brief Return executor set's number of input + * @return Number of input + */ + virtual uint32_t inputSize() const = 0; + + /** + * @brief Return executor set's number of output + * @return Number of output + */ + virtual uint32_t outputSize() const = 0; + + /** + * @brief Return NN package input tensor info + * @param[in] index Input index + * @return Tensor info + */ + virtual const ir::OperandInfo &inputInfo(const ir::IOIndex &index) const = 0; + + /** + * @brief Return NN package output tensor info + * @param[in] index Output index + * @return Tensor info + */ + virtual const ir::OperandInfo &outputInfo(const ir::IOIndex &index) const = 0; + + /** + * @brief Execute NN package executor set + * @param[in] desc Input and output buffer description + */ + virtual void execute(const IODescription &desc) = 0; +}; + +} // namespace exec +} // namespace onert + +#endif // __ONERT_EXEC_I_EXECUTORS_H__ diff --git a/runtime/onert/core/include/ir/Graph.h b/runtime/onert/core/include/ir/Graph.h index 286caf72f..1783cdca0 100644 --- a/runtime/onert/core/include/ir/Graph.h +++ b/runtime/onert/core/include/ir/Graph.h @@ -89,15 +89,6 @@ public: void verify(void); void removeOperand(const OperandIndex &ind) { _operands.remove(ind); } void setLayout(Layout layout) { _layout = layout; } - void setPartialModel(const std::shared_ptr &partial_model) - { - _partialgraphs = partial_model; - } - void - setTensorName(std::shared_ptr> &tensor_names) - { - _tensor_names = tensor_names; - } private: bool checkOperandsForOperation(const Operation &operation); @@ -136,29 +127,6 @@ public: const Operations &operations() const { return _operations; } Operations &operations() { return _operations; } Layout layout() const { return _layout; } - std::shared_ptr &partialgraphs() { return _partialgraphs; } - std::shared_ptr> &tensor_names() - { - return _tensor_names; - } - std::unordered_map::const_iterator _name_to_input_begin() const - { - return _name_to_input.begin(); - } - std::unordered_map::const_iterator _name_to_input_end() const - { - return _name_to_input.end(); - } - std::unordered_map::const_iterator _name_to_output_begin() const - { - return _name_to_output.begin(); - } - std::unordered_map::const_iterator _name_to_output_end() const - { - return _name_to_output.end(); - } - void input_sort() { _inputs.sort(); } - void output_sort() { _outputs.sort(); } // Topological sort public: @@ -173,10 +141,6 @@ private: std::unordered_map _name_to_output; // TFLite and circle's default layout is NHWC; Layout _layout{Layout::NHWC}; - - // model for partial graphs - std::shared_ptr _partialgraphs; - std::shared_ptr> _tensor_names; }; } // namespace ir diff --git a/runtime/onert/core/include/ir/Index.h b/runtime/onert/core/include/ir/Index.h index f01a4c84d..1864c3bdb 100644 --- a/runtime/onert/core/include/ir/Index.h +++ b/runtime/onert/core/include/ir/Index.h @@ -36,10 +36,10 @@ struct IOIndexTag; using IOIndex = ::onert::util::Index; struct SubgraphIndexTag; -using SubgraphIndex = ::onert::util::Index; +using SubgraphIndex = ::onert::util::Index; struct ModelIndexTag; -using ModelIndex = ::onert::util::Index; +using ModelIndex = ::onert::util::Index; template std::ostream &_index_print_impl(std::ostream &o, const std::string &prefix, IndexType index) diff --git a/runtime/onert/core/include/ir/NNPkg.h b/runtime/onert/core/include/ir/NNPkg.h index d9f825e85..b23745d55 100644 --- a/runtime/onert/core/include/ir/NNPkg.h +++ b/runtime/onert/core/include/ir/NNPkg.h @@ -21,6 +21,7 @@ #include #include +#include "ir/Graph.h" #include "ir/Index.h" #include "ir/Model.h" @@ -89,7 +90,7 @@ public: ~NNPkg() = default; NNPkg(std::shared_ptr model) { _models[ModelIndex{0}] = model; } - std::shared_ptr primary_model() { return _models.at(onert::ir::ModelIndex{0}); } + std::shared_ptr primary_model() const { return _models.at(onert::ir::ModelIndex{0}); } /** * @brief Put model at index @@ -180,6 +181,91 @@ public: */ const ModelEdges &model_edges() { return _edges; } + /** + * @brief Verify NNPkg + * + */ + void verify(void) + { + // Verify edges information + // + // Only duplicates of nnpkg output and Edge `from` are possible. + // | Whether duplicates are possible | Edge `to` | Edge `from` | + // | nnpkg input (input of subgraph) | X (*1) | X (*2) | + // | nnpkg output (output of subgraph) | X (*2) | O | + // *1. The subjects who determine values of each buffer are different. + // - nnpkg input : user input + // - Edge `to` : output of another subgraph + // *2. `IOIndex` of inputs and outputs of subgraph is distinct. + // + for (const auto &edge : _edges.edges) + { + if (std::find(_edges.pkg_inputs.begin(), _edges.pkg_inputs.end(), edge.to) != + _edges.pkg_inputs.end()) + { + throw std::runtime_error{ + "Invalid edge information. NNPkg inputs and Edge `to` cannot be duplicated"}; + } + } + } + + // TODO Find better way to handle single model NNPackage and multi model NNPackage on inputSize(), + // outputSize(), inputInfo(), outputInfo() + + /** + * @brief Get model input size + */ + uint32_t inputSize() const + { + return _models.size() == 1 ? primary_model()->primary_subgraph()->getInputs().size() + : _edges.pkg_inputs.size(); + } + + /** + * @brief Get model output size + */ + uint32_t outputSize() const + { + return _models.size() == 1 ? primary_model()->primary_subgraph()->getOutputs().size() + : _edges.pkg_outputs.size(); + } + + /** + * @brief Get model input info + */ + OperandInfo &inputInfo(uint32_t index) const + { + if (_models.size() == 1) + { + auto const graph = primary_model()->primary_subgraph(); + auto const operand_index = graph->getInputs().at(index); + return graph->operands().at(operand_index).info(); + } + + auto const &desc = input(index); + auto const graph = model(std::get(desc))->primary_subgraph(); + auto const operand_index = graph->getInputs().at(std::get(desc).value()); + return graph->operands().at(operand_index).info(); + } + + /** + * @brief Get model output info + */ + OperandInfo &outputInfo(uint32_t index) const + { + if (_models.size() == 1) + { + auto const graph = primary_model()->primary_subgraph(); + auto const operand_index = graph->getOutputs().at(index); + return graph->operands().at(operand_index).info(); + } + + auto const &desc = output(index); + auto const graph = model(std::get(desc))->primary_subgraph(); + auto const operand_index = graph->getOutputs().at(std::get(desc).value()); + return graph->operands().at(operand_index).info(); + } + // TODO: Add iterate() or getter for edges private: @@ -190,4 +276,18 @@ private: } // namespace ir } // namespace onert +namespace std +{ + +template <> struct hash +{ + size_t operator()(const ::onert::ir::IODesc &iodesc) const noexcept + { + return (std::get<0>(iodesc).value() << 24) | (std::get<1>(iodesc).value() << 16) | + std::get<2>(iodesc).value(); + } +}; + +} // namespace std + #endif // __ONERT_IR_NNPKG_H__ diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h index dd390748b..846c3f950 100644 --- a/runtime/onert/core/include/ir/OperandIndexSequence.h +++ b/runtime/onert/core/include/ir/OperandIndexSequence.h @@ -19,7 +19,6 @@ #include #include -#include #include "ir/Index.h" @@ -46,12 +45,6 @@ public: void append(const OperandIndex &index) { _vec.emplace_back(index); } void append(const OperandIndexSequence &l) { _vec.insert(_vec.end(), l.begin(), l.end()); } - void sort() - { - std::sort(_vec.begin(), _vec.end(), - [](const auto &lhs, const auto &rhs) { return lhs.value() < rhs.value(); }); - } - public: uint32_t size() const { return static_cast(_vec.size()); } const OperandIndex &at(IOIndex set_index) const { return _vec.at(set_index.value()); } diff --git a/runtime/onert/core/include/ir/Shape.h b/runtime/onert/core/include/ir/Shape.h index ec6dd07af..cf84e2626 100644 --- a/runtime/onert/core/include/ir/Shape.h +++ b/runtime/onert/core/include/ir/Shape.h @@ -70,8 +70,8 @@ struct FeatureShape struct Shape { public: - static int32_t const UNSPECIFIED_DIM; - static int32_t const MAX_RANK; + static int32_t const kUnspecifiedDim; + static int32_t const kMaxRank; Shape() = default; @@ -126,7 +126,7 @@ public: */ bool hasUnspecifiedDims() const { - return (std::find(_dimensions.begin(), _dimensions.end(), UNSPECIFIED_DIM) != + return (std::find(_dimensions.begin(), _dimensions.end(), kUnspecifiedDim) != _dimensions.end()); } diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst index 4bbc02ac3..b9bad1b59 100644 --- a/runtime/onert/core/include/util/Config.lst +++ b/runtime/onert/core/include/util/Config.lst @@ -23,7 +23,6 @@ CONFIG(GRAPH_DOT_DUMP , int , "0") CONFIG(BACKENDS , std::string , "cpu;acl_cl;acl_neon;ruy;xnnpack;gpu_cl;trix;bcq") // FIXME Remove bcq CONFIG(OP_BACKEND_ALLOPS , std::string , "") CONFIG(OP_BACKEND_MAP , std::string , "") -CONFIG(DISABLE_COMPILE , bool , "0") CONFIG(ONERT_LOG_ENABLE , bool , "0") CONFIG(CPU_MEMORY_PLANNER , std::string , "WIC") CONFIG(EXECUTOR , std::string , "Linear") diff --git a/runtime/onert/core/include/util/Index.h b/runtime/onert/core/include/util/Index.h index d3f3dcb46..49c5f4c6d 100644 --- a/runtime/onert/core/include/util/Index.h +++ b/runtime/onert/core/include/util/Index.h @@ -138,6 +138,13 @@ public: */ T value() const { return _index; } + /** + * @brief Return max index value + * + * @return Maximum valid index value + */ + static T max() { return UNDEFINED - 1; } + private: T _index; }; diff --git a/runtime/onert/core/include/util/ObjectManager.h b/runtime/onert/core/include/util/ObjectManager.h index 36b6c85c8..077a4c2ef 100644 --- a/runtime/onert/core/include/util/ObjectManager.h +++ b/runtime/onert/core/include/util/ObjectManager.h @@ -202,12 +202,12 @@ public: // This implementation is a workaround in case of adding operands while iteration std::list l; - for (auto &e : _objects) + for (const auto &e : _objects) { l.push_back(e.first); } - for (auto &index : l) + for (const auto &index : l) { fn(index, *_objects[index]); } diff --git a/runtime/onert/core/include/util/Utils.h b/runtime/onert/core/include/util/Utils.h index 8a4eea32b..505f5a9b3 100644 --- a/runtime/onert/core/include/util/Utils.h +++ b/runtime/onert/core/include/util/Utils.h @@ -29,9 +29,9 @@ template struct ForEachDimension { - template + template static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords, - L &&lambda_function, Args &&... args) + L lambda_function) { static_assert(from < to, "from must not be less than to"); assert(static_cast(to) <= shape.rank()); @@ -40,8 +40,7 @@ template struct ForEachDimensio for (auto v = 0; v < d; v++) { coords.set(from, v); - ForEachDimension::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension::unroll(shape, coords, lambda_function); } } }; @@ -49,18 +48,17 @@ template struct ForEachDimensio template struct ForEachDimension::type> { - template + template static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords, - L &&lambda_function, Args &&... args) + L lambda_function) { UNUSED_RELEASE(shape); assert(static_cast(to) <= shape.rank()); - lambda_function(coords, std::forward(args)...); + lambda_function(coords); } }; -template -inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args &&... args) +template inline void ShapeLoop(const onert::ir::Shape &shape, L lambda_function) { assert(shape.rank() > 0); for (auto i = 0; i < shape.rank(); ++i) @@ -73,32 +71,25 @@ inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args & { case 0: coords.set(0, 0); - ForEachDimension<0, 0>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 0>::unroll(shape, coords, lambda_function); break; case 1: - ForEachDimension<0, 1>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 1>::unroll(shape, coords, lambda_function); break; case 2: - ForEachDimension<0, 2>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 2>::unroll(shape, coords, lambda_function); break; case 3: - ForEachDimension<0, 3>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 3>::unroll(shape, coords, lambda_function); break; case 4: - ForEachDimension<0, 4>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 4>::unroll(shape, coords, lambda_function); break; case 5: - ForEachDimension<0, 5>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 5>::unroll(shape, coords, lambda_function); break; case 6: - ForEachDimension<0, 6>::unroll(shape, coords, std::forward(lambda_function), - std::forward(args)...); + ForEachDimension<0, 6>::unroll(shape, coords, lambda_function); break; default: assert(false && "ShapeLoop, 1 <= Shape'rank <= 6"); diff --git a/runtime/onert/core/src/backend/basic/MemoryManager.cc b/runtime/onert/core/src/backend/basic/MemoryManager.cc index c468ee458..05fd9cc77 100644 --- a/runtime/onert/core/src/backend/basic/MemoryManager.cc +++ b/runtime/onert/core/src/backend/basic/MemoryManager.cc @@ -94,7 +94,7 @@ void DynamicMemoryManager::deallocate(const ITensor *tensor) void DynamicMemoryManager::deallocate(void) { - for (auto &mem_alloc : _mem_alloc_map) + for (auto &&mem_alloc : _mem_alloc_map) { // Release memory buffer of mem_alloc mem_alloc.second->release(); diff --git a/runtime/onert/core/src/backend/basic/MemoryPlanner.cc b/runtime/onert/core/src/backend/basic/MemoryPlanner.cc index 1fda57b3d..1c048043c 100644 --- a/runtime/onert/core/src/backend/basic/MemoryPlanner.cc +++ b/runtime/onert/core/src/backend/basic/MemoryPlanner.cc @@ -58,7 +58,7 @@ void FirstFitPlanner::claim(const ir::OperandIndex &ind, size_t size) { // Find the right position for claiming uint32_t next_offset = 0; - for (auto &mem_claim : _claim_table) + for (const auto &mem_claim : _claim_table) { auto claimed_base_offset = mem_claim.first; auto claimed_size = _mem_plans[mem_claim.second].size; diff --git a/runtime/onert/core/src/backend/basic/StaticTensorManager.cc b/runtime/onert/core/src/backend/basic/StaticTensorManager.cc index d891814fa..b03eb607c 100644 --- a/runtime/onert/core/src/backend/basic/StaticTensorManager.cc +++ b/runtime/onert/core/src/backend/basic/StaticTensorManager.cc @@ -39,7 +39,7 @@ void StaticTensorManager::allocateNonconsts(void) { _nonconst_mgr->allocate(); - for (auto &pair : _tensors->native_tensors()) + for (auto &&pair : _tensors->native_tensors()) { const auto &ind = pair.first; auto tensor = pair.second.get(); diff --git a/runtime/onert/core/src/backend/builtin/BackendContext.cc b/runtime/onert/core/src/backend/builtin/BackendContext.cc index 8a6cddcfb..c1a2ed537 100644 --- a/runtime/onert/core/src/backend/builtin/BackendContext.cc +++ b/runtime/onert/core/src/backend/builtin/BackendContext.cc @@ -44,7 +44,7 @@ FunctionMap BackendContext::genKernels() const_cast(graph())->operands().iterate( [&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - for (auto &it : ret) + for (auto &&it : ret) { auto &fn_seq = it.second; fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); diff --git a/runtime/onert/core/src/backend/builtin/IOTensor.h b/runtime/onert/core/src/backend/builtin/IOTensor.h index a1b2064a1..d94ed0bca 100644 --- a/runtime/onert/core/src/backend/builtin/IOTensor.h +++ b/runtime/onert/core/src/backend/builtin/IOTensor.h @@ -47,7 +47,7 @@ public: public: void setTensor(IPortableTensor *tensor); void setUserTensor(uint8_t *buffer, size_t size); - ir::OperandInfo orig_info() const { return _orig_info; } + const ir::OperandInfo &orig_info() const { return _orig_info; } ir::Layout orig_layout() const { return _orig_layout; } public: diff --git a/runtime/onert/core/src/backend/builtin/KernelGenerator.cc b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc index fa2fc0b94..4533703a6 100644 --- a/runtime/onert/core/src/backend/builtin/KernelGenerator.cc +++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc @@ -33,8 +33,8 @@ KernelGenerator::KernelGenerator(const ir::Graph &graph, DynamicTensorManager *d const std::shared_ptr &tensor_reg, const std::shared_ptr &external_context) : basic::KernelGeneratorBase{graph}, _dyn_tensor_manager{dyn_tensor_manager}, - _tensor_reg{tensor_reg}, _tensor_registries{}, _executors{nullptr}, _external_context{ - external_context} + _tensor_reg{tensor_reg}, _tensor_registries{}, _executors{nullptr}, _model_index{}, + _external_context{external_context} { UNUSED_RELEASE(_graph); UNUSED_RELEASE(_tensor_registries); @@ -90,7 +90,7 @@ void KernelGenerator::visit(const ir::operation::If &node) input_tensors.erase(input_tensors.begin()); auto fn = std::make_unique<::onert::backend::builtin::kernel::IfLayer>( cond_tensor, input_tensors, output_tensors, then_subg_index, else_subg_index, _executors, - _external_context); + _model_index, _external_context); _return_fn = std::move(fn); } @@ -133,7 +133,7 @@ void KernelGenerator::visit(const ir::operation::While &node) // WhileLayer just set Executors instead of cond and body executor to avoid complexity of // creating executor recusively auto fn = std::make_unique<::onert::backend::builtin::kernel::WhileLayer>( - input_tensors, output_tensors, cond_subg_index, body_subg_index, _executors, + input_tensors, output_tensors, cond_subg_index, body_subg_index, _executors, _model_index, _dyn_tensor_manager->dynamic_mem_mgr().get(), _external_context); _return_fn = std::move(fn); diff --git a/runtime/onert/core/src/backend/builtin/KernelGenerator.h b/runtime/onert/core/src/backend/builtin/KernelGenerator.h index d5931ca26..3c86fe306 100644 --- a/runtime/onert/core/src/backend/builtin/KernelGenerator.h +++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.h @@ -23,7 +23,7 @@ #include "../../compiler/TensorRegistries.h" #include "backend/basic/KernelGeneratorBase.h" -#include "exec/Executors.h" +#include "exec/IExecutors.h" #include "ir/Graph.h" namespace onert @@ -44,12 +44,14 @@ public: { _tensor_registries = tensor_registries; } - void setExecutors(const std::shared_ptr &executors) + void setExecutors(const std::shared_ptr &executors) { // FIXME Using shared_ptr's raw pointer! _executors = executors.get(); } + void setModelIndex(const ir::ModelIndex &index) { _model_index = index; } + std::unique_ptr generate(ir::OperationIndex ind) override; private: @@ -65,7 +67,8 @@ private: DynamicTensorManager *_dyn_tensor_manager; std::shared_ptr _tensor_reg; compiler::TensorRegistries _tensor_registries; - exec::Executors *_executors; + exec::IExecutors *_executors; + ir::ModelIndex _model_index; const std::shared_ptr _external_context; }; diff --git a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc index cdb41960a..51bc5a8f2 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc +++ b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc @@ -29,11 +29,11 @@ IfLayer::IfLayer(backend::IPortableTensor *cond_tensor, const std::vector input_tensors, const std::vector output_tensors, const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index, - exec::Executors *executors, + exec::IExecutors *executors, const ir::ModelIndex &model_index, const std::shared_ptr &external_context) : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors}, _then_subg_index{then_subg_index}, _else_subg_index{else_subg_index}, _executors{executors}, - _external_context{external_context} + _model_index{model_index}, _external_context{external_context} { // At this point, executors may not have executors of then subg and else subg } @@ -61,12 +61,12 @@ void IfLayer::run() if (cond_result) { VERBOSE(If) << "Call to $" << _then_subg_index << " (then)" << std::endl; - subg_exec = _executors->at(_then_subg_index).get(); + subg_exec = _executors->at(_model_index, _then_subg_index); } else { VERBOSE(If) << "Call to $" << _else_subg_index << " (else)" << std::endl; - subg_exec = _executors->at(_else_subg_index).get(); + subg_exec = _executors->at(_model_index, _else_subg_index); } subg_exec->execute(_input_tensors, _output_tensors); diff --git a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h index fa5537a67..8f639ced9 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h +++ b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h @@ -18,7 +18,7 @@ #define __ONERT_BACKEND_BUILTIN_KERNEL_IF_LAYER_H__ #include -#include +#include #include "../ExternalContext.h" namespace onert @@ -37,7 +37,8 @@ public: const std::vector input_tensors, const std::vector output_tensors, const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index, - exec::Executors *executors, const std::shared_ptr &external_context); + exec::IExecutors *executors, const ir::ModelIndex &model_index, + const std::shared_ptr &external_context); public: void run() override; @@ -48,7 +49,8 @@ private: const std::vector _output_tensors; const ir::SubgraphIndex _then_subg_index; const ir::SubgraphIndex _else_subg_index; - exec::Executors *_executors; + exec::IExecutors *_executors; + ir::ModelIndex _model_index; const std::shared_ptr _external_context; }; diff --git a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc index ddaecdf57..600180077 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc +++ b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc @@ -64,7 +64,7 @@ void PermuteLayer::optimize() src_offsets_it->resize(0); dst_offsets_it->resize(0); if (underlying_type(src->data_type()) != underlying_type(dst->data_type())) - throw std::runtime_error("data type does not match"); + continue; const auto permute_type = [&]() -> PermuteType { if (src->getShape().rank() == 4 && src->layout() == ir::Layout::NHWC && dst->layout() == ir::Layout::NCHW) @@ -81,6 +81,8 @@ void PermuteLayer::optimize() return PermuteType::COPY; } }(); + + // TODO Support different types auto fn = [&](backend::ITensor &src_tensor) { dst->access([&](backend::ITensor &dst_tensor) { // NOTE The buffer of both tensor can be nullptr in this step @@ -260,8 +262,10 @@ void PermuteLayer::run() // 1. The tasks for multithreathing was created // 2. The tasks's size > 1 // 3. Both tensors are not dynamic + // 4. Data types of both tensors are different if (_tasks_map.find(src) == _tasks_map.end() || _tasks_map.at(src).size() == 1 || - src->is_dynamic() || dst->is_dynamic()) + src->is_dynamic() || dst->is_dynamic() || + underlying_type(src->data_type()) != underlying_type(dst->data_type())) { permute(src, dst, src->getShape().rank(), src_offsets, dst_offsets); } diff --git a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc index 8e006c5ea..c0ca4046c 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc +++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc @@ -35,12 +35,14 @@ namespace kernel WhileLayer::WhileLayer(const std::vector input_tensors, const std::vector output_tensors, const ir::SubgraphIndex &cond_subg_index, - const ir::SubgraphIndex &body_subg_index, exec::Executors *executors, + const ir::SubgraphIndex &body_subg_index, exec::IExecutors *executors, + const ir::ModelIndex &model_index, basic::DynamicMemoryManager *dyn_memory_manager, const std::shared_ptr &external_context) : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index}, _input_tensors{input_tensors}, _output_tensors{output_tensors}, _executors{executors}, - _dyn_memory_manager{dyn_memory_manager}, _external_context{external_context} + _model_index{model_index}, _dyn_memory_manager{dyn_memory_manager}, _external_context{ + external_context} { // At this point, executors may not have executors of cond subg and body subg } @@ -57,8 +59,8 @@ void WhileLayer::run() // // Run cond subg // If there is no loop copy "_input_tensors" -> "_dst_tensors", else copy "cond subg inputs" -> // "_dst_tensors" - auto cond_exec = _executors->at(_cond_subg_index).get(); - auto body_exec = _executors->at(_body_subg_index).get(); + auto cond_exec = _executors->at(_model_index, _cond_subg_index); + auto body_exec = _executors->at(_model_index, _body_subg_index); // Need a temp tensor to hold the cond subgraph output assert(cond_exec->getOutputTensors().size() == 1); diff --git a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h index 8551b3d09..40ca4fe23 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h +++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h @@ -18,7 +18,7 @@ #define __ONERT_BACKEND_BUILTIN_KERNEL_WHILE_LAYER_H__ #include -#include +#include #include #include #include @@ -41,7 +41,8 @@ public: WhileLayer(const std::vector input_tensors, const std::vector output_tensors, const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index, - exec::Executors *executors, basic::DynamicMemoryManager *dyn_memory_manager, + exec::IExecutors *executors, const ir::ModelIndex &model_index, + basic::DynamicMemoryManager *dyn_memory_manager, const std::shared_ptr &external_context); public: @@ -52,7 +53,8 @@ private: const ir::SubgraphIndex _body_subg_index; const std::vector _input_tensors; const std::vector _output_tensors; - exec::Executors *_executors; + exec::IExecutors *_executors; + const ir::ModelIndex _model_index; basic::DynamicMemoryManager *_dyn_memory_manager; // For generating temp tensors const std::shared_ptr _external_context; }; diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc index 7be9c1e3b..45124556b 100644 --- a/runtime/onert/core/src/compiler/Compiler.cc +++ b/runtime/onert/core/src/compiler/Compiler.cc @@ -22,543 +22,96 @@ #include "pass/OddOutputPass.h" #include "pass/PassRunner.h" #include "pass/UnusedOperandEliminationPass.h" -#include "../backend/builtin/Config.h" #include "../dumper/dot/DotDumper.h" -#include "../interp/InterpExecutor.h" -#include "../ir/OperationCloner.h" +#include "../exec/SingleModelExecutors.h" #include "../ir/OperationDumper.h" #include "../ir/verifier/Verifier.h" #include "compiler/StaticShapeInferer.h" -#include "util/ConfigSource.h" -#include "util/logging.h" -#include #include -#include - -// TODO Remove using fstream header -#include - -namespace -{ - -using namespace onert; - -std::string getOpBackends(std::unordered_map &opcode_to_backend) -{ - std::unordered_map::iterator it; - std::string opbackends; - - for (it = opcode_to_backend.begin(); it != opcode_to_backend.end(); ++it) - { - if (!opbackends.empty()) - opbackends = opbackends + ", "; - - auto opcode = it->first; - const std::string opname = ir::toString(opcode); - opbackends += opname + "=" + it->second; - } - return opbackends; -} - -void verboseOptions(compiler::CompilerOptions &options) -{ - VERBOSE(Compiler) << std::boolalpha << "==== Compiler Options ====" << std::endl; - VERBOSE(Compiler) << "backend_list : " - << nnfw::misc::join(options.backend_list.begin(), options.backend_list.end(), - "/") - << std::endl; - VERBOSE(Compiler) << "trace_filepath : " << options.trace_filepath << std::endl; - VERBOSE(Compiler) << "graph_dump_level : " << options.graph_dump_level << std::endl; - VERBOSE(Compiler) << "executor : " << options.executor << std::endl; - VERBOSE(Compiler) << "manual backend_for_all : " - << options.manual_scheduler_options.backend_for_all << std::endl; - VERBOSE(Compiler) << "manual_scheduler_options : " - << getOpBackends(options.manual_scheduler_options.opcode_to_backend) - << std::endl; - VERBOSE(Compiler) << "he_scheduler : " << options.he_scheduler << std::endl; - VERBOSE(Compiler) << "he_profiling_mode : " << options.he_profiling_mode << std::endl; - VERBOSE(Compiler) << "disable_compile : " << options.disable_compile << std::endl; - VERBOSE(Compiler) << "fp16_enable : " << options.fp16_enable << std::endl - << std::noboolalpha; -} - -std::unordered_map> -createStaticShapeInferers( - const std::unordered_map> - &lowered_subgs) -{ - // Allocate StaticShapeInferer per each subgraph - std::unordered_map> inferers; - for (auto &pair : lowered_subgs) - { - const auto &subg_index = pair.first; - auto &lowered_subg = pair.second; - inferers[subg_index] = std::make_unique(lowered_subg.get()); - } - - // Append observers in all StaticShapeInferers - for (auto &pair : lowered_subgs) - { - const auto &subg_index = pair.first; - auto &lowered_subg = pair.second; - - // TODO: Change this iteration for all to controlflow iteration - lowered_subg->graph().operations().iterate([&](const ir::OperationIndex &, - const ir::Operation &op) { - // A Function to append child inferers. These make it possible for a StaticShapeInferer to - // call StaticShapeInferes of child subgraphs recursively - auto appendChildInferer = [&](const ir::SubgraphIndex &child_subg_idx) { - auto *child_inferer = inferers.at(child_subg_idx).get(); - inferers.at(subg_index)->appendChildInferer(child_subg_idx, child_inferer); - }; - - // A Function to appaend subg input observers. This makes it possible for a StaticShapeInferer - // to update inputs of child subgraphs - auto appendSubgraphInputObserver = [&](const ir::SubgraphIndex &child_subg_idx) { - std::vector child_subg_inputs; - auto &child_subg = lowered_subgs.at(child_subg_idx)->graph(); - for (const auto &input_idx : child_subg.getInputs()) - { - auto operand_ptr = child_subg.operands().getRawPtr(input_idx); - child_subg_inputs.emplace_back(operand_ptr); - } - inferers.at(subg_index) - ->appendSubgInputObserver(child_subg_idx, - std::make_unique(child_subg_inputs)); - }; - - // A Function to set controlflow output observers. This makes it possible for a - // StaticShapeInferer to update outputs of parent controlflow opeerations - auto setControlFlowOutputObserver = [&](const ir::SubgraphIndex &child_subg_idx) { - std::vector cf_outputs; - auto &subg = lowered_subg->graph(); - for (const auto &output_idx : op.getOutputs()) - { - auto operand_ptr = subg.operands().getRawPtr(output_idx); - cf_outputs.emplace_back(operand_ptr); - } - inferers.at(child_subg_idx) - ->setControlflowOutputObserver(std::make_unique(cf_outputs)); - }; - - // Append Observers in a StaticShapeInferer - if (op.opcode() == ir::OpCode::If) - { - const auto &if_op = nnfw::misc::polymorphic_downcast(op); - - appendChildInferer(if_op.param().then_subg_index); - appendChildInferer(if_op.param().else_subg_index); - - appendSubgraphInputObserver(if_op.param().then_subg_index); - appendSubgraphInputObserver(if_op.param().else_subg_index); - - setControlFlowOutputObserver(if_op.param().then_subg_index); - } - else if (op.opcode() == ir::OpCode::While) - { - const auto &while_op = nnfw::misc::polymorphic_downcast(op); - - appendChildInferer(while_op.param().cond_subg_index); - appendChildInferer(while_op.param().body_subg_index); - - appendSubgraphInputObserver(while_op.param().cond_subg_index); - appendSubgraphInputObserver(while_op.param().body_subg_index); - - setControlFlowOutputObserver(while_op.param().body_subg_index); - } - }); - } - - return inferers; -} - -} // namespace namespace onert { - namespace compiler { -void ManualSchedulerOptions::setBackendMap(const std::string &str) -{ - // TODO Support multiple subgraphs for manual scheduling - auto key_val_list = nnfw::misc::split(str, ';'); - for (const auto &key_val_str : key_val_list) - { - if (key_val_str.empty()) - { - continue; - } - - auto key_val = nnfw::misc::split(key_val_str, '='); - const auto &key_str = key_val.at(0); - const auto &val = key_val.at(1); - auto key = static_cast(std::stoi(key_str)); - this->index_to_backend.emplace(ir::OperationIndex{key}, val); - } -} - -std::unique_ptr CompilerOptions::fromGlobalConfig() -{ - auto o = std::make_unique(); - o->backend_list = nnfw::misc::split(util::getConfigString(util::config::BACKENDS), ';'); - o->trace_filepath = util::getConfigString(util::config::TRACE_FILEPATH); - o->graph_dump_level = util::getConfigInt(util::config::GRAPH_DOT_DUMP); - o->executor = util::getConfigString(util::config::EXECUTOR); - o->he_scheduler = util::getConfigBool(util::config::USE_SCHEDULER); - o->he_profiling_mode = util::getConfigBool(util::config::PROFILING_MODE); - o->disable_compile = util::getConfigBool(util::config::DISABLE_COMPILE); - o->fp16_enable = util::getConfigBool(util::config::FP16_ENABLE); - { - // Backend for all - auto &ms_options = o->manual_scheduler_options; - - // Default value for op_backend_all is first element in the backend list - ms_options.backend_for_all = util::getConfigString(util::config::OP_BACKEND_ALLOPS); - -// Opcode to Backend -#define OP(OpName) \ - { \ - const auto &backend_str = util::getConfigString(util::config::OP_BACKEND_##OpName); \ - if (!backend_str.empty()) \ - { \ - ms_options.opcode_to_backend[ir::OpCode::OpName] = backend_str; \ - } \ - } -#include "ir/Operations.lst" -#undef OP - - // Index to Backend - auto map_str = util::getConfigString(util::config::OP_BACKEND_MAP); - ms_options.setBackendMap(map_str); - } - return o; -} Compiler::Compiler(const std::shared_ptr &model, CompilerOptions &copt) - : _nnpkg{std::make_shared(model)}, _state{State::CREATED}, _voptions{&copt} + : _model{model}, _options{&copt} { // DO NOTHING } Compiler::Compiler(const std::shared_ptr &nnpkg, std::vector> &copts) - : _nnpkg{nnpkg}, _state{State::CREATED}, _voptions{} + : _model{nnpkg->primary_model()}, _options{copts[0].get()} { - for (uint32_t i = 0; i < copts.size(); i++) - { - _voptions.push_back(copts[i].get()); - } -} - -void Compiler::enableToFp16() -{ - for (auto options : _voptions) - options->fp16_enable = true; -} - -void Compiler::checkProfilerConditions() -{ - if (_nnpkg->model_count() != 1) - throw std::runtime_error("NYI: Profiling mode for multiple model is not supported yet"); - - auto &options = *_voptions[0]; - - if (options.he_scheduler) - throw std::runtime_error("Heterogeneous scheduler must be enabled during profiling."); - - if (options.executor != "Dataflow") - throw std::runtime_error("Profiling mode works only with 'Dataflow' executor"); -} - -bool Compiler::buildPartialGraph(uint32_t num_graphs) -{ - // Use 1st model and options only on partial graph (pipeline) compile - assert(_nnpkg->model_count() == 1); - assert(_voptions.size() == 1); - - auto model = _nnpkg->primary_model(); - auto &options = *_voptions[0]; - - if (model->subgraphs_count() > 1) - return false; - - auto partialgraphs = std::make_shared(); - - for (uint32_t idx = 0; idx < num_graphs; idx++) - { - auto partialgraph = std::make_unique(); - partialgraphs->push(ir::SubgraphIndex{idx}, std::move(partialgraph)); - } - model->primary_subgraph()->setPartialModel(partialgraphs); - - auto partial_graph = primary_subgraph()->partialgraphs(); - - primary_subgraph()->operands().iterate( - [&](const ir::OperandIndex &operand_index, const ir::Operand &operand) { - auto use_operations = operand.getUses(); - - for (auto use_operation : use_operations) - { - auto graph_index = options.partial_graph_options.index_to_graph.find(use_operation); - if (graph_index == options.partial_graph_options.index_to_graph.end()) - { - throw std::runtime_error("Invalid Partition Map"); - } - auto partition = partial_graph->at(graph_index->second); - - if (partition->operands().exist(operand_index)) - { - continue; - } - - auto new_operand = std::make_unique(operand); - new_operand->clearDefUse(); - auto new_operand_ind = partition->addOperand(operand_index, std::move(new_operand)); - UNUSED_RELEASE(new_operand_ind); - assert(new_operand_ind == operand_index); - } - }); - - primary_subgraph()->operations().iterate( - [&](const ir::OperationIndex &operation_index, const ir::Operation &operation) { - auto graph_index = options.partial_graph_options.index_to_graph.find(operation_index); - if (graph_index == options.partial_graph_options.index_to_graph.end()) - { - throw std::runtime_error("Invalid Partition Map"); - } - auto partition = partial_graph->at(graph_index->second); - - auto operand_io = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED | - ir::Remove::UNDEFINED; - for (auto operand_index : operand_io) - { - if (partition->operands().exist(operand_index)) - continue; - - const auto &operand = primary_subgraph()->operands().at(operand_index); - - auto new_operand = std::make_unique(operand); - new_operand->clearDefUse(); - - auto new_operand_index = partition->addOperand(operand_index, std::move(new_operand)); - UNUSED_RELEASE(new_operand_index); - assert(new_operand_index == operand_index); - } - - auto new_operation_index = partition->addOperation(operation_index, clone(operation)); - UNUSED_RELEASE(new_operation_index); - assert(new_operation_index == operation_index); - }); - - for (uint32_t idx = 0; idx < partial_graph->subgraphs_count(); idx++) - { - auto partition = partial_graph->at(ir::SubgraphIndex{idx}); - - partition->operands().iterate([&](const ir::OperandIndex &operand_index, - const ir::Operand &operand) { - if (primary_subgraph()->getInputs().contains(operand_index) || - (!operand.getDef().valid() && !operand.isConstant())) - { - partition->addInput(operand_index, primary_subgraph()->tensor_names()->at(operand_index)); - } - if (primary_subgraph()->getOutputs().contains(operand_index) || operand.getUses().size() == 0) - { - partition->addOutput(operand_index, primary_subgraph()->tensor_names()->at(operand_index)); - } - - if (primary_subgraph()->operands().at(operand_index).getUses().size() > 1 && - !primary_subgraph()->operands().at(operand_index).isConstant() && - !partition->getInputs().contains(operand_index)) - { - auto use_operations = primary_subgraph()->operands().at(operand_index).getUses(); - auto iter = use_operations.begin(); - ir::SubgraphIndex graph_index = - options.partial_graph_options.index_to_graph.find(*iter++)->second; - while (iter != use_operations.end()) - { - if (graph_index != options.partial_graph_options.index_to_graph.find(*iter)->second && - !partition->getOutputs().contains(operand_index)) - { - partition->addOutput(operand_index, - primary_subgraph()->tensor_names()->at(operand_index)); - } - iter++; - } - } - }); - - partition->verify(); - - bool same = true; - if (partition->getInputs().size() == primary_subgraph()->getInputs().size()) - { - for (auto iter = partition->getInputs().begin(); iter != partition->getInputs().end(); ++iter) - { - if (!primary_subgraph()->getInputs().contains(*iter)) - { - same = false; - break; - } - } - if (same == true) - { - partition->getInputs() = primary_subgraph()->getInputs(); - } - else - { - partition->input_sort(); - } - } - - same = true; - if (partition->getOutputs().size() == primary_subgraph()->getOutputs().size()) - { - for (auto iter = partition->getOutputs().begin(); iter != partition->getOutputs().end(); - ++iter) - { - if (!primary_subgraph()->getOutputs().contains(*iter)) - { - same = false; - break; - } - } - if (same == true) - { - partition->getOutputs() = primary_subgraph()->getOutputs(); - } - else - { - partition->output_sort(); - } - } - } - return true; + // Use for single model only + assert(nnpkg->model_count() == 1); } std::shared_ptr Compiler::compile(void) { - for (auto options : _voptions) - { - // Set control flow backend for control flow operators - auto &builtin_id = backend::builtin::Config::ID; - options->manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id; - options->manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id; - options->manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id; - - // FIXME This is a workaround for bcq operations, should remove it - options->manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq"; - options->manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq"; - - // FIXME This is a workaround for bulk operations, should remove it - options->manual_scheduler_options.opcode_to_backend[ir::OpCode::Bulk] = "trix"; - - verboseOptions(*options); - } - - // NYI: allow one model compilation - auto const model_count = _nnpkg->model_count(); - if (model_count != _voptions.size()) - throw std::runtime_error{"Model count and option vector size mismatch"}; - - for (uint32_t i = 0; i < model_count; i++) - { - _nnpkg->model(ir::ModelIndex{i})->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) { - // Mandatory passes - pass::PassRunner{} - .append(std::make_unique(subg)) - .append(std::make_unique(subg)) - .run(); - - // Optimizations - pass::PassRunner{}.append(std::make_unique(subg)).run(); - }); - } - /*************************************************** * Prepare compilation phase ***************************************************/ - // Compilable check - // TODO: Support hybrid execution - - // execution between interpreter and compiled executor (including control flow) - if (_voptions[0]->disable_compile) - { - if (model_count > 1) - throw std::runtime_error{"NYI: Disable compilation for multi model is not supported yet"}; + if (!_options) + throw std::runtime_error{"Empty compile option"}; - auto executors = std::make_shared(); + // Mode check + // TODO handle option for each model + if (_options->he_profiling_mode) + { + if (!_options->he_scheduler) + throw std::runtime_error("Heterogeneous scheduler must be enabled during profiling."); - _nnpkg->primary_model()->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) { - executors->emplace(index, std::make_unique(subg)); - }); - _state = State::COMPILED; - return std::make_shared(executors, nullptr); + if (_options->executor != "Dataflow") + throw std::runtime_error("Profiling mode works only with 'Dataflow' executor"); } - // Mode check - // TODO handle option for each model - if (_voptions[0]->he_profiling_mode) - checkProfilerConditions(); + _options->forceInternalOptions(); + _options->verboseOptions(); + + _model->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) { + // Mandatory passes + pass::PassRunner{} + .append(std::make_unique(subg)) + .append(std::make_unique(subg)) + .run(); + + // Optimizations + pass::PassRunner{}.append(std::make_unique(subg)).run(); + }); /*************************************************** * Backend independent analysis & optimization phase ***************************************************/ // TODO Handle dump level for each model - auto dump_level = static_cast(_voptions[0]->graph_dump_level); + auto dump_level = static_cast(_options->graph_dump_level); onert::dumper::dot::DotDumper dot_dumper(dump_level); // Tracing context auto tracing_ctx = std::make_unique(); - // Model edge context - std::unique_ptr model_edges = nullptr; - // Lower: Assign backend std::unordered_map> lowered_subgs; - - if (model_count == 1) { - _nnpkg->primary_model()->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) { - dot_dumper.dump(subg, nnfw::misc::str("before_lower_subg-", index.value())); + _model->iterate([&](const ir::SubgraphIndex &subg_index, ir::Graph &subg) { // Lower: Assign backend - lowered_subgs[index] = std::make_unique(subg, *_voptions[0]); + lowered_subgs[subg_index] = std::make_unique(subg, *_options); // Set tracing_ctx for copied graph - tracing_ctx->setSubgraphIndex(&(lowered_subgs[index]->graph()), index.value()); + if (tracing_ctx != nullptr) + tracing_ctx->setSubgraphIndex(&(lowered_subgs[subg_index]->graph()), subg_index.value()); }); } - else - { - // TODO Support tracing_ctx for multiple model - tracing_ctx = nullptr; - // Copy model edge context - model_edges = std::make_unique(_nnpkg->model_edges()); + _model.reset(); - for (uint32_t i = 0; i < model_count; i++) - { - auto model = _nnpkg->model(ir::ModelIndex{i}); - if (model->subgraphs_count() != 1) - throw std::runtime_error{"NYI: Lowering subgraphs for multiple model is not supported yet"}; - auto subg = model->primary_subgraph(); - dot_dumper.dump(*subg, nnfw::misc::str("before_lower_model-", i)); - - // For multimodel, model index is used for lowered graph index in lowered graph map - // and index type is SubgraphIndex - // TODO Find better way to represent lowered graph index for multimodel's subgraph - lowered_subgs[ir::SubgraphIndex{i}] = - std::make_unique(*model->primary_subgraph(), *_voptions[i]); - } - } - - _nnpkg.reset(); - - for (auto &pair : lowered_subgs) + for (const auto &pair : lowered_subgs) { const auto &subg_index = pair.first; - auto &lowered_subg = pair.second; - dot_dumper.dump(*lowered_subg, "after_lower_subg-" + std::to_string(subg_index.value())); + const auto &lowered_subg = pair.second; + dot_dumper.dump(*lowered_subg, nnfw::misc::str("after_lower_subg-", subg_index.value())); } // Shape inference. @@ -566,28 +119,15 @@ std::shared_ptr Compiler::compile(void) // Run the StaticShapeInfer of primary subg. All child StaticShapeInferers are called // recursively std::unordered_map> inferers = - createStaticShapeInferers(lowered_subgs); + StaticShapeInferer::createStaticShapeInferers(lowered_subgs); - if (model_count == 1) - { - const auto primary_subg_idx = ir::SubgraphIndex{0}; - inferers.at(primary_subg_idx)->infer(); + const auto primary_subg_idx = ir::SubgraphIndex{0}; + inferers.at(primary_subg_idx)->infer(); - for (const auto &pair : inferers) - { - const auto inferer = pair.second.get(); - inferer->dump(); - } - } - else + for (const auto &pair_inferer : inferers) { - // Assume multi model has only one subgraph on each model - for (const auto &pair : inferers) - { - const auto inferer = pair.second.get(); - inferer->infer(); - inferer->dump(); - } + const auto inferer = pair_inferer.second.get(); + inferer->dump(); } } @@ -598,7 +138,7 @@ std::shared_ptr Compiler::compile(void) // - Check parameter value validation which valid value is depend on input tensor shape // - Output tensor shape validation check is needless because // static/dynamic shape inferer will make valid output shape - for (auto &pair : lowered_subgs) + for (const auto &pair : lowered_subgs) { auto &lowered_subg = pair.second; compiler::ShapeValidator{lowered_subg->graph()}(); @@ -607,240 +147,30 @@ std::shared_ptr Compiler::compile(void) /************************************************************* * Backend independent analysis & optimization phase finished *************************************************************/ - auto executors = std::make_shared(std::move(model_edges)); - for (auto &pair : lowered_subgs) + auto executors = std::make_shared(); + for (auto &&pair : lowered_subgs) { - const auto &subg_index = pair.first; + auto const model_index = ir::ModelIndex{0}; + auto const subg_index = pair.first; auto &lowered_subg = pair.second; - auto indexed_ranks = lowered_subg->indexed_ranks(); + auto const indexed_ranks = lowered_subg->indexed_ranks(); ir::OperationDumper dumper("Executor generation of Subgraph " + std::to_string(subg_index.value())); lowered_subg->graph().operations().iterate( [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); }); - auto &options = (model_count > 1) ? *_voptions[subg_index.value()] : *_voptions[0]; auto executor = std::unique_ptr{ExecutorFactory::get().create( - std::move(lowered_subg), tracing_ctx.get(), options, executors)}; + std::move(lowered_subg), tracing_ctx.get(), *_options, executors, model_index)}; executor->setIndexedRanks(indexed_ranks); - executors->emplace(subg_index, std::move(executor)); + executors->emplace(model_index, subg_index, std::move(executor)); } /******************************** * Code generation phase finished ********************************/ - _state = State::COMPILED; return std::make_shared(executors, std::move(tracing_ctx)); } -std::vector> Compiler::compile(const char *package_file_path, - const char *map_file_path) -{ - // Allow one model compilation for pipeline - if (_nnpkg->model_count() != 1) - throw std::runtime_error{"Multiple models compilation for pipeline is not supported yet."}; - assert(_voptions.size() == 1); - - auto model = _nnpkg->primary_model(); - auto &options = *_voptions[0]; - - std::string package_path(package_file_path); - std::string partition_map_file; - - if (map_file_path) - { - partition_map_file = map_file_path; - } - else - { - partition_map_file = package_path + "/partition_map.json"; - } - - std::ifstream pmfs(partition_map_file); - Json::Value root; - pmfs >> root; - const Json::Value &map = root["partition_map"]; - const Json::Value &np = root["num_partitions"]; - - uint32_t num_graphs = 1; - - if (pmfs.is_open()) - { - num_graphs = np.asUInt(); - for (uint32_t i = 0; i < (uint32_t)map.size(); ++i) - { - options.partial_graph_options.index_to_graph[ir::OperationIndex{i}] = - ir::SubgraphIndex{map[i].asUInt()}; - } - } - else - { - throw std::runtime_error("There is no partition map file"); - } - - if (!buildPartialGraph(num_graphs)) - { - throw std::runtime_error("It doesn't support in case there are subgraphs"); - } - - // Set control flow backend for control flow operators - { - auto &builtin_id = backend::builtin::Config::ID; - options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id; - options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id; - options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id; - } - - // FIXME This is a workaround for bcq operations, should remove it - { - options.manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq"; - options.manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq"; - } - - // FIXME This is a workaround for bulk operations, should remove it - { - options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Bulk] = "trix"; - } - - verboseOptions(options); - - model->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) { - // Mandatory passes - auto part = subg.partialgraphs(); - part->iterate([&](const ir::SubgraphIndex &, ir::Graph &partialgraph) { - pass::PassRunner{} - .append(std::make_unique(partialgraph)) - .append(std::make_unique(partialgraph)) - .run(); - - // Optimizations - pass::PassRunner{} - .append(std::make_unique(partialgraph)) - .run(); - }); - }); - - /*************************************************** - * Prepare compilation phase - ***************************************************/ - - // Compilable check - // TODO: Support hybrid execution - - // execution between interpreter and compiled executor (including control flow) - if (options.disable_compile) - { - std::vector> results; - auto executors = std::make_shared(); - - model->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) { - executors->emplace(index, std::make_unique(subg)); - }); - results.push_back(std::make_shared(executors, nullptr)); - _state = State::COMPILED; - return results; - } - - // Mode check - if (options.he_profiling_mode) - checkProfilerConditions(); - - /*************************************************** - * Backend independent analysis & optimization phase - ***************************************************/ - auto dump_level = static_cast(options.graph_dump_level); - onert::dumper::dot::DotDumper dot_dumper_part(dump_level); - - // Lower: Assign backend - std::unordered_map> - lowered_partialgraphs; - model->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) { - auto part = subg.partialgraphs(); - part->iterate([&](const ir::SubgraphIndex &pindex, ir::Graph &partialgraph) { - dot_dumper_part.dump(partialgraph, - nnfw::misc::str("before_lower_subg_partialgraph-", pindex.value())); - - // // Lower: Assign backend - lowered_partialgraphs[pindex] = - std::make_unique(subg, partialgraph, options); - }); - }); - - for (auto &pair : lowered_partialgraphs) - { - - const auto &partialgraph_index = pair.first; - auto &lowered_partialgraph = pair.second; - dot_dumper_part.dump(*lowered_partialgraph, "after_lower_subg_partialgraph-" + - std::to_string(partialgraph_index.value())); - } - - // Partial Graph shape inference - std::unordered_map> inferers = - createStaticShapeInferers(lowered_partialgraphs); - // NOTE If partialgraph has subgraphs StaticShapeInferer may be called multiple times - for (auto &pair : lowered_partialgraphs) - { - const auto &partialgraph_index = pair.first; - const auto partial_inferer = inferers.at(partialgraph_index).get(); - partial_inferer->infer(); - partial_inferer->dump(); - } - - // Shape validation - // TODO Move shape independent feature check from ShapeValidator to OperationValidator - // TODO Move ShapeValidator into shape inference - // - Check input tensor shape validation - // - Check parameter value validation which valid value is depend on input tensor shape - // - Output tensor shape validation check is needless because - // static/dynamic shape inferer will make valid output shape - for (auto &pair : lowered_partialgraphs) - { - auto &lowered_partialgraph = pair.second; - compiler::ShapeValidator{lowered_partialgraph->graph()}(); - } - - /************************************************************* - * Backend independent analysis & optimization phase finished - *************************************************************/ - std::map> ordered; - for (auto &pair : lowered_partialgraphs) - { - // const auto &partialgraph_index = pair.first; - auto &lowered_partialgraph = pair.second; - - ordered.insert(make_pair(pair.first.value(), std::move(lowered_partialgraph))); - } - - std::vector> results; - for (auto &pair : ordered) - { - auto executors = std::make_shared(); - - const auto &partialgraph_index = ir::SubgraphIndex(pair.first); - auto &lowered_partialgraph = pair.second; - auto indexed_ranks = lowered_partialgraph->indexed_ranks(); - ir::OperationDumper dumper("Executor generation of Subgraph " + - std::to_string(partialgraph_index.value())); - lowered_partialgraph->graph().operations().iterate( - [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); }); - auto executor = std::unique_ptr{ - ExecutorFactory::get().create(std::move(lowered_partialgraph), nullptr, options, executors)}; - executor->setIndexedRanks(indexed_ranks); - executors->emplace(ir::SubgraphIndex{0}, std::move(executor)); - - // It doesn't support tracing in case of partial graph - results.push_back(std::make_shared(executors, nullptr)); - } - - _nnpkg.reset(); - /******************************** - * Code generation phase finished - ********************************/ - _state = State::COMPILED; - - return results; -} - } // namespace compiler - } // namespace onert diff --git a/runtime/onert/core/src/compiler/CompilerFactory.cc b/runtime/onert/core/src/compiler/CompilerFactory.cc new file mode 100644 index 000000000..d8d4bb277 --- /dev/null +++ b/runtime/onert/core/src/compiler/CompilerFactory.cc @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compiler/CompilerFactory.h" + +#include "MultiModelCompiler.h" + +#include "compiler/Compiler.h" + +namespace onert +{ +namespace compiler +{ + +CompilerFactory &CompilerFactory::get() +{ + static CompilerFactory singleton; + return singleton; +} + +std::unique_ptr +CompilerFactory::create(const std::shared_ptr &nnpkg, + std::vector> &copts) +{ + if (nnpkg->model_count() == 1) + return std::make_unique(nnpkg, copts); + + return std::make_unique(nnpkg, copts); +} + +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/CompilerOptions.cc b/runtime/onert/core/src/compiler/CompilerOptions.cc new file mode 100644 index 000000000..b5fd392e0 --- /dev/null +++ b/runtime/onert/core/src/compiler/CompilerOptions.cc @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compiler/CompilerOptions.h" + +#include "../backend/builtin/Backend.h" + +#include "util/ConfigSource.h" +#include "util/logging.h" + +#include + +namespace +{ + +using namespace onert; + +std::string getOpBackends(std::unordered_map &opcode_to_backend) +{ + std::unordered_map::iterator it; + std::string opbackends; + + for (it = opcode_to_backend.begin(); it != opcode_to_backend.end(); ++it) + { + if (!opbackends.empty()) + opbackends = opbackends + ", "; + + auto opcode = it->first; + const std::string opname = ir::toString(opcode); + opbackends += opname + "=" + it->second; + } + return opbackends; +} + +} // namespace + +namespace onert +{ +namespace compiler +{ + +void ManualSchedulerOptions::setBackendMap(const std::string &str) +{ + // TODO Support multiple subgraphs for manual scheduling + auto key_val_list = nnfw::misc::split(str, ';'); + for (const auto &key_val_str : key_val_list) + { + if (key_val_str.empty()) + { + continue; + } + + auto key_val = nnfw::misc::split(key_val_str, '='); + const auto &key_str = key_val.at(0); + const auto &val = key_val.at(1); + auto key = static_cast(std::stoi(key_str)); + this->index_to_backend.emplace(ir::OperationIndex{key}, val); + } +} + +std::unique_ptr CompilerOptions::fromGlobalConfig() +{ + auto o = std::make_unique(); + o->backend_list = nnfw::misc::split(util::getConfigString(util::config::BACKENDS), ';'); + o->trace_filepath = util::getConfigString(util::config::TRACE_FILEPATH); + o->graph_dump_level = util::getConfigInt(util::config::GRAPH_DOT_DUMP); + o->executor = util::getConfigString(util::config::EXECUTOR); + o->he_scheduler = util::getConfigBool(util::config::USE_SCHEDULER); + o->he_profiling_mode = util::getConfigBool(util::config::PROFILING_MODE); + o->fp16_enable = util::getConfigBool(util::config::FP16_ENABLE); + { + // Backend for all + auto &ms_options = o->manual_scheduler_options; + + // Default value for op_backend_all is first element in the backend list + ms_options.backend_for_all = util::getConfigString(util::config::OP_BACKEND_ALLOPS); + +// Opcode to Backend +#define OP(OpName) \ + { \ + const auto &backend_str = util::getConfigString(util::config::OP_BACKEND_##OpName); \ + if (!backend_str.empty()) \ + { \ + ms_options.opcode_to_backend[ir::OpCode::OpName] = backend_str; \ + } \ + } +#include "ir/Operations.lst" +#undef OP + + // Index to Backend + auto map_str = util::getConfigString(util::config::OP_BACKEND_MAP); + ms_options.setBackendMap(map_str); + } + return o; +} + +void CompilerOptions::forceInternalOptions() +{ + // Set control flow backend for control flow operators + auto &builtin_id = backend::builtin::Config::ID; + manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id; + manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id; + manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id; + + // FIXME This is a workaround for bcq operations, should remove it + manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq"; + manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq"; + + // FIXME This is a workaround for bulk operations, should remove it + manual_scheduler_options.opcode_to_backend[ir::OpCode::Bulk] = "trix"; +} + +void CompilerOptions::verboseOptions() +{ + VERBOSE(Compiler) << std::boolalpha << "==== Compiler Options ====" << std::endl; + VERBOSE(Compiler) << "backend_list : " + << nnfw::misc::join(backend_list.begin(), backend_list.end(), "/") << std::endl; + VERBOSE(Compiler) << "trace_filepath : " << trace_filepath << std::endl; + VERBOSE(Compiler) << "graph_dump_level : " << graph_dump_level << std::endl; + VERBOSE(Compiler) << "executor : " << executor << std::endl; + VERBOSE(Compiler) << "manual backend_for_all : " << manual_scheduler_options.backend_for_all + << std::endl; + VERBOSE(Compiler) << "manual_scheduler_options : " + << getOpBackends(manual_scheduler_options.opcode_to_backend) << std::endl; + VERBOSE(Compiler) << "he_scheduler : " << he_scheduler << std::endl; + VERBOSE(Compiler) << "he_profiling_mode : " << he_profiling_mode << std::endl; + VERBOSE(Compiler) << "fp16_enable : " << fp16_enable << std::endl + << std::noboolalpha; +} + +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc index 024556e7e..b09d6b021 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.cc +++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc @@ -196,7 +196,7 @@ backend::BackendContexts createBackendContexts(compiler::LoweredGraph &lgraph, b // Create contexts auto whole_op_order = lgraph.graph().topolSortOperations(); - for (auto &pair : context_data_map) + for (auto &&pair : context_data_map) { auto backend = pair.first; auto &data = pair.second; @@ -240,18 +240,22 @@ ExecutorFactory &ExecutorFactory::get() ExecutorFactory::ExecutorFactory() { _map["Linear"] = createLinearExecutor; - _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3, std::placeholders::_4, false); - _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2, - std::placeholders::_3, std::placeholders::_4, true); + _map["Dataflow"] = + std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, false); + _map["Parallel"] = + std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, true); } exec::IExecutor *ExecutorFactory::create(std::unique_ptr lowered_graph, const util::TracingCtx *tracing_ctx, const compiler::CompilerOptions &options, - const std::shared_ptr &executors) + const std::shared_ptr &executors, + const ir::ModelIndex &index) { - return _map.at(options.executor)(std::move(lowered_graph), tracing_ctx, options, executors); + return _map.at(options.executor)(std::move(lowered_graph), tracing_ctx, options, executors, + index); } void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph, @@ -282,10 +286,11 @@ void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_grap } void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs, - const std::shared_ptr &executors, - const backend::BackendContexts &backend_contexts) + const std::shared_ptr &executors, + const backend::BackendContexts &backend_contexts, + const ir::ModelIndex &index) { - for (auto &pair : backend_contexts) + for (auto &&pair : backend_contexts) { auto builtin_context = dynamic_cast(pair.second.get()); if (builtin_context != nullptr) @@ -293,6 +298,7 @@ void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs, auto builtin_kernel_gen = builtin_context->kernel_gen; builtin_kernel_gen->setTensorRegistries(tensor_regs); builtin_kernel_gen->setExecutors(executors); + builtin_kernel_gen->setModelIndex(index); } } } @@ -302,7 +308,7 @@ ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_con { std::deque> ordered_contexts; - for (auto &pair : backend_contexts) + for (auto &&pair : backend_contexts) { // NOTE builtin backend must be processed lastly. // This is because of Permute layer's specialty which is the only operation that could have @@ -319,7 +325,8 @@ ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_con exec::IExecutor *ExecutorFactory::createLinearExecutor( std::unique_ptr lowered_graph, const util::TracingCtx *tracing_ctx, - const compiler::CompilerOptions &options, const std::shared_ptr &executors) + const compiler::CompilerOptions &options, const std::shared_ptr &executors, + const ir::ModelIndex &index) { auto &graph = lowered_graph->graph(); @@ -337,7 +344,7 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor( auto order = Linear::linearize(*lowered_graph); Linear::dump(*lowered_graph, order); - for (auto &pair : backend_contexts) + for (auto &&pair : backend_contexts) { pair.second->genTensors(); } @@ -345,7 +352,7 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor( prepareMigrantTensors(*lowered_graph, backend_contexts); // Give some runtime objects to builtin KernelGenerator - prepareBuiltinBackend(tensor_regs, executors, backend_contexts); + prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index); ExecutionBuilder builder; @@ -406,10 +413,10 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor( } // Generate kernels - for (auto &pair : ordered_contexts) + for (auto &&pair : ordered_contexts) { auto codes = pair.second->genKernels(); - for (auto &pair : codes) + for (auto &&pair : codes) { auto &op_ind = pair.first; auto &fn_seq = pair.second; @@ -444,8 +451,8 @@ exec::IExecutor *ExecutorFactory::createLinearExecutor( exec::IExecutor *ExecutorFactory::createDataflowExecutor( std::unique_ptr lowered_graph, const util::TracingCtx *tracing_ctx, - const compiler::CompilerOptions &options, const std::shared_ptr &executors, - bool parallel) + const compiler::CompilerOptions &options, const std::shared_ptr &executors, + const ir::ModelIndex &index, bool parallel) { backend::BackendContexts backend_contexts = createBackendContexts(*lowered_graph, options.executor == "Linear"); @@ -457,7 +464,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED); - for (auto &pair : backend_contexts) + for (auto &&pair : backend_contexts) { pair.second->genTensors(); } @@ -465,7 +472,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( prepareMigrantTensors(*lowered_graph, backend_contexts); // Give some runtime objects to builtin KernelGenerator - prepareBuiltinBackend(tensor_regs, executors, backend_contexts); + prepareBuiltinBackend(tensor_regs, executors, backend_contexts, index); ExecutionBuilder builder; @@ -473,10 +480,10 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( auto ordered_contexts = orderBackendContext(backend_contexts); // Generate kernels - for (auto &pair : ordered_contexts) + for (auto &&pair : ordered_contexts) { auto codes = pair.second->genKernels(); - for (auto &pair : codes) + for (auto &&pair : codes) { auto &op_ind = pair.first; auto &fn_seq = pair.second; diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h index 70c089f8c..f8f989043 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.h +++ b/runtime/onert/core/src/compiler/ExecutorFactory.h @@ -21,7 +21,7 @@ #include "backend/ITensor.h" #include "compiler/LoweredGraph.h" -#include "exec/Executors.h" +#include "exec/IExecutors.h" #include #include @@ -40,7 +40,8 @@ public: exec::IExecutor *create(std::unique_ptr lowered_graph, const util::TracingCtx *tracing_ctx, const compiler::CompilerOptions &options, - const std::shared_ptr &executors); + const std::shared_ptr &executors, + const ir::ModelIndex &index); private: ExecutorFactory(); @@ -49,26 +50,28 @@ private: static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph, const backend::BackendContexts &backend_contexts); static void prepareBuiltinBackend(const TensorRegistries &tensor_regs, - const std::shared_ptr &executors, - const backend::BackendContexts &backend_contexts); + const std::shared_ptr &executors, + const backend::BackendContexts &backend_contexts, + const ir::ModelIndex &index); static std::deque> orderBackendContext(const backend::BackendContexts &backend_contexts); static exec::IExecutor *createLinearExecutor( std::unique_ptr lowered_graph, const util::TracingCtx *tracing_ctx, - const compiler::CompilerOptions &options, const std::shared_ptr &executors); - static exec::IExecutor * - createDataflowExecutor(std::unique_ptr lowered_graph, - const util::TracingCtx *tracing_ctx, - const compiler::CompilerOptions &options, - const std::shared_ptr &executors, bool parallel); + const compiler::CompilerOptions &options, const std::shared_ptr &executors, + const ir::ModelIndex &index); + static exec::IExecutor *createDataflowExecutor( + std::unique_ptr lowered_graph, const util::TracingCtx *tracing_ctx, + const compiler::CompilerOptions &options, const std::shared_ptr &executors, + const ir::ModelIndex &index, bool parallel); private: std::unordered_map< std::string, std::function, const util::TracingCtx *tracing_ctx, - const compiler::CompilerOptions &options, const std::shared_ptr &executors)>> + const compiler::CompilerOptions &options, const std::shared_ptr &executors, + const ir::ModelIndex &index)>> _map; }; diff --git a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc index 98dc906e4..fdf4e24f0 100644 --- a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc +++ b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc @@ -393,10 +393,10 @@ void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq) const auto &op_seq_inputs = _lowered_graph.graph().getInputs(); const auto &op_seq_outputs = _lowered_graph.graph().getOutputs(); - for (auto &op_idx : op_seq) + for (const auto &op_idx : op_seq) { const auto &node = operations.at(op_idx); - for (auto &ind : node.getInputs() | ir::Remove::UNDEFINED) + for (const auto &ind : node.getInputs() | ir::Remove::UNDEFINED) { if (node.opcode() == ir::OpCode::ConvertFp32ToFp16 || op_seq_inputs.contains(ind)) continue; @@ -410,7 +410,7 @@ void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq) VERBOSE(Fp32ToFp16Converter) << "Input Operand " << ind << ": fp16" << std::endl; } - for (auto &ind : node.getOutputs()) + for (const auto &ind : node.getOutputs()) { if (node.opcode() == ir::OpCode::ConvertFp16ToFp32 || op_seq_outputs.contains(ind)) continue; @@ -747,7 +747,7 @@ Fp32ToFp16Converter::findOpSequencesContiguous(const InputToOpSeqs &input_to_op_ // | | // [OPERATION] [OPERATION] // - for (auto &op_seq_ind : found_input_in_op_seqs->second) + for (const auto &op_seq_ind : found_input_in_op_seqs->second) { auto found_in_fp32_to_fp16 = _list_fp32_to_fp16.find(op_seq_ind); if (found_in_fp32_to_fp16 != _list_fp32_to_fp16.end()) @@ -799,13 +799,13 @@ Fp32ToFp16Converter::getListOpSequences(const OpSeqIndexToOpSeqIndexList &opseq_ OpSeqIndexList list; for (const auto &it : opseq_map_to_delete) { - auto &opseq_ind_fp16_to_fp32 = it.first; + const auto &opseq_ind_fp16_to_fp32 = it.first; if (list.find(opseq_ind_fp16_to_fp32) == list.end()) { list.emplace(opseq_ind_fp16_to_fp32); } - for (auto &opseq_ind_fp32_to_fp16 : it.second) + for (const auto &opseq_ind_fp32_to_fp16 : it.second) { if (list.find(opseq_ind_fp32_to_fp16) == list.end()) { @@ -869,7 +869,7 @@ void Fp32ToFp16Converter::manipulateContiguousOpSequences( auto &op_seq_fp16_to_fp32 = op_seqs.at(op_seq_ind_fp16_to_fp32); auto &input_ind_fp16_to_fp32 = op_seq_fp16_to_fp32.getInputs().at(0); - for (auto &op_seq_ind_fp32_to_fp16 : it.second) + for (const auto &op_seq_ind_fp32_to_fp16 : it.second) { auto &op_seq_fp32_to_fp16 = op_seqs.at(op_seq_ind_fp32_to_fp16); assert(op_seq_fp32_to_fp16.size() == 1); @@ -879,7 +879,7 @@ void Fp32ToFp16Converter::manipulateContiguousOpSequences( auto found_next_to_fp16 = input_to_op_seqs.find(output_ind_fp32_to_fp16); assert(found_next_to_fp16 != input_to_op_seqs.end()); - for (auto &op_seq_ind_next_to_fp16 : found_next_to_fp16->second) + for (const auto &op_seq_ind_next_to_fp16 : found_next_to_fp16->second) { manipulateInput(op_seq_ind_next_to_fp16, output_ind_fp32_to_fp16, input_ind_fp16_to_fp32); } @@ -901,7 +901,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences( auto &operations = _lowered_graph.graph().operations(); auto &op_seqs = _lowered_graph.op_seqs(); - for (auto &op_seq_ind : list_to_delete_op_seqs) + for (const auto &op_seq_ind : list_to_delete_op_seqs) { auto &op_seq = op_seqs.at(op_seq_ind); assert(op_seq.size() == 1); @@ -914,7 +914,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences( VERBOSE(Fp32ToFp16Converter) << "Delete Node " << first_node_ind << std::endl; // Uses - for (auto &ind : first_node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) + for (const auto &ind : first_node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) { auto &obj = operands.at(ind); obj.removeUse(first_node_ind); @@ -923,7 +923,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences( } // Def - for (auto &ind : first_node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) + for (const auto &ind : first_node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) { auto &obj = operands.at(ind); assert(obj.getDef() == first_node_ind); @@ -942,7 +942,7 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences( } // Operand - for (auto &ind : list_to_delete_ops) + for (const auto &ind : list_to_delete_ops) { operands.remove(ind); VERBOSE(Fp32ToFp16Converter) << "Operand " << ind << " is removed" << std::endl; diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc index c4bfddb8f..65fd4cd77 100644 --- a/runtime/onert/core/src/compiler/HEScheduler.cc +++ b/runtime/onert/core/src/compiler/HEScheduler.cc @@ -512,7 +512,7 @@ HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::Operation // Find free time for data transferring and insert it into backend taskset. This is needed: // 1. Time for multiple permutations for this node's input is found correctly // 2. If backend==cpu, then free time for this node must come after permutations - for (auto &it : transfer_st_exec_time) + for (auto &&it : transfer_st_exec_time) { if (_is_parallel_exec) { diff --git a/runtime/onert/core/src/compiler/HEScheduler.test.cc b/runtime/onert/core/src/compiler/HEScheduler.test.cc index c4a2df025..589331b49 100644 --- a/runtime/onert/core/src/compiler/HEScheduler.test.cc +++ b/runtime/onert/core/src/compiler/HEScheduler.test.cc @@ -163,7 +163,7 @@ void setOperationsExecutionTime(const std::vector &backends, ExecTime et(backends); for (int i = 0; i < op_names.size(); ++i) { - for (auto &backend : backends) + for (const auto backend : backends) setOperationExecTime(et, backend, op_names[i], false, op_sizes[i], exec_time); } et.storeOperationsExecTime(); @@ -189,7 +189,7 @@ void setPermutationsExecutionTime(const std::vector &backends, ExecTime et(backends); for (const auto &backend : backends) { - for (auto &other_backend : backends) + for (const auto other_backend : backends) { if (backend == other_backend) continue; diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc index 9e84753a7..d53d0ed00 100644 --- a/runtime/onert/core/src/compiler/LoweredGraph.cc +++ b/runtime/onert/core/src/compiler/LoweredGraph.cc @@ -44,14 +44,6 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option lowerGraph(options); } -// TODO Design better class and constructor to represent parent_graph -LoweredGraph::LoweredGraph(const ir::Graph &parent_graph, const ir::Graph &graph, - const CompilerOptions &options) - : _graph{graph}, _parent_graph{parent_graph} -{ - lowerGraph(options); -} - void LoweredGraph::lowerGraph(const CompilerOptions &options) { // Build backend contexts diff --git a/runtime/onert/core/src/compiler/ManualScheduler.cc b/runtime/onert/core/src/compiler/ManualScheduler.cc index af2d84cd9..621f0c7b7 100644 --- a/runtime/onert/core/src/compiler/ManualScheduler.cc +++ b/runtime/onert/core/src/compiler/ManualScheduler.cc @@ -64,7 +64,7 @@ std::unique_ptr ManualScheduler::schedule(const ir::Graph &grap // 2. Backend per operation type std::unordered_map op_type_map; - for (auto &pair : manual_options.opcode_to_backend) + for (const auto &pair : manual_options.opcode_to_backend) { op_type_map.emplace(pair.first, BackendManager::get().get(pair.second)); } @@ -80,7 +80,7 @@ std::unique_ptr ManualScheduler::schedule(const ir::Graph &grap }); // 3. Backend per operation - for (auto &pair : manual_options.index_to_backend) + for (const auto &pair : manual_options.index_to_backend) { const auto &key = pair.first; const auto &val = pair.second; diff --git a/runtime/onert/core/src/compiler/MultiModelCompiler.cc b/runtime/onert/core/src/compiler/MultiModelCompiler.cc new file mode 100644 index 000000000..fea6a7f25 --- /dev/null +++ b/runtime/onert/core/src/compiler/MultiModelCompiler.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "MultiModelCompiler.h" + +#include "ExecutorFactory.h" +#include "ShapeValidator.h" +#include "pass/ConstantOutputPass.h" +#include "pass/OddOutputPass.h" +#include "pass/PassRunner.h" +#include "pass/UnusedOperandEliminationPass.h" +#include "../dumper/dot/DotDumper.h" +#include "../exec/Executors.h" +#include "../ir/OperationDumper.h" +#include "../ir/verifier/Verifier.h" + +#include "compiler/StaticShapeInferer.h" + +#include + +namespace onert +{ +namespace compiler +{ + +MultiModelCompiler::MultiModelCompiler(const std::shared_ptr &nnpkg, + std::vector> &copts) + : _nnpkg{nnpkg}, _voptions{} +{ + assert(nnpkg->model_count() != 1); + + for (uint32_t i = 0; i < copts.size(); i++) + { + _voptions.push_back(copts[i].get()); + } +} + +std::shared_ptr MultiModelCompiler::compile(void) +{ + /*************************************************** + * Prepare compilation phase + ***************************************************/ + for (auto options : _voptions) + { + if (!options) + throw std::runtime_error{"Empty compile option"}; + + // Mode check + // TODO handle option for each model + if (options->he_profiling_mode) + throw std::runtime_error("NYI: Profiling mode for multiple model is not supported yet"); + + options->forceInternalOptions(); + options->verboseOptions(); + } + + // NYI: allow one model compilation + auto const model_count = _nnpkg->model_count(); + if (model_count != _voptions.size()) + throw std::runtime_error{"Model count and option vector size mismatch"}; + + for (uint16_t i = 0; i < model_count; i++) + { + _nnpkg->model(ir::ModelIndex{i})->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) { + // Mandatory passes + pass::PassRunner{} + .append(std::make_unique(subg)) + .append(std::make_unique(subg)) + .run(); + + // Optimizations + pass::PassRunner{}.append(std::make_unique(subg)).run(); + }); + } + + /*************************************************** + * Backend independent analysis & optimization phase + ***************************************************/ + // TODO Handle dump level for each model + auto dump_level = static_cast(_voptions[0]->graph_dump_level); + onert::dumper::dot::DotDumper dot_dumper(dump_level); + + // Tracing context + // TODO Support tracing_ctx for multiple model + std::unique_ptr tracing_ctx = nullptr; + + // Model edge context: copy model edge context + auto model_edges = std::make_unique(_nnpkg->model_edges()); + + // Lower: Assign backend + std::unordered_map>> + lowered_subgs; + + for (uint16_t i = 0; i < model_count; i++) + { + auto const model_index = ir::ModelIndex{i}; + auto model = _nnpkg->model(model_index); + + model->iterate([&](const ir::SubgraphIndex &subg_index, ir::Graph &subg) { + dot_dumper.dump(subg, + nnfw::misc::str("before_lower_model-", i, "-subg-", subg_index.value())); + // Lower: Assign backend + lowered_subgs[model_index][subg_index] = + std::make_unique(subg, *_voptions[i]); + // Set tracing_ctx for copied graph + if (tracing_ctx != nullptr) + tracing_ctx->setSubgraphIndex(&(lowered_subgs[model_index][subg_index]->graph()), + subg_index.value()); + }); + } + + _nnpkg.reset(); + + for (const auto &pair : lowered_subgs) + { + const auto &model_index = pair.first; + const auto &model_lsubg = pair.second; + + for (const auto &pair_inner : model_lsubg) + { + const auto &subg_index = pair_inner.first; + const auto &lowered_subg = pair_inner.second; + dot_dumper.dump(*lowered_subg, nnfw::misc::str("after_lower_model-", model_index.value(), + "-subg-", subg_index.value())); + } + } + + // Shape inference. + for (auto &&pair : lowered_subgs) + { + auto &model_lsubgs = pair.second; + // Run the StaticShapeInfer of primary subg. All child StaticShapeInferers are called + // recursively + std::unordered_map> inferers = + StaticShapeInferer::createStaticShapeInferers(model_lsubgs); + + const auto primary_subg_idx = ir::SubgraphIndex{0}; + inferers.at(primary_subg_idx)->infer(); + + for (const auto &pair_inferer : inferers) + { + const auto inferer = pair_inferer.second.get(); + inferer->dump(); + } + } + + // Shape validation + // TODO Move shape independent feature check from ShapeValidator to OperationValidator + // TODO Move ShapeValidator into shape inference + // - Check input tensor shape validation + // - Check parameter value validation which valid value is depend on input tensor shape + // - Output tensor shape validation check is needless because + // static/dynamic shape inferer will make valid output shape + for (const auto &pair : lowered_subgs) + { + const auto &model_lsubgs = pair.second; + + for (const auto &pair_inner : model_lsubgs) + { + const auto &lowered_subg = pair_inner.second; + compiler::ShapeValidator{lowered_subg->graph()}(); + } + } + + /************************************************************* + * Backend independent analysis & optimization phase finished + *************************************************************/ + auto executors = std::make_shared(std::move(model_edges)); + for (auto &&pair : lowered_subgs) + { + auto const &model_index = pair.first; + auto &model_lsubgs = pair.second; + + for (auto &&pair_inner : model_lsubgs) + { + auto const subg_index = pair_inner.first; + auto &lowered_subg = pair_inner.second; + auto const indexed_ranks = lowered_subg->indexed_ranks(); + + ir::OperationDumper dumper("Executor generation of Subgraph " + + std::to_string(subg_index.value())); + lowered_subg->graph().operations().iterate( + [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); }); + + auto &options = *_voptions[model_index.value()]; + auto executor = std::unique_ptr{ExecutorFactory::get().create( + std::move(lowered_subg), tracing_ctx.get(), options, executors, model_index)}; + executor->setIndexedRanks(indexed_ranks); + executors->emplace(model_index, subg_index, std::move(executor)); + } + } + + /******************************** + * Code generation phase finished + ********************************/ + return std::make_shared(executors, std::move(tracing_ctx)); +} + +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/MultiModelCompiler.h b/runtime/onert/core/src/compiler/MultiModelCompiler.h new file mode 100644 index 000000000..89af664f8 --- /dev/null +++ b/runtime/onert/core/src/compiler/MultiModelCompiler.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file MultiModelCompiler.h + * @brief This file contains MultiModelCompiler class to define and run compilation phase + */ + +#ifndef __ONERT_COMPILER_MULTI_MODEL_COMPILER_H__ +#define __ONERT_COMPILER_MULTI_MODEL_COMPILER_H__ + +#include "compiler/CompilerOptions.h" +#include "compiler/ICompiler.h" +#include "ir/NNPkg.h" + +namespace onert +{ +namespace compiler +{ + +/** + * @brief Class to compile NN package + */ +class MultiModelCompiler final : public ICompiler +{ +public: + /** + * @brief Construct a new Compiler object for NN package + * @param[in] nnpkg NN package to compile + * @param[in] coptions Compiler option vector for each model in package + */ + MultiModelCompiler(const std::shared_ptr &nnpkg, + std::vector> &copts); + + /** + * @brief Destroy the MultiModelCompiler object + */ + ~MultiModelCompiler() = default; + +public: + /** + * @brief Do compilation with the options + * + * @return std::shared_ptr Executors as a result of compilation + */ + std::shared_ptr compile(void); + +private: + std::shared_ptr &primary_subgraph() + { + return _nnpkg->primary_model()->at(ir::SubgraphIndex{0}); + } + +private: + std::shared_ptr _nnpkg; + std::vector _voptions; +}; + +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_MULTI_MODEL_COMPILER_H__ diff --git a/runtime/onert/core/src/compiler/StaticShapeInferer.cc b/runtime/onert/core/src/compiler/StaticShapeInferer.cc index 485450560..25747d950 100644 --- a/runtime/onert/core/src/compiler/StaticShapeInferer.cc +++ b/runtime/onert/core/src/compiler/StaticShapeInferer.cc @@ -18,6 +18,8 @@ #include "util/ShapeInference.h" #include "util/logging.h" +#include + #include #include @@ -188,6 +190,95 @@ void StaticShapeInferer::dump() }); } +std::unordered_map> +StaticShapeInferer::createStaticShapeInferers( + const std::unordered_map> &lowered_subgs) +{ + // Allocate StaticShapeInferer per each subgraph + std::unordered_map> inferers; + for (auto &&pair : lowered_subgs) + { + const auto &subg_index = pair.first; + auto &lowered_subg = pair.second; + inferers[subg_index] = std::make_unique(lowered_subg.get()); + } + + // Append observers in all StaticShapeInferers + for (auto &&pair : lowered_subgs) + { + const auto &subg_index = pair.first; + auto &lowered_subg = pair.second; + + // TODO: Change this iteration for all to controlflow iteration + lowered_subg->graph().operations().iterate( + [&](const ir::OperationIndex &, const ir::Operation &op) { + // A Function to append child inferers. These make it possible for a StaticShapeInferer to + // call StaticShapeInferes of child subgraphs recursively + auto appendChildInferer = [&](const ir::SubgraphIndex &child_subg_idx) { + auto *child_inferer = inferers.at(child_subg_idx).get(); + inferers.at(subg_index)->appendChildInferer(child_subg_idx, child_inferer); + }; + + // A Function to appaend subg input observers. This makes it possible for a + // StaticShapeInferer to update inputs of child subgraphs + auto appendSubgraphInputObserver = [&](const ir::SubgraphIndex &child_subg_idx) { + std::vector child_subg_inputs; + auto &child_subg = lowered_subgs.at(child_subg_idx)->graph(); + for (const auto &input_idx : child_subg.getInputs()) + { + auto operand_ptr = child_subg.operands().getRawPtr(input_idx); + child_subg_inputs.emplace_back(operand_ptr); + } + inferers.at(subg_index) + ->appendSubgInputObserver(child_subg_idx, + std::make_unique(child_subg_inputs)); + }; + + // A Function to set controlflow output observers. This makes it possible for a + // StaticShapeInferer to update outputs of parent controlflow opeerations + auto setControlFlowOutputObserver = [&](const ir::SubgraphIndex &child_subg_idx) { + std::vector cf_outputs; + auto &subg = lowered_subg->graph(); + for (const auto &output_idx : op.getOutputs()) + { + auto operand_ptr = subg.operands().getRawPtr(output_idx); + cf_outputs.emplace_back(operand_ptr); + } + inferers.at(child_subg_idx) + ->setControlflowOutputObserver(std::make_unique(cf_outputs)); + }; + + // Append Observers in a StaticShapeInferer + if (op.opcode() == ir::OpCode::If) + { + const auto &if_op = nnfw::misc::polymorphic_downcast(op); + + appendChildInferer(if_op.param().then_subg_index); + appendChildInferer(if_op.param().else_subg_index); + + appendSubgraphInputObserver(if_op.param().then_subg_index); + appendSubgraphInputObserver(if_op.param().else_subg_index); + + setControlFlowOutputObserver(if_op.param().then_subg_index); + } + else if (op.opcode() == ir::OpCode::While) + { + const auto &while_op = nnfw::misc::polymorphic_downcast(op); + + appendChildInferer(while_op.param().cond_subg_index); + appendChildInferer(while_op.param().body_subg_index); + + appendSubgraphInputObserver(while_op.param().cond_subg_index); + appendSubgraphInputObserver(while_op.param().body_subg_index); + + setControlFlowOutputObserver(while_op.param().body_subg_index); + } + }); + } + + return inferers; +} + void StaticShapeInferer::visit(const ir::operation::ArgMinMax &op) { auto &operands = _lowered_subg->graph().operands(); @@ -1306,8 +1397,11 @@ void StaticShapeInferer::visit(const ir::operation::Bulk &op) auto origin_output_shape = op.param().origin_output_shapes[0]; // TODO: more check for valid batch request - assert(cur_input_shape.dim(0) >= origin_output_shape.dim(0)); - assert(cur_input_shape.dim(0) % origin_output_shape.dim(0) == 0); + if ((cur_input_shape.dim(0) < origin_output_shape.dim(0)) || + (cur_input_shape.dim(0) % origin_output_shape.dim(0) != 0)) + { + throw std::runtime_error("StaticShapeInferer " + op.name() + ": Not supported batch size"); + } size_t batch_multiplier = cur_input_shape.dim(0) / origin_output_shape.dim(0); ir::Shape new_shape; diff --git a/runtime/onert/core/src/compiler/TensorRegistries.h b/runtime/onert/core/src/compiler/TensorRegistries.h index b3cc0bbe3..c7e06e84c 100644 --- a/runtime/onert/core/src/compiler/TensorRegistries.h +++ b/runtime/onert/core/src/compiler/TensorRegistries.h @@ -71,7 +71,7 @@ public: backend::ITensor *getITensor(ir::OperandIndex ind) const { - for (auto &tensor_reg : _tensor_regs) + for (auto &&tensor_reg : _tensor_regs) { auto tensor = tensor_reg->getITensor(ind); if (tensor) diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc index f50fae0d3..e2b3f6111 100644 --- a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc +++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc @@ -34,7 +34,7 @@ void OddOutputPass::run() VERBOSE(OddOutputPass) << "Case 1 : An operand which is a model output and a model input" << std::endl; - for (auto &ind : outputs) + for (const auto &ind : outputs) { if (_graph.getInputs().contains(ind)) { @@ -46,7 +46,7 @@ void OddOutputPass::run() VERBOSE(OddOutputPass) << "Case 2 : Two or more duplicated outputs" << std::endl; std::unordered_set occurence; - for (auto &ind : outputs) + for (auto &&ind : outputs) { auto &obj = _graph.operands().at(ind); if (occurence.count(ind) == 0) diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.cc b/runtime/onert/core/src/compiler/pass/PassRunner.cc index 1be6d7794..2d11be201 100644 --- a/runtime/onert/core/src/compiler/pass/PassRunner.cc +++ b/runtime/onert/core/src/compiler/pass/PassRunner.cc @@ -31,7 +31,7 @@ PassRunner &PassRunner::append(std::unique_ptr pass) void PassRunner::run() { - for (auto &pass : _passes) + for (auto &&pass : _passes) { VERBOSE(PassRunner) << "Start running '" << pass->id() << "'" << std::endl; pass->run(); diff --git a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc index 71efa1bb5..0da1e54df 100644 --- a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc +++ b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc @@ -105,9 +105,9 @@ void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Opera } } - for (auto &operation : remove_list) + for (const auto &operation_index : remove_list) { - object.removeUse(operation); + object.removeUse(operation_index); } } } diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc index 9d1e06d6c..7d5b406ef 100644 --- a/runtime/onert/core/src/exec/Execution.cc +++ b/runtime/onert/core/src/exec/Execution.cc @@ -23,13 +23,12 @@ namespace onert namespace exec { -Execution::Execution(const std::shared_ptr &executors) : _executors{executors} +Execution::Execution(const std::shared_ptr &executors) : _executors{executors} { assert(executors != nullptr); - assert(executors->at(ir::SubgraphIndex{0}) != nullptr); + assert(executors->entryExecutor() != nullptr); _io_desc.inputs.resize(_executors->inputSize()); _io_desc.outputs.resize(_executors->outputSize()); - sem_init(&_async_io_descs_sem, 0, 1); } void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_shape) @@ -70,80 +69,6 @@ void Execution::setInput(const ir::IOIndex &index, const void *buffer, size_t le _io_desc.inputs.at(index.value()) = std::make_unique(info, buffer, length, layout); } -void Execution::createNewAsyncDesc(uint32_t count) -{ - IODescription *_async_io_desc = new IODescription; - _async_io_desc->inputs.resize(primary_subgraph().getInputs().size()); - _async_io_desc->outputs.resize(primary_subgraph().getOutputs().size()); - - _async_io_descs.push_back({_async_io_desc, count}); -} - -void Execution::setFinish() { finished = true; } - -bool Execution::isEmptyQueue() -{ - asyncIoDescSemWait(); - bool ret = _async_io_descs.empty(); - if (!ret) - { - for (uint32_t idx = 0; idx < _async_io_descs.front().first->inputs.size(); idx++) - { - if (_async_io_descs.front().first->inputs.at(idx).get() == nullptr) - { - ret = true; - break; - } - } - } - asyncIoDescSemPost(); - return ret; -} - -void Execution::executeAsyncInput(const ir::IOIndex &index, const void *buffer, size_t length, - ir::Layout layout) -{ - const auto info = _executors->inputInfo(index); - IODescription *_async_io_desc = _async_io_descs.back().first; - - { - auto input_shape_sig = _async_io_desc->dynamic_input_shapes.find(index); - auto size_required = - (input_shape_sig != _async_io_desc->dynamic_input_shapes.end()) - ? input_shape_sig->second.num_elements() * onert::ir::sizeOfDataType(info.typeInfo().type()) - : info.total_size(); - - if (length < size_required) - { - throw std::runtime_error{"Too small length"}; - } - } - void *_buffer = (void *)malloc(length); - if (_buffer == NULL) - { - throw std::runtime_error{"malloc failed"}; - } - memcpy(_buffer, buffer, length); - - _async_io_desc->inputs.at(index.value()) = - std::make_unique(info, _buffer, length, layout); -} - -void Execution::executeAsyncOutput(const ir::IOIndex &index, void *buffer, size_t length, - ir::Layout layout) -{ - const auto info = _executors->outputInfo(index); - IODescription *_async_io_desc = _async_io_descs.front().first; - - if (length < info.total_size()) - { - throw std::runtime_error{"Too small length"}; - } - - _async_io_desc->outputs.at(index.value()) = - std::make_unique(info, buffer, length, layout); -} - // TODO Remove default parameter void Execution::setInput(const ir::IOIndex &index, const ir::TypeInfo &type, const ir::Shape &shape, const void *buffer, size_t length, ir::Layout layout) @@ -209,18 +134,6 @@ void Execution::execute() VERBOSE(Execution) << "Execution finished" << std::endl; } -void Execution::AsyncExecute() -{ - VERBOSE(Execution) << "Start Async execution" << std::endl; - if (_async_io_descs.empty()) - { - VERBOSE(Execution) << "The input is not ready" << std::endl; - return; - } - - primary_executor()->execute(*_async_io_descs.front().first); -} - void Execution::startExecute() { VERBOSE(Execution) << "Create asynchronous execution thread" << std::endl; @@ -251,163 +164,21 @@ ir::Shape Execution::getInputShape(ir::IOIndex ind) const } } +// NNAPI return fail if ANeuralNetworksExecution_getOutputOperandRank or +// ANeuralNetworksExecution_getOutputOperandDimensions is called before execution. +// On the other hand, NNFW API return static shape inference result if nnfw_output_tensorinfo is +// called before execution. +// To handle both case, this method retun static shape inference result and fail will be handled on +// NNAPI frontend. ir::Shape Execution::getOutputShape(ir::IOIndex ind) const { if (!isFinished()) - throw std::runtime_error("Cannot get output shape before execution is finished"); + return _executors->outputInfo(ind).shape(); const auto &output_desc = _io_desc.outputs.at(ind.value()); return output_desc->info.shape(); } -void Execution::asyncIoDescSemWait() { sem_wait(&_async_io_descs_sem); } - -void Execution::asyncIoDescSemPost() { sem_post(&_async_io_descs_sem); } - -void Execution::runInference() -{ - uint32_t inference_cnt; - uint32_t output_sz = primary_subgraph().getOutputs().size(); - while (true) - { - if (isEmptyQueue()) - { - if (isFinished()) - { - if (!next_exes.empty()) - { - for (uint32_t i = 0; i < next_exes.size(); i++) - { - std::get<0>(next_exes[i])->setFinish(); - } - } - else - { - sholudStop(); - } - break; - } - } - else - { - for (uint32_t i = 0; i < output_sz; i++) - { - auto opidx = primary_subgraph().getOutputs().at(i); - auto shape = primary_subgraph().operands().at(opidx).shape(); - auto dtype = primary_subgraph().operands().at(opidx).typeInfo().type(); - auto rank = shape.rank(); - uint32_t tensor_size = 1; - for (int32_t j = 0; j < rank; j++) - { - tensor_size *= shape.dim(j); - } - if (dtype == onert::ir::DataType::FLOAT32 || dtype == onert::ir::DataType::INT32 || - dtype == onert::ir::DataType::UINT32) - tensor_size *= 4; - else if (dtype == onert::ir::DataType::INT64) - tensor_size *= 8; - void *_buffer = (void *)malloc(tensor_size); - if (_buffer == NULL) - { - throw std::runtime_error{"malloc failed"}; - } - executeAsyncOutput(onert::ir::IOIndex(i), _buffer, tensor_size); - } - AsyncExecute(); - - // set inputs of next execution - auto _io_desc = getAsyncIoDescs()->front().first; - inference_cnt = getAsyncIoDescs()->front().second; - getAsyncIoDescs()->pop_front(); - - for (uint32_t i = 0; i < next_exes.size(); i++) - { - auto next_exe = std::get<0>(next_exes[i]); - auto o_index = std::get<1>(next_exes[i]); - auto i_index = std::get<2>(next_exes[i]); - - next_exe->asyncIoDescSemWait(); - auto next_io_descs = next_exe->getAsyncIoDescs(); - bool exist = false; - for (auto iter = next_io_descs->begin(); iter != next_io_descs->end(); iter++) - { - if (inference_cnt == iter->second) - { - exist = true; - } - } - - if (!exist) - { - next_exe->createNewAsyncDesc(inference_cnt); - } - for (auto iter = next_io_descs->begin(); iter != next_io_descs->end(); iter++) - { - if (inference_cnt == iter->second) - { - const auto input_index = next_exe->primary_subgraph().getInputs().at(i_index.value()); - const auto info = next_exe->primary_subgraph().operands().at(input_index).info(); - - size_t length = _io_desc->outputs[o_index.value()]->size; - void *_buffer = (void *)malloc(length); - if (_buffer == NULL) - { - throw std::runtime_error{"malloc failed"}; - } - memcpy(_buffer, _io_desc->outputs[o_index.value()]->buffer, length); - - iter->first->inputs.at(i_index.value()) = std::make_unique( - info, _buffer, length, onert::ir::Layout::NHWC); - break; - } - } - next_exe->asyncIoDescSemPost(); - } - - if (next_exes.empty()) - { - std::vector results; - for (uint32_t i = 0; i < _io_desc->outputs.size(); i++) - { - size_t length = _io_desc->outputs[i]->size; - void *_buffer = (void *)malloc(length); - if (_buffer == NULL) - { - throw std::runtime_error{"malloc failed"}; - } - memcpy(_buffer, _io_desc->outputs[i]->buffer, length); - results.push_back(_buffer); - } - _async_results.push_back(results); - } - - for (uint32_t i = 0; i < _io_desc->inputs.size(); i++) - { - auto p = _io_desc->inputs.at(i).release(); - if (p) - { - free((void *)p->buffer); - delete p; - } - } - for (uint32_t i = 0; i < _io_desc->outputs.size(); i++) - { - auto p = _io_desc->outputs.at(i).release(); - if (p) - { - free(p->buffer); - delete p; - } - } - delete _io_desc; - } - } -} - -bool Execution::stopWait(void) const { return stop_wait; } - -void Execution::sholudStop() { stop_wait = true; } - } // namespace exec } // namespace onert diff --git a/runtime/onert/core/src/exec/Execution.test.cc b/runtime/onert/core/src/exec/Execution.test.cc index e3ea49470..fefe8a332 100644 --- a/runtime/onert/core/src/exec/Execution.test.cc +++ b/runtime/onert/core/src/exec/Execution.test.cc @@ -17,6 +17,7 @@ #include "exec/Execution.h" #include "compiler/Compiler.h" +#include "compiler/CompilerFactory.h" #include "ir/Graph.h" #include "ir/operation/BinaryArithmetic.h" #include "util/TracingCtx.h" @@ -90,6 +91,161 @@ public: std::shared_ptr artifact; }; +class CompiledMockUpMultiModel +{ +public: + CompiledMockUpMultiModel() + { + // Model0: a float elementwise add operation + // Model0 input: lhs0, rhs0 + // Model0 output: add result (result0) + + // Model1: a qasymm8 elementwise add operation + // Model1 input: result0, rhs1 + // Model1 output: add result (result1) + + // Model2: a float elementwise add operation + // Model2 input: result0, result1 + // Model2 output: add result (result2) + + // constant: rhs2 + // result0 <= (lhs0 + rhs0) + // result1 <= (result0 + rhs1) + // result2 <= (result0 + result1) + // lhs0, rhs0, rh1, result0, result1, result2 shape: {1, 2, 2, 1} + // activation: none (constant) + + // Update edge information + edges.pkg_inputs.emplace_back(ModelIndex{0}, SubgraphIndex{0}, IOIndex{0}); + edges.pkg_inputs.emplace_back(ModelIndex{0}, SubgraphIndex{0}, IOIndex{1}); + edges.pkg_outputs.emplace_back(ModelIndex{2}, SubgraphIndex{0}, IOIndex{0}); + // From + const auto result0 = IODesc{ModelIndex{0}, SubgraphIndex{0}, IOIndex{0}}; + const auto result1 = IODesc{ModelIndex{1}, SubgraphIndex{0}, IOIndex{0}}; + // To + const auto lhs1 = IODesc{ModelIndex{1}, SubgraphIndex{0}, IOIndex{0}}; + const auto lhs2 = IODesc{ModelIndex{2}, SubgraphIndex{0}, IOIndex{0}}; + const auto rhs2 = IODesc{ModelIndex{2}, SubgraphIndex{0}, IOIndex{1}}; + edges.edges.insert({result0, lhs1}); + edges.edges.insert({result0, lhs2}); + edges.edges.insert({result1, rhs2}); + + for (size_t i = 0; i < 3; ++i) + { + graphs.emplace_back(std::make_shared()); + } + Shape shape{1, 2, 2, 1}; + + // Model0's add operands (result1 <= lhs0 + rhs0) + DataType types[3] = {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM, DataType::FLOAT32}; + auto operand_lhs0 = graphs[0]->addOperand(shape, TypeInfo{types[0]}); + auto operand_rhs0 = graphs[0]->addOperand(shape, TypeInfo{types[0]}); + auto operand_result0 = graphs[0]->addOperand(shape, TypeInfo{types[0]}); + + // Model0's add operation + operation::BinaryArithmetic::Param param0; + param0.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; + param0.activation = Activation::NONE; + auto input_set0 = OperandIndexSequence{operand_lhs0, operand_rhs0}; + auto output_set0 = OperandIndexSequence{operand_result0}; + graphs[0]->addOperation( + std::make_unique(input_set0, output_set0, param0)); + + // Model0's inputs/outputs + graphs[0]->addInput(operand_lhs0); + graphs[0]->addInput(operand_rhs0); + graphs[0]->addOutput(operand_result0); + graphs[0]->verify(); + + // Model1's add operands (result2 <= Model0 result + rhs1) + // static float rhs1_data[4] = {3, 1, -1, 5}; + static uint8_t rhs1_data[4] = {131, 129, 127, 133}; + const float scale = 1; + const int32_t zero_point = 128; + auto operand_lhs1 = graphs[1]->addOperand(shape, TypeInfo{types[1], scale, zero_point}); + auto operand_rhs1 = graphs[1]->addOperand(shape, TypeInfo{types[1], scale, zero_point}); + auto operand_result1 = graphs[1]->addOperand(shape, TypeInfo{types[1], scale, zero_point}); + graphs[1] + ->operands() + .at(operand_rhs1) + .data(std::make_unique(reinterpret_cast(&rhs1_data), 4)); + + // Model1's add operation + operation::BinaryArithmetic::Param param1; + param1.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; + param1.activation = Activation::NONE; + auto input_set1 = OperandIndexSequence{operand_lhs1, operand_rhs1}; + auto output_set1 = OperandIndexSequence{operand_result1}; + graphs[1]->addOperation( + std::make_unique(input_set1, output_set1, param1)); + + // Model1's inputs/outputs + graphs[1]->addInput(operand_lhs1); + graphs[1]->addOutput(operand_result1); + graphs[1]->verify(); + + // Model2's additional operands (result3 <= Model0 result + Model1 result) + auto operand_lhs2 = graphs[2]->addOperand(shape, TypeInfo{types[2]}); + auto operand_rhs2 = graphs[2]->addOperand(shape, TypeInfo{types[2]}); + auto operand_result2 = graphs[2]->addOperand(shape, TypeInfo{types[2]}); + + // Model2's add operation + operation::BinaryArithmetic::Param param2; + param2.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; + param2.activation = Activation::NONE; + auto input_set2 = OperandIndexSequence{operand_lhs2, operand_rhs2}; + auto output_set2 = OperandIndexSequence{operand_result2}; + graphs[2]->addOperation( + std::make_unique(input_set2, output_set2, param2)); + + // Model1's inputs/outputs + graphs[2]->addInput(operand_lhs2); + graphs[2]->addInput(operand_rhs2); + graphs[2]->addOutput(operand_result2); + graphs[2]->verify(); + + // Compile + compile(); + } + +public: + void compile() + { + auto nnpkg = std::make_shared(); + coptions.clear(); + for (uint16_t i = 0; i < graphs.size(); ++i) + { + coptions.emplace_back(onert::compiler::CompilerOptions::fromGlobalConfig()); + + auto model = std::make_shared(); + model->push(SubgraphIndex{0}, graphs[i]); + + nnpkg->push(onert::ir::ModelIndex{i}, std::move(model)); + } + for (const auto &pkg_input : edges.pkg_inputs) + { + nnpkg->addInput(pkg_input); + } + for (const auto &pkg_output : edges.pkg_outputs) + { + nnpkg->addOutput(pkg_output); + } + for (const auto &edge : edges.edges) + { + nnpkg->addEdge(edge.from, edge.to); + } + auto compiler = onert::compiler::CompilerFactory::get().create(nnpkg, coptions); + nnpkg.reset(); + artifact = compiler->compile(); + } + +public: + std::vector> graphs; + std::vector> coptions; + std::shared_ptr artifact; + ModelEdges edges; +}; + TEST(ExecInstance, simple) { auto mockup = CompiledMockUpModel(); @@ -209,7 +365,7 @@ class Inference { public: Inference(const float (&input1)[4], const float (&input2)[4], float (&output)[4], - std::shared_ptr &executors) + std::shared_ptr &executors) : _input1{input1}, _input2{input2}, _output{output}, _executors{executors} { // DO NOTHING @@ -233,7 +389,7 @@ private: const float (&_input1)[4]; const float (&_input2)[4]; float (&_output)[4]; - std::shared_ptr &_executors; + std::shared_ptr &_executors; }; // Support multi-thread execution @@ -299,4 +455,181 @@ TEST(ExecInstance, async) } } +TEST(ExecInstance, multi_model_simple) +{ + auto mockup = CompiledMockUpMultiModel(); + auto executors = mockup.artifact->_executors; + + auto input1 = IOIndex{0}; + auto input2 = IOIndex{1}; + auto output = IOIndex{0}; + + const float input1_buffer[4] = {1, 0, -1, -2}; + const float input2_buffer[4] = {1, -3, 2, -4}; + float output_buffer[4] = {}; + const float output_expected[4] = {7, -5, 1, -7}; + + onert::exec::Execution execution{executors}; + + execution.setInput(input1, reinterpret_cast(input1_buffer), 16); + execution.setInput(input2, reinterpret_cast(input2_buffer), 16); + execution.setOutput(output, reinterpret_cast(output_buffer), 16); + execution.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(output_buffer[i], output_expected[i]); + } +} + +TEST(ExecInstance, multi_model_twoCompile) +{ + auto mockup = CompiledMockUpMultiModel(); + auto executors1 = mockup.artifact->_executors; + onert::exec::Execution execution1{executors1}; + + auto input1 = IOIndex{0}; + auto input2 = IOIndex{1}; + auto output = IOIndex{0}; + + const float exe1_input1_buffer[4] = {1, 0, -1, -2}; + const float exe1_input2_buffer[4] = {1, -3, 2, -4}; + float exe1_output_buffer[4] = {}; + const float exe1_output_expected[4] = {7, -5, 1, -7}; + + execution1.setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); + execution1.setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); + execution1.setOutput(output, reinterpret_cast(exe1_output_buffer), 16); + + // Make new executor: compile again + mockup.compile(); + onert::exec::Execution execution2{mockup.artifact->_executors}; + + const float exe2_input1_buffer[4] = {2, 1, -2, 0}; + const float exe2_input2_buffer[4] = {-3, 3, 1, 2}; + float exe2_output_buffer[4] = {}; + const float exe2_output_expected[4] = {1, 9, -3, 9}; + + execution2.setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); + execution2.setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); + execution2.setOutput(output, reinterpret_cast(exe2_output_buffer), 16); + + execution1.execute(); + execution2.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]); + EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]); + } +} + +// Support two initialized execution instance then ordered execution +TEST(ExecInstance, multi_model_twoExecution) +{ + auto mockup = CompiledMockUpMultiModel(); + auto executors = mockup.artifact->_executors; + auto input1 = IOIndex{0}; + auto input2 = IOIndex{1}; + auto output1 = IOIndex{0}; + + const float exe1_input1_buffer[4] = {1, 0, -1, -2}; + const float exe1_input2_buffer[4] = {1, -3, 2, -4}; + float exe1_output_buffer[4] = {}; + const float exe1_output_expected[4] = {7, -5, 1, -7}; + const float exe2_output_expected[4] = {1, 9, -3, 9}; + + onert::exec::Execution execution1{executors}; + execution1.setInput(input1, reinterpret_cast(exe1_input1_buffer), 16); + execution1.setInput(input2, reinterpret_cast(exe1_input2_buffer), 16); + execution1.setOutput(output1, reinterpret_cast(exe1_output_buffer), 16); + + const float exe2_input1_buffer[4] = {2, 1, -2, 0}; + const float exe2_input2_buffer[4] = {-3, 3, 1, 2}; + float exe2_output_buffer[4] = {}; + + // Make new execution + onert::exec::Execution execution2{executors}; + execution2.setInput(input1, reinterpret_cast(exe2_input1_buffer), 16); + execution2.setInput(input2, reinterpret_cast(exe2_input2_buffer), 16); + execution2.setOutput(output1, reinterpret_cast(exe2_output_buffer), 16); + + execution1.execute(); + execution1.execute(); + execution2.execute(); + execution2.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]); + EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]); + } +} + +// Multi-model is not thread-safe yet + +// Support asynchronous execution +TEST(ExecInstance, multi_model_async) +{ + auto mockup = CompiledMockUpMultiModel(); + auto executors = mockup.artifact->_executors; + + auto input1 = IOIndex{0}; + auto input2 = IOIndex{1}; + auto output = IOIndex{0}; + + const float input1_buffer[4] = {1, 0, -1, -2}; + const float input2_buffer[4] = {1, -3, 2, -4}; + float output_buffer[4] = {}; + const float output_expected[4] = {7, -5, 1, -7}; + + onert::exec::Execution execution{executors}; + + execution.setInput(input1, reinterpret_cast(input1_buffer), 16); + execution.setInput(input2, reinterpret_cast(input2_buffer), 16); + execution.setOutput(output, reinterpret_cast(output_buffer), 16); + execution.startExecute(); + execution.waitFinish(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(output_buffer[i], output_expected[i]); + } +} + +TEST(ExecInstance, multi_model_dequant_input_quant_output) +{ + auto mockup = CompiledMockUpMultiModel(); + auto executors = mockup.artifact->_executors; + + auto input1 = IOIndex{0}; + auto input2 = IOIndex{1}; + auto output = IOIndex{0}; + + const uint8_t input1_buffer[4] = {138, 128, 118, 108}; // {1, 0, -1, -2} + const uint8_t input2_buffer[4] = {138, 98, 148, 88}; // {1, -3, 2, -4} + uint8_t output_buffer[4] = {}; + const uint8_t output_expected[4] = {198, 78, 138, 58}; // {7, -5, 1, -7} + float scale = 0.1; + int32_t zero_point = 128; + + onert::exec::Execution execution{executors}; + + onert::ir::TypeInfo type_info{onert::ir::DataType::QUANT_UINT8_ASYMM, scale, zero_point}; + execution.setInput(input1, type_info, execution.getInputShape(input1), + reinterpret_cast(input1_buffer), 4, onert::ir::Layout::NHWC); + execution.setInput(input2, type_info, execution.getInputShape(input2), + reinterpret_cast(input2_buffer), 4, onert::ir::Layout::NHWC); + execution.setOutput(output, type_info, execution.getOutputShape(output), + reinterpret_cast(output_buffer), 4, onert::ir::Layout::NHWC); + execution.execute(); + + for (auto i = 0; i < 4; i++) + { + EXPECT_EQ(output_buffer[i], output_expected[i]); + } +} + +// TODO Add an unittest multi_model_quant_input_dequant_output + } // namespace diff --git a/runtime/onert/core/src/exec/ExecutionObservee.cc b/runtime/onert/core/src/exec/ExecutionObservee.cc index d6a2bfd17..66610f0e0 100644 --- a/runtime/onert/core/src/exec/ExecutionObservee.cc +++ b/runtime/onert/core/src/exec/ExecutionObservee.cc @@ -28,7 +28,7 @@ void ExecutionObservee::add(std::unique_ptr observer) void ExecutionObservee::notifySubgraphBegin(ir::SubgraphIndex ind) { - for (auto &o : _observers) + for (auto &&o : _observers) { o->handleSubgraphBegin(ind); } @@ -36,7 +36,7 @@ void ExecutionObservee::notifySubgraphBegin(ir::SubgraphIndex ind) void ExecutionObservee::notifySubgraphEnd(ir::SubgraphIndex ind) { - for (auto &o : _observers) + for (auto &&o : _observers) { o->handleSubgraphEnd(ind); } @@ -45,7 +45,7 @@ void ExecutionObservee::notifySubgraphEnd(ir::SubgraphIndex ind) void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex subg_ind, ir::OperationIndex op_ind, const backend::Backend *backend) { - for (auto &o : _observers) + for (auto &&o : _observers) { o->handleJobBegin(executor, subg_ind, op_ind, backend); } @@ -54,7 +54,7 @@ void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex su void ExecutionObservee::notifyJobEnd(IExecutor *executor, ir::SubgraphIndex subg_ind, ir::OperationIndex op_ind, const backend::Backend *backend) { - for (auto &o : _observers) + for (auto &&o : _observers) { o->handleJobEnd(executor, subg_ind, op_ind, backend); } diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h index 1aadac2f5..91fbac323 100644 --- a/runtime/onert/core/src/exec/ExecutionObservers.h +++ b/runtime/onert/core/src/exec/ExecutionObservers.h @@ -22,7 +22,7 @@ #include "../util/EventRecorder.h" #include "../util/EventWriter.h" -#include "exec/Executors.h" +#include "exec/IExecutor.h" #include "ir/Index.h" #include "ir/Operation.h" #include "util/ITimer.h" diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc index d2d204a0b..515cf8e48 100644 --- a/runtime/onert/core/src/exec/ExecutorBase.cc +++ b/runtime/onert/core/src/exec/ExecutorBase.cc @@ -29,8 +29,8 @@ ExecutorBase::ExecutorBase(std::unique_ptr &&lowered_gra backend::BackendContexts &&backend_contexts, const compiler::TensorRegistries &tensor_regs, const util::TracingCtx *tracing_ctx) - : _lowered_graph{std::move(lowered_graph)}, _backend_contexts{std::move(backend_contexts)}, - _graph{_lowered_graph->graph()}, _parent_graph{_lowered_graph->parent_graph()}, _mutex(), + : _lowered_graph{std::move(lowered_graph)}, + _backend_contexts{std::move(backend_contexts)}, _graph{_lowered_graph->graph()}, _mutex(), _tracing_ctx(tracing_ctx) { auto build_tensor_list = [&](const auto &ind_seq, auto &tensors) { @@ -120,9 +120,27 @@ void ExecutorBase::execute(const IODescription &desc) { tensor->set_dynamic(); tensor->setShape(input_shape->second); + /* + * Changes tensor shape and allocate memory since its shape was changed + * perhaps by nnfw_set_input_tensorinfo() + * + * Cases are: + * 1) static operand -> nnfw_set_input_tensorinfo() -> execute() -> execute() + * (a) (b) + * + * at (a), operand is static, tensor is static - memory dealloc is not needed + * (DynamicTensorManager cannot dealloc memory allocated by StaticTensorManager) + * at (b), operand is static, tensor is dynamic - memory dealloc is needed + * + * 2) dynamic operand -> nnfw_set_input_tensorinfo() -> execute() -> execute() + * (a) (b) + * + * at (a), operand is dynamic, tensor is dynamic - memory dealloc is not needed + * since it has not been allocated yet + * at (b), operand is dynamic, tensor is dynamic - memory dealloc is needed + */ + tensor->applyShape(input_shape->second); } - - handleDynamicInputTensor(ir::IOIndex{i}, desc); } assert(_output_tensors.size() == desc.outputs.size()); @@ -156,38 +174,9 @@ void ExecutorBase::execute(const IODescription &desc) } } -/** - * @brief Changes tensor shape and allocate memory - * if input shape was changed by nnfw_set_input_tensorinfo() - * - * @note Cases are: - * 1) static operand -> nnfw_set_input_tensorinfo() -> execute() -> execute() - * (a) (b) - * - * at (a), operand is static, tensor is static - memory dealloc is not needed - * (DynamicTensorManager cannot dealloc memory allocated by StaticTensorManager) - * at (b), operand is static, tensor is dynamic - memory dealloc is needed - * - * 2) dynamic operand -> nnfw_set_input_tensorinfo() -> execute() -> execute() - * (a) (b) - * - * at (a), operand is dynamic, tensor is dynamic - memory dealloc is not needed - * since it has not been allocated yet - * at (b), operand is dynamic, tensor is dynamic - memory dealloc is needed - */ -void ExecutorBase::handleDynamicInputTensor(ir::IOIndex io_ind, const IODescription &desc) -{ - auto shape_sig_found = desc.dynamic_input_shapes.find(io_ind); - if (shape_sig_found != desc.dynamic_input_shapes.end()) - { - auto changed_input_shape = shape_sig_found->second; - _input_tensors[io_ind.value()]->applyShape(changed_input_shape); - } -} - bool ExecutorBase::hasDynamicInput() { - for (auto &tensor : _input_tensors) + for (auto &&tensor : _input_tensors) { if (tensor->is_dynamic()) return true; diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h index e4f914546..7aee3d9ee 100644 --- a/runtime/onert/core/src/exec/ExecutorBase.h +++ b/runtime/onert/core/src/exec/ExecutorBase.h @@ -51,9 +51,7 @@ public: virtual ~ExecutorBase() = default; - const ir::Graph &graph() final { return _graph; } - - const ir::Graph &parent_graph() final { return _parent_graph; } + const ir::Graph &graph() const final { return _graph; } void execute(const IODescription &desc) final; @@ -70,6 +68,11 @@ public: void addObserver(std::unique_ptr ref) { _subject.add(std::move(ref)); }; + const std::vector &getInputTensors() const override + { + return _input_tensors; + } + const std::vector &getOutputTensors() const override { return _output_tensors; @@ -87,14 +90,10 @@ protected: std::unique_ptr _lowered_graph; backend::BackendContexts _backend_contexts; const ir::Graph &_graph; - const ir::Graph &_parent_graph; std::vector _input_tensors; std::vector _output_tensors; std::mutex _mutex; const util::TracingCtx *_tracing_ctx; - -private: - void handleDynamicInputTensor(ir::IOIndex input_index, const IODescription &desc); }; } // namespace exec diff --git a/runtime/onert/core/src/exec/Executors.cc b/runtime/onert/core/src/exec/Executors.cc index e0ee24fea..3f4b3cc7f 100644 --- a/runtime/onert/core/src/exec/Executors.cc +++ b/runtime/onert/core/src/exec/Executors.cc @@ -14,170 +14,628 @@ * limitations under the License. */ -#include "exec/Executors.h" +#include "Executors.h" -namespace onert -{ -namespace exec +#include "../backend/builtin/IOTensor.h" + +namespace { -uint32_t Executors::inputSize() const +using namespace onert; + +int32_t find_input_index(const std::vector &pkg_inputs, + const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + const ir::IOIndex &io_index) { - return _model_edges ? _model_edges->pkg_inputs.size() - : _executors.at(ir::SubgraphIndex{0})->graph().getInputs().size(); + for (size_t i = 0; i < pkg_inputs.size(); i++) + { + auto &input_desc = pkg_inputs[i]; + if ((std::get(input_desc) == model_index) && + (std::get(input_desc) == subg_index) && + (std::get(input_desc) == io_index)) + return static_cast(i); + } + return -1; } -uint32_t Executors::outputSize() const +int32_t find_output_index(const std::vector &pkg_outputs, + const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + const ir::IOIndex &io_index) { - return _model_edges ? _model_edges->pkg_outputs.size() - : _executors.at(ir::SubgraphIndex{0})->graph().getOutputs().size(); + for (size_t i = 0; i < pkg_outputs.size(); i++) + { + auto &input_desc = pkg_outputs[i]; + if ((std::get(input_desc) == model_index) && + (std::get(input_desc) == subg_index) && + (std::get(input_desc) == io_index)) + return static_cast(i); + } + return -1; } -const ir::OperandInfo Executors::inputInfo(const ir::IOIndex &index) +} // namespace + +namespace onert +{ +namespace exec +{ + +class Executors::EdgeTensor : public backend::builtin::IOTensor { - if (_model_edges) +public: + EdgeTensor(const ir::OperandInfo &info, ir::Layout layout) + : backend::builtin::IOTensor(info, layout), _buffer{nullptr}, _ref_count{0} { - // Assume that each model may have only one subgraph - // TODO handle general case - const auto desc = _model_edges->pkg_inputs[index.value()]; - const auto model_idx = std::get<0>(desc); - const auto executor_idx = ir::SubgraphIndex{model_idx.value()}; - const auto input_index = _executors.at(executor_idx)->graph().getInputs().at(std::get<2>(desc)); - return _executors.at(executor_idx)->graph().operands().at(input_index).info(); } + ~EdgeTensor() = default; - const auto input_index = _executors.at(ir::SubgraphIndex{0})->graph().getInputs().at(index); - return _executors.at(ir::SubgraphIndex{0})->graph().operands().at(input_index).info(); -} + void allocate_buffer() + { + const auto total_size = orig_info().total_size(); + _buffer = std::make_unique(total_size); + _ref_count = 1; -const ir::OperandInfo Executors::outputInfo(const ir::IOIndex &index) -{ - if (_model_edges) + // NOTE Executor's inputs/outputs are always IPortableTensor. If backend of inputs/outputs + // is using tensor that does not inherit IPortableTensor, Permute operation is added + // and all inputs/outputs become IPortableTensor at compile stage. + // This allows user's buffers to be set to inputs/outputs of executors. + setUserTensor(_buffer.get(), total_size); + } + + void increase_ref() { _ref_count++; } + + void decrease_ref() { - // Assume that each model may have only one subgraph - // TODO handle general case - auto desc = _model_edges->pkg_outputs[index.value()]; - auto model_idx = std::get<0>(desc); - auto executor_idx = ir::SubgraphIndex{model_idx.value()}; - auto output_index = _executors.at(executor_idx)->graph().getOutputs().at(std::get<2>(desc)); - return _executors.at(executor_idx)->graph().operands().at(output_index).info(); + assert(_ref_count > 0); + _ref_count--; + if (_ref_count == 0) + { + _buffer.reset(); + setUserTensor(nullptr, orig_info().total_size()); + } } - auto output_index = _executors.at(ir::SubgraphIndex{0})->graph().getOutputs().at(index); - return _executors.at(ir::SubgraphIndex{0})->graph().operands().at(output_index).info(); +private: + std::unique_ptr _buffer; + int32_t _ref_count; +}; + +void Executors::emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + std::unique_ptr exec) +{ + _executors.emplace(std::make_pair(model_index, subg_index), std::move(exec)); } -void Executors::execute(const IODescription &desc) +IExecutor *Executors::at(const ir::ModelIndex &model_index, + const ir::SubgraphIndex &subg_index) const +{ + return _executors.at(std::make_pair(model_index, subg_index)).get(); +} + +uint32_t Executors::inputSize() const { return _model_edges->pkg_inputs.size(); } + +uint32_t Executors::outputSize() const { return _model_edges->pkg_outputs.size(); } + +const ir::OperandInfo &Executors::inputInfo(const ir::IOIndex &index) const { - if (_model_edges) - return executeEntries(desc); + auto const desc = _model_edges->pkg_inputs[index.value()]; + auto const model_index = std::get<0>(desc); + auto const subg_index = std::get<1>(desc); + auto const io_index = std::get<2>(desc); + auto const executor = at(model_index, subg_index); + return executor->getInputTensors().at(io_index.value())->orig_info(); +} - _executors.at(ir::SubgraphIndex{0})->execute(desc); +const ir::OperandInfo &Executors::outputInfo(const ir::IOIndex &index) const +{ + auto const desc = _model_edges->pkg_outputs[index.value()]; + auto const model_index = std::get<0>(desc); + auto const subg_index = std::get<1>(desc); + auto const io_index = std::get<2>(desc); + auto const executor = at(model_index, subg_index); + return executor->getOutputTensors().at(io_index.value())->orig_info(); } -void Executors::executeEntries(const IODescription &desc) +// Allow below edges only +// m1 < m2, s1 == 0 and s2 == 0 if m1:s1:o1 -> m2:s2:o2' +void Executors::checkSupportedMultimodel() const { - // Assume 2 executors only - // Assume that each model may have only one subgraph - // TODO Support general case - if (_executors.size() != 2) - throw std::runtime_error{"NYI: Multi model execution for this package is not supported yet"}; + // If package includes no-connection model, model_count is less than real model count in package. + // Then this method will throw exception based on model index + // 1st model: input assumption + // Otherwise: edges assumption - // Assume all edges are 0:0:x -> 1:0:x + // Assumption: edges + // m1 < m2, s1 == 0 and s2 == 0 if edge 'm1:s1:o1 -> m2:s2:o2' for (auto edge : _model_edges->edges) { - if ((std::get(edge.from) != ir::ModelIndex{0}) || - (std::get(edge.to) != ir::ModelIndex{1}) || - (std::get(edge.from) != ir::SubgraphIndex{0}) || - (std::get(edge.to) != ir::SubgraphIndex{0}) || - (std::get(edge.from) != std::get(edge.to))) - throw std::runtime_error{"NYI: Multi model execution for this edge is not supported yet"}; + auto const model_from = std::get(edge.from); + auto const model_to = std::get(edge.to); + auto const subg_from = std::get(edge.from); + auto const subg_to = std::get(edge.to); + + if (model_from.value() == model_to.value()) + { + throw std::runtime_error{"Multi model's edge set has invalid edge"}; + } + + if ((model_from.value() > model_to.value()) || (subg_from != ir::SubgraphIndex{0}) || + (subg_to != ir::SubgraphIndex{0})) + throw std::runtime_error{"NYI: Multi model execution for this edge set is not supported yet"}; } - // Assume all package inputs are 0:0:x - for (uint32_t i = 0; i < _model_edges->pkg_inputs.size(); i++) + // Assumption: package inputs + // All 1st model inputs come from package input if always m1 < m2 { - auto input = _model_edges->pkg_inputs[i]; - if ((std::get(input) != ir::ModelIndex{0}) || - (std::get(input) != ir::SubgraphIndex{0}) || - (std::get(input) != ir::IOIndex{i})) + auto first_executor = at(ir::ModelIndex{0}, ir::SubgraphIndex{0}); + auto search_first_model = [&](const ir::IOIndex &input_index) { + for (const auto &input : _model_edges->pkg_inputs) + { + if ((std::get(input) == ir::ModelIndex{0}) || + (std::get(input) == ir::SubgraphIndex{0}) || + (std::get(input) == input_index)) + return true; + } + + return false; + }; + + for (uint32_t i = 0; i < first_executor->getInputTensors().size(); i++) { - throw std::runtime_error{"NYI: Support package input to 1st model with same order"}; + if (!search_first_model(ir::IOIndex{i})) + throw std::runtime_error{"Cannot find 1st model's input buffer"}; } } - // Assume all package outputs are 1:0:x - for (uint32_t i = 0; i < _model_edges->pkg_outputs.size(); i++) + // Check whether nnpkg outputs and Edge `from` are duplicated + for (const auto &edge : _model_edges->edges) { - auto output = _model_edges->pkg_outputs[i]; - if ((std::get(output) != ir::ModelIndex{1}) || - (std::get(output) != ir::SubgraphIndex{0}) || - (std::get(output) != ir::IOIndex{i})) + if (std::find(_model_edges->pkg_outputs.begin(), _model_edges->pkg_outputs.end(), edge.from) != + _model_edges->pkg_outputs.end()) { - throw std::runtime_error{"NYI: Support package output from 2nd model with same order"}; + throw std::runtime_error{"Multi model execution does not support duplicating nnpkg outputs " + "with `from` of edges yet"}; } } +} + +void Executors::createEdgeQuantLayers() +{ + if (_is_created_edge_quant_layers) + { + return; + } - const auto &executor1 = _executors.at(ir::SubgraphIndex{0}); - const auto &graph1 = executor1->graph(); - const auto &executor2 = _executors.at(ir::SubgraphIndex{1}); - const auto &graph2 = executor2->graph(); + // Create EdgeTensor for edges between executors + for (const auto &pair : _edge_map) + { + const auto &from_iodesc = pair.first; + const auto &from_model_index = std::get(from_iodesc); + const auto &from_subg_index = std::get(from_iodesc); + const auto &from_io_index = std::get(from_iodesc); + + const auto from_executor = _executors.at({from_model_index, from_subg_index}).get(); + const auto from_tensor = from_executor->getOutputTensors().at(from_io_index.value()); + + const auto &from_info = from_tensor->orig_info(); + const auto from_layout = from_tensor->orig_layout(); + _edge_tensors[from_iodesc] = std::make_unique(from_info, from_layout); + } - if ((graph1.getInputs().size() != _model_edges->pkg_inputs.size()) || - (graph2.getOutputs().size() != _model_edges->pkg_outputs.size()) || - (graph1.getOutputs().size() != graph2.getInputs().size()) || - (graph1.getOutputs().size() != _model_edges->edges.size())) + // Append type-aware quantization layer for edges between executors + for (const auto &executor_pair : _executors) { - throw std::runtime_error{"NYI: Unsupported model edge pattern"}; + const auto &executor_index = executor_pair.first; + const auto &model_index = executor_index.first; + const auto &subg_index = executor_index.second; + + std::vector inputs; + std::vector outputs; + for (const auto &pair : _edge_map) + { + const auto &from_iodesc = pair.first; + if (std::get(from_iodesc) == model_index && + std::get(from_iodesc) == subg_index) + { + const auto from_tensor = _edge_tensors[from_iodesc].get(); + const auto &to_list = pair.second; + + for (const auto &to_iodesc : to_list) + { + const auto &to_model_index = std::get(to_iodesc); + const auto &to_subg_index = std::get(to_iodesc); + const auto &to_io_index = std::get(to_iodesc); + + const auto to_executor = _executors.at({to_model_index, to_subg_index}).get(); + const auto to_tensor = to_executor->getInputTensors().at(to_io_index.value()); + + // TODO Unify tensors with the same `from` tensor and same type + if (from_tensor->data_type() != to_tensor->data_type()) + { + assert(inputs.size() == outputs.size()); + const auto &to_info = + to_executor->getInputTensors().at(to_io_index.value())->orig_info(); + const auto to_layout = to_tensor->orig_layout(); + inputs.emplace_back(from_tensor); + + auto type_aware_quant_tensor = std::make_unique(to_info, to_layout); + outputs.emplace_back(type_aware_quant_tensor.get()); + + _edge_quant_tensors[to_iodesc] = std::move(type_aware_quant_tensor); + } + } + } + } + + auto layer = std::make_unique(inputs, outputs); + layer->prepare(); + _edge_quant_layers[{model_index, subg_index}] = std::move(layer); } - // Prepare buffer - // Assume buffer layout is NHWC - std::vector> bufs(_model_edges->edges.size()); - std::vector buf_infos(_model_edges->edges.size()); - const auto layout = ir::Layout::NHWC; + _is_created_edge_quant_layers = true; +} - for (uint32_t i = 0; i < graph1.getOutputs().size(); i++) +void Executors::CreatePkgIOTensors(const IODescription &desc) +{ + for (const auto &pkg_input : _model_edges->pkg_inputs) { - const auto buf_index = - _executors.at(ir::SubgraphIndex{0})->graph().getOutputs().at(ir::IOIndex{i}); - buf_infos[i] = &_executors.at(ir::SubgraphIndex{0})->graph().operands().at(buf_index).info(); - const auto buf_size = buf_infos[i]->total_size(); - bufs[i] = std::make_unique(buf_size); + // Create IOTensor for nnpkg inputs + const auto &model_index = std::get(pkg_input); + const auto &subg_index = std::get(pkg_input); + const auto &io_index = std::get(pkg_input); + const auto input_pkg_index = + find_input_index(_model_edges->pkg_inputs, model_index, subg_index, io_index); + auto input_desc = desc.inputs[input_pkg_index].get(); + _pkg_input_tensors[pkg_input] = + std::make_unique(input_desc->info, input_desc->layout); } - // 1st executor + for (const auto &pkg_output : _model_edges->pkg_outputs) { - IODescription desc1; - const auto input_size = graph1.getInputs().size(); - const auto output_size = graph1.getOutputs().size(); - desc1.inputs.resize(input_size); - desc1.outputs.resize(output_size); - for (uint32_t i = 0; i < input_size; i++) - desc1.inputs[i] = std::make_unique(*desc.inputs[i].get()); - for (uint32_t i = 0; i < output_size; i++) - desc1.outputs[i] = std::make_unique(*buf_infos[i], bufs[i].get(), - buf_infos[i]->total_size(), layout); + // Create IOTensor for nnpkg outputs + const auto &model_index = std::get(pkg_output); + const auto &subg_index = std::get(pkg_output); + const auto &io_index = std::get(pkg_output); + const auto output_pkg_index = + find_output_index(_model_edges->pkg_outputs, model_index, subg_index, io_index); + auto output_desc = desc.outputs[output_pkg_index].get(); + _pkg_output_tensors[pkg_output] = + std::make_unique(output_desc->info, output_desc->layout); + } +} - executor1->execute(desc1); +void Executors::createPkgIOQuantLayers(const IODescription &desc) +{ + // Append type-aware quantization layer for nnpkg inputs/outputs between executors + for (const auto &pair : _executors) + { + const auto &executor_index = pair.first; + const auto &model_index = executor_index.first; + const auto &subg_index = executor_index.second; + const auto executor = pair.second.get(); + + // Find pkg inputs of current executor + std::vector pkg_inputs; + for (const auto &pkg_input : _model_edges->pkg_inputs) + { + if (std::get(pkg_input) == model_index && + std::get(pkg_input) == subg_index) + { + pkg_inputs.emplace_back(pkg_input); + } + } + std::vector src_tensors; + std::vector dst_tensors; + for (const auto &pkg_input : pkg_inputs) + { + const auto &io_index = std::get(pkg_input); + const auto input_pkg_index = + find_input_index(_model_edges->pkg_inputs, model_index, subg_index, io_index); + auto input_desc = desc.inputs[input_pkg_index].get(); + + // Create EdgeTensor for nnpkg input if type is different + const auto input_tensor = + executor->getInputTensors().at(std::get(pkg_input).value()); + const auto &orig_info = input_tensor->orig_info(); + if (input_desc->info.typeInfo().type() != input_tensor->orig_info().typeInfo().type()) + { + const auto orig_layout = input_tensor->orig_layout(); + auto pkg_input_edge_tensor = std::make_unique(orig_info, orig_layout); + _pkg_input_quant_tensors[pkg_input] = std::move(pkg_input_edge_tensor); + + // Append type-aware quantization layer's inputs/outputs + src_tensors.emplace_back(_pkg_input_tensors[pkg_input].get()); + dst_tensors.emplace_back(_pkg_input_quant_tensors[pkg_input].get()); + } + } + + // Create type-aware quantization layer for nnpkg inputs + auto pkg_input_layer = std::make_unique(src_tensors, dst_tensors); + pkg_input_layer->prepare(); + _pkg_input_quant_layers[{model_index, subg_index}] = std::move(pkg_input_layer); + + // Find pkg outputs of current executor + std::vector pkg_outputs; + for (const auto &pkg_output : _model_edges->pkg_outputs) + { + if (std::get(pkg_output) == model_index && + std::get(pkg_output) == subg_index) + { + pkg_outputs.emplace_back(pkg_output); + } + } + src_tensors.clear(); + dst_tensors.clear(); + // Create Tensors of nnpkg outputs for type-aware quantization + for (const auto &pkg_output : pkg_outputs) + { + const auto &io_index = std::get(pkg_output); + const auto output_pkg_index = + find_output_index(_model_edges->pkg_outputs, model_index, subg_index, io_index); + auto output_desc = desc.outputs[output_pkg_index].get(); + + // Create EdgeTensor for nnpkg output if type is different + const auto output_tensor = + executor->getOutputTensors().at(std::get(pkg_output).value()); + const auto &orig_info = output_tensor->orig_info(); + if (output_desc->info.typeInfo().type() != output_tensor->orig_info().typeInfo().type()) + { + const auto orig_layout = output_tensor->orig_layout(); + auto pkg_output_edge_tensor = std::make_unique(orig_info, orig_layout); + _pkg_output_quant_tensors[pkg_output] = std::move(pkg_output_edge_tensor); + + // Append type-aware quantization layer's inputs/outputs + src_tensors.emplace_back(_pkg_output_quant_tensors[pkg_output].get()); + dst_tensors.emplace_back(_pkg_output_tensors[pkg_output].get()); + } + } + + // Create type-aware quantization layer for nnpkg outputs + auto pkg_output_layer = std::make_unique(src_tensors, dst_tensors); + pkg_output_layer->prepare(); + _pkg_output_quant_layers[{model_index, subg_index}] = std::move(pkg_output_layer); } +} + +void Executors::execute(const IODescription &desc) +{ + // Check supported multi model package + checkSupportedMultimodel(); + + // TODO Move creating type-aware quantization layers for edges in compilation stage + createEdgeQuantLayers(); + + // TODO Create IOTensors only once and recreate them only if nnpkg info changes + CreatePkgIOTensors(desc); + + // TODO Create type-aware quantization layers only once and recreate them only if type changes + createPkgIOQuantLayers(desc); - // 2nd executor + // TODO Find better way to schedule order of executors + auto const model_count = modelCount(); + + auto find_from = [&](const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + const ir::IOIndex &io_index) { + for (const auto &edge : _model_edges->edges) + { + if ((std::get(edge.to) == model_index) && + (std::get(edge.to) == subg_index) && + (std::get(edge.to) == io_index)) + return edge.from; + } + + throw std::runtime_error{"Cannot find edge for model input"}; + }; + + // Execute each model + // NOTE May be better to use vector instead of unordered_map for _executors + for (auto model_index = ir::ModelIndex{0}; model_index.value() < model_count; model_index++) { - IODescription desc2; - const auto input_size = graph2.getInputs().size(); - const auto output_size = graph2.getOutputs().size(); - desc2.inputs.resize(input_size); - desc2.outputs.resize(output_size); + // Find executor + auto executor = at(model_index, ir::SubgraphIndex{0}); + + // Set IOTensors + // TODO Set internal IOTensors only once + std::vector inputs_inter; + std::vector outputs_inter; + const auto &input_tensors = executor->getInputTensors(); + const auto &output_tensors = executor->getOutputTensors(); + auto const input_size = input_tensors.size(); + auto const output_size = output_tensors.size(); + inputs_inter.resize(input_size); + outputs_inter.resize(output_size); + + // Set inputs of executor + // TODO Create layer to allocate/deallocate buffers of EdgeTensor for each executor for (uint32_t i = 0; i < input_size; i++) - desc2.inputs[i] = std::make_unique(*buf_infos[i], bufs[i].get(), - buf_infos[i]->total_size(), layout); + { + const auto input_pkg_index = find_input_index(_model_edges->pkg_inputs, model_index, + ir::SubgraphIndex{0}, ir::IOIndex{i}); + const auto input_io_desc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + if (input_pkg_index != -1) + { + // Allocate type-aware quantization tensors for nnpkg inputs and set internal tensors + if (_pkg_input_quant_tensors.find(input_io_desc) != _pkg_input_quant_tensors.end()) + { + _pkg_input_quant_tensors[input_io_desc]->allocate_buffer(); + + inputs_inter[i] = _pkg_input_quant_tensors[input_io_desc].get(); + } + else + { + inputs_inter[i] = _pkg_input_tensors[input_io_desc].get(); + } + + // Set buffer of IOTensor + auto input_desc = desc.inputs[input_pkg_index].get(); + // TODO Remove const_cast (we need const_cast as ITensor is writable) + _pkg_input_tensors[input_io_desc]->setUserTensor( + reinterpret_cast(const_cast(input_desc->buffer)), input_desc->size); + } + else + { + auto from_iodesc = find_from(model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}); + const auto &from_model_index = std::get(from_iodesc); + const auto &from_subg_index = std::get(from_iodesc); + const auto &from_ioindex = std::get(from_iodesc).value(); + + // Supported only sequantial execution of models + assert(from_model_index.value() < model_index.value()); + assert(from_subg_index.value() == 0); + const auto from_executor = _executors.at({from_model_index, from_subg_index}).get(); + const auto to_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + if (_edge_quant_tensors.find(to_iodesc) == _edge_quant_tensors.end()) + { + inputs_inter[i] = from_executor->getOutputTensors().at(from_ioindex); + } + else + { + inputs_inter[i] = _edge_quant_tensors.at(to_iodesc).get(); + } + assert(inputs_inter[i]->buffer() != nullptr); + } + } + + // Set outputs of executor for (uint32_t i = 0; i < output_size; i++) - desc2.outputs[i] = std::make_unique(*desc.outputs[i].get()); + { + const auto output_pkg_index = find_output_index(_model_edges->pkg_outputs, model_index, + ir::SubgraphIndex{0}, ir::IOIndex{i}); + const auto output_io_desc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + if (output_pkg_index != -1) + { + // Allocate type-aware quantization tensors for nnpkg outputs and set internal tensors + if (_pkg_output_quant_tensors.find(output_io_desc) != _pkg_output_quant_tensors.end()) + { + _pkg_output_quant_tensors[output_io_desc]->allocate_buffer(); + + outputs_inter[i] = _pkg_output_quant_tensors[output_io_desc].get(); + } + else + { + outputs_inter[i] = _pkg_output_tensors[output_io_desc].get(); + } + + // Set buffer of IOTensor + auto output_desc = desc.outputs[output_pkg_index].get(); + _pkg_output_tensors[output_io_desc]->setUserTensor( + reinterpret_cast(output_desc->buffer), output_desc->size); + } + else + { + // Allocate buffer of `from` tensors + const auto from_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + _edge_tensors[from_iodesc]->allocate_buffer(); + outputs_inter[i] = _edge_tensors[from_iodesc].get(); - executor2->execute(desc2); + // Allocate buffer of tensors for type-aware quantization + for (const auto &to_iodesc : _edge_map[from_iodesc]) + { + _edge_tensors[from_iodesc]->increase_ref(); + if (_edge_quant_tensors.find(to_iodesc) != _edge_quant_tensors.end()) + { + auto type_aware_quant_tensor = _edge_quant_tensors.at(to_iodesc).get(); + type_aware_quant_tensor->allocate_buffer(); + + _edge_tensors[from_iodesc]->decrease_ref(); + } + } + } + } + + _pkg_input_quant_layers[{model_index, ir::SubgraphIndex{0}}]->run(); + + executor->execute(inputs_inter, outputs_inter); + + _edge_quant_layers[{model_index, ir::SubgraphIndex{0}}]->run(); + _pkg_output_quant_layers[{model_index, ir::SubgraphIndex{0}}]->run(); + + // Release input buffers that are no longer needed + for (uint32_t i = 0; i < input_size; i++) + { + const auto input_pkg_index = find_input_index(_model_edges->pkg_inputs, model_index, + ir::SubgraphIndex{0}, ir::IOIndex{i}); + + const auto to_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + if (input_pkg_index == -1) + { + if (_edge_quant_tensors.find(to_iodesc) != _edge_quant_tensors.end()) + { + // Decrease reference count of tensor for type-aware quantization if input tensor is the + // tensor + const auto to_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + if (_edge_quant_tensors.find(to_iodesc) != _edge_quant_tensors.end()) + { + _edge_quant_tensors[to_iodesc]->decrease_ref(); + } + } + else + { + // Decrease reference count of `from` tensor if input tensor is the `from` tensor + const auto from_iodesc = find_from(model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}); + _edge_tensors[from_iodesc]->decrease_ref(); + + // Decrease reference count of nnpkg inputs + if (_pkg_input_quant_tensors.find(to_iodesc) != _pkg_input_quant_tensors.end()) + { + _pkg_input_quant_tensors[to_iodesc]->decrease_ref(); + } + } + } + } + + // Release output buffers if those buffers are no longer used other executors because of + // type-aware quantization + // FIXME if tensors for type-aware quantization unified for the same `from` tensor and same type + for (uint32_t i = 0; i < output_size; i++) + { + auto from_iodesc = ir::IODesc{model_index, ir::SubgraphIndex{0}, ir::IOIndex{i}}; + + // Check if other executors will use the buffer of edge tensor + const auto &to_list = _edge_map[from_iodesc]; + if (to_list.size() == 0) + { + // This condition means `from_iodesc` tensor is an output of nnpkg + continue; + } + + bool to_be_release = + !std::any_of(to_list.begin(), to_list.end(), [&](const ir::IODesc &to_iodesc) { + // This condition means another executor uses the buffer of edge tensor + return _edge_quant_tensors.find(to_iodesc) == _edge_quant_tensors.end(); + }); + + if (to_be_release) + { + // This edge tensor's buffer won't be used in other executors + // Tensors for type-aware quantization take over the role of this edge tensor instead + _edge_tensors[from_iodesc]->decrease_ref(); + } + + // Decrease reference count of nnpkg outputs + if (_pkg_output_quant_tensors.find(from_iodesc) != _pkg_output_quant_tensors.end()) + { + _pkg_output_quant_tensors[from_iodesc]->decrease_ref(); + } + } } } +// modelCount() iterates _executors. +// It assumes that Compiler will generate Executor for all models and _executors includes all +// generated Executor. +// If nnpackage includes model(s) which has no connection and Compiler does not +// generate Executor for them, modelCount() return less value than real model count. +uint16_t Executors::modelCount() const +{ + uint16_t model_count = 0; + for (; _executors.find(std::make_pair(ir::ModelIndex{model_count}, ir::SubgraphIndex{0})) != + _executors.end(); + model_count++) + ; + + return model_count; +} + } // namespace exec } // namespace onert diff --git a/runtime/onert/core/src/exec/Executors.h b/runtime/onert/core/src/exec/Executors.h new file mode 100644 index 000000000..ac7489186 --- /dev/null +++ b/runtime/onert/core/src/exec/Executors.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_EXEC_EXECUTORS_H__ +#define __ONERT_EXEC_EXECUTORS_H__ + +#include "exec/IExecutors.h" +#include "ir/NNPkg.h" +#include "IPermuteFunction.h" + +namespace std +{ + +template <> struct hash> +{ + size_t + operator()(const std::pair<::onert::ir::ModelIndex, ::onert::ir::SubgraphIndex> &pair) const + noexcept + { + return (hash()(pair.first.value()) << 16) ^ hash()(pair.second.value()); + } +}; + +} // namespace std + +namespace onert +{ +namespace exec +{ + +/** + * @brief Class to gather executors + */ +class Executors : public IExecutors +{ +public: + Executors(void) = delete; + Executors(std::unique_ptr model_edges) + : _executors{}, _model_edges{std::move(model_edges)}, _edge_quant_layers{}, + _edge_quant_tensors{}, _edge_tensors{}, _is_created_edge_quant_layers{false}, + _pkg_input_quant_layers{}, _pkg_output_quant_layers{}, _pkg_input_quant_tensors{}, + _pkg_output_quant_tensors{}, _pkg_input_tensors{}, _pkg_output_tensors{} + { + for (const auto &edge : _model_edges->edges) + { + _edge_map[edge.from].emplace_back(edge.to); + } + } + Executors(const Executors &) = delete; + Executors(Executors &&) = default; + ~Executors() = default; + + // TODO Use Executor index + void emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + std::unique_ptr exec) override; + + IExecutor *at(const ir::ModelIndex &model_index, + const ir::SubgraphIndex &subg_index) const override; + + uint32_t inputSize() const override; + + uint32_t outputSize() const override; + + const ir::OperandInfo &inputInfo(const ir::IOIndex &index) const override; + + const ir::OperandInfo &outputInfo(const ir::IOIndex &index) const override; + + void execute(const IODescription &desc) override; + +private: + void checkSupportedMultimodel() const; + void createEdgeQuantLayers(); + void CreatePkgIOTensors(const IODescription &desc); + void createPkgIOQuantLayers(const IODescription &desc); + uint16_t modelCount() const; + +private: + // TODO Remove this class + class PermuteLayer : public exec::IPermuteFunction + { + public: + PermuteLayer(const std::vector &inputs, + const std::vector &outputs) + { + assert(inputs.size() == outputs.size()); + _src_tensors = inputs; + _dst_tensors = outputs; + } + virtual ~PermuteLayer() {} + void optimize() override {} + }; + + class EdgeTensor; + +private: + std::unordered_map, std::unique_ptr> + _executors; + + // NOTE _model_edges may use different struct type for executor implementation + std::unique_ptr _model_edges; + std::unordered_map> _edge_map; + + /** + * @brief Type-aware quantization layers for edges between executors + * + */ + // TODO Move variables related to type-aware quantization for edges into compilation stage + // TODO Replace PermuteLayer with backend::builtin::kernel::PermuteLayer + std::unordered_map, std::unique_ptr> + _edge_quant_layers; + + /** + * @brief Tensors for type-aware quantization of edges + * Key: `to` IODesc, Value: EdgeTensor + */ + // + // Q: Why is Key `to` IODesc + // A: these tensors are currently created depending on the type of `to` + // TODO Unify tensors with the same `from` tensor and same type + // NOTE The incomplete type 'EdgeTensor' cannot be declared as unique_ptr. + std::unordered_map> _edge_quant_tensors; + + /** + * @brief Tensors for edges between executors that are not related to type-aware quantization + * Key: `from` IODesc, Value: EdgeTensor + */ + // Q: Why is Key `from` IODesc + // A: `from` can be connected to multiple `to` + // NOTE The incomplete type 'EdgeTensor' cannot be declared as unique_ptr. + std::unordered_map> _edge_tensors; + /** + * @brief Whether type-aware quantization layers for edges between executors are created + * + */ + // TODO Remove this member after the creation of type-aware quantization layers for edges + // is moved into compilation stage + bool _is_created_edge_quant_layers; + + // TODO Replace PermuteLayer with backend::builtin::kernel::PermuteLayer + std::unordered_map, std::unique_ptr> + _pkg_input_quant_layers; + // TODO Replace PermuteLayer with backend::builtin::kernel::PermuteLayer + std::unordered_map, std::unique_ptr> + _pkg_output_quant_layers; + // Edge tensors of nnpkg inputs/outputs for type-aware quantization + std::unordered_map> _pkg_input_quant_tensors; + std::unordered_map> _pkg_output_quant_tensors; + // IOTensors for user buffer + std::unordered_map> _pkg_input_tensors; + std::unordered_map> _pkg_output_tensors; +}; + +} // namespace exec +} // namespace onert + +#endif // __ONERT_EXEC_EXECUTORS_H__ diff --git a/runtime/onert/core/src/exec/IPermuteFunction.cc b/runtime/onert/core/src/exec/IPermuteFunction.cc new file mode 100644 index 000000000..9d548e6dc --- /dev/null +++ b/runtime/onert/core/src/exec/IPermuteFunction.cc @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "IPermuteFunction.h" + +#include +#include +#include "backend/IPortableTensor.h" +#include "exec/IFunction.h" +#include "ir/Index.h" +#include "ir/Shape.h" +#include +#include +#include +#include "util/Utils.h" +#include +#include + +namespace +{ +using namespace onert; + +inline nnfw::cker::Shape getShape(const backend::ITensor *tensor) +{ + const ir::Shape shape = tensor->getShape(); + + assert(tensor->layout() == ir::Layout::NHWC); + + auto rank = shape.rank(); + nnfw::cker::Shape ret(rank); + auto data = ret.DimsData(); + for (int i = 0; i < rank; ++i) + { + data[i] = shape.dim(i); + } + return ret; +} + +// Quantize per element +template +void elementwiseQuantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor) +{ + const auto scale = dst_tensor->data_scale(); + const auto zero_point = dst_tensor->data_zero_point(); + + int min_val = std::numeric_limits::min(); + int max_val = std::numeric_limits::max(); + + auto loop_shape = src_tensor->getShape(); + const auto src_layout = src_tensor->layout(); + const auto dst_layout = dst_tensor->layout(); + const bool is_permutation = src_layout != dst_layout && loop_shape.rank() == 4; + ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) { + const InputT *input_data = + reinterpret_cast(src_tensor->buffer() + src_tensor->calcOffset(coords)); + int32_t unclamped = static_cast(round(*input_data / scale)) + zero_point; + int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + + ir::Coordinates dst_coords = + is_permutation ? ir::convertCoordinates(coords, src_layout, dst_layout) : coords; + OutputT *output_data = + reinterpret_cast(dst_tensor->buffer() + dst_tensor->calcOffset(dst_coords)); + *output_data = clamped; + }); +} + +// TODO Optimize the case where tensors has the same layout +template +void quantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor) +{ + if (!src_tensor->has_padding() && !dst_tensor->has_padding() && + src_tensor->layout() == dst_tensor->layout() && !src_tensor->is_dynamic()) + { + assert(!dst_tensor->is_dynamic()); + + // Call optimized neon kernel + nnfw::cker::Quantize(getShape(src_tensor), + reinterpret_cast(src_tensor->buffer()), + getShape(dst_tensor), reinterpret_cast(dst_tensor->buffer()), + dst_tensor->data_scale(), dst_tensor->data_zero_point()); + } + else + { + elementwiseQuantize(src_tensor, dst_tensor); + } +} + +// Dequantize per element +template +void elementwiseDequantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor) +{ + const auto scale = src_tensor->data_scale(); + const auto zero_point = src_tensor->data_zero_point(); + + auto loop_shape = src_tensor->getShape(); + const auto src_layout = src_tensor->layout(); + const auto dst_layout = dst_tensor->layout(); + const bool is_permutation = src_layout != dst_layout && loop_shape.rank() == 4; + ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) { + const InputT *input_data = + reinterpret_cast(src_tensor->buffer() + src_tensor->calcOffset(coords)); + const OutputT result = static_cast(scale * (*input_data - zero_point)); + + ir::Coordinates dst_coords = + is_permutation ? ir::convertCoordinates(coords, src_layout, dst_layout) : coords; + OutputT *output_data = + reinterpret_cast(dst_tensor->buffer() + dst_tensor->calcOffset(dst_coords)); + *output_data = result; + }); +} + +// TODO Optimize the case where tensors has the same layout +template +void dequantize(const backend::ITensor *src_tensor, backend::ITensor *dst_tensor) +{ + if (!src_tensor->has_padding() && !dst_tensor->has_padding() && + src_tensor->layout() == dst_tensor->layout() && !src_tensor->is_dynamic()) + { + assert(!dst_tensor->is_dynamic()); + + // Call optimized neon kernel + nnfw::cker::Dequantize(getShape(src_tensor), + reinterpret_cast(src_tensor->buffer()), + getShape(dst_tensor), reinterpret_cast(dst_tensor->buffer()), + src_tensor->data_scale(), src_tensor->data_zero_point()); + } + else + { + elementwiseDequantize(src_tensor, dst_tensor); + } +} + +template ::value && + std::is_base_of::value, + bool> = true> +void typeAwareQuantize(const SRC_T *src_tensor, DST_T *dst_tensor) +{ + // TODO Support other types + if (src_tensor->data_type() == ir::DataType::FLOAT32) + { + switch (dst_tensor->data_type()) + { + case ir::DataType::QUANT_UINT8_ASYMM: + { + quantize(src_tensor, dst_tensor); + break; + } + case ir::DataType::QUANT_INT8_SYMM: + { + quantize(src_tensor, dst_tensor); + break; + } + case ir::DataType::QUANT_INT16_SYMM: + { + quantize(src_tensor, dst_tensor); + break; + } + default: + { + throw std::runtime_error("IPermuteFunction: Unsupported quantization type"); + break; + } + } + } + else if (dst_tensor->data_type() == ir::DataType::FLOAT32) + { + switch (src_tensor->data_type()) + { + case ir::DataType::QUANT_UINT8_ASYMM: + { + dequantize(src_tensor, dst_tensor); + break; + } + case ir::DataType::QUANT_INT8_SYMM: + { + dequantize(src_tensor, dst_tensor); + break; + } + case ir::DataType::QUANT_INT16_SYMM: + { + dequantize(src_tensor, dst_tensor); + break; + } + default: + { + throw std::runtime_error("IPermuteFunction: Unsupported dequantization type"); + break; + } + } + } + else + { + throw std::runtime_error("IPermuteFunction: Unsupported type for type-aware quantization yet"); + } +} + +} // namespace + +namespace onert +{ +namespace exec +{ + +void IPermuteFunction::IPermuteFunction::run() +{ + // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0) + assert(_src_tensors.size() == _dst_tensors.size()); + if (_src_tensors_offsets.size() == 0) + { + _src_tensors_offsets.resize(_src_tensors.size()); + _dst_tensors_offsets.resize(_dst_tensors.size()); + } + assert(_src_tensors.size() == _src_tensors_offsets.size()); + assert(_src_tensors_offsets.size() == _dst_tensors_offsets.size()); + + for (size_t i = 0; i < _src_tensors.size(); ++i) + { + auto src_tensor = _src_tensors.at(i); + auto dst_tensor = _dst_tensors.at(i); + auto &src_offsets = _src_tensors_offsets.at(i); + auto &dst_offsets = _dst_tensors_offsets.at(i); + if (src_tensor != dst_tensor) + { + const auto rank = src_tensor->getShape().rank(); + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + } + } +} + +void IPermuteFunction::permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor, + size_t rank, std::vector &src_offsets, + std::vector &dst_offsets) +{ + if (src_tensor->total_size() == 0) + { + assert(dst_tensor->total_size() == 0); + return; + } + + assert(src_tensor != dst_tensor); + if (underlying_type(src_tensor->data_type()) != underlying_type(dst_tensor->data_type())) + { + typeAwareQuantize(src_tensor, dst_tensor); + return; + } + + switch (src_tensor->data_type()) + { + case ir::DataType::FLOAT32: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + case ir::DataType::INT32: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + case ir::DataType::UINT32: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + case ir::DataType::BOOL8: + case ir::DataType::QUANT_UINT8_ASYMM: + case ir::DataType::UINT8: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + case ir::DataType::QUANT_INT8_ASYMM: + case ir::DataType::QUANT_INT8_SYMM: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + case ir::DataType::INT64: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + case ir::DataType::QUANT_INT16_SYMM: + permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); + break; + default: + throw std::runtime_error("IPermuteFunction: Not supported data type"); + break; + } +} + +const std::type_info &IPermuteFunction::underlying_type(ir::DataType type) const +{ + switch (type) + { + case ir::DataType::FLOAT32: + return typeid(float); + case ir::DataType::INT32: + return typeid(int32_t); + case ir::DataType::UINT32: + return typeid(uint32_t); + case ir::DataType::INT64: + return typeid(int64_t); + case ir::DataType::BOOL8: + case ir::DataType::QUANT_UINT8_ASYMM: + case ir::DataType::UINT8: + return typeid(uint8_t); + case ir::DataType::QUANT_INT8_ASYMM: + case ir::DataType::QUANT_INT8_SYMM: + return typeid(int8_t); + case ir::DataType::QUANT_INT16_SYMM: + return typeid(int16_t); + default: + throw std::runtime_error("IPermuteFunction: Not supported data type"); + } +} + +} // namespace exec +} // namespace onert diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h index eb54b67ae..e790f3290 100644 --- a/runtime/onert/core/src/exec/IPermuteFunction.h +++ b/runtime/onert/core/src/exec/IPermuteFunction.h @@ -25,11 +25,7 @@ #include "backend/ITensor.h" #include "exec/IFunction.h" -#include "ir/Index.h" -#include "ir/Shape.h" #include -#include -#include "util/Utils.h" #include #include @@ -79,31 +75,7 @@ protected: }; public: - virtual void run() override - { - // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0) - assert(_src_tensors.size() == _dst_tensors.size()); - if (_src_tensors_offsets.size() == 0) - { - _src_tensors_offsets.resize(_src_tensors.size()); - _dst_tensors_offsets.resize(_dst_tensors.size()); - } - assert(_src_tensors.size() == _src_tensors_offsets.size()); - assert(_src_tensors_offsets.size() == _dst_tensors_offsets.size()); - - for (size_t i = 0; i < _src_tensors.size(); ++i) - { - auto src_tensor = _src_tensors.at(i); - auto dst_tensor = _dst_tensors.at(i); - auto &src_offsets = _src_tensors_offsets.at(i); - auto &dst_offsets = _dst_tensors_offsets.at(i); - if (src_tensor != dst_tensor) - { - const auto rank = src_tensor->getShape().rank(); - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - } - } - } + virtual void run() override; virtual void prepare() override { optimize(); } @@ -111,48 +83,7 @@ public: protected: void permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor, size_t rank, - std::vector &src_offsets, std::vector &dst_offsets) - { - if (src_tensor->total_size() == 0) - { - assert(dst_tensor->total_size() == 0); - return; - } - - assert(src_tensor != dst_tensor); - if (underlying_type(src_tensor->data_type()) != underlying_type(dst_tensor->data_type())) - throw std::runtime_error("data type does not match"); - switch (src_tensor->data_type()) - { - case ir::DataType::FLOAT32: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - case ir::DataType::INT32: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - case ir::DataType::UINT32: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - case ir::DataType::BOOL8: - case ir::DataType::QUANT_UINT8_ASYMM: - case ir::DataType::UINT8: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - case ir::DataType::QUANT_INT8_ASYMM: - case ir::DataType::QUANT_INT8_SYMM: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - case ir::DataType::INT64: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - case ir::DataType::QUANT_INT16_SYMM: - permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets); - break; - default: - throw std::runtime_error("IPermuteFunction: Not supported data type"); - break; - } - } + std::vector &src_offsets, std::vector &dst_offsets); private: // TODO make src const by proving const access() @@ -322,31 +253,7 @@ protected: // NOTE The typeid expression is lvalue expression which refers to an object with static storage // duration, of the polymorphic type const std::type_info or of some type derived from it. // So std::type_info is non-copyable - const std::type_info &underlying_type(ir::DataType type) const - { - switch (type) - { - case ir::DataType::FLOAT32: - return typeid(float); - case ir::DataType::INT32: - return typeid(int32_t); - case ir::DataType::UINT32: - return typeid(uint32_t); - case ir::DataType::INT64: - return typeid(int64_t); - case ir::DataType::BOOL8: - case ir::DataType::QUANT_UINT8_ASYMM: - case ir::DataType::UINT8: - return typeid(uint8_t); - case ir::DataType::QUANT_INT8_ASYMM: - case ir::DataType::QUANT_INT8_SYMM: - return typeid(int8_t); - case ir::DataType::QUANT_INT16_SYMM: - return typeid(int16_t); - default: - throw std::runtime_error("IPermuteFunction: Not supported data type"); - } - } + const std::type_info &underlying_type(ir::DataType type) const; protected: std::vector _src_tensors; diff --git a/runtime/onert/core/src/exec/IPermuteFunction.test.cc b/runtime/onert/core/src/exec/IPermuteFunction.test.cc new file mode 100644 index 000000000..1009f194d --- /dev/null +++ b/runtime/onert/core/src/exec/IPermuteFunction.test.cc @@ -0,0 +1,902 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "IPermuteFunction.h" + +#include +#include +#include + +#include +#include + +namespace +{ +using namespace onert; +using namespace ir; +using namespace backend; +using namespace exec; + +class MockUpTensor : public ITensor +{ +public: + MockUpTensor(const Shape &shape, const TypeInfo &type_info, Layout layout, size_t pad) + : _shape(shape), _type_info(type_info), _data(nullptr), _layout(layout) + { + _strides.resize(shape.rank()); + + std::vector pads(shape.rank(), 0); + pads[shape.rank() - 1] = pad; + size_t stride = 1; + for (int32_t i = _shape.rank() - 1; i >= 0; --i) + { + _strides.at(i) = stride; + stride = stride * (_shape.dim(i) + pads.at(i)); + } + } + virtual ~MockUpTensor() {} + + void setBuffer(uint8_t *data) { _data = data; } + + size_t total_size() const override + { + size_t total_size = _strides[0] * _shape.dim(0); + total_size *= sizeOfDataType(data_type()); + return total_size; + } + + size_t calcOffset(const ir::Coordinates &coords) const override + { + size_t offset = 0; + for (size_t i = 0; i < _shape.rank(); ++i) + { + offset += (_strides[i] * coords[i]); + } + offset *= sizeOfDataType(data_type()); + return offset; + } + + uint8_t *buffer() const override { return _data; } + + ir::Layout layout() const override { return _layout; } + ir::DataType data_type() const override { return _type_info.type(); } + float data_scale() const override { return _type_info.scale(); } + int32_t data_zero_point() const override { return _type_info.zero_point(); } + const std::vector &data_scales() const override { return _type_info.scales(); } + const std::vector &data_zero_points() const override { return _type_info.zero_points(); } + bool has_padding() const override + { + return total_size() / sizeOfDataType(data_type()) != _shape.num_elements(); + } + void access(const std::function &fn) final { fn(*this); } + + bool is_dynamic() const override { return false; } + Shape getShape() const override { return _shape; } + +private: + Shape _shape; + TypeInfo _type_info; + Layout _layout; + uint8_t *_data; + std::vector _strides; +}; + +class MockUpLayer : public IPermuteFunction +{ +public: + MockUpLayer(const std::vector &inputs, const std::vector &outputs) + { + assert(inputs.size() == outputs.size()); + _src_tensors = inputs; + _dst_tensors = outputs; + } + virtual ~MockUpLayer() {} + void optimize() override {} +}; + +TEST(IPermuteFunction, float_rank1) +{ + const size_t input_pads[4] = {0, 1, 0, 2}; + const size_t output_pads[4] = {0, 0, 2, 1}; + const std::vector shapes{{1}, {4}, {5}, {2}}; + float expected_buffer[] = {1, 0, -1, -2, 3}; + const auto type_info = TypeInfo(DataType::FLOAT32); + + std::vector> inputs(4); + std::vector> outputs(4); + + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + Coordinates coords{j}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } +} + +TEST(IPermuteFunction, float_rank2) +{ + const size_t input_pads[4] = {0, 1, 0, 2}; + const size_t output_pads[4] = {0, 0, 2, 1}; + const std::vector shapes{{1, 4}, {2, 2}, {1, 5}, {2, 3}}; + float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8}; + const auto type_info = TypeInfo(DataType::FLOAT32); + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + Coordinates coords{j, k}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } + } +} + +TEST(IPermuteFunction, float_rank3) +{ + const size_t input_pads[4] = {0, 5, 0, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 4, 1}, {1, 2, 1}, {2, 1, 5}, {1, 2, 3}}; + float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8, 9, -10}; + const auto type_info = TypeInfo(DataType::FLOAT32); + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + Coordinates coords{j, k, l}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } + } + } +} + +TEST(IPermuteFunction, float_rank4) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8, 9, -10}; + const auto type_info = TypeInfo(DataType::FLOAT32); + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, float_rank4_layout) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {1, 0, -1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16}; + const auto type_info = TypeInfo(DataType::FLOAT32); + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + Layout layout = Layout::NHWC; + Shape shape = shapes[i]; + if (i % 2 == 1) + { + layout = Layout::NCHW; + shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)}; + } + inputs[i] = std::make_unique(shape, type_info, layout, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + if (layout == Layout::NHWC) + { + layout = Layout::NCHW; + shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)}; + } + else + { + layout = Layout::NHWC; + shape = shapes[i]; + } + outputs[i] = std::make_unique(shape, type_info, layout, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates input_coords; + Coordinates output_coords; + if (inputs[i]->layout() == Layout::NHWC) + { + input_coords = Coordinates{j, k, l, m}; + } + else + { + input_coords = Coordinates{j, m, k, l}; + } + if (outputs[i]->layout() == Layout::NHWC) + { + output_coords = Coordinates{j, k, l, m}; + } + else + { + output_coords = Coordinates{j, m, k, l}; + } + float result = *reinterpret_cast(outputs[i]->buffer() + + outputs[i]->calcOffset(output_coords)); + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(input_coords)); + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, float_to_qasymm8) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100}; + float scale = 10; + int32_t zero_point = 128; + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], TypeInfo(DataType::FLOAT32), Layout::NHWC, + input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point}; + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + uint8_t qasymm8 = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float result = (qasymm8 - zero_point) * scale; + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, float_to_qsymm8) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100}; + float scale = 10; + int32_t zero_point = 0; + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], TypeInfo(DataType::FLOAT32), Layout::NHWC, + input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + TypeInfo type_info{DataType::QUANT_INT8_SYMM, scale, zero_point}; + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + int8_t qsymm8 = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float result = (qsymm8 - zero_point) * scale; + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, float_to_qsymm16) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100}; + float scale = 10; + int32_t zero_point = 0; + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + inputs[i] = std::make_unique(shapes[i], TypeInfo(DataType::FLOAT32), Layout::NHWC, + input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + TypeInfo type_info{DataType::QUANT_INT16_SYMM, scale, zero_point}; + outputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + int16_t qsymm16 = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + float result = (qsymm16 - zero_point) * scale; + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, qasymm8_to_float) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100}; + float scale = 10; + int32_t zero_point = 128; + uint8_t input_buffer[12]; + + int32_t min_val = std::numeric_limits::min(); + int32_t max_val = std::numeric_limits::max(); + for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i) + { + int32_t unclamped = static_cast(std::round(expected_buffer[i] / scale)) + zero_point; + input_buffer[i] = std::min(std::max(unclamped, min_val), max_val); + } + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point}; + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(input_buffer)); + + outputs[i] = std::make_unique(shapes[i], TypeInfo(DataType::FLOAT32), + Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + uint8_t qasymm8 = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + float expected = (qasymm8 - zero_point) * scale; + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, qsymm8_to_float) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100}; + float scale = 10; + int32_t zero_point = 0; + uint8_t input_buffer[12]; + + int32_t min_val = std::numeric_limits::min(); + int32_t max_val = std::numeric_limits::max(); + for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i) + { + int32_t unclamped = static_cast(std::round(expected_buffer[i] / scale)) + zero_point; + input_buffer[i] = std::min(std::max(unclamped, min_val), max_val); + } + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + TypeInfo type_info{DataType::QUANT_INT8_SYMM, scale, zero_point}; + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(input_buffer)); + + outputs[i] = std::make_unique(shapes[i], TypeInfo(DataType::FLOAT32), + Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + int8_t qasymm8 = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + float expected = (qasymm8 - zero_point) * scale; + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, qsymm16_to_float) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, -80, 90, -100}; + float scale = 10; + int32_t zero_point = 0; + uint8_t input_buffer[12]; + + int32_t min_val = std::numeric_limits::min(); + int32_t max_val = std::numeric_limits::max(); + for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i) + { + int32_t unclamped = static_cast(std::round(expected_buffer[i] / scale)) + zero_point; + input_buffer[i] = std::min(std::max(unclamped, min_val), max_val); + } + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + TypeInfo type_info{DataType::QUANT_INT16_SYMM, scale, zero_point}; + inputs[i] = std::make_unique(shapes[i], type_info, Layout::NHWC, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(input_buffer)); + + outputs[i] = std::make_unique(shapes[i], TypeInfo(DataType::FLOAT32), + Layout::NHWC, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates coords{j, k, l, m}; + float result = + *reinterpret_cast(outputs[i]->buffer() + outputs[i]->calcOffset(coords)); + int16_t qasymm8 = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(coords)); + float expected = (qasymm8 - zero_point) * scale; + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, float_to_qasymm8_layout) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, + -80, 90, -100, 110, -120, 130, -140, 150, -160}; + float scale = 10; + int32_t zero_point = 128; + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + Layout layout = Layout::NHWC; + Shape shape = shapes[i]; + if (i % 2 == 1) + { + layout = Layout::NCHW; + shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)}; + } + inputs[i] = + std::make_unique(shape, TypeInfo(DataType::FLOAT32), layout, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + if (layout == Layout::NHWC) + { + layout = Layout::NCHW; + shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)}; + } + else + { + layout = Layout::NHWC; + shape = shapes[i]; + } + TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point}; + outputs[i] = std::make_unique(shape, type_info, layout, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates input_coords; + Coordinates output_coords; + if (inputs[i]->layout() == Layout::NHWC) + { + input_coords = Coordinates{j, k, l, m}; + } + else + { + input_coords = Coordinates{j, m, k, l}; + } + if (outputs[i]->layout() == Layout::NHWC) + { + output_coords = Coordinates{j, k, l, m}; + } + else + { + output_coords = Coordinates{j, m, k, l}; + } + uint8_t qasymm8 = *reinterpret_cast(outputs[i]->buffer() + + outputs[i]->calcOffset(output_coords)); + float result = (qasymm8 - zero_point) * scale; + float expected = + *reinterpret_cast(inputs[i]->buffer() + inputs[i]->calcOffset(input_coords)); + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +TEST(IPermuteFunction, asymm8_to_float_layout) +{ + const size_t input_pads[4] = {0, 0, 1, 2}; + const size_t output_pads[4] = {0, 3, 2, 1}; + const std::vector shapes{{1, 1, 4, 1}, {2, 1, 2, 3}, {1, 2, 1, 2}, {1, 1, 2, 3}}; + float expected_buffer[] = {10, 0, -10, -20, 30, -40, 50, -60, 70, + -80, 90, -100, 110, -120, 130, -140, 150, -160}; + float scale = 10; + int32_t zero_point = 128; + uint8_t input_buffer[18]; + + int32_t min_val = std::numeric_limits::min(); + int32_t max_val = std::numeric_limits::max(); + for (int32_t i = 0; i < sizeof(expected_buffer) / sizeof(float); ++i) + { + int32_t unclamped = static_cast(std::round(expected_buffer[i] / scale)) + zero_point; + input_buffer[i] = std::min(std::max(unclamped, min_val), max_val); + } + + std::vector> inputs(4); + std::vector> outputs(4); + std::vector> output_buffers(4); + for (size_t i = 0; i < 4; ++i) + { + Layout layout = Layout::NHWC; + Shape shape = shapes[i]; + if (i % 2 == 1) + { + layout = Layout::NCHW; + shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)}; + } + TypeInfo type_info{DataType::QUANT_UINT8_ASYMM, scale, zero_point}; + inputs[i] = std::make_unique(shape, type_info, layout, input_pads[i]); + inputs[i]->setBuffer(reinterpret_cast(expected_buffer)); + + if (layout == Layout::NHWC) + { + layout = Layout::NCHW; + shape = Shape{shapes[i].dim(0), shapes[i].dim(3), shapes[i].dim(1), shapes[i].dim(2)}; + } + else + { + layout = Layout::NHWC; + shape = shapes[i]; + } + outputs[i] = + std::make_unique(shape, TypeInfo(DataType::FLOAT32), layout, output_pads[i]); + output_buffers[i] = std::make_unique(outputs[i]->total_size()); + outputs[i]->setBuffer(output_buffers[i].get()); + } + + auto mockup_layer = std::make_unique( + std::vector{inputs[0].get(), inputs[1].get(), inputs[2].get(), inputs[3].get()}, + std::vector{outputs[0].get(), outputs[1].get(), outputs[2].get(), outputs[3].get()}); + mockup_layer->run(); + + for (size_t i = 0; i < 4; ++i) + { + for (int32_t j = 0; j < shapes[i].dim(0); ++j) + { + for (int32_t k = 0; k < shapes[i].dim(1); ++k) + { + for (int32_t l = 0; l < shapes[i].dim(2); ++l) + { + for (int32_t m = 0; m < shapes[i].dim(3); ++m) + { + Coordinates input_coords; + Coordinates output_coords; + if (inputs[i]->layout() == Layout::NHWC) + { + input_coords = Coordinates{j, k, l, m}; + } + else + { + input_coords = Coordinates{j, m, k, l}; + } + if (outputs[i]->layout() == Layout::NHWC) + { + output_coords = Coordinates{j, k, l, m}; + } + else + { + output_coords = Coordinates{j, m, k, l}; + } + float result = *reinterpret_cast(outputs[i]->buffer() + + outputs[i]->calcOffset(output_coords)); + uint8_t qasymm8 = *reinterpret_cast(inputs[i]->buffer() + + inputs[i]->calcOffset(input_coords)); + float expected = (qasymm8 - zero_point) * scale; + EXPECT_EQ(result, expected); + } + } + } + } + } +} + +} // namespace diff --git a/runtime/onert/core/src/exec/ParallelScheduler.cc b/runtime/onert/core/src/exec/ParallelScheduler.cc index 70c9c3dd6..456663f91 100644 --- a/runtime/onert/core/src/exec/ParallelScheduler.cc +++ b/runtime/onert/core/src/exec/ParallelScheduler.cc @@ -45,7 +45,7 @@ void ParallelScheduler::assign(std::unique_ptr &&fn, const backend::B void ParallelScheduler::finish() { - for (auto &itr : _thread_pools) + for (auto &&itr : _thread_pools) { itr.second->finish(); } diff --git a/runtime/onert/core/src/exec/SingleModelExecutors.cc b/runtime/onert/core/src/exec/SingleModelExecutors.cc new file mode 100644 index 000000000..4b954bab2 --- /dev/null +++ b/runtime/onert/core/src/exec/SingleModelExecutors.cc @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SingleModelExecutors.h" + +#include "../backend/builtin/IOTensor.h" + +namespace onert +{ +namespace exec +{ + +void SingleModelExecutors::emplace(const ir::ModelIndex &, const ir::SubgraphIndex &subg_index, + std::unique_ptr exec) +{ + _executors.emplace(subg_index, std::move(exec)); +} + +IExecutor *SingleModelExecutors::at(const ir::ModelIndex &, + const ir::SubgraphIndex &subg_index) const +{ + return _executors.at(subg_index).get(); +} + +uint32_t SingleModelExecutors::inputSize() const +{ + return entryExecutor()->getInputTensors().size(); +} + +uint32_t SingleModelExecutors::outputSize() const +{ + return entryExecutor()->getOutputTensors().size(); +} + +const ir::OperandInfo &SingleModelExecutors::inputInfo(const ir::IOIndex &index) const +{ + return entryExecutor()->getInputTensors().at(index.value())->orig_info(); +} + +const ir::OperandInfo &SingleModelExecutors::outputInfo(const ir::IOIndex &index) const +{ + return entryExecutor()->getOutputTensors().at(index.value())->orig_info(); +} + +void SingleModelExecutors::execute(const IODescription &desc) { entryExecutor()->execute(desc); } + +} // namespace exec +} // namespace onert diff --git a/runtime/onert/core/src/exec/SingleModelExecutors.h b/runtime/onert/core/src/exec/SingleModelExecutors.h new file mode 100644 index 000000000..98d629eae --- /dev/null +++ b/runtime/onert/core/src/exec/SingleModelExecutors.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_EXEC_SINGLE_MODEL_EXECUTORS_H__ +#define __ONERT_EXEC_SINGLE_MODEL_EXECUTORS_H__ + +#include "exec/IExecutors.h" +#include "ir/NNPkg.h" + +namespace onert +{ +namespace exec +{ + +/** + * @brief Class to gather executor set for single model NN package + */ +class SingleModelExecutors : public IExecutors +{ +public: + /** + * @brief Construct a new SingleModelExecutors object + */ + SingleModelExecutors(void) = default; + SingleModelExecutors(const SingleModelExecutors &) = delete; + SingleModelExecutors(SingleModelExecutors &&) = default; + + /** + * @brief Destroy the SingleModelExecutors object + */ + ~SingleModelExecutors() = default; + +public: + void emplace(const ir::ModelIndex &model_index, const ir::SubgraphIndex &subg_index, + std::unique_ptr exec) override; + + IExecutor *at(const ir::ModelIndex &model_index, + const ir::SubgraphIndex &subg_index) const override; + + uint32_t inputSize() const override; + + uint32_t outputSize() const override; + + const ir::OperandInfo &inputInfo(const ir::IOIndex &index) const override; + + const ir::OperandInfo &outputInfo(const ir::IOIndex &index) const override; + + void execute(const IODescription &desc) override; + +private: + std::unordered_map> _executors; +}; + +} // namespace exec +} // namespace onert + +#endif // __ONERT_EXEC_SINGLE_MODEL_EXECUTORS_H__ diff --git a/runtime/onert/core/src/exec/ThreadPool.cc b/runtime/onert/core/src/exec/ThreadPool.cc index c8e0e3265..bf85e59f6 100644 --- a/runtime/onert/core/src/exec/ThreadPool.cc +++ b/runtime/onert/core/src/exec/ThreadPool.cc @@ -48,7 +48,7 @@ uint32_t ThreadPool::numJobsInQueue() { return _worker.numJobsInQueue(); } void ThreadPool::join() { - for (auto &thread : _threads) + for (auto &&thread : _threads) { thread.join(); } diff --git a/runtime/onert/core/src/interp/Buffer.h b/runtime/onert/core/src/interp/Buffer.h deleted file mode 100644 index 24938f74f..000000000 --- a/runtime/onert/core/src/interp/Buffer.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file Buffer.h - * @brief This file contains Buffer interface and InternalBuffer, ExternalBuffer class - */ -#ifndef __ONERT_INTERP_BUFFER_H__ -#define __ONERT_INTERP_BUFFER_H__ - -#include - -#include "ir/Data.h" - -namespace onert -{ -namespace interp -{ - -/** - * @brief Interface for writable data area - */ -class Buffer : public ir::Data -{ -public: - /** - * @brief Return writable pointer for data area - * @return Writable pointer - */ - virtual uint8_t *baseWritable(void) const = 0; -}; - -/** - * @brief Class for internally allocated data area - */ -class InternalBuffer final : public Buffer -{ -public: - InternalBuffer(size_t size) : _base{std::make_unique(size)}, _size{size} - { - // DO NOTHING - } - -public: - size_t size(void) const override { return _size; } - const uint8_t *base(void) const override { return _base.get(); } - uint8_t *baseWritable(void) const override { return _base.get(); } - -private: - std::unique_ptr _base; - size_t _size; -}; - -/** - * @brief Class for data area from outside - */ -class ExternalBuffer final : public Buffer -{ -public: - ExternalBuffer(uint8_t *base, size_t size) : _base{base}, _size{size} - { - // DO NOTHING - } - -public: - size_t size(void) const override { return _size; } - const uint8_t *base(void) const override { return _base; } - uint8_t *baseWritable(void) const override { return _base; } - -private: - uint8_t *_base; - size_t _size; -}; - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_BUFFER_H__ diff --git a/runtime/onert/core/src/interp/ExecEnv.h b/runtime/onert/core/src/interp/ExecEnv.h deleted file mode 100644 index 7f577ea6e..000000000 --- a/runtime/onert/core/src/interp/ExecEnv.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file ExecEnv.h - * @brief This file contains ExecEnv to access interpreter tensor and execution status - */ -#ifndef __ONERT_INTERP_EXEC_ENV_H_ -#define __ONERT_INTERP_EXEC_ENV_H_ - -#include - -#include "ir/Graph.h" -#include "Tensor.h" - -namespace onert -{ -namespace interp -{ - -/** - * @brief Class to gather interpreter execution environment - * Each interpreter instance own execution environment - */ -class ExecEnv -{ -public: - /** - * @brief Construct a new Exec Env object (deleted) - */ - ExecEnv(void) = delete; - /** - * @brief Construct a new ExecEnv object - * @param[in] graph Graph to execute by interpreter - */ - explicit ExecEnv(const ir::Graph &graph) : _graph(graph) - { - // DO NOTHING - } - -public: - /** - * @brief Return graph to execute - * @return Graph - */ - const ir::Graph &graph(void) const { return _graph; } - /** - * @brief Assign tensor to environment which have allocated or assigned buffer - * @param[in] index Tensor index - * @param[in] tensor Tensor - */ - void assignTensor(const ir::OperandIndex index, std::shared_ptr tensor) - { - assert(tensor->bufferRO() != nullptr); - _tensors.emplace(index, tensor); - } - - /** - * @brief Return tensor pointer in environment - * @param[in] index Tensor index - * can_optional @c True if tensor can be optional input, otherwise @c false - * @return Tensor pointer - */ - const ITensor *tensorAt(const ir::OperandIndex index, bool can_optional = false) const - { - if (_tensors.find(index) == _tensors.end()) - { - // It may optional input, - // otherwise input is not set by runtime user - if (can_optional) - { - return nullptr; - } - - throw std::runtime_error{"ExecEnv: Input is not set"}; - } - - return _tensors.at(index).get(); - } - - /** - * @brief Check environment contains tensor - * @param[in] index Tensor index - * @return @c true if environment contain tensor, otherwise @c false - */ - bool contains(const ir::OperandIndex index) const - { - return (_tensors.find(index) != _tensors.end()); - } - - /** - * @brief Allocate tensor using operand info - * @param[in] index Tensor index - * @param[in] info Operand info - * @note If already allocated, just return - * @TODO More smart allocation policy - */ - void allocateIfNeeded(const ir::OperandIndex index, const ir::OperandInfo &info) - { - // already allocated, or constant - if (contains(index)) - { - return; - } - - // Buffer from external (ex. model output) - auto tensor = std::make_shared(info); - if (isExtBuffer(index)) - { - tensor->setBuffer(_external_buffers.at(index)); - assignTensor(index, tensor); - - return; - } - - tensor->setBuffer(std::make_shared(tensor->total_size())); - assignTensor(index, tensor); - _buffers.insert(index); - } - - /** - * @brief Allocate read-only tensor and share data with other tensor - * @param[in] index Tensor index - * @param[in] info Operand info - * @param[in] index_to_share Tensor index that have data to share - */ - void allocateAndShareIfNeeded(const ir::OperandIndex index, const ir::OperandInfo &info, - const ir::OperandIndex index_to_share) - { - if (!contains(index_to_share)) - { - throw std::runtime_error{"Cannot find tensor to share data"}; - } - - // already allocated - if (contains(index)) - { - return; - } - - if (isExtBuffer(index)) - { - auto tensor = std::make_shared(info); - tensor->setBuffer(_external_buffers.at(index)); - assignTensor(index, tensor); - } - else - { - auto tensor = std::make_shared(info); - tensor->setData(tensorAt(index_to_share)->shareData()); - assignTensor(index, tensor); - _buffers.insert(index); - } - } - - /** - * @brief Free buffer if allocated by allocateIfNeed - * @param[in] index Tensor index - * @note If allocated by outside, just return - */ - void freeIfAllocated(const ir::OperandIndex index) - { - if (_buffers.find(index) != _buffers.end()) - { - _tensors.at(index)->releaseData(); - } - } - - /** - * @brief Assign ExternalBuffer into external buffer map - * @param[in] index Tensor index - * @param[in] buffer External buffer - */ - void assignExternalBuffer(const ir::OperandIndex index, std::shared_ptr buffer) - { - _external_buffers.emplace(index, buffer); - } - -private: - bool isExtBuffer(const ir::OperandIndex index) - { - return (_external_buffers.find(index) != _external_buffers.end()); - } - -private: - const ir::Graph &_graph; - // Tensor map to use in interpreter - // It should map tensors that have allocated or assigned buffer pointer - std::unordered_map> _tensors; - // Tensors allocated by allocateIfNeed (buffer) - std::unordered_set _buffers; - // Tensor buffer from external - std::unordered_map> _external_buffers; -}; - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_EXEC_ENV_H_ diff --git a/runtime/onert/core/src/interp/InterpExecutor.cc b/runtime/onert/core/src/interp/InterpExecutor.cc deleted file mode 100644 index f04777174..000000000 --- a/runtime/onert/core/src/interp/InterpExecutor.cc +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "InterpExecutor.h" - -#include "ExecEnv.h" -#include "Interpreter.h" - -#include "util/logging.h" - -#include - -namespace onert -{ -namespace interp -{ - -void InterpExecutor::execute(const exec::IODescription &desc) -{ - /************************************************************************ - * Prepare execution model (submodel) - It may execute divided model - but now consider model inference is done at interpreter - ***********************************************************************/ - ir::OperandIndexMap> tensor_map; - - for (uint32_t n = 0; n < _graph.getInputs().size(); n++) - { - ir::IOIndex index{n}; - const auto input_index = _graph.getInputs().at(index); - - const auto input = desc.inputs.at(n).get(); - if (input == nullptr) - { - // Optional input - continue; - } - - auto input_tensor = std::make_shared(input->info); - input_tensor->setData(std::make_shared( - reinterpret_cast(input->buffer), input->size)); - tensor_map[input_index] = input_tensor; - } - - /************************************************************************ - * Prepare execution environment - Execution environment will be assigned to invoked interpreter instance - ***********************************************************************/ - - std::unique_ptr interp_env = std::make_unique(_graph); - - // Assign input/output tensor into interpreter execution environment - for (auto index : _graph.getInputs()) - { - if (tensor_map.find(index) != tensor_map.end()) - { - VERBOSE(INTERPRETER) << "Assign input tensor. operand index:" << index << std::endl; - interp_env->assignTensor(index, tensor_map.at(index)); - } - } - - for (uint32_t n = 0; n < _graph.getOutputs().size(); n++) - { - ir::IOIndex index{n}; - const auto output_index = _graph.getOutputs().at(index); - const auto output = desc.outputs.at(n).get(); - if (output == nullptr) - { - // Optional output - continue; - } - - VERBOSE(INTERPRETER) << "Set out buffer to ExecEnv. operand index:" << output_index.value() - << std::endl; - - interp_env->assignExternalBuffer( - output_index, - std::make_shared(reinterpret_cast(output->buffer), output->size)); - } - - // Allocate constant tensor - _graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (obj.isConstant()) - { - VERBOSE(INTERPRETER) << "Allocate and assign constant tensor. operand index:" << ind - << std::endl; - - assert(obj.data()); - auto const_tensor = std::make_shared(obj.info()); - // Assume that interpreter's tensor layout is same with model (NHWC) - const_tensor->setData( - std::make_shared(obj.data()->base(), obj.info().total_size())); - interp_env->assignTensor(ind, const_tensor); - } - }); - - /***************************************************************************** - * Invoke interpreter - ****************************************************************************/ - - interp::Interpreter interp(std::move(interp_env)); - interp.run(); - - /***************************************************************************** - * Invoked interpreter run is finished - ****************************************************************************/ - - // If interpreter execute submodel - // 1. Get tensor output of submodel into tensor_map to save result - // 2. Generate new ExecEnv for next interpretation -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/InterpExecutor.h b/runtime/onert/core/src/interp/InterpExecutor.h deleted file mode 100644 index d6d5dd0a3..000000000 --- a/runtime/onert/core/src/interp/InterpExecutor.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file InterpExecutor.h - * @brief This file contains InterpExecutor class\n - * to manage interpreter execution and environment - */ -#ifndef __ONERT_INTERP_INTERP_EXECUTOR_H__ -#define __ONERT_INTERP_INTERP_EXECUTOR_H__ - -#include "ir/OperandIndexMap.h" -#include "ir/Graph.h" -#include "exec/IExecutor.h" - -namespace onert -{ -namespace interp -{ - -class ITensor; - -/** - * @brief Class to execute model using interpreter - */ -class InterpExecutor final : public exec::IExecutor -{ -public: - explicit InterpExecutor(const ir::Graph &graph) : _graph(graph) - { - // DO NOTHING - } - -public: - /** - * @brief Return graph object - * @return Graph object - */ - const ir::Graph &graph() final { return _graph; } - - const ir::Graph &parent_graph() final - { - throw new std::runtime_error{"Interpreter does not support this function."}; - } - void setIndexedRanks(std::shared_ptr>) override{ - // Not implemented - }; - /** - * @brief Start execution - * @note It should be called after setting input and output buffer - */ - void execute(const exec::IODescription &desc) final; - void execute(const std::vector &, - const std::vector &) final - { - throw new std::runtime_error{"Interpreter does not support subgraph calls(control flow ops)"}; - } - const std::vector &getOutputTensors() const final - { - throw new std::runtime_error{"Interpreter does not support this function."}; - } - -private: - /** - * @brief Copy of target graph for lowering - * @note It uses copy of graph, not reference. - * Original graph may be deallocated by frontend. - */ - const ir::Graph _graph; - ir::OperandIndexMap> _tensor_map; -}; - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_INTERP_EXECUTOR_H__ diff --git a/runtime/onert/core/src/interp/InterpExecutor.test.cc b/runtime/onert/core/src/interp/InterpExecutor.test.cc deleted file mode 100644 index 9f95ffee0..000000000 --- a/runtime/onert/core/src/interp/InterpExecutor.test.cc +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "InterpExecutor.h" - -#include "exec/Execution.h" -#include "ir/Graph.h" -#include "ir/operation/BinaryArithmetic.h" - -#include - -#include - -namespace -{ - -using namespace onert::ir; -using InterpExecutor = onert::interp::InterpExecutor; -using Execution = onert::exec::Execution; -using Executors = onert::exec::Executors; - -class InterpExecutorTest : public ::testing::Test -{ -protected: - virtual void SetUp() {} - void CreateSimpleModel() - { - // Model: one elementwise add operation - // model input: lhs, rhs - // model output: add result - // lhs, rhs, result shape: {1, 2, 2, 1} - // activation: none (constant) - _graph = std::make_unique(); - - // Add operands - - Shape shape{1, 2, 2, 1}; - TypeInfo type{DataType::INT32}; - Shape shape_scalar(0); - TypeInfo type_scalar{DataType::INT32}; - - auto operand_lhs = _graph->addOperand(shape, type); - auto operand_rhs = _graph->addOperand(shape, type); - auto operand_result = _graph->addOperand(shape, type); - - // Add operations - - operation::BinaryArithmetic::Param param; - param.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; - param.activation = Activation::NONE; - auto input_set = OperandIndexSequence{operand_lhs, operand_rhs}; - auto output_set = OperandIndexSequence{operand_result}; - _graph->addOperation( - std::make_unique(input_set, output_set, param)); - - // Identify model inputs and outputs - - _graph->getInputs().append(operand_lhs); - _graph->getInputs().append(operand_rhs); - _graph->getOutputs().append(operand_result); - - _graph->verify(); - - auto model = std::make_shared(); - model->push(onert::ir::SubgraphIndex{0}, _graph); - - _executors = std::make_shared(); - _executors->emplace(onert::ir::SubgraphIndex{0}, std::make_unique(*_graph)); - } - - void CreateTwoStepModel() - { - // Model: two elementwise add operation - // model input: lhs, rhs1 - // model output: second add result (result2) - // constant: rhs2 - // result1 <= (lhs + rhs) - // result2 <= (result1 + rhs2) - // lhs, rhs1, rh2, result1, result2 shape: {1, 2, 2, 1} - // activation: none (constant) - _graph = std::make_unique(); - - // 1st add operands (result1 <= lhs + rhs1) - - Shape shape{1, 2, 2, 1}; - TypeInfo type{DataType::INT32}; - Shape shape_scalar(0); - TypeInfo type_scalar{DataType::INT32}; - - static int32_t rhs2_data[4] = {3, 1, -1, 5}; - - auto operand_lhs = _graph->addOperand(shape, type); - auto operand_rhs1 = _graph->addOperand(shape, type); - auto operand_result1 = _graph->addOperand(shape, type); - auto operand_rhs2 = _graph->addOperand(shape, type); - auto operand_result2 = _graph->addOperand(shape, type); - _graph->operands() - .at(operand_rhs2) - .data(std::make_unique(reinterpret_cast(&rhs2_data), 16)); - - // 2nd add operations (result2 <= result1 + rhs2) - - operation::BinaryArithmetic::Param param1; - param1.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; - param1.activation = Activation::NONE; - auto input_set1 = OperandIndexSequence{operand_lhs, operand_rhs1}; - auto output_set1 = OperandIndexSequence{operand_result1}; - _graph->addOperation( - std::make_unique(input_set1, output_set1, param1)); - - operation::BinaryArithmetic::Param param2; - param2.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; - param2.activation = Activation::NONE; - auto input_set2 = OperandIndexSequence{operand_result1, operand_rhs2}; - auto output_set2 = OperandIndexSequence{operand_result2}; - _graph->addOperation( - std::make_unique(input_set2, output_set2, param2)); - - // Identify model inputs and outputs - - _graph->getInputs().append(operand_lhs); - _graph->getInputs().append(operand_rhs1); - _graph->getOutputs().append(operand_result2); - - _graph->verify(); - - auto model = std::make_shared(); - model->push(onert::ir::SubgraphIndex{0}, _graph); - - _executors = std::make_shared(); - _executors->emplace(onert::ir::SubgraphIndex{0}, std::make_unique(*_graph)); - } - - void CreateUnspecifiedDimensionsModel() - { - // Model: one elementwise add operation - // model input: lhs, rhs - // model output: add result - // lhs, rhs, result shape: {1, unknown, 2, 1} - // activation: none (constant) - _graph = std::make_unique(); - - // Add operands - - Shape shape{1, 0, 2, 1}; - TypeInfo type{DataType::INT32}; - Shape shape_scalar(0); - TypeInfo type_scalar{DataType::INT32}; - - auto operand_lhs = _graph->addOperand(shape, type); - auto operand_rhs = _graph->addOperand(shape, type); - - auto operand_activation = _graph->addOperand(shape_scalar, type_scalar); - _graph->operands() - .at(operand_activation) - .data(std::make_unique(reinterpret_cast(&_activation_value), 4)); - - auto operand_result = _graph->addOperand(shape, type); - - // Add operations - - operation::BinaryArithmetic::Param param; - param.arithmetic_type = operation::BinaryArithmetic::ArithmeticType::ADD; - param.activation = Activation::NONE; - auto input_set = OperandIndexSequence{operand_lhs, operand_rhs}; - auto output_set = OperandIndexSequence{operand_result}; - _graph->addOperation( - std::make_unique(input_set, output_set, param)); - - // Identify model inputs and outputs - - _graph->getInputs().append(operand_lhs); - _graph->getInputs().append(operand_rhs); - _graph->getOutputs().append(operand_result); - - _graph->verify(); - - auto model = std::make_shared(); - model->push(onert::ir::SubgraphIndex{0}, _graph); - - _executors = std::make_shared(); - _executors->emplace(onert::ir::SubgraphIndex{0}, std::make_unique(*_graph)); - } - - void createExecution() { _execution = std::make_unique(_executors); } - - virtual void TearDown() { _executors = nullptr; } - - std::shared_ptr _graph{nullptr}; - std::shared_ptr _executors{nullptr}; - std::unique_ptr _execution{nullptr}; - const int32_t _activation_value{0}; -}; - -TEST_F(InterpExecutorTest, create_empty) -{ - Graph graph; - graph.verify(); - auto executor = std::make_unique(graph); - ASSERT_NE(executor, nullptr); -} - -TEST_F(InterpExecutorTest, create_simple) -{ - CreateSimpleModel(); - ASSERT_NE(_executors, nullptr); - ASSERT_NE(_executors->at(onert::ir::SubgraphIndex{0}), nullptr); -} - -TEST_F(InterpExecutorTest, neg_setInput) -{ - CreateSimpleModel(); - createExecution(); - - auto input1 = IOIndex{0}; - const int32_t input1_buffer[4] = {1, 0, -1, -2}; - - EXPECT_THROW(_execution->setInput(input1, reinterpret_cast(input1_buffer), 4), - std::runtime_error); - EXPECT_THROW(_execution->setInput(input1, reinterpret_cast(input1_buffer), 12), - std::runtime_error); - EXPECT_NO_THROW(_execution->setInput(input1, reinterpret_cast(input1_buffer), 16)); -} - -TEST_F(InterpExecutorTest, neg_setOutput) -{ - CreateSimpleModel(); - createExecution(); - - auto output = IOIndex{0}; - auto output_idx = _graph->getOutputs().at(output); - - int32_t output_buffer[4] = {}; - - EXPECT_THROW(_execution->setOutput(output, reinterpret_cast(output_buffer), 4), - std::runtime_error); - EXPECT_THROW(_execution->setOutput(output, reinterpret_cast(output_buffer), 12), - std::runtime_error); - EXPECT_NO_THROW(_execution->setOutput(output, reinterpret_cast(output_buffer), 16)); -} - -TEST_F(InterpExecutorTest, neg_setInputForUnspecifiedDimensions) -{ - CreateUnspecifiedDimensionsModel(); - createExecution(); - - auto input1 = IOIndex{0}; - const int32_t input1_buffer[4] = {1, 0, -1, -2}; - - TypeInfo operand_type{DataType::INT32}; - Shape operand_shape{1, 2, 2, 1}; - - EXPECT_THROW(_execution->setInput(input1, operand_type, operand_shape, - reinterpret_cast(input1_buffer), 4), - std::runtime_error); - EXPECT_THROW(_execution->setInput(input1, operand_type, operand_shape, - reinterpret_cast(input1_buffer), 12), - std::runtime_error); - EXPECT_NO_THROW(_execution->setInput(input1, operand_type, operand_shape, - reinterpret_cast(input1_buffer), 16)); -} - -TEST_F(InterpExecutorTest, neg_setOutputForUnspecifiedDimensions) -{ - CreateUnspecifiedDimensionsModel(); - createExecution(); - - auto output = IOIndex{0}; - auto output_idx = _graph->getOutputs().at(output); - - TypeInfo operand_type{DataType::INT32}; - Shape operand_shape{1, 2, 2, 1}; - - int32_t output_buffer[4] = {}; - - EXPECT_THROW(_execution->setOutput(output, operand_type, operand_shape, - reinterpret_cast(output_buffer), 4), - std::runtime_error); - EXPECT_THROW(_execution->setOutput(output, operand_type, operand_shape, - reinterpret_cast(output_buffer), 12), - std::runtime_error); - EXPECT_NO_THROW(_execution->setOutput(output, operand_type, operand_shape, - reinterpret_cast(output_buffer), 16)); -} - -TEST_F(InterpExecutorTest, execute) -{ - CreateSimpleModel(); - createExecution(); - - auto input1 = IOIndex{0}; - auto input2 = IOIndex{1}; - auto input1_idx = _graph->getInputs().at(input1); - auto input2_idx = _graph->getInputs().at(input2); - - const int32_t input1_buffer[4] = {1, 0, -1, -2}; - const int32_t input2_buffer[4] = {1, -3, 2, -4}; - - auto output = IOIndex{0}; - auto output_idx = _graph->getOutputs().at(output); - - int32_t output_buffer[4] = {}; - - EXPECT_NO_THROW(_execution->setInput(input1, reinterpret_cast(input1_buffer), 16)); - EXPECT_NO_THROW(_execution->setInput(input2, reinterpret_cast(input2_buffer), 16)); - EXPECT_NO_THROW(_execution->setOutput(output, reinterpret_cast(output_buffer), 16)); - EXPECT_NO_THROW(_execution->execute()); - EXPECT_EQ(output_buffer[0], 2); - EXPECT_EQ(output_buffer[1], -3); - EXPECT_EQ(output_buffer[2], 1); - EXPECT_EQ(output_buffer[3], -6); -} - -TEST_F(InterpExecutorTest, executeTwoStep) -{ - CreateTwoStepModel(); - createExecution(); - - auto input1 = IOIndex{0}; - auto input2 = IOIndex{1}; - auto input1_idx = _graph->getInputs().at(input1); - auto input2_idx = _graph->getInputs().at(input2); - - const int32_t input1_buffer[4] = {1, 0, -1, -2}; - const int32_t input2_buffer[4] = {1, -3, 2, -4}; - - auto output = IOIndex{0}; - auto output_idx = _graph->getOutputs().at(output); - - int32_t output_buffer[4] = {}; - - EXPECT_NO_THROW(_execution->setInput(input1, reinterpret_cast(input1_buffer), 16)); - EXPECT_NO_THROW(_execution->setInput(input2, reinterpret_cast(input2_buffer), 16)); - EXPECT_NO_THROW(_execution->setOutput(output, reinterpret_cast(output_buffer), 16)); - EXPECT_NO_THROW(_execution->execute()); - EXPECT_EQ(output_buffer[0], 5); - EXPECT_EQ(output_buffer[1], -2); - EXPECT_EQ(output_buffer[2], 0); - EXPECT_EQ(output_buffer[3], -1); -} - -} // namespace diff --git a/runtime/onert/core/src/interp/InterpOps.lst b/runtime/onert/core/src/interp/InterpOps.lst deleted file mode 100644 index 0714df38a..000000000 --- a/runtime/onert/core/src/interp/InterpOps.lst +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef INTERP_OP -#error Define INTERP_OP before including this file -#endif - -// Supported operation name in interpreter -// -// Same list with Operations.lst -// Make comment out if operation is not supported in interpreter -INTERP_OP(BinaryArithmetic) -//INTERP_OP(BatchToSpaceND) -//INTERP_OP(Cast) -INTERP_OP(Conv2D) -INTERP_OP(DepthwiseConv2D) -INTERP_OP(Pool2D) -INTERP_OP(Concat) -INTERP_OP(FullyConnected) -//INTERP_OP(Reduce) -INTERP_OP(Reshape) -INTERP_OP(Softmax) -//INTERP_OP(Squeeze) -//INTERP_OP(Slice) -//INTERP_OP(StridedSlice) -INTERP_OP(ElementwiseActivation) -//INTERP_OP(Transpose) -//INTERP_OP(Exp) -//INTERP_OP(Comparison) -//INTERP_OP(LogicalNot) -//INTERP_OP(LSTM) -//INTERP_OP(RSQRT) -//INTERP_OP(ResizeBilinear) -//INTERP_OP(RNN) -//INTERP_OP(Floor) -//INTERP_OP(SpaceToBatchND) -//INTERP_OP(SpaceToDepth) -//INTERP_OP(EmbeddingLookup) -//INTERP_OP(L2Normalization) -//INTERP_OP(HashtableLookup) -INTERP_OP(InstanceNorm) -//INTERP_OP(PReLU) -INTERP_OP(TransposeConv) -//INTERP_OP(SQRT) -//INTERP_OP(SquaredDifference) -//INTERP_OP(TopKV2) -INTERP_OP(Gather) -//INTERP_OP(Neg) -//INTERP_OP(Abs) -//INTERP_OP(ArgMax) -//INTERP_OP(Dequantize) -//INTERP_OP(LocalResponseNormalization) -//INTERP_OP(DepthToSpace) -//INTERP_OP(Pack) -//INTERP_OP(Split) -//INTERP_OP(Unpack) -INTERP_OP(Pad) -//INTERP_OP(Custom) -//INTERP_OP(Permute) -//INTERP_OP(OneHot) diff --git a/runtime/onert/core/src/interp/Interpreter.cc b/runtime/onert/core/src/interp/Interpreter.cc deleted file mode 100644 index e01afb8a6..000000000 --- a/runtime/onert/core/src/interp/Interpreter.cc +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Interpreter.h" - -#include -#include - -#include "Registration.h" - -#include "ir/OperandIndexMap.h" -#include "util/logging.h" -#include "ir/OperationVisitor.h" - -namespace onert -{ -namespace interp -{ - -// TODO more structured execution kernel implementation -// TODO use cker for execution -// TODO divide tensor prepare and execution -// TODO introduce memory manager (buffer allocate and free) -class OperationExecutor -{ -public: - OperationExecutor(ExecEnv *env) : _env{env} - { -#define INTERP_OP(InternalName) _kernels[ir::OpCode::InternalName] = get##InternalName(); -#include "InterpOps.lst" -#undef INTERP_OP - } - - void execute(const ir::OperationIndex &idx) - { - const ir::Operation &node = _env->graph().operations().at(idx); - const auto nodeName = node.name(); - VERBOSE(INTERPRETER) << "Prepare output operands and execute " << nodeName - << " operation (id: " << idx << ")" << std::endl; - - const auto nodeOpCode = node.opcode(); - if (_kernels.find(nodeOpCode) == _kernels.end()) - { - throw std::runtime_error{"Interpreter: Operation " + nodeName + " is not yet implemented"}; - } - - if (_kernels[nodeOpCode]->prepare != nullptr) - { - _kernels[nodeOpCode]->prepare(_env, node); - } - _kernels[nodeOpCode]->invoke(_env, node); - } - -private: - ExecEnv *_env; - std::unordered_map _kernels; -}; - -void Interpreter::run() -{ - VERBOSE(INTERPRETER) << "Interpreter is invoked " << std::endl; - - // operand_stack: save operands prepared to use - std::stack operand_stack; - - // Note: We should push input first, then constant. - // We use use-def for find operators ready to execution, - // but Use-Def cannot handle parameters (maybe constant, but not always) - // Note: If all model inputs are constant, it may not work (depend on tensors' order). - // But that scenario may not exist - for (auto ind : _env->graph().getInputs()) - { - VERBOSE(INTERPRETER) << "Input: Push to operand stack " << ind << std::endl; - - operand_stack.push(ind); - } - - _env->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (obj.isConstant()) - { - VERBOSE(INTERPRETER) << "Constant: Push to operand stack " << ind << std::endl; - - operand_stack.push(ind); - } - }); - - // Execution - std::unordered_set ready_check; - std::unordered_set executed; - OperationExecutor executor{_env.get()}; - while (!operand_stack.empty()) - { - const auto current_operand_index = operand_stack.top(); - operand_stack.pop(); - VERBOSE(INTERPRETER) << "Poped operand " << current_operand_index.value() - << " is checked ready to use" << std::endl; - - assert(ready_check.find(current_operand_index) == ready_check.end()); - ready_check.insert(current_operand_index); - - // Find prepared operations by scan use of current operand - std::stack operation_stack; - const auto use_operators = _env->graph().operands().at(current_operand_index).getUses(); - for (const auto &use_operator : use_operators) - { - // Assumption: all parameters are ready to use - bool operator_ready = true; - for (auto input_index : _env->graph().operations().at(use_operator).getInputs()) - { - if (ready_check.find(input_index) == ready_check.end()) - { - operator_ready = false; - break; - } - } - - if (operator_ready) - { - VERBOSE(INTERPRETER) << "Ready to execute operation " << use_operator << std::endl; - operation_stack.push(use_operator); - } - } - - while (!operation_stack.empty()) - { - const auto current_operation_index = operation_stack.top(); - operation_stack.pop(); - VERBOSE(INTERPRETER) << "Poped operation: " << current_operation_index << "(" - << _env->graph().operations().at(current_operation_index).name() << ")" - << std::endl; - - // execution - // 1. Prepare output tensor - // 2. Call operation kernel - executor.execute(current_operation_index); - executed.insert(current_operation_index); - - // 3. Push each output into operand stack - const auto def_operands = _env->graph().operations().at(current_operation_index).getOutputs(); - for (auto def_operand : def_operands) - { - VERBOSE(INTERPRETER) << "Buffer: Push to operand stack " << def_operand.value() - << std::endl; - operand_stack.push(def_operand); - } - - // 4. Free if lifetime of buffer operands used by input is finished - for (auto input_index : _env->graph().operations().at(current_operation_index).getInputs()) - { - const auto use_operators = _env->graph().operands().at(input_index).getUses(); - bool dead_buffer = true; - for (const auto &use_operator : use_operators) - { - if (executed.find(use_operator) == executed.end()) - { - dead_buffer = false; - break; - } - } - - if (dead_buffer) - { - _env->freeIfAllocated(input_index); - } - } - } - } -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/Interpreter.h b/runtime/onert/core/src/interp/Interpreter.h deleted file mode 100644 index d2165f538..000000000 --- a/runtime/onert/core/src/interp/Interpreter.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file Interpreter.h - * @brief This file contains Interpreter class for interpretation - */ -#ifndef __ONERT_INTERP_INTERPRETER_H__ -#define __ONERT_INTERP_INTERPRETER_H__ - -#include "ExecEnv.h" - -namespace onert -{ -namespace interp -{ - -/** - * @brief Class for interpretation - */ -class Interpreter -{ - -public: - /** - * @brief Construct a new Interpreter object (deleted) - */ - Interpreter() = delete; - /** - * @brief Construct a new Interpreter object - * @param[in] env Execution environment variable for interpreter object - */ - Interpreter(std::unique_ptr env) : _env{std::move(env)} - { - // DO NOTHING - } - -public: - /** - * @brief Run interpreter until there is no operation to execute - */ - void run(); - -private: - std::unique_ptr _env; -}; - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_INTERPRETER_H__ diff --git a/runtime/onert/core/src/interp/Registration.h b/runtime/onert/core/src/interp/Registration.h deleted file mode 100644 index 956b92a53..000000000 --- a/runtime/onert/core/src/interp/Registration.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_INTERP_REGISTRATION_H__ -#define __ONERT_INTERP_REGISTRATION_H__ - -#include "ExecEnv.h" - -#include "ir/Operation.h" - -namespace onert -{ -namespace interp -{ - -struct OpKernel -{ - std::function prepare; - std::function invoke; -}; - -// Defined in operations/ directory -#define INTERP_OP(InternalName) OpKernel *get##InternalName(); -#include "InterpOps.lst" -#undef INTERP_OP - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_REGISTRATION_H__ diff --git a/runtime/onert/core/src/interp/Tensor.cc b/runtime/onert/core/src/interp/Tensor.cc deleted file mode 100644 index de095c9e4..000000000 --- a/runtime/onert/core/src/interp/Tensor.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Tensor.h" - -#define NO_USE(a) (void)(a) - -namespace onert -{ -namespace interp -{ - -void ITensor::access(const std::function &fn) { fn(*this); } - -size_t ROTensor::calcOffset(const ir::Coordinates &coords) const -{ - NO_USE(coords); - throw std::runtime_error("offset_element_in_bytes is not supported for cpu::Tensor now."); -} - -size_t Tensor::calcOffset(const ir::Coordinates &coords) const -{ - NO_USE(coords); - throw std::runtime_error("offset_element_in_bytes is not supported for cpu::Tensor now."); -} - -ir::Layout ROTensor::layout() const -{ - // TODO Changes to return frontend layout - return ir::Layout::NHWC; -} - -ir::Layout Tensor::layout() const -{ - // TODO Changes to return frontend layout - return ir::Layout::NHWC; -} - -ir::Shape Tensor::getShape() const { return _info.shape(); } - -ir::Shape ROTensor::getShape() const { return _info.shape(); } - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h deleted file mode 100644 index 642fdc164..000000000 --- a/runtime/onert/core/src/interp/Tensor.h +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file Tensor.h - * @brief This file contains ITensor interface, ROTensor class, and Tensor class - */ -#ifndef __ONERT_INTERP_TENSOR_H__ -#define __ONERT_INTERP_TENSOR_H__ - -#include "Buffer.h" - -#include "ir/OperandInfo.h" -#include "backend/ITensor.h" -#include "ir/Layout.h" - -namespace onert -{ -namespace interp -{ - -/** - * @brief Interface to handle Tensor in interpreter - */ -class ITensor : public backend::ITensor -{ -public: - virtual ~ITensor() = default; - -public: - virtual uint8_t *buffer() const = 0; - /** - * @brief Return shared pointer for buffer - * @return Buffer shared pointer - */ - virtual std::shared_ptr shareBuffer() const = 0; - /** - * @brief Return read-only buffer pointer - * @return Read-only buffer pointer - */ - virtual const uint8_t *bufferRO() const = 0; - /** - * @brief Return shared pointer for data - * @return Data shared pointer - */ - virtual std::shared_ptr shareData() const = 0; - /** - * @brief Set internal/external buffer - * @param[in] buffer Buffer pointer - */ - virtual void setBuffer(std::shared_ptr buffer) = 0; - /** - * @brief Set data reference (including constant, input) - * @param[in] data Data pointer - */ - virtual void setData(std::shared_ptr data) = 0; - virtual void releaseData() = 0; - - virtual size_t total_size() const = 0; - virtual size_t calcOffset(const ir::Coordinates &coords) const = 0; - - virtual bool has_padding() const = 0; - /** - * @brief Return data type of tensor - * @return Data type of tensor - */ - virtual ir::DataType data_type() const = 0; - /** - * @brief Return TensorInfo - * @return TensorInfo - */ - virtual const ir::OperandInfo &tensorInfo() const = 0; - /** - * @brief Return number of elements - * @return Number of elements - */ - virtual uint64_t num_elements() const = 0; - void access(const std::function &fn) final; -}; - -/** - * @brief Class to handle tensor in interpreter as read-only - */ -class ROTensor final : public ITensor -{ -public: - ROTensor() = delete; - ROTensor(const ir::OperandInfo &info) : _info(info) - { - // DO NOTHING - } - -public: - uint8_t *buffer() const override { throw std::runtime_error{"Read only tensor"}; } - std::shared_ptr shareBuffer() const override - { - throw std::runtime_error{"Read only tensor"}; - } - const uint8_t *bufferRO() const override { return _data->base(); } - std::shared_ptr shareData() const override { return _data; } - void setBuffer(std::shared_ptr buffer) override { _data = buffer; } - void setData(std::shared_ptr data) override { _data = data; } - void releaseData() override { _data = nullptr; } - - size_t total_size() const override { return _info.total_size(); } - size_t calcOffset(const ir::Coordinates &coords) const override; - ir::Layout layout() const override; - bool is_dynamic() const override { return false; } - bool has_padding() const override { return false; } - ir::DataType data_type() const override { return _info.typeInfo().type(); } - float data_scale() const override { return _info.typeInfo().scale(); } - int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); } - const std::vector &data_scales() const override { return _info.typeInfo().scales(); } - const std::vector &data_zero_points() const override - { - return _info.typeInfo().zero_points(); - } - const ir::OperandInfo &tensorInfo() const override { return _info; } - uint64_t num_elements() const override { return _info.shape().num_elements(); }; - ir::Shape getShape() const override; - -private: - const ir::OperandInfo _info; - std::shared_ptr _data{nullptr}; -}; - -/** - * @brief Class to handle tensor in interpreter as writable - */ -class Tensor final : public ITensor -{ -public: - Tensor() = delete; - Tensor(const ir::OperandInfo &info) : _info(info) - { - // DO NOTHING - } - -public: - uint8_t *buffer() const override { return _buffer->baseWritable(); } - std::shared_ptr shareBuffer() const override { return _buffer; }; - const uint8_t *bufferRO() const override { return _buffer->base(); } - std::shared_ptr shareData() const override { return _buffer; } - void setBuffer(std::shared_ptr buffer) override { _buffer = buffer; } - void setData(std::shared_ptr) override - { - throw std::runtime_error{"Passed data may read-only"}; - } - void releaseData() override { _buffer = nullptr; } - - size_t total_size() const override { return _info.total_size(); } - size_t calcOffset(const ir::Coordinates &coords) const override; - ir::Layout layout() const override; - bool is_dynamic() const override { return false; } - bool has_padding() const override { return false; } - ir::DataType data_type() const override { return _info.typeInfo().type(); } - float data_scale() const override { return _info.typeInfo().scale(); } - int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); } - const std::vector &data_scales() const override { return _info.typeInfo().scales(); } - const std::vector &data_zero_points() const override - { - return _info.typeInfo().zero_points(); - } - const ir::OperandInfo &tensorInfo() const override { return _info; } - uint64_t num_elements() const override { return _info.shape().num_elements(); }; - ir::Shape getShape() const override; - -private: - const ir::OperandInfo _info; - std::shared_ptr _buffer{nullptr}; -}; - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_TENSOR_H__ diff --git a/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc b/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc deleted file mode 100644 index fe4acd309..000000000 --- a/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/BinaryArithmetic.h" - -#include -#include -#include - -namespace onert -{ -namespace interp -{ -namespace -{ - -enum class OpType -{ - ADD, - SUB, - MUL -}; - -void prepare(ExecEnv *env, const ir::Operation &node) -{ - const auto &arithmetic_node = - nnfw::misc::polymorphic_downcast(node); - - const auto lhs_index = node.getInputs().at(arithmetic_node.LHS); - const auto rhs_index = node.getInputs().at(arithmetic_node.RHS); - const auto out_index = node.getOutputs().at(0); - - const auto lhs_tensor = env->tensorAt(lhs_index); - const auto rhs_tensor = env->tensorAt(rhs_index); - - // Check shape and type lhs is same with rhs - // TODO Util function to compare TensorInfo - if (lhs_tensor->data_type() != rhs_tensor->data_type()) - { - throw std::runtime_error{"Interp(" + arithmetic_node.name() + "): Different input types"}; - } - - bool try_broadcast = (lhs_tensor->tensorInfo().shape() != rhs_tensor->tensorInfo().shape()); - if (try_broadcast) - { - bool success = true; - auto out_shape = calcBroadcastShape(lhs_tensor->tensorInfo().shape(), - rhs_tensor->tensorInfo().shape(), success); - if (!success) - { - throw std::runtime_error{"Interp(" + arithmetic_node.name() + "): Fail to brodcasting"}; - } - - auto output_info = - ir::OperandInfo::createStaticInfo(out_shape, lhs_tensor->tensorInfo().typeInfo()); - // We can handle already allocated (ex. model output) - env->allocateIfNeeded(out_index, output_info); - } - else - { - // Output's shape and type is same with input - auto output_info = lhs_tensor->tensorInfo(); - // We can handle already allocated (ex. model output) - env->allocateIfNeeded(out_index, output_info); - } - - auto out_tensor = env->tensorAt(out_index); - // Check shape and type lhs is same with output - // TODO Util function to compare TensorInfo - if (lhs_tensor->data_type() != out_tensor->data_type()) - { - throw std::runtime_error{"Interp(" + arithmetic_node.name() + "): Invalid output type"}; - } -} - -inline void setActivationParams(float min, float max, nnfw::cker::BinaryArithmeticOpParam *params) -{ - params->float_activation_min = min; - params->float_activation_max = max; -} - -inline void setActivationParams(int32_t min, int32_t max, - nnfw::cker::BinaryArithmeticOpParam *params) -{ - params->quantized_activation_min = min; - params->quantized_activation_max = max; -} - -template -void invoke(const ITensor *lhs_tensor, const ITensor *rhs_tensor, const ITensor *out_tensor, - const ir::operation::BinaryArithmetic::Param ¶m) -{ - const auto lhs_buffer = lhs_tensor->bufferRO(); - const auto rhs_buffer = rhs_tensor->bufferRO(); - auto out_buffer = out_tensor->buffer(); - - nnfw::cker::BinaryArithmeticOpParam cker_param; - raw_type activation_min, activation_max; - calculateActivationRange(param.activation, &activation_min, &activation_max); - setActivationParams(activation_min, activation_max, &cker_param); - const raw_type *lhs_ptr = reinterpret_cast(lhs_buffer); - const raw_type *rhs_ptr = reinterpret_cast(rhs_buffer); - raw_type *out_ptr = reinterpret_cast(out_buffer); - - const auto cker_op_type = - (op_type == OpType::ADD) ? nnfw::cker::BinaryArithmeticOpType::ADD - : ((op_type == OpType::SUB) ? nnfw::cker::BinaryArithmeticOpType::SUB - : nnfw::cker::BinaryArithmeticOpType::MUL); - - const bool need_broadcast = - nnfw::cker::ProcessBroadcastShapes(convertShape(lhs_tensor->tensorInfo().shape()), - convertShape(rhs_tensor->tensorInfo().shape()), &cker_param); - - if (need_broadcast) - { - const auto lhs_shape = convertShape(lhs_tensor->tensorInfo().shape()); - const auto rhs_shape = convertShape(rhs_tensor->tensorInfo().shape()); - const auto out_shape = convertShape(out_tensor->tensorInfo().shape()); - nnfw::cker::BroadcastBinaryArithmeticOp(cker_param, lhs_shape, lhs_ptr, rhs_shape, - rhs_ptr, out_shape, out_ptr); - return; - } - - const auto lhs_shape = convertShape(lhs_tensor->tensorInfo().shape()); - const auto rhs_shape = convertShape(rhs_tensor->tensorInfo().shape()); - const auto out_shape = convertShape(out_tensor->tensorInfo().shape()); - nnfw::cker::BinaryArithmeticOp(cker_param, lhs_shape, lhs_ptr, rhs_shape, rhs_ptr, - out_shape, out_ptr); -} - -template -void invokeBinaryArithmetic(const ExecEnv *env, const ir::operation::BinaryArithmetic &node) -{ - const auto lhs_index = node.getInputs().at(node.LHS); - const auto rhs_index = node.getInputs().at(node.RHS); - const auto out_index = node.getOutputs().at(0); - const auto lhs_tensor = env->tensorAt(lhs_index); - const auto rhs_tensor = env->tensorAt(rhs_index); - const auto out_tensor = env->tensorAt(out_index); - const auto data_type = lhs_tensor->data_type(); - - if (data_type == ir::DataType::INT32) - { - invoke(lhs_tensor, rhs_tensor, out_tensor, node.param()); - } - else if (data_type == ir::DataType::FLOAT32) - { - invoke(lhs_tensor, rhs_tensor, out_tensor, node.param()); - } - else - { - throw std::runtime_error{"NYI: Unsupported data type"}; - } -} - -void invokeBinaryArithmeticOps(const ExecEnv *env, const ir::Operation &node) -{ - const auto &arithmetic_node = - nnfw::misc::polymorphic_downcast(node); - - switch (arithmetic_node.param().arithmetic_type) - { - case ir::operation::BinaryArithmetic::ArithmeticType::ADD: - invokeBinaryArithmetic(env, arithmetic_node); - break; - case ir::operation::BinaryArithmetic::ArithmeticType::SUB: - invokeBinaryArithmetic(env, arithmetic_node); - break; - case ir::operation::BinaryArithmetic::ArithmeticType::MUL: - invokeBinaryArithmetic(env, arithmetic_node); - break; - default: - throw std::runtime_error{"Interp(BinaryArithmetic): NYI unsupported operation " + - arithmetic_node.name()}; - break; - } -} - -} // namespace - -OpKernel *getBinaryArithmetic() -{ - static OpKernel kernel = {prepare, invokeBinaryArithmeticOps}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/Concat.cc b/runtime/onert/core/src/interp/operations/Concat.cc deleted file mode 100644 index 103604631..000000000 --- a/runtime/onert/core/src/interp/operations/Concat.cc +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/Concat.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace concat -{ - -void prepareConcat(ExecEnv *env, const ir::Operation &node) -{ - const auto &concat_node = nnfw::misc::polymorphic_downcast(node); - - const auto first_index = node.getInputs().at(0); - const auto out_index = node.getOutputs().at(0); - - const auto first_tensor = env->tensorAt(first_index); - uint32_t out_axis_dimension = 0; - const int32_t axis_raw = concat_node.param().axis; - const int32_t axis = (axis_raw < 0) ? (axis_raw + first_tensor->getShape().rank()) : axis_raw; - - // All inputs shape should be same except axis dimension - // All inputs type should be same - for (auto input : node.getInputs()) - { - assert(first_tensor->getShape().rank() == env->tensorAt(input)->getShape().rank()); - assert(first_tensor->data_type() == env->tensorAt(input)->data_type()); - for (int i = 0; i < first_tensor->getShape().rank(); i++) - { - if (i == axis) - { - out_axis_dimension += env->tensorAt(input)->getShape().dim(i); - continue; - } - assert(first_tensor->getShape().dim(i) == env->tensorAt(input)->getShape().dim(i)); - } - } - - // Make output tensor info using first input tensor info, and accumulated axis dimension value - auto out_shape = first_tensor->tensorInfo().shape(); - out_shape.dim(axis) = out_axis_dimension; - env->allocateIfNeeded( - out_index, ir::OperandInfo::createStaticInfo(out_shape, first_tensor->tensorInfo().typeInfo())); - - auto out_tensor = env->tensorAt(out_index); - UNUSED_RELEASE(out_tensor); - - // Output shape should be same with input except axis getShape().dim - // Output type should be same with input - assert(first_tensor->data_type() == out_tensor->data_type()); - for (int i = 0; i < first_tensor->getShape().rank(); i++) - { - if (i == axis) - { - continue; - } - assert(first_tensor->getShape().dim(i) == out_tensor->getShape().dim(i)); - } -} - -void invoke(const std::vector in_tensors, const ITensor *out_tensor, uint32_t axis) -{ - const uint32_t count = in_tensors.size(); - - // Calculate - nnfw::cker::ConcatenationParams cker_param; - cker_param.axis = (int8_t)axis; - cker_param.inputs_count = count; - - const auto out_shape = convertShape(out_tensor->tensorInfo().shape()); - - std::vector in_shapes; - std::vector in_shape_ptrs; - in_shapes.reserve(count); - in_shape_ptrs.reserve(count); - std::vector in_ptrs; - for (uint32_t i = 0; i < count; i++) - { - in_shapes.push_back(convertShape(in_tensors[i]->tensorInfo().shape())); - in_shape_ptrs.push_back(&in_shapes[i]); - in_ptrs.push_back(reinterpret_cast(in_tensors[i]->bufferRO())); - } - - auto out_buffer = out_tensor->buffer(); - float *out_ptr = reinterpret_cast(out_buffer); - - nnfw::cker::Concatenation(cker_param, in_shape_ptrs.data(), in_ptrs.data(), out_shape, - out_ptr); -} - -void invokeConcat(const ExecEnv *env, const ir::Operation &node) -{ - const auto &concat_node = nnfw::misc::polymorphic_downcast(node); - const int32_t axis_raw = concat_node.param().axis; - - std::vector in_tensors; - for (const auto &e : concat_node.getInputs()) - { - in_tensors.emplace_back(env->tensorAt(e)); - } - - const auto out_index = node.getOutputs().at(0); - const auto out_tensor = env->tensorAt(out_index); - const uint32_t axis = (axis_raw < 0) ? (axis_raw + out_tensor->getShape().rank()) : axis_raw; - - const auto data_type = in_tensors[0]->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - invoke(in_tensors, out_tensor, axis); - } - else - { - throw std::runtime_error{"NYI: Support float32 only"}; - } -} -} // namespace concat - -OpKernel *getConcat() -{ - static OpKernel kernel = {concat::prepareConcat, concat::invokeConcat}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/Conv2D.cc b/runtime/onert/core/src/interp/operations/Conv2D.cc deleted file mode 100644 index 72c2057c2..000000000 --- a/runtime/onert/core/src/interp/operations/Conv2D.cc +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/Conv2D.h" -#include "util/ShapeInference.h" -#include "util/Utils.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace conv2d -{ - -void prepareConv2D(ExecEnv *env, const ir::Operation &node) -{ - const auto in_index = node.getInputs().at(ir::operation::Conv2D::INPUT); - const auto kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); - const auto bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); - const auto out_index = node.getOutputs().at(0); - - const auto in_tensor = env->tensorAt(in_index); - const auto kernel_tensor = env->tensorAt(kernel_index); - const auto bias_tensor = env->tensorAt(bias_index); - - assert(in_tensor->getShape().rank() == 4); - assert(kernel_tensor->getShape().rank() == 4); - assert(bias_tensor->getShape().rank() == 1); - - UNUSED_RELEASE(in_tensor); - UNUSED_RELEASE(kernel_tensor); - UNUSED_RELEASE(bias_tensor); - - const auto output_info = env->graph().operands().at(out_index).info(); - if (output_info.total_size() == 0) - { - // Handle unspecified output shape - const auto &conv_node = nnfw::misc::polymorphic_downcast(node); - const auto infered_output_shape = shape_inference::inferConv2DShape( - in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(), conv_node.param()); - env->allocateIfNeeded( - out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo())); - } - else - { - env->allocateIfNeeded(out_index, output_info); - } - - auto out_tensor = env->tensorAt(out_index); - UNUSED_RELEASE(out_tensor); - - // Handle same ifm & ofm data type only - assert(in_tensor->data_type() == out_tensor->data_type()); - assert(out_tensor->getShape().rank() == 4); -} - -void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor, - const ITensor *ofm_tensor, const ir::operation::Conv2D::Param ¶m) -{ - // TODO Support NCHW frontned - const auto ifm_shape = ifm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - const auto ofm_shape = ofm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]. - const auto &ker_shape = ker_tensor->tensorInfo().shape(); - const auto ker_height = ker_shape.dim(1); - const auto ker_width = ker_shape.dim(2); - const auto padding = - ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, ker_width, ker_height); - - // Calculate - float activation_min, activation_max; - calculateActivationRange(param.activation, &activation_min, &activation_max); - - nnfw::cker::ConvParams cker_param; - cker_param.padding_type = convertPaddingType(param.padding.type); - cker_param.padding_values.width = padding.left; - cker_param.padding_values.height = padding.top; - cker_param.stride_width = param.stride.horizontal; - cker_param.stride_height = param.stride.vertical; - cker_param.dilation_width_factor = 1; - cker_param.dilation_height_factor = 1; - cker_param.float_activation_min = activation_min; - cker_param.float_activation_max = activation_max; - - const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape()); - const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape()); - const auto cker_bias_shape = convertShape(bias_tensor->tensorInfo().shape()); - const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape()); - const float *ifm_ptr = reinterpret_cast(ifm_tensor->bufferRO()); - const float *ker_ptr = reinterpret_cast(ker_tensor->bufferRO()); - const float *bias_ptr = reinterpret_cast(bias_tensor->bufferRO()); - float *ofm_ptr = reinterpret_cast(ofm_tensor->buffer()); - - nnfw::cker::Conv conv_kernel; - conv_kernel(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr, cker_bias_shape, - bias_ptr, cker_ofm_shape, ofm_ptr); -} - -void invokeConv2D(const ExecEnv *env, const ir::Operation &node) -{ - const auto &conv_node = nnfw::misc::polymorphic_downcast(node); - - const auto ifm_index = node.getInputs().at(ir::operation::Conv2D::INPUT); - const auto ker_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); - const auto bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); - const auto ofm_index = node.getOutputs().at(0); - - const auto ifm_tensor = env->tensorAt(ifm_index); - const auto ker_tensor = env->tensorAt(ker_index); - const auto bias_tensor = env->tensorAt(bias_index); - const auto ofm_tensor = env->tensorAt(ofm_index); - - const auto data_type = ifm_tensor->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - invoke(ifm_tensor, ker_tensor, bias_tensor, ofm_tensor, conv_node.param()); - } - else - { - throw std::runtime_error{"NYI: Support float32 only"}; - } -} -} // namespace conv2d - -OpKernel *getConv2D() -{ - static OpKernel kernel = {conv2d::prepareConv2D, conv2d::invokeConv2D}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc b/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc deleted file mode 100644 index 9f527440e..000000000 --- a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/DepthwiseConv2D.h" -#include "util/ShapeInference.h" -#include "util/Utils.h" - -#include -#include - -namespace onert -{ -namespace interp -{ - -namespace -{ - -void prepareDepthwiseConv(ExecEnv *env, const ir::Operation &node) -{ - const auto in_index = node.getInputs().at(ir::operation::DepthwiseConv2D::INPUT); - const auto kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); - const auto bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); - const auto out_index = node.getOutputs().at(0); - - const auto in_tensor = env->tensorAt(in_index); - const auto kernel_tensor = env->tensorAt(kernel_index); - const auto bias_tensor = env->tensorAt(bias_index); - - assert(in_tensor->getShape().rank() == 4); - assert(kernel_tensor->getShape().rank() == 4); - assert(bias_tensor->getShape().rank() == 1); - - UNUSED_RELEASE(in_tensor); - UNUSED_RELEASE(kernel_tensor); - UNUSED_RELEASE(bias_tensor); - - // TODO handle unspecified output shape: - // calculate output shape using ifm shape, kernel shape, padding, stride - const auto output_info = env->graph().operands().at(out_index).info(); - if (output_info.total_size() == 0) - { - // Handle unspecified output shape - const auto &depth_conv_node = - nnfw::misc::polymorphic_downcast(node); - const auto infered_output_shape = shape_inference::inferDepthwiseConv2DShape( - in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(), - depth_conv_node.param()); - env->allocateIfNeeded( - out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo())); - } - else - { - env->allocateIfNeeded(out_index, output_info); - } - - auto out_tensor = env->tensorAt(out_index); - UNUSED_RELEASE(out_tensor); - - // Handle same ifm & ofm data type only - assert(in_tensor->data_type() == out_tensor->data_type()); - assert(out_tensor->getShape().rank() == 4); -} - -void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor, - const ITensor *ofm_tensor, const ir::operation::DepthwiseConv2D::Param ¶m) -{ - // TODO Support NCHW frontend - const auto ifm_shape = ifm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - const auto ofm_shape = ofm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - // Kernel format is [1, kernel_height, kernel_width, depth_out]. - const auto &ker_shape = ker_tensor->tensorInfo().shape(); - const auto ker_height = ker_shape.dim(1); - const auto ker_width = ker_shape.dim(2); - const auto padding = - ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, ker_width, ker_height); - - // Calculate - float activation_min, activation_max; - calculateActivationRange(param.activation, &activation_min, &activation_max); - - nnfw::cker::DepthwiseConvParams cker_param; - cker_param.padding_values.width = padding.left; - cker_param.padding_values.height = padding.top; - cker_param.depth_multiplier = param.multiplier; - cker_param.stride_width = param.stride.horizontal; - cker_param.stride_height = param.stride.vertical; - cker_param.dilation_width_factor = 1; - cker_param.dilation_height_factor = 1; - cker_param.float_activation_min = activation_min; - cker_param.float_activation_max = activation_max; - - const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape()); - const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape()); - const auto cker_bias_shape = convertShape(bias_tensor->tensorInfo().shape()); - const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape()); - const float *ifm_ptr = reinterpret_cast(ifm_tensor->bufferRO()); - const float *ker_ptr = reinterpret_cast(ker_tensor->bufferRO()); - const float *bias_ptr = reinterpret_cast(bias_tensor->bufferRO()); - float *ofm_ptr = reinterpret_cast(ofm_tensor->buffer()); - - nnfw::cker::DepthwiseConv(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr, - cker_bias_shape, bias_ptr, cker_ofm_shape, ofm_ptr, nullptr); -} - -void invokeDepthwiseConv(const ExecEnv *env, const ir::Operation &node) -{ - const auto &conv_node = static_cast(node); - - const auto ifm_index = node.getInputs().at(ir::operation::DepthwiseConv2D::INPUT); - const auto ker_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); - const auto bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); - const auto ofm_index = node.getOutputs().at(0); - - const auto ifm_tensor = env->tensorAt(ifm_index); - const auto ker_tensor = env->tensorAt(ker_index); - const auto bias_tensor = env->tensorAt(bias_index); - const auto ofm_tensor = env->tensorAt(ofm_index); - - const auto data_type = ifm_tensor->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - invoke(ifm_tensor, ker_tensor, bias_tensor, ofm_tensor, conv_node.param()); - } - else - { - throw std::runtime_error{"NYI: Support float32 only"}; - } -} - -} // namespace - -OpKernel *getDepthwiseConv2D() -{ - static OpKernel kernel = {prepareDepthwiseConv, invokeDepthwiseConv}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc b/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc deleted file mode 100644 index e13080e76..000000000 --- a/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/ElementwiseActivation.h" - -#include -#include -#include - -#include - -namespace onert -{ -namespace interp -{ -namespace -{ - -enum class ActivationType -{ - Logistic, - ReLU, - Tanh -}; - -void prepare(ExecEnv *env, const ir::Operation &node) -{ - const auto input_index = node.getInputs().at(0); - const auto output_index = node.getOutputs().at(0); - - const auto input_tensor = env->tensorAt(input_index); - - const auto output_info = env->graph().operands().at(output_index).info(); - if (output_info.total_size() == 0) - { - // Output's shape and type is same with input - auto input_info = input_tensor->tensorInfo(); - // We can handle already allocated (ex. model output) - env->allocateIfNeeded(output_index, input_info); - } - else - { - env->allocateIfNeeded(output_index, output_info); - } - - const auto output_tensor = env->tensorAt(output_index); - // Check shape and type lhs is same with output - // TODO Util function to compare TensorInfo - if (input_tensor->data_type() != output_tensor->data_type()) - { - throw std::runtime_error{"Interp(ElementwiseActivation): Invalid output type"}; - } -} - -template -void evalFloat(const float *input_ptr, float *output_ptr, uint64_t num_elements, float alpha, - float beta) -{ - std::function fn = [](const float &) { return std::nanf(""); }; - switch (act_type) - { - case ActivationType::ReLU: - fn = [alpha, beta](const float &in) { return std::min(std::max(beta, in), alpha); }; - break; - case ActivationType::Tanh: - fn = [](const float &in) { return std::tanh(in); }; - break; - default: - throw std::runtime_error{"Interp(ElementwiseActivation): NYI - Unsupported activation"}; - break; - } - - const float *input_end = input_ptr + num_elements; - for (; input_ptr < input_end; input_ptr++, output_ptr++) - { - *output_ptr = fn(*input_ptr); - } -} - -template void invoke(const ExecEnv *env, const ir::Operation &node) -{ - const auto input_index = node.getInputs().at(0); - const auto output_index = node.getOutputs().at(0); - - // Check lhs shape is same with rhs (with broadcast) - const auto input_tensor = env->tensorAt(input_index); - const auto output_tensor = env->tensorAt(output_index); - - const auto data_type = input_tensor->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - uint64_t elements = input_tensor->num_elements(); - const float *input_start = reinterpret_cast(input_tensor->bufferRO()); - float *out = reinterpret_cast(output_tensor->buffer()); - if (act_type == ActivationType::Logistic) - { - const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape()); - const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape()); - nnfw::cker::Logistic(cker_input_shape, input_start, cker_output_shape, out); - } - else - { - const auto &act_node = - nnfw::misc::polymorphic_downcast(node); - evalFloat(input_start, out, elements, act_node.param().alpha, - act_node.param().beta); - } - } - else - { - throw std::runtime_error{"Interp(" + node.name() + "): NYI - Support float only"}; - } -} - -void invokeElementwiseActivation(const ExecEnv *env, const ir::Operation &node) -{ - const auto &act_node = - nnfw::misc::polymorphic_downcast(node); - switch (act_node.param().op_type) - { - case ir::operation::ElementwiseActivation::Type::LOGISTIC: - invoke(env, node); - break; - case ir::operation::ElementwiseActivation::Type::RELU: - invoke(env, node); - break; - case ir::operation::ElementwiseActivation::Type::TANH: - invoke(env, node); - break; - default: - throw std::runtime_error("Interp(" + node.name() + "): NYI - Unsupported activation"); - } -} - -} // namespace - -OpKernel *getElementwiseActivation() -{ - static OpKernel kernel = {prepare, invokeElementwiseActivation}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/FullyConnected.cc b/runtime/onert/core/src/interp/operations/FullyConnected.cc deleted file mode 100644 index 2bc9f517f..000000000 --- a/runtime/onert/core/src/interp/operations/FullyConnected.cc +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/FullyConnected.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace fc -{ - -void prepareFC(ExecEnv *env, const ir::Operation &node) -{ - const auto in_index = node.getInputs().at(ir::operation::FullyConnected::INPUT); - const auto kernel_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); - const auto bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); - const auto out_index = node.getOutputs().at(0); - - const auto in_tensor = env->tensorAt(in_index); - const auto kernel_tensor = env->tensorAt(kernel_index); - const auto bias_tensor = env->tensorAt(bias_index); - - UNUSED_RELEASE(in_tensor); - UNUSED_RELEASE(kernel_tensor); - UNUSED_RELEASE(bias_tensor); - - assert(in_tensor->getShape().rank() >= 2); - assert(kernel_tensor->getShape().rank() == 2); - assert(bias_tensor->getShape().rank() == 1); - - const auto input_size_with_batch = in_tensor->num_elements(); - const auto num_units = kernel_tensor->getShape().dim(0); - const auto input_size = kernel_tensor->getShape().dim(1); - const int32_t batch_size = input_size_with_batch / input_size; - assert(input_size_with_batch % input_size == 0); - assert(num_units == bias_tensor->getShape().dim(0)); - - // Make output tensor info - ir::Shape output_shape(2); - output_shape.dim(0) = batch_size; - output_shape.dim(1) = num_units; - const auto out_info = - ir::OperandInfo::createStaticInfo(output_shape, in_tensor->tensorInfo().typeInfo()); - env->allocateIfNeeded(out_index, out_info); - - auto out_tensor = env->tensorAt(out_index); - UNUSED_RELEASE(out_tensor); - - // Handle same ifm & ofm data type only - assert(in_tensor->data_type() == out_tensor->data_type()); - assert(out_tensor->getShape().rank() == 2); - assert(out_tensor->getShape().dim(0) == batch_size); - assert(out_tensor->getShape().dim(1) == num_units); -} - -void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor, - const ITensor *ofm_tensor, const ir::operation::FullyConnected::Param ¶m) -{ - const auto ifm_buffer = ifm_tensor->bufferRO(); - const auto ker_buffer = ker_tensor->bufferRO(); - const auto bias_buffer = bias_tensor->bufferRO(); - auto ofm_buffer = ofm_tensor->buffer(); - - // Calculate - nnfw::cker::FullyConnectedParams cker_param; - cker_param.activation = convertActivationType(param.activation); - const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape()); - const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape()); - const auto cker_bias_shape = convertShape(bias_tensor->tensorInfo().shape()); - const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape()); - const float *ifm_ptr = reinterpret_cast(ifm_buffer); - const float *ker_ptr = reinterpret_cast(ker_buffer); - const float *bias_ptr = reinterpret_cast(bias_buffer); - float *ofm_ptr = reinterpret_cast(ofm_buffer); - - nnfw::cker::FullyConnected(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr, - cker_bias_shape, bias_ptr, cker_ofm_shape, ofm_ptr); -} - -void invokeFC(const ExecEnv *env, const ir::Operation &node) -{ - const auto &conv_node = - nnfw::misc::polymorphic_downcast(node); - - const auto ifm_index = node.getInputs().at(ir::operation::FullyConnected::INPUT); - const auto ker_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); - const auto bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); - const auto ofm_index = node.getOutputs().at(0); - - const auto ifm_tensor = env->tensorAt(ifm_index); - const auto ker_tensor = env->tensorAt(ker_index); - const auto bias_tensor = env->tensorAt(bias_index); - const auto ofm_tensor = env->tensorAt(ofm_index); - - const auto data_type = ifm_tensor->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - invoke(ifm_tensor, ker_tensor, bias_tensor, ofm_tensor, conv_node.param()); - } - else - { - throw std::runtime_error{"NYI: Support float only"}; - } -} -} // namespace fc - -OpKernel *getFullyConnected() -{ - static OpKernel kernel = {fc::prepareFC, fc::invokeFC}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/Gather.cc b/runtime/onert/core/src/interp/operations/Gather.cc deleted file mode 100644 index d686cfcf6..000000000 --- a/runtime/onert/core/src/interp/operations/Gather.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/Gather.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace -{ - -void prepareGather(ExecEnv *env, const ir::Operation &node) -{ - const auto input_index = node.getInputs().at(ir::operation::Gather::INPUT); - const auto indices_index = node.getInputs().at(ir::operation::Gather::INDICES); - const auto output_index = node.getOutputs().at(0); - - const auto input_tensor = env->tensorAt(input_index); - const auto indices_tensor = env->tensorAt(indices_index); - - // TODO handle unspecified output shape: - // calculate output shape using ifm shape, kernel shape, padding, stride - const auto output_info = env->graph().operands().at(output_index).info(); - if (output_info.total_size() == 0) - { - throw std::runtime_error{"Interp(Gather): NYI for unspecified output shape"}; - } - else - { - env->allocateIfNeeded(output_index, output_info); - } - - if (indices_tensor->data_type() != ir::DataType::INT32) - { - throw std::runtime_error{"Interp(Gather): Invalid indices data type"}; - } - - auto output_tensor = env->tensorAt(output_index); - auto output_rank = input_tensor->getShape().rank() + indices_tensor->getShape().rank() - 1; - - if (output_rank != output_tensor->getShape().rank()) - { - throw std::runtime_error{"Interp(Gather): Invalid output rank"}; - } - if (output_tensor->data_type() != input_tensor->data_type()) - { - throw std::runtime_error{"Interp(Gather): Invalid output data type"}; - } - - if (input_tensor->data_type() == ir::DataType::QUANT_UINT8_ASYMM && - input_tensor->tensorInfo().typeInfo() != output_tensor->tensorInfo().typeInfo()) - { - throw std::runtime_error{ - "Interp(Gather): Cannot handle different I/O QUANT_UINT8_ASYMM scale/offset"}; - } -} - -template -void invoke(const ITensor *input_tensors, const ITensor *indices_tensors, - const ITensor *output_tensor, uint32_t axis) -{ - // Calculate - nnfw::cker::GatherParams cker_param; - cker_param.axis = (int8_t)axis; - - const auto cker_input_shapes = convertShape(input_tensors->tensorInfo().shape()); - const auto cker_indices_shape = convertShape(indices_tensors->tensorInfo().shape()); - const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape()); - const raw_type *input_ptr = reinterpret_cast(input_tensors->bufferRO()); - const int32_t *indices_ptr = reinterpret_cast(indices_tensors->bufferRO()); - raw_type *output_ptr = reinterpret_cast(output_tensor->buffer()); - - nnfw::cker::Gather(cker_param, cker_input_shapes, input_ptr, cker_indices_shape, - indices_ptr, cker_output_shape, output_ptr); -} - -void invokeGather(const ExecEnv *env, const ir::Operation &node) -{ - const auto &gather_node = nnfw::misc::polymorphic_downcast(node); - const int32_t axis_raw = gather_node.param().axis; - - const auto input_index = node.getInputs().at(ir::operation::Gather::INPUT); - const auto indices_index = node.getInputs().at(ir::operation::Gather::INDICES); - const auto output_index = node.getOutputs().at(0); - - const auto input_tensor = env->tensorAt(input_index); - const auto indices_tensor = env->tensorAt(indices_index); - const auto output_tensor = env->tensorAt(output_index); - const uint32_t axis = (axis_raw < 0) ? (axis_raw + input_tensor->getShape().rank()) : axis_raw; - - const auto data_type = input_tensor->data_type(); - - switch (data_type) - { - case ir::DataType::FLOAT32: - invoke(input_tensor, indices_tensor, output_tensor, axis); - break; - case ir::DataType::INT32: - invoke(input_tensor, indices_tensor, output_tensor, axis); - break; - case ir::DataType::QUANT_UINT8_ASYMM: - invoke(input_tensor, indices_tensor, output_tensor, axis); - break; - default: - throw std::runtime_error{"Interp(Gather): NYI - Not supported type"}; - } -} - -} // namespace - -OpKernel *getGather() -{ - static OpKernel kernel = {prepareGather, invokeGather}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/InstanceNorm.cc b/runtime/onert/core/src/interp/operations/InstanceNorm.cc deleted file mode 100644 index 318088457..000000000 --- a/runtime/onert/core/src/interp/operations/InstanceNorm.cc +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/InstanceNorm.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace instancenorm -{ - -void prepareInstanceNorm(ExecEnv *env, const ir::Operation &node) -{ - const auto &instancenorm_node = - nnfw::misc::polymorphic_downcast(node); - - const auto input_index = node.getInputs().at(instancenorm_node.INPUT); - const auto output_index = node.getOutputs().at(0); - const auto input_tensor = env->tensorAt(input_index); - - if (input_tensor->getShape().rank() != 4) - { - throw std::runtime_error{"Interp(InstanceNorm): Input should be 4D-tensor"}; - } - - // Output shape should be same with input - env->allocateIfNeeded(output_index, input_tensor->tensorInfo()); - - auto output_tensor = env->tensorAt(output_index); - UNUSED_RELEASE(output_tensor); - - // Handle same ifm & ofm data type only - assert(input_tensor->data_type() == output_tensor->data_type()); - assert(input_tensor->tensorInfo().shape() == output_tensor->tensorInfo().shape()); -} - -inline void setActivationParams(float min, float max, nnfw::cker::InstanceNormParams *params) -{ - params->float_activation_min = min; - params->float_activation_max = max; -} - -void invoke(const ITensor *input_tensor, const ITensor *gamma_tensor, const ITensor *beta_tensor, - const ITensor *output_tensor, const ir::operation::InstanceNorm::Param ¶m) -{ - // Calculate - float activation_min, activation_max; - calculateActivationRange(param.activation, &activation_min, &activation_max); - - nnfw::cker::InstanceNormParams cker_param; - cker_param.epsilon = param.epsilon; - cker_param.float_activation_min = activation_min; - cker_param.float_activation_max = activation_max; - - const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape()); - const auto cker_gamma_shape = convertShape(gamma_tensor->tensorInfo().shape()); - const auto cker_beta_shape = convertShape(beta_tensor->tensorInfo().shape()); - const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape()); - const float *input_ptr = reinterpret_cast(input_tensor->bufferRO()); - const float *gamma_ptr = reinterpret_cast(gamma_tensor->bufferRO()); - const float *beta_ptr = reinterpret_cast(beta_tensor->bufferRO()); - float *output_ptr = reinterpret_cast(output_tensor->buffer()); - - nnfw::cker::InstanceNorm(cker_param, cker_input_shape, input_ptr, cker_gamma_shape, gamma_ptr, - cker_beta_shape, beta_ptr, cker_output_shape, output_ptr); -} - -void invokeInstanceNorm(const ExecEnv *env, const ir::Operation &node) -{ - const auto &instancenorm_node = - nnfw::misc::polymorphic_downcast(node); - - const auto input_index = node.getInputs().at(instancenorm_node.INPUT); - const auto gamma_index = node.getInputs().at(instancenorm_node.GAMMA); - const auto beta_index = node.getInputs().at(instancenorm_node.BETA); - const auto out_index = node.getOutputs().at(0); - const auto input_tensor = env->tensorAt(input_index); - const auto gamma_tensor = env->tensorAt(gamma_index); - const auto beta_tensor = env->tensorAt(beta_index); - const auto out_tensor = env->tensorAt(out_index); - const auto data_type = input_tensor->data_type(); - - if (data_type == ir::DataType::FLOAT32) - { - invoke(input_tensor, gamma_tensor, beta_tensor, out_tensor, instancenorm_node.param()); - } - else - { - throw std::runtime_error{"NYI: Unsupported data type"}; - } -} -} // namespace instancenorm - -OpKernel *getInstanceNorm() -{ - static OpKernel kernel = {instancenorm::prepareInstanceNorm, instancenorm::invokeInstanceNorm}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/OperationUtil.h b/runtime/onert/core/src/interp/operations/OperationUtil.h deleted file mode 100644 index 2fdf098f0..000000000 --- a/runtime/onert/core/src/interp/operations/OperationUtil.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_INTERP_OPERATIONS_OPERATION_UTILS_H_ -#define __ONERT_INTERP_OPERATIONS_OPERATION_UTILS_H_ - -#include "ir/Shape.h" -#include "ir/InternalType.h" -#include "ir/Padding.h" - -#include -#include - -namespace onert -{ -namespace interp -{ - -inline nnfw::cker::Shape convertShape(const ir::Shape &shape) -{ - auto dimensions = std::vector(shape.dims().begin(), shape.dims().end()); - - std::vector raw_shape; - raw_shape.resize(dimensions.size()); - - for (uint32_t i = 0; i < dimensions.size(); ++i) - { - raw_shape[i] = dimensions[i]; - } - - return nnfw::cker::GetShape(raw_shape); -} - -inline nnfw::cker::Shape convertExtendShape(const ir::Shape &shape) -{ - auto dimensions = std::vector(shape.dims().begin(), shape.dims().end()); - - const int32_t extended_rank = 4; - int32_t raw_shape[extended_rank]; - uint32_t start = extended_rank - dimensions.size(); - - for (uint32_t i = 0; i < extended_rank; ++i) - { - if (i < start) - { - raw_shape[i] = 1; - } - else - { - raw_shape[i] = dimensions[i - start]; - } - } - - return nnfw::cker::Shape(extended_rank, raw_shape); -} - -inline nnfw::cker::FusedActivationFunctionType -convertActivationType(const ir::Activation activation) -{ - switch (activation) - { - case ir::Activation::NONE: - return nnfw::cker::FusedActivationFunctionType::kNone; - case ir::Activation::RELU: - return nnfw::cker::FusedActivationFunctionType::kRelu; - case ir::Activation::RELU1: - return nnfw::cker::FusedActivationFunctionType::kRelu1; - case ir::Activation::RELU6: - return nnfw::cker::FusedActivationFunctionType::kRelu6; - default: - throw std::runtime_error{"CPU backend: Cannot convert activation type"}; - } -} - -template -void calculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max) -{ - if (activation == ir::Activation::RELU) - { - *activation_min = 0; - *activation_max = std::numeric_limits::max(); - } - else if (activation == ir::Activation::RELU6) - { - *activation_min = 0; - *activation_max = 6; - } - else if (activation == ir::Activation::RELU1) - { - *activation_min = -1; - *activation_max = 1; - } - else if (activation == ir::Activation::NONE) - { - *activation_min = std::numeric_limits::lowest(); - *activation_max = std::numeric_limits::max(); - } - else - { - throw std::runtime_error{"Unsupported activation type"}; - } -} - -inline ir::Shape calcBroadcastShape(const ir::Shape &lhs, const ir::Shape &rhs, bool &success) -{ - int lhs_rank = lhs.rank(); - int rhs_rank = rhs.rank(); - - int out_rank = (lhs_rank > rhs_rank ? lhs_rank : rhs_rank); - ir::Shape out_shape(out_rank); - - int lhs_idim = lhs_rank - 1; - int rhs_idim = rhs_rank - 1; - success = true; - for (int out_idim = out_rank - 1; out_idim >= 0; out_idim--) - { - if (lhs_idim == -1 && rhs_idim == -1) - { - // invalid result - success = false; - break; - } - - if (lhs_idim == -1) - { - out_shape.dim(out_idim) = rhs.dim(rhs_idim); - rhs_idim--; - } - else if (rhs_idim == -1) - { - out_shape.dim(out_idim) = lhs.dim(lhs_idim); - lhs_idim--; - } - else - { - if (lhs.dim(lhs_idim) == rhs.dim(rhs_idim)) - { - out_shape.dim(out_idim) = lhs.dim(lhs_idim); - lhs_idim--; - rhs_idim--; - } - else if (lhs.dim(lhs_idim) == 1) - { - out_shape.dim(out_idim) = rhs.dim(rhs_idim); - lhs_idim--; - rhs_idim--; - } - else if (rhs.dim(rhs_idim) == 1) - { - out_shape.dim(out_idim) = lhs.dim(lhs_idim); - lhs_idim--; - rhs_idim--; - } - else - { - // invalid result - success = false; - break; - } - } - } - - if (lhs_idim != -1 || rhs_idim != -1) - { - // invalid result - success = false; - } - return out_shape; -} - -inline nnfw::cker::PaddingType convertPaddingType(ir::PaddingType ir_padding_type) -{ - switch (ir_padding_type) - { - case ir::PaddingType::EXPLICIT: - return nnfw::cker::PaddingType::kNone; - case ir::PaddingType::SAME: - return nnfw::cker::PaddingType::kSame; - case ir::PaddingType::VALID: - return nnfw::cker::PaddingType::kValid; - default: - throw std::runtime_error("Wrong padding type."); - break; - } -} - -} // namespace interp -} // namespace onert - -#endif // __ONERT_INTERP_OPERATIONS_OPERATION_UTILS_H_ diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc deleted file mode 100644 index 3db0828eb..000000000 --- a/runtime/onert/core/src/interp/operations/Pad.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/Pad.h" - -#include - -namespace onert -{ -namespace interp -{ -namespace -{ - -void preparePad(ExecEnv *env, const ir::Operation &node) -{ - const auto input_index = node.getInputs().at(ir::operation::Pad::INPUT); - const auto output_index = node.getOutputs().at(0); - - const auto input_tensor = env->tensorAt(input_index); - - const auto output_info = env->graph().operands().at(output_index).info(); - - // Check shape and type lhs is same with rhs - // TODO Util function to compare TensorInfo - if (output_info.total_size() == 0) - { - throw std::runtime_error{"Interp(Pad): NYI unspecified output shape"}; - } - else - { - env->allocateIfNeeded(output_index, output_info); - } - - const auto output_tensor = env->tensorAt(output_index); - if (input_tensor->data_type() != output_tensor->data_type()) - { - throw std::runtime_error{"Interp(Pad): Invalid output type"}; - } -} - -void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITensor *output_tensor) -{ - const auto input_buffer = input_tensor->bufferRO(); - const auto pad_buffer = pad_tensor->bufferRO(); - auto output_buffer = output_tensor->buffer(); - - int32_t pad_rank = pad_tensor->getShape().dim(0); - - const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape()); - const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape()); - const float *input_ptr = reinterpret_cast(input_buffer); - const int32_t *pad_ptr = reinterpret_cast(pad_buffer); - float *output_ptr = reinterpret_cast(output_buffer); - - nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, - output_ptr, nullptr); -} - -void invokePad(const ExecEnv *env, const ir::Operation &node) -{ - const auto input_index = node.getInputs().at(ir::operation::Pad::INPUT); - const auto pad_index = node.getInputs().at(ir::operation::Pad::PAD); - const auto output_index = node.getOutputs().at(0); - - const auto input_tensor = env->tensorAt(input_index); - const auto pad_tensor = env->tensorAt(pad_index); - const auto output_tensor = env->tensorAt(output_index); - - const auto data_type = input_tensor->data_type(); - - if (data_type == ir::DataType::FLOAT32) - { - invoke(input_tensor, pad_tensor, output_tensor); - } - else - { - throw std::runtime_error{"Interp(Pad): NYI - Unsupported data type"}; - } -} -} // namespace - -OpKernel *getPad() -{ - static OpKernel kernel = {preparePad, invokePad}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/Pool2D.cc b/runtime/onert/core/src/interp/operations/Pool2D.cc deleted file mode 100644 index 3935d4756..000000000 --- a/runtime/onert/core/src/interp/operations/Pool2D.cc +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/Pool2D.h" -#include "util/ShapeInference.h" -#include "util/Utils.h" - -#include -#include -#include - -namespace onert -{ -namespace interp -{ -namespace pool2d -{ - -void preparePool2D(ExecEnv *env, const ir::Operation &node) -{ - const auto &pool_node = nnfw::misc::polymorphic_downcast(node); - const auto in_index = node.getInputs().at(pool_node.INPUT); - const auto out_index = node.getOutputs().at(0); - - const auto in_tensor = env->tensorAt(in_index); - UNUSED_RELEASE(in_tensor); - - assert(in_tensor->getShape().rank() == 4); - - const auto output_info = env->graph().operands().at(out_index).info(); - if (output_info.total_size() == 0) - { - // Handle unspecified output shape - const auto infered_output_shape = - shape_inference::inferPoolShape(in_tensor->tensorInfo().shape(), pool_node.param()); - env->allocateIfNeeded( - out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo())); - } - else - { - env->allocateIfNeeded(out_index, output_info); - } - - auto out_tensor = env->tensorAt(out_index); - UNUSED_RELEASE(out_tensor); - - // Handle same ifm & ofm data type only - assert(in_tensor->data_type() == out_tensor->data_type()); - assert(out_tensor->getShape().rank() == 4); -} - -template -void invoke(const nnfw::cker::PoolParams ¶ms, const nnfw::cker::Shape &in_shape, - const T *in_ptr, const nnfw::cker::Shape &out_shape, T *out_ptr, - ir::operation::Pool2D::PoolType op_type) -{ - switch (op_type) - { - case ir::operation::Pool2D::PoolType::AVG: - nnfw::cker::AveragePool(params, in_shape, in_ptr, out_shape, out_ptr); - break; - case ir::operation::Pool2D::PoolType::MAX: - nnfw::cker::MaxPool(params, in_shape, in_ptr, out_shape, out_ptr); - break; - default: - throw std::runtime_error{"Interp(Pool2D): NYI unsupported operation"}; - break; - } -} - -void invokePool2DOps(const ExecEnv *env, const ir::Operation &node) -{ - const auto &pool_node = nnfw::misc::polymorphic_downcast(node); - - const auto in_index = node.getInputs().at(0); - const auto out_index = node.getOutputs().at(0); - - // Check lhs shape is same with rhs (with broadcast) - const auto in_tensor = env->tensorAt(in_index); - const auto out_tensor = env->tensorAt(out_index); - - // TODO support NCHW frontend - const auto ifm_shape = in_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - const auto ofm_shape = out_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - const auto param = pool_node.param(); - const auto padding = - ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, param.kw, param.kh); - // Calculate - nnfw::cker::PoolParams cker_param; - cker_param.filter_width = param.kw; - cker_param.filter_height = param.kh; - cker_param.padding_values.width = padding.left; - cker_param.padding_values.height = padding.top; - cker_param.stride_width = param.stride.horizontal; - cker_param.stride_height = param.stride.vertical; - - const auto data_type = in_tensor->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - calculateActivationRange(param.activation, &cker_param.float_activation_min, - &cker_param.float_activation_max); - - const auto in_shape = convertShape(in_tensor->tensorInfo().shape()); - const auto out_shape = convertShape(out_tensor->tensorInfo().shape()); - const float *in_ptr = reinterpret_cast(in_tensor->bufferRO()); - float *out_ptr = reinterpret_cast(out_tensor->buffer()); - // Now, invoke() supports only Pool2D in float - invoke(cker_param, in_shape, in_ptr, out_shape, out_ptr, param.op_type); - } - else - { - throw std::runtime_error{"NYI: Support float only"}; - } -} -} // namespace pool2d - -OpKernel *getPool2D() -{ - static OpKernel kernel = {pool2d::preparePool2D, pool2d::invokePool2DOps}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/Reshape.cc b/runtime/onert/core/src/interp/operations/Reshape.cc deleted file mode 100644 index 1de5a5762..000000000 --- a/runtime/onert/core/src/interp/operations/Reshape.cc +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../Registration.h" - -namespace onert -{ -namespace interp -{ -namespace -{ - -void prepare(ExecEnv *env, const ir::Operation &node) -{ - const auto in_index = node.getInputs().at(0); - const auto out_index = node.getOutputs().at(0); - - // Unspecified shape is not supported in operation node spec now - const auto output_info = env->graph().operands().at(out_index).info(); - env->allocateAndShareIfNeeded(out_index, output_info, in_index); - - assert(output_info.total_size() == env->graph().operands().at(in_index).info().total_size()); -} - -void invoke(const ExecEnv *env, const ir::Operation &node) -{ - const auto in_index = node.getInputs().at(0); - const auto out_index = node.getOutputs().at(0); - - if (env->tensorAt(in_index)->bufferRO() == env->tensorAt(out_index)->bufferRO()) - { - // Same data - return; - } - - const auto output_info = env->graph().operands().at(out_index).info(); - memcpy(env->tensorAt(out_index)->buffer(), env->tensorAt(in_index)->bufferRO(), - output_info.total_size()); -} - -} // namespace - -OpKernel *getReshape() -{ - static OpKernel kernel = {prepare, invoke}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/Softmax.cc b/runtime/onert/core/src/interp/operations/Softmax.cc deleted file mode 100644 index 8be2f2210..000000000 --- a/runtime/onert/core/src/interp/operations/Softmax.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/Softmax.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace -{ - -void prepareSoftMax(ExecEnv *env, const ir::Operation &node) -{ - const auto in_index = node.getInputs().at(0); - const auto out_index = node.getOutputs().at(0); - - const auto in_tensor = env->tensorAt(in_index); - UNUSED_RELEASE(in_tensor); - - assert((in_tensor->getShape().rank() == 4) || (in_tensor->getShape().rank() == 2)); - - // Output shape should be same with input - // Output type is pre-defined in model - const auto output_shape = env->graph().operands().at(in_index).info().shape(); - const auto output_type = env->graph().operands().at(out_index).info().typeInfo(); - - const auto output_info = ir::OperandInfo::createStaticInfo(output_shape, output_type); - env->allocateIfNeeded(out_index, output_info); - - auto out_tensor = env->tensorAt(out_index); - UNUSED_RELEASE(out_tensor); - - // Check output shape is same with input - assert(out_tensor->getShape().rank() == out_tensor->getShape().rank()); - for (int32_t i = 0; i < in_tensor->getShape().rank(); i++) - { - assert(in_tensor->getShape().dim(i) == out_tensor->getShape().dim(i)); - } -} - -void invoke(const ITensor *in_tensor, const ITensor *out_tensor, - const ir::operation::Softmax::Param ¶m) -{ - const float *in_ptr = reinterpret_cast(in_tensor->bufferRO()); - float *out_ptr = reinterpret_cast(out_tensor->buffer()); - - float beta = param.beta; - - if (in_tensor->getShape().rank() == 2) - { - uint32_t batch_size = in_tensor->getShape().dim(0); - uint32_t input_size = in_tensor->getShape().dim(1); - - nnfw::cker::Softmax(in_ptr, input_size, batch_size, beta, out_ptr); - } - else if (in_tensor->getShape().rank() == 4) - { - const auto in_shape = convertShape(in_tensor->tensorInfo().shape()); - const auto out_shape = convertShape(out_tensor->tensorInfo().shape()); - - nnfw::cker::SoftmaxParams cker_param; - cker_param.beta = beta; - - nnfw::cker::Softmax(cker_param, in_shape, in_ptr, out_shape, out_ptr); - } - else - { - throw std::runtime_error{"Unsuported input dimension: support 2D or 4D"}; - } -} - -void invokeSoftMax(const ExecEnv *env, const ir::Operation &node) -{ - const auto &softmax_node = nnfw::misc::polymorphic_downcast(node); - - const auto in_index = node.getInputs().at(0); - const auto out_index = node.getOutputs().at(0); - - const auto in_tensor = env->tensorAt(in_index); - const auto out_tensor = env->tensorAt(out_index); - - const auto in_data_type = in_tensor->data_type(); - const auto out_data_type = out_tensor->data_type(); - if ((in_data_type == ir::DataType::FLOAT32) && (out_data_type == ir::DataType::FLOAT32)) - { - invoke(in_tensor, out_tensor, softmax_node.param()); - } - else - { - throw std::runtime_error{"NYI: Support float32 only"}; - } -} - -} // namespace - -OpKernel *getSoftmax() -{ - static OpKernel kernel = {prepareSoftMax, invokeSoftMax}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/interp/operations/TransposeConv.cc b/runtime/onert/core/src/interp/operations/TransposeConv.cc deleted file mode 100644 index 59c8e8cdf..000000000 --- a/runtime/onert/core/src/interp/operations/TransposeConv.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "OperationUtil.h" -#include "../Registration.h" - -#include "ir/operation/TransposeConv.h" - -#include -#include - -namespace onert -{ -namespace interp -{ -namespace -{ - -void prepareTransposeConv(ExecEnv *env, const ir::Operation &node) -{ - const auto ifm_index = node.getInputs().at(ir::operation::TransposeConv::INPUT); - const auto ker_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL); - const auto ofm_shape_index = node.getInputs().at(ir::operation::TransposeConv::OUTPUT_SHAPE); - const auto ofm_index = node.getOutputs().at(0); - - const auto ifm_tensor = env->tensorAt(ifm_index); - const auto ker_tensor = env->tensorAt(ker_index); - const auto ofm_shape_tensor = env->tensorAt(ofm_shape_index); - - assert(ifm_tensor->getShape().rank() == 4); - assert(ker_tensor->getShape().rank() == 4); - assert(ofm_shape_tensor->getShape().rank() == 1); - - UNUSED_RELEASE(ifm_tensor); - UNUSED_RELEASE(ker_tensor); - UNUSED_RELEASE(ofm_shape_tensor); - - const auto output_info = env->graph().operands().at(ofm_index).info(); - if (output_info.total_size() == 0) - { - // TODO: Handle unspecified output shape - throw std::runtime_error{"Interp(TConv): NYI unspecified output shape"}; - } - else - { - env->allocateIfNeeded(ofm_index, output_info); - } - - auto ofm_tensor = env->tensorAt(ofm_index); - UNUSED_RELEASE(ofm_tensor); - - // Handle same ifm & ofm data type only - if (ifm_tensor->data_type() != ofm_tensor->data_type()) - { - throw std::runtime_error{"Interp(TConv): Different I/O data dype"}; - } - - if (ofm_tensor->getShape().rank() != 4) - { - throw std::runtime_error{"Interp(TConv): Invalid output rank"}; - } -} - -void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *ofm_tensor, - const ir::operation::TransposeConv::Param ¶m) -{ - const auto ifm_shape = ifm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - const auto ofm_shape = ofm_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC); - // Kernel format is [depth_out, kernel_height, kernel_width, depth_in]. - const auto ker_shape = ker_tensor->tensorInfo().shape(); - const auto ker_height = ker_shape.dim(1); - const auto ker_width = ker_shape.dim(2); - const auto padding = - ir::calculatePadding(param.padding, ofm_shape, ifm_shape, param.stride, ker_width, ker_height); - - nnfw::cker::TransposeConvParams cker_param; - cker_param.padding_values.width = padding.left; - cker_param.padding_values.height = padding.top; - cker_param.stride_width = param.stride.horizontal; - cker_param.stride_height = param.stride.vertical; - cker_param.dilation_width_factor = 1; - cker_param.dilation_height_factor = 1; - - const auto cker_ifm_shape = convertShape(ifm_tensor->tensorInfo().shape()); - const auto cker_ker_shape = convertShape(ker_tensor->tensorInfo().shape()); - const auto cker_ofm_shape = convertShape(ofm_tensor->tensorInfo().shape()); - const float *ifm_ptr = reinterpret_cast(ifm_tensor->bufferRO()); - const float *ker_ptr = reinterpret_cast(ker_tensor->bufferRO()); - float *ofm_ptr = reinterpret_cast(ofm_tensor->buffer()); - - nnfw::cker::TransposeConv(cker_param, cker_ifm_shape, ifm_ptr, cker_ker_shape, ker_ptr, - cker_ofm_shape, ofm_ptr); -} - -void invokeTransposeConv(const ExecEnv *env, const ir::Operation &node) -{ - const auto &tconv_node = - nnfw::misc::polymorphic_downcast(node); - - const auto ifm_index = node.getInputs().at(ir::operation::TransposeConv::INPUT); - const auto ker_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL); - const auto ofm_index = node.getOutputs().at(0); - - const auto ifm_tensor = env->tensorAt(ifm_index); - const auto ker_tensor = env->tensorAt(ker_index); - const auto ofm_tensor = env->tensorAt(ofm_index); - - const auto data_type = ifm_tensor->data_type(); - if (data_type == ir::DataType::FLOAT32) - { - invoke(ifm_tensor, ker_tensor, ofm_tensor, tconv_node.param()); - } - else - { - throw std::runtime_error{"Interp(TConv): Support float32 only"}; - } -} - -} // namespace - -OpKernel *getTransposeConv() -{ - static OpKernel kernel = {prepareTransposeConv, invokeTransposeConv}; - return &kernel; -} - -} // namespace interp -} // namespace onert diff --git a/runtime/onert/core/src/ir/Shape.cc b/runtime/onert/core/src/ir/Shape.cc index a7c50a266..e4e4c154b 100644 --- a/runtime/onert/core/src/ir/Shape.cc +++ b/runtime/onert/core/src/ir/Shape.cc @@ -26,10 +26,10 @@ namespace onert namespace ir { -int32_t const Shape::UNSPECIFIED_DIM = -1; +int32_t const Shape::kUnspecifiedDim = -1; // NNFW_MAX_RANK is 6 -int32_t const Shape::MAX_RANK = 6; +int32_t const Shape::kMaxRank = 6; FeatureShape Shape::asFeature(Layout layout) const { @@ -80,7 +80,7 @@ uint64_t Shape::num_elements() const { // if dimension is 0, it means unspecified and cannot calculate the total number of elements if (std::any_of(_dimensions.begin(), _dimensions.end(), - [](const int32_t &v) { return v == UNSPECIFIED_DIM; })) + [](const int32_t &v) { return v == kUnspecifiedDim; })) throw std::runtime_error("num_elements() cannot calculate when any dimension is unspecified"); return std::accumulate(_dimensions.cbegin(), _dimensions.cend(), UINT64_C(1), @@ -89,7 +89,7 @@ uint64_t Shape::num_elements() const Shape permuteShape(const Shape &shape, Layout from, Layout to) { - assert(shape.rank() <= Shape::MAX_RANK); + assert(shape.rank() <= Shape::kMaxRank); Shape ret{shape}; if (from == to) return ret; diff --git a/runtime/onert/core/src/ir/Shape.test.cc b/runtime/onert/core/src/ir/Shape.test.cc index afdb29254..4788522d3 100644 --- a/runtime/onert/core/src/ir/Shape.test.cc +++ b/runtime/onert/core/src/ir/Shape.test.cc @@ -48,7 +48,7 @@ TEST(ShapeTest, neg_basic_test) onert::ir::Shape shape(2); shape.dim(0) = 1; - shape.dim(1) = onert::ir::Shape::UNSPECIFIED_DIM; + shape.dim(1) = onert::ir::Shape::kUnspecifiedDim; ASSERT_EQ(shape.rank(), 2); ASSERT_EQ(onert::ir::rankMaybeUnspecified(shape), false); diff --git a/runtime/onert/core/src/util/ChromeTracingEventWriter.cc b/runtime/onert/core/src/util/ChromeTracingEventWriter.cc index d868efedf..c3f5179df 100644 --- a/runtime/onert/core/src/util/ChromeTracingEventWriter.cc +++ b/runtime/onert/core/src/util/ChromeTracingEventWriter.cc @@ -168,7 +168,7 @@ void ChromeTracingWriter::flush(const std::vector _os << "{\n"; _os << " " << quote("traceEvents") << ": [\n"; - for (auto &recorder : recorders) + for (const auto &recorder : recorders) { flushOneRecord(*recorder); } @@ -180,7 +180,7 @@ void ChromeTracingWriter::flush(const std::vector void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder) { - for (auto &evt : recorder.duration_events()) + for (const auto &evt : recorder.duration_events()) { const std::string name = getLabel(*evt); const std::string tid = getTid(*evt); @@ -188,7 +188,7 @@ void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder) _os << " " << object(*evt, name, tid) << ",\n"; } - for (auto &evt : recorder.counter_events()) + for (const auto &evt : recorder.counter_events()) { _os << " " << object(evt) << ",\n"; } diff --git a/runtime/onert/core/src/util/MDTableEventWriter.cc b/runtime/onert/core/src/util/MDTableEventWriter.cc index 7a8b9f234..13dab5b77 100644 --- a/runtime/onert/core/src/util/MDTableEventWriter.cc +++ b/runtime/onert/core/src/util/MDTableEventWriter.cc @@ -32,7 +32,7 @@ namespace void writeMDTableRow(std::ostream &os, const std::vector &list) { os << "| "; - for (auto &key : list) + for (const auto &key : list) { os << key << " | "; } @@ -227,7 +227,7 @@ struct MDTableBuilder MDTableBuilder &build() { - for (auto &it : divideGraph()) + for (const auto &it : divideGraph()) { size_t begin_idx = it.first; size_t end_idx = it.second; @@ -314,7 +314,7 @@ struct MDTableBuilder graph.end_ts = std::stoull(_duration_events[end_idx]->ts); graph.setOperations(name_to_op); - for (auto &arg : _duration_events[end_idx]->args) + for (const auto &arg : _duration_events[end_idx]->args) { if (arg.first == "session") graph.session_index = arg.second; @@ -358,7 +358,7 @@ struct MDTableBuilder void MDTableWriter::flush(const std::vector> &records) { - for (auto &recorder : records) + for (const auto &recorder : records) { MDTableBuilder(recorder->duration_events(), recorder->counter_events()).build().write(_os); } diff --git a/runtime/onert/core/src/util/SNPEEventWriter.cc b/runtime/onert/core/src/util/SNPEEventWriter.cc index 4dea6d16c..87bbfc662 100644 --- a/runtime/onert/core/src/util/SNPEEventWriter.cc +++ b/runtime/onert/core/src/util/SNPEEventWriter.cc @@ -103,9 +103,9 @@ void SNPEWriter::flush(const std::vector> &record // Memory { std::unordered_map mem_stats; - for (auto &recorder : recorders) + for (const auto &recorder : recorders) { - for (auto &evt : recorder->counter_events()) + for (const auto &evt : recorder->counter_events()) { auto &mem_stat = mem_stats[evt.name]; uint64_t val = std::stoull(evt.values.at("value")); @@ -114,7 +114,7 @@ void SNPEWriter::flush(const std::vector> &record } auto &mem = exec_data["memory"] = Json::Value{Json::objectValue}; - for (auto &kv : mem_stats) + for (const auto &kv : mem_stats) { auto &key = kv.first; auto &val = kv.second; @@ -132,9 +132,9 @@ void SNPEWriter::flush(const std::vector> &record // 2D keys : stats[tid][name] std::unordered_map> stats; std::unordered_map> begin_timestamps; - for (auto &recorder : recorders) + for (const auto &recorder : recorders) { - for (auto &evt : recorder->duration_events()) + for (const auto &evt : recorder->duration_events()) { std::string evt_name = getLabel(*evt); std::string evt_tid = getBackend(*evt); @@ -160,17 +160,17 @@ void SNPEWriter::flush(const std::vector> &record } } - for (auto &kv : begin_timestamps) - for (auto &kv2 : kv.second) + for (const auto &kv : begin_timestamps) + for (const auto &kv2 : kv.second) if (kv2.second != 0) throw std::runtime_error{"Invalid Data - B and E pair does not match."}; - for (auto &kv : stats) + for (const auto &kv : stats) { - auto &tid = kv.first; - auto &map = kv.second; + const auto &tid = kv.first; + const auto &map = kv.second; auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue}; - for (auto &kv : map) + for (const auto &kv : map) { auto &name = kv.first; auto &val = kv.second; diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc index 173de29c7..862d6f725 100644 --- a/runtime/onert/core/src/util/ShapeInference.cc +++ b/runtime/onert/core/src/util/ShapeInference.cc @@ -608,12 +608,12 @@ ir::Shape inferReshapeShape(const int32_t *shape_buf, const int32_t shape_num_el const size_t total_num_elements) { ir::Shape ret(shape_num_elements); - int32_t flatten_dim = ir::Shape::UNSPECIFIED_DIM; + int32_t flatten_dim = ir::Shape::kUnspecifiedDim; for (int32_t i = 0; i < shape_num_elements; ++i) { if (shape_buf[i] < 0) { - if (flatten_dim != ir::Shape::UNSPECIFIED_DIM) + if (flatten_dim != ir::Shape::kUnspecifiedDim) throw std::runtime_error("Reshape: 2nd param has special dim(for flatten) more than twice"); flatten_dim = i; ret.dim(i) = 1; @@ -623,7 +623,7 @@ ir::Shape inferReshapeShape(const int32_t *shape_buf, const int32_t shape_num_el ret.dim(i) = shape_buf[i]; } } - if (flatten_dim != ir::Shape::UNSPECIFIED_DIM) + if (flatten_dim != ir::Shape::kUnspecifiedDim) ret.dim(flatten_dim) = total_num_elements / ret.num_elements(); // Check reshapable diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h index cf080abbc..878a594cc 100644 --- a/runtime/onert/frontend/base_loader/include/base_loader.h +++ b/runtime/onert/frontend/base_loader/include/base_loader.h @@ -68,8 +68,7 @@ public: * @param model reference to model */ explicit BaseLoader(std::unique_ptr &model) - : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _model(model), _domain_model{nullptr}, - _tensor_names(std::make_shared>()) + : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _model(model), _domain_model{nullptr} { _use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA); } @@ -194,7 +193,7 @@ protected: const Model *_domain_model; // Maps Tensor indices to onert Operands. std::vector _tensor_to_operand; - std::shared_ptr> _tensor_names; + std::unordered_map _tensor_names; // Verifier std::unique_ptr _verifier; // Boolean flag to use MMAPED_DATA @@ -411,7 +410,7 @@ ir::OperandIndex BaseLoader::loadOperand(const Tensor *tensor, ir: subg.setOperandValue(operand_index, std::move(data_obj)); } - _tensor_names->emplace(operand_index, tensor->name()->str()); + _tensor_names.emplace(operand_index, tensor->name()->str()); // Variable if (tensor->is_variable()) @@ -1297,8 +1296,8 @@ void BaseLoader::loadIf(const Operator *op, ir::Graph &subg) verifySubgraphIndex(else_index); ir::operation::If::Param param; - param.then_subg_index = ir::SubgraphIndex{static_cast(then_index)}; - param.else_subg_index = ir::SubgraphIndex{static_cast(else_index)}; + param.then_subg_index = ir::SubgraphIndex{static_cast(then_index)}; + param.else_subg_index = ir::SubgraphIndex{static_cast(else_index)}; loadOperationTo(op, subg, param); } @@ -1314,8 +1313,8 @@ void BaseLoader::loadWhile(const Operator *op, ir::Graph &subg) verifySubgraphIndex(body_index); ir::operation::While::Param param; - param.cond_subg_index = ir::SubgraphIndex{static_cast(cond_index)}; - param.body_subg_index = ir::SubgraphIndex{static_cast(body_index)}; + param.cond_subg_index = ir::SubgraphIndex{static_cast(cond_index)}; + param.body_subg_index = ir::SubgraphIndex{static_cast(body_index)}; loadOperationTo(op, subg, param); } @@ -1663,6 +1662,12 @@ void BaseLoader::loadOperation(const Operator *op, ir::Graph &subg case BuiltinOperator::BuiltinOperator_DEPTH_TO_SPACE: loadDepthToSpace(op, subg); return; + case BuiltinOperator::BuiltinOperator_EMBEDDING_LOOKUP: + loadOperationTo(op, subg); + return; + case BuiltinOperator::BuiltinOperator_HASHTABLE_LOOKUP: + loadOperationTo(op, subg); + return; default: throw std::runtime_error( std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op))); @@ -1682,10 +1687,15 @@ template void BaseLoader::loadModel() // Load subgraphs and map operations on subgraph const auto subgraphs = _domain_model->subgraphs(); auto model = std::make_unique(); - for (uint32_t subgraph_index = 0; subgraph_index < subgraphs->size(); ++subgraph_index) + if (subgraphs->size() - 1 > ir::SubgraphIndex::max()) + throw std::runtime_error{"The number of subgraphs cannot exceed " + + std::to_string(ir::SubgraphIndex::max() + 1)}; + for (uint16_t subgraph_index = 0; subgraph_index < subgraphs->size(); ++subgraph_index) { auto subg = loadSubgraph((*_domain_model->subgraphs())[subgraph_index]); - model->push(ir::SubgraphIndex{subgraph_index}, std::move(subg)); + // NOTE: Used () instead of {}, which does not check narrowing. + // It is okay since overflow is checked the above if-statement. + model->push(ir::SubgraphIndex(subgraph_index), std::move(subg)); } _model = std::move(model); } diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc index 5abcc9cd0..5bf626d6c 100644 --- a/runtime/onert/frontend/circle/src/circle_loader.cc +++ b/runtime/onert/frontend/circle/src/circle_loader.cc @@ -112,13 +112,13 @@ private: for (const std::int32_t input_ind : *circle_subg->inputs()) { subg->addInput(tensorIdxToOperandIdx(input_ind), - _tensor_names->at(_tensor_to_operand[input_ind])); + _tensor_names.at(_tensor_to_operand[input_ind])); } // Set outputs for (const std::int32_t output_ind : *circle_subg->outputs()) { subg->addOutput(tensorIdxToOperandIdx(output_ind), - _tensor_names->at(_tensor_to_operand[output_ind])); + _tensor_names.at(_tensor_to_operand[output_ind])); } // Create operations for (const auto *op : *circle_subg->operators()) diff --git a/runtime/onert/frontend/nnapi/CMakeLists.txt b/runtime/onert/frontend/nnapi/CMakeLists.txt index dafd84ccf..b66b32e89 100644 --- a/runtime/onert/frontend/nnapi/CMakeLists.txt +++ b/runtime/onert/frontend/nnapi/CMakeLists.txt @@ -24,4 +24,4 @@ target_link_libraries(test_onert_frontend_nnapi PRIVATE ${LIB_ONERT} dl) target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest) target_link_libraries(test_onert_frontend_nnapi PRIVATE gtest_main) -install(TARGETS test_onert_frontend_nnapi DESTINATION unittest_standalone) +install(TARGETS test_onert_frontend_nnapi DESTINATION unittest) diff --git a/runtime/onert/frontend/nnapi/compilation.cc b/runtime/onert/frontend/nnapi/compilation.cc index 871c040ef..2c56f061a 100644 --- a/runtime/onert/frontend/nnapi/compilation.cc +++ b/runtime/onert/frontend/nnapi/compilation.cc @@ -58,7 +58,7 @@ int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation *compilation) return ANEURALNETWORKS_UNEXPECTED_NULL; } - if (compilation->state() != ::onert::compiler::State::CREATED) + if (compilation->isFinished()) { VERBOSE(NNAPI::Compilation) << "finish: Already finished" << std::endl; return ANEURALNETWORKS_BAD_STATE; @@ -87,7 +87,7 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation *compila return ANEURALNETWORKS_UNEXPECTED_NULL; } - if (compilation->state() != ::onert::compiler::State::CREATED) + if (compilation->isFinished()) { VERBOSE(NNAPI::Compilation) << "setPreference: Already finished" << std::endl; return ANEURALNETWORKS_BAD_STATE; diff --git a/runtime/onert/frontend/nnapi/execution.cc b/runtime/onert/frontend/nnapi/execution.cc index 19636a84d..4e1a985f3 100644 --- a/runtime/onert/frontend/nnapi/execution.cc +++ b/runtime/onert/frontend/nnapi/execution.cc @@ -37,7 +37,7 @@ int ANeuralNetworksExecution_create(ANeuralNetworksCompilation *compilation, return ANEURALNETWORKS_UNEXPECTED_NULL; } - std::shared_ptr executors; + std::shared_ptr executors; compilation->publish(executors); diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc index bb247b97f..3b5edc180 100644 --- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc +++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.cc @@ -26,9 +26,7 @@ ANeuralNetworksCompilation::ANeuralNetworksCompilation(const ANeuralNetworksMode _compiler{std::make_shared(_model, *_coptions)} { if (model->allowedToFp16()) - { - _compiler->enableToFp16(); - } + _coptions->enableToFp16(); } bool ANeuralNetworksCompilation::finish() noexcept @@ -36,6 +34,7 @@ bool ANeuralNetworksCompilation::finish() noexcept try { _artifact = _compiler->compile(); + _compiler = nullptr; } catch (const std::exception &e) { diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h index dff5c6dc6..3898f1d5e 100644 --- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h +++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksCompilation.h @@ -22,7 +22,7 @@ #include "compiler/Compiler.h" #include "ir/Graph.h" #include "ir/Model.h" -#include "exec/Executors.h" +#include "exec/IExecutors.h" #include "util/TracingCtx.h" struct ANeuralNetworksCompilation @@ -32,9 +32,9 @@ public: public: bool finish() noexcept; + bool isFinished() noexcept { return _compiler == nullptr; } - onert::compiler::State state(void) noexcept { return _compiler->state(); } - void publish(std::shared_ptr &executors) noexcept + void publish(std::shared_ptr &executors) noexcept { executors = _artifact ? _artifact->_executors : nullptr; } diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h index 110c7cd55..6fbc4c2e0 100644 --- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h +++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h @@ -26,7 +26,7 @@ struct ANeuralNetworksExecution { public: - ANeuralNetworksExecution(const std::shared_ptr &executors) + ANeuralNetworksExecution(const std::shared_ptr &executors) : _execution{std::make_shared(executors)} { // DO NOTHING diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc index fe69e4e2a..dc8564632 100644 --- a/runtime/onert/frontend/tflite/src/tflite_loader.cc +++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc @@ -99,13 +99,13 @@ private: for (const std::int32_t input_ind : *tflite_subg->inputs()) { subg->addInput(tensorIdxToOperandIdx(input_ind), - _tensor_names->at(_tensor_to_operand[input_ind])); + _tensor_names.at(_tensor_to_operand[input_ind])); } // Set outputs for (const std::int32_t output_ind : *tflite_subg->outputs()) { subg->addOutput(tensorIdxToOperandIdx(output_ind), - _tensor_names->at(_tensor_to_operand[output_ind])); + _tensor_names.at(_tensor_to_operand[output_ind])); } // Create operations for (const auto *op : *tflite_subg->operators()) @@ -113,7 +113,6 @@ private: loadOperation(op, *subg); } - subg->setTensorName(_tensor_names); subg->verify(); return subg; -- cgit v1.2.3