diff options
Diffstat (limited to 'runtime/onert/core/src/compiler/ExecutorFactory.cc')
-rw-r--r-- | runtime/onert/core/src/compiler/ExecutorFactory.cc | 501 |
1 files changed, 501 insertions, 0 deletions
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc new file mode 100644 index 000000000..bb325ffbc --- /dev/null +++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ExecutorFactory.h" + +#include <functional> +#include "exec/ExecutionObservers.h" +#include "exec/LinearExecutor.h" +#include "exec/DataflowExecutor.h" +#include "exec/ParallelExecutor.h" +#include "compiler/BackendManager.h" +#include "compiler/ExecutionBuilder.h" +#include "exec/ExecTime.h" +#include "compiler/Linear.h" +#include "compiler/TensorBuilders.h" +#include "backend/IConstantInitializer.h" +#include "backend/IKernelGenerator.h" +#include "backend/IOptimizer.h" +#include "backend/IPortableTensor.h" +#include "backend/ITensorRegister.h" +#include "backend/controlflow/Config.h" +#include "backend/controlflow/KernelGenerator.h" +#include "backend/controlflow/UserTensor.h" +#include "backend/controlflow/TensorBuilder.h" +#include <memory> + +namespace onert +{ +namespace +{ + +class SyncFunction final : public exec::IFunction +{ +public: + virtual ~SyncFunction() = default; + SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config) + : _fn{std::move(fn)}, _config{config} + { + assert(_fn); + assert(_config); + } + + void run() override + { + _fn->run(); + _config->sync(); + } + + void prepare() override { _fn->prepare(); } + +private: + std::unique_ptr<exec::IFunction> _fn; + std::shared_ptr<backend::IConfig> _config; +}; + +} // namespace +} // namespace onert + +namespace onert +{ +namespace compiler +{ + +ExecutorFactory &ExecutorFactory::get() +{ + static ExecutorFactory singleton; + return singleton; +} + +ExecutorFactory::ExecutorFactory() +{ + _map["Linear"] = createLinearExecutor; + _map["Dataflow"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3, false); + _map["Parallel"] = std::bind(createDataflowExecutor, std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3, true); +} + +exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph, + const compiler::CompilerOptions &options, + const std::shared_ptr<exec::ExecutorMap> &executor_map) +{ + return _map.at(options.executor)(std::move(lowered_graph), options, executor_map); +} + +void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph) +{ + struct Entry + { + std::vector<backend::BackendContext::OperationInfo> operation_list; + std::vector<ir::OperandIndex> operand_list; + }; + std::unordered_map<const backend::Backend *, Entry> backend_assets; + + // Build lists for operations + lowered_graph->op_seqs().iterate( + [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) { + auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq; + auto backend = op_seq_li.at(op_seq_index)->backend(); + for (auto &operation_idx : op_seq.operations()) + { + backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout()); + } + }); + + // Build lists for operands + lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { + const auto lower_info = lowered_graph->getLowerInfo(ind); + for (auto factor : lower_info->def_factors()) + { + auto backend = factor.backend(); + backend_assets[backend].operand_list.emplace_back(ind); + } + }); + + for (auto &pair : backend_assets) + { + auto backend = pair.first; + auto &arg = pair.second; + lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list); + } +} + +void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph, + const std::vector<ir::OpSequenceIndex> &order) +{ + for (const auto index : order) + { + const auto &op_seq = lowered_graph->op_seqs().at(index); + const auto backend = lowered_graph->getLowerInfo(index)->backend(); + const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register; + auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder; + auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs(); + + if (tensor_register) + { + // Custom registration + tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo()); + } + else + { + // Default registration + for (const auto op_idx : op_seq) + { + const auto &op = lowered_graph->graph().operations().at(op_idx); + for (const auto &index : + (op.getInputs() | ir::Remove::UNDEFINED) + (op.getOutputs() | ir::Remove::UNDEFINED)) + { + if (!tensor_builder->isRegistered(index) && !model_io.contains(index)) + { + const auto &operand_lower_info = + lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement(); + + // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl) + // op.getOutputs() of permute (CPU) returns tensor A + // but tensor A belongs to the backend of acl_cl. + // So, we have to make this tensor NOT registered for CPU. + if (operand_lower_info.backend() != backend) + continue; + + const auto &obj = lowered_graph->graph().operands().at(index); + const auto frontend_layout = op_seq.getLayout(); + const auto backend_layout = operand_lower_info.layout(); + ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout), + obj.typeInfo(), obj.info().memAllocType(), + obj.isConstant()}; + tensor_builder->registerTensorInfo(index, backend_info, backend_layout); + } + } + } + } + } +} + +std::vector<backend::ITensor *> +ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph, + const ir::OperandIndexSequence &indices) +{ + std::vector<backend::ITensor *> ret; + + // TODO Store controlflow backend in BackendContext + std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder; + std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg; + for (const auto &e : lowered_graph.backend_contexts()) + { + auto backend = e.first; + auto &context = e.second; + if (backend->config()->id() == backend::controlflow::Config::ID) + { + cf_tensor_builder = + std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder); + cf_tensor_reg = + std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry); + } + } + assert(cf_tensor_builder); + assert(cf_tensor_reg); + + for (auto ind : indices) + { + const auto &operand = lowered_graph.graph().operands().at(ind); + auto tensor = std::make_unique<backend::controlflow::UserTensor>( + operand.info(), + ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */ + ); + + // Add tensor to controlflow TensorRegistry. + cf_tensor_reg->setNativeUserTensor(ind, std::move(tensor)); + auto *itensor = cf_tensor_reg->getITensor(ind); + ret.push_back(itensor); + } + return ret; +} + +void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph) +{ + TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true}; + + lowered_graph.op_seqs().iterate( + [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) { + auto lower_info = lowered_graph.getLowerInfo(op_seq_index); + auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend()); + for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED | + ir::Remove::UNDEFINED) + { + // If an OpSequence input/output tensor does not have a own tensor object, + // it must be using migrant tensors, so find the tensor from other tensor builders and + // set the tensor to this tensor builder if portable + if (!backend_ctx->tensor_registry->getITensor(ind)) + { + auto tensor = tensor_regs.getITensor(ind); + assert(tensor); // The tensor must have been registered + auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor); + if (ptensor) + backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor); + } + } + }); +} + +exec::IExecutor * +ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, + const compiler::CompilerOptions &options, + const std::shared_ptr<exec::ExecutorMap> &executor_map) +{ + const auto &backend_contexts = lowered_graph->backend_contexts(); + + initializeBackendContext(lowered_graph.get()); + + // linearize + assert(!lowered_graph->graph().isBuildingPhase()); + + /************************************************* + * Backend dependent analysis & optimization phase + *************************************************/ + + for (auto &pair : backend_contexts) + { + auto &optimizer = pair.second->optimizer; + if (optimizer) + optimizer->optimize(); + } + + /********************************************************** + * Backend dependent analysis & optimization phase finished + **********************************************************/ + + /*********************** + * Code generation phase + ***********************/ + + auto order = Linear::linearize(*lowered_graph); + runTensorRegistration(lowered_graph.get(), order); + + std::vector<backend::ITensor *> input_tensors; + std::vector<backend::ITensor *> output_tensors; + if (options.is_primary_subgraph) + { + input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs()); + output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs()); + } + + Linear::dump(*lowered_graph, order); + Linear::planTensors(*lowered_graph, order); + + TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true}; + TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true}; + + for (auto &tensor_builder : tensor_builders) + { + tensor_builder->prepare(); + } + + prepareMigrantTensors(*lowered_graph); + + ExecutionBuilder builder; + + // Generate kernels + lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index, + const ir::OpSequence &op_seq) { + auto lower_info = lowered_graph->getLowerInfo(op_seq_index); + auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen; + // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow + auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get()); + if (cf_kernel_gen != nullptr) + { + cf_kernel_gen->setTensorRegistries(tensor_regs); + cf_kernel_gen->setExecutorMap(executor_map); + } + auto fn_seq = kernel_gen->generate(op_seq); + if (options.he_profiling_mode) + { + fn_seq->wrap<SyncFunction>(lower_info->backend()->config()); + } + builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)}); + }); + + for (auto &tensor_builder : tensor_builders) + { + tensor_builder->allocate(); + } + + for (auto &pair : backend_contexts) + { + pair.second->initConsts(); + } + + lowered_graph->graph().operands().iterate( + [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); + + auto code_map = builder.releaseCodeMap(); + + for (auto &it : code_map) + { + auto op_seq_index = it.first; + auto &fn_seq = it.second.fn_seq; + + fn_seq->iterate([&](exec::IFunction &ifunc) { + ifunc.prepare(); + auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend(); + auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder; + tensor_builder->postFunctionPrepare(); + }); + } + + auto exec = + new exec::LinearExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, + std::move(code_map), order}; + + if (!options.trace_filepath.empty()) + { + std::unique_ptr<exec::IExecutionObserver> ctp = + std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph()); + exec->addObserver(std::move(ctp)); + } + + return exec; +} + +exec::IExecutor *ExecutorFactory::createDataflowExecutor( + std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options, + const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel) +{ + const auto &backend_contexts = lowered_graph->backend_contexts(); + + initializeBackendContext(lowered_graph.get()); + + auto order = Linear::linearize(*lowered_graph); + runTensorRegistration(lowered_graph.get(), order); + + std::vector<backend::ITensor *> input_tensors; + std::vector<backend::ITensor *> output_tensors; + if (options.is_primary_subgraph) + { + input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs()); + output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs()); + } + + TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true}; + TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true}; + + // To make tensors never be deallocated, this is a workaround to use static memory planner + for (auto &tensor_builder : tensor_builders) + { + lowered_graph->graph().operands().iterate( + [&](const ir::OperandIndex &ind, const ir::Operand &) { + if (tensor_builder->isRegistered(ind)) + { + tensor_builder->notifyFirstUse(ind); + } + }); + } + + for (auto &tensor_builder : tensor_builders) + { + tensor_builder->prepare(); + } + + prepareMigrantTensors(*lowered_graph); + + ExecutionBuilder builder; + + // Generate kernels + lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index, + const ir::OpSequence &op_seq) { + auto lower_info = lowered_graph->getLowerInfo(op_seq_index); + auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen; + // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow + auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get()); + if (cf_kernel_gen != nullptr) + { + assert(cf_kernel_gen != nullptr); + cf_kernel_gen->setTensorRegistries(tensor_regs); + cf_kernel_gen->setExecutorMap(executor_map); + } + auto fn_seq = kernel_gen->generate(op_seq); + if (options.he_profiling_mode) + { + fn_seq->wrap<SyncFunction>(lower_info->backend()->config()); + } + builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)}); + }); + + for (const auto &tensor_builder : tensor_builders) + { + tensor_builder->allocate(); + } + + for (auto &pair : backend_contexts) + { + pair.second->initConsts(); + } + + lowered_graph->graph().operands().iterate( + [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); + + auto code_map = builder.releaseCodeMap(); + + for (auto &it : code_map) + { + auto op_seq_index = it.first; + auto &fn_seq = it.second.fn_seq; + + fn_seq->iterate([&](exec::IFunction &ifunc) { + ifunc.prepare(); + auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend(); + auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder; + tensor_builder->postFunctionPrepare(); + }); + } + + exec::ExecutorBase *exec = nullptr; + if (parallel) + { + exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors, + tensor_regs, std::move(code_map)}; + } + else + { + auto dataflow_exec = new exec::DataflowExecutor{ + std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, std::move(code_map)}; + if (options.he_profiling_mode) + { + std::vector<const backend::Backend *> backends; + for (const auto &pair : backend_contexts) + { + backends.push_back(pair.first); + } + auto et = std::make_shared<exec::ExecTime>(backends); + std::unique_ptr<exec::IExecutionObserver> obs = + std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph()); + dataflow_exec->addObserver(std::move(obs)); + } + exec = dataflow_exec; + } + + if (!options.trace_filepath.empty()) + { + std::unique_ptr<exec::IExecutionObserver> ctp = + std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph()); + exec->addObserver(std::move(ctp)); + } + + return exec; +} + +} // namespace compiler +} // namespace onert |