diff options
Diffstat (limited to 'runtime/onert/core/src/compiler/ExecutorFactory.cc')
-rw-r--r-- | runtime/onert/core/src/compiler/ExecutorFactory.cc | 1037 |
1 files changed, 709 insertions, 328 deletions
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc index 062c6c9c3..6a08524cc 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.cc +++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc @@ -16,26 +16,37 @@ #include "ExecutorFactory.h" +#include "Linear.h" +#include "../backend/builtin/BackendContext.h" +#include "../backend/builtin/Config.h" +#include "../backend/builtin/UserTensor.h" +#include "../dumper/text/GraphDumper.h" +#include "../exec/DataflowExecutor.h" +#include "../exec/ExecTime.h" +#include "../exec/ExecutionObservers.h" +#include "../exec/LinearExecutor.h" +#ifdef MINMAX_H5DUMPER +#include "../exec/MinMaxRecorder.h" +#endif +#include "../exec/ParallelExecutor.h" +#include "../ir/OperationCloner.h" + +#include <backend/IPortableTensor.h> +#include <compiler/BackendManager.h> +#include <compiler/ExecutionBuilder.h> +#include <util/TracingCtx.h> + #include <functional> -#include "exec/ExecutionObservers.h" -#include "exec/LinearExecutor.h" -#include "exec/DataflowExecutor.h" -#include "exec/ParallelExecutor.h" -#include "compiler/BackendManager.h" -#include "compiler/ExecutionBuilder.h" -#include "exec/ExecTime.h" -#include "compiler/Linear.h" -#include "compiler/TensorBuilders.h" -#include "backend/IConstantInitializer.h" -#include "backend/IKernelGenerator.h" -#include "backend/IOptimizer.h" -#include "backend/ITensorRegister.h" -#include "backend/controlflow/Config.h" -#include "backend/controlflow/KernelGenerator.h" -#include "backend/controlflow/UserTensor.h" -#include "backend/controlflow/TensorBuilder.h" #include <memory> +#ifdef ONERT_TRAIN +#include "../backend/builtin/train/BackendContext.h" +#include "../exec/train/TrainableExecutor.h" + +#include <backend/train/TrainableBackendContext.h> +#include <backend/train/ITrainableBackend.h> +#endif // ONERT_TRAIN + namespace onert { namespace @@ -46,7 +57,7 @@ class SyncFunction final : public exec::IFunction public: virtual ~SyncFunction() = default; SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config) - : _fn{std::move(fn)}, _config{config} + : _fn{std::move(fn)}, _config{config} { assert(_fn); assert(_config); @@ -65,21 +76,218 @@ private: std::shared_ptr<backend::IConfig> _config; }; -// TODO Think of a better way to manage TensorManagers -backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders) +using DeallocList = std::vector<backend::ITensor *>; +// Deallocation after execution of an operation used by Linear Executor +class DeallocFunction final : public exec::IFunction +{ +public: + DeallocFunction(const DeallocList &tensors) : _dealloc_list{tensors} {} + + void run() override + { + for (auto &&tensor : _dealloc_list) + { + if (!tensor->is_dynamic()) + continue; + tensor->deallocBuffer(); + } + } + +private: + DeallocList _dealloc_list; +}; + +// TODO Unify initializeSubgraphIOTensors +void initializeSubgraphIOTensors(compiler::ILoweredGraph &lowered_graph, + const backend::BackendContexts &backend_contexts, + const ir::OperandIndexSequence &indices) +{ + // TODO Store builtin backend in BackendContext + std::shared_ptr<backend::builtin::TensorRegistry> builtin_tensor_reg; + for (const auto &e : backend_contexts) + { + auto backend = e.first; + auto &context = e.second; + if (backend->config()->id() == backend::builtin::Config::ID) + { + builtin_tensor_reg = + std::dynamic_pointer_cast<backend::builtin::TensorRegistry>(context->tensor_registry); + } + } + assert(builtin_tensor_reg); + + for (auto &&ind : indices) + { + const auto &operand = lowered_graph.graph().operands().at(ind); + auto tensor = std::make_unique<backend::builtin::IOTensor>( + operand.info(), + ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */ + ); + + // Add tensor to builtin TensorRegistry. + builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor)); + } +} + +#ifdef ONERT_TRAIN +void initializeSubgraphIOTensors(compiler::ILoweredGraph &lowered_graph, + const backend::train::TrainableBackendContexts &backend_contexts, + const ir::OperandIndexSequence &indices) { - backend::TensorManagerSet tensor_mgrs; - for (auto &tensor_builder : tensor_builders) + std::shared_ptr<backend::builtin::train::TensorRegistry> builtin_tensor_reg; + for (const auto &e : backend_contexts) { - auto s_tensor_manager = tensor_builder->releaseStaticTensorManager(); - if (s_tensor_manager != nullptr) - tensor_mgrs.insert(std::move(s_tensor_manager)); + auto backend = e.first; + auto &context = e.second; + if (backend->config()->id() == backend::builtin::Config::ID) + { + builtin_tensor_reg = std::dynamic_pointer_cast<backend::builtin::train::TensorRegistry>( + context->tensor_registry()); + } + } + assert(builtin_tensor_reg); + + for (auto &&ind : indices) + { + const auto &operand = lowered_graph.graph().operands().at(ind); + auto tensor = std::make_unique<backend::builtin::IOTensor>( + operand.info(), + ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */ + ); + + // Add tensor to builtin TensorRegistry. + builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor)); + } +} +#endif // ONERT_TRAIN + +backend::BackendContexts +createBackendContexts(compiler::ILoweredGraph &lgraph, bool linear_executor, + std::shared_ptr<backend::custom::IKernelBuilder> custom_kernel_builder) +{ + backend::BackendContexts contexts; + auto &backend_manager = compiler::BackendManager::get(); + + std::unordered_map<const backend::Backend *, backend::ContextData> context_data_map; + + // Generate partial graphs for each backend + for (auto &&backend : backend_manager.getAll()) + { + auto &data = context_data_map[backend]; + auto graph = std::make_unique<ir::Graph>(); + graph->setLayout(lgraph.graph().layout()); + data.graph = std::move(graph); + } + + auto &whole_graph = lgraph.graph(); + // Separate operands into partial graphs + whole_graph.operands().iterate([&](const ir::OperandIndex &operand_ind, ir::Operand &operand) { + auto &operand_li = lgraph.lower_info().operand; + const auto &def_factors = operand_li.at(operand_ind).def_factors(); + if (def_factors.size() == 0) // Ignore unused tensor + return; + const auto &def_factor = def_factors.getOnlyElement(); + const auto backend = def_factor.backend(); + auto &partial_graph = *context_data_map[backend].graph; + auto &operand_layouts = context_data_map[backend].operand_layouts; + assert(operand_layouts.find(operand_ind) == operand_layouts.end()); + operand_layouts[operand_ind] = def_factor.layout(); + + // Copy the operand and insert it to the partial graph + auto new_operand = std::make_unique<ir::Operand>(operand); + new_operand->clearDefUse(); + operand.releaseData(); // Deref data of LoweredGraph + auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand)); + UNUSED_RELEASE(new_operand_ind); + assert(new_operand_ind == operand_ind); + }); + // Separate operations into partial graphs + whole_graph.operations().iterate( + [&](const ir::OperationIndex &op_ind, const ir::IOperation &operation) { + auto &op_li = lgraph.lower_info().operation; + auto backend = op_li.at(op_ind).backend(); + auto &partial_graph = *context_data_map[backend].graph; + auto &external_operands = context_data_map[backend].external_operands; + auto &operand_layouts = context_data_map[backend].operand_layouts; + + { + // Add missing operands (externals) + auto io_list = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED | + ir::Remove::UNDEFINED; + for (auto &&operand_ind : io_list) + { + if (partial_graph.operands().exist(operand_ind)) + continue; + + // Copy the operand and insert it to the partial graph + const auto &operand = whole_graph.operands().at(operand_ind); + auto new_operand = std::make_unique<ir::Operand>(operand); + new_operand->clearDefUse(); + auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand)); + UNUSED_RELEASE(new_operand_ind); + assert(new_operand_ind == operand_ind); + + auto layout = + lgraph.lower_info().operand.at(operand_ind).def_factors().getOnlyElement().layout(); + assert(operand_layouts.find(operand_ind) == operand_layouts.end()); + operand_layouts[operand_ind] = layout; + external_operands.add(operand_ind); + } + + auto new_op_ind = partial_graph.addOperation(op_ind, clone(operation)); + UNUSED_RELEASE(new_op_ind); + assert(new_op_ind == op_ind); + } + }); + + // Create contexts + auto whole_op_order = lgraph.graph().topolSortOperations(); + for (auto &&pair : context_data_map) + { + auto backend = pair.first; + auto &data = pair.second; + // Handle graph input/outputs or external tensors + data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) { + if (whole_graph.getInputs().contains(ind) || whole_graph.getOutputs().contains(ind)) + data.external_operands.add(ind); + // Inputs are either "graph input" or "no def op and non-constant" + if (whole_graph.getInputs().contains(ind) || + (!operand.getDef().valid() && !operand.isConstant())) + // Outputs are either "graph output" or "no uses" + data.graph->addInput(ind); + if (whole_graph.getOutputs().contains(ind) || operand.getUses().size() == 0) + data.graph->addOutput(ind); + }); + dumper::text::dumpGraph(*data.graph); + + std::copy_if(whole_op_order.begin(), whole_op_order.end(), std::back_inserter(data.op_order), + [&](const auto &ind) { return data.graph->operations().exist(ind); }); + data.is_linear_executor = linear_executor; + data.custom_kernel_builder = custom_kernel_builder; + contexts.emplace(backend, backend->newContext(std::move(data))); + } + return contexts; +} + +template <typename Context> +std::deque<std::pair<const backend::Backend *, Context *>> orderBackendContext( + const std::unordered_map<const backend::Backend *, std::unique_ptr<Context>> &tbackend_contexts) +{ + std::deque<std::pair<const backend::Backend *, Context *>> ordered_contexts; - auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager(); - if (d_tensor_manager != nullptr) - tensor_mgrs.insert(std::move(d_tensor_manager)); + for (auto &&pair : tbackend_contexts) + { + // NOTE builtin backend must be processed lastly. + // This is because of Permute layer's specialty which is the only operation that could have + // different ITensor objects for the input and the output. And it requires all other backends' + // tensors are ready to use. + if (pair.first->config()->id() == "builtin") + ordered_contexts.emplace_back(pair.first, pair.second.get()); + else + ordered_contexts.emplace_front(pair.first, pair.second.get()); } - return tensor_mgrs; + + return ordered_contexts; } } // namespace @@ -106,415 +314,588 @@ ExecutorFactory::ExecutorFactory() } exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const compiler::CompilerOptions &options, - const std::shared_ptr<exec::ExecutorMap> &executor_map) + const std::shared_ptr<exec::IExecutors> &executors, + const ExecutorFactoryArgs &args) { - return _map.at(options.executor)(std::move(lowered_graph), options, executor_map); + assert(args.options != nullptr); + return _map.at(args.options->executor)(std::move(lowered_graph), executors, args); } -void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph) +void ExecutorFactory::prepareMigrantTensors(compiler::ILoweredGraph &lowered_graph, + const backend::BackendContexts &backend_contexts) { - struct Entry - { - std::vector<backend::BackendContext::OperationInfo> operation_list; - std::vector<ir::OperandIndex> operand_list; - }; - std::unordered_map<const backend::Backend *, Entry> backend_assets; - - // Build lists for operations - lowered_graph->op_seqs().iterate( - [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) { - auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq; - auto backend = op_seq_li.at(op_seq_index)->backend(); - for (auto &operation_idx : op_seq.operations()) + TensorRegistries tensor_regs{backend_contexts, true}; + + lowered_graph.graph().operations().iterate( + [&](const ir::OperationIndex &op_ind, const ir::IOperation &op) { + auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind); + auto &backend_ctx = backend_contexts.at(lower_info->backend()); + for (auto &&ind : + (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) + { + // If an Operation's input/output tensor does not have an own tensor object, + // it must be using migrant tensors, so find the tensor from other tensor registries and + // register it to the current tensor registry if it is portable + if (!backend_ctx->tensor_registry->getITensor(ind)) { - backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout()); + auto tensor = tensor_regs.getITensor(ind); + assert(tensor); // The tensor must have been registered + auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor); + if (ptensor) + backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor); } - }); + } + }); +} - // Build lists for operands - lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { - const auto lower_info = lowered_graph->getLowerInfo(ind); - for (auto factor : lower_info->def_factors()) +void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs, + const std::shared_ptr<exec::IExecutors> &executors, + const backend::BackendContexts &backend_contexts, + const ir::ModelIndex &index) +{ + for (auto &&pair : backend_contexts) + { + auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get()); + if (builtin_context != nullptr) { - auto backend = factor.backend(); - backend_assets[backend].operand_list.emplace_back(ind); + auto builtin_kernel_gen = builtin_context->kernel_gen; + builtin_kernel_gen->setTensorRegistries(tensor_regs); + builtin_kernel_gen->setExecutors(executors); + builtin_kernel_gen->setModelIndex(index); } - }); + } +} - for (auto &pair : backend_assets) +std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> +ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_contexts) +{ + std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts; + for (auto &&pair : backend_contexts) { - auto backend = pair.first; - auto &arg = pair.second; - lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list); + // NOTE builtin backend must be processed lastly. + // This is because of Permute layer's specialty which is the only operation that could have + // different ITensor objects for the input and the output. And it requires all other backends' + // tensors are ready to use. + if (pair.first->config()->id() == "builtin") + ordered_contexts.emplace_back(pair.first, pair.second.get()); + else + ordered_contexts.emplace_front(pair.first, pair.second.get()); } + return ordered_contexts; } -void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph, - const std::vector<ir::OpSequenceIndex> &order) +exec::IExecutor * +ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, + const std::shared_ptr<exec::IExecutors> &executors, + const ExecutorFactoryArgs &args) { - for (const auto index : order) + const auto options = args.options; + const auto &model_index = args.model_index; + const auto tracing_ctx = args.tracing_ctx; + auto custom_kernel_builder = args.custom_kernel_builder; + auto &graph = lowered_graph->graph(); + + backend::BackendContexts backend_contexts = + createBackendContexts(*lowered_graph, options->executor == "Linear", custom_kernel_builder); + + TensorRegistries tensor_regs{backend_contexts, true}; + + initializeSubgraphIOTensors( + *lowered_graph, backend_contexts, + (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) | + ir::Remove::DUPLICATED | ir::Remove::UNDEFINED); + + // linearize + auto order = Linear::linearize(*lowered_graph); + Linear::dump(*lowered_graph, order); + + for (auto &&pair : backend_contexts) { - const auto &op_seq = lowered_graph->op_seqs().at(index); - const auto backend = lowered_graph->getLowerInfo(index)->backend(); - const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register; - auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder; - auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs(); + pair.second->genTensors(); + } + + prepareMigrantTensors(*lowered_graph, backend_contexts); - if (tensor_register) + // Give some runtime objects to builtin KernelGenerator + prepareBuiltinBackend(tensor_regs, executors, backend_contexts, model_index); + + ExecutionBuilder builder; + + // Adjust the order of backends for the upcoming iteration + auto ordered_contexts = orderBackendContext(backend_contexts); + + // Simulate the execution for deallocation of tensors + std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map; + { + ir::OperandIndexMap<uint32_t> uses_map; + ir::OperandIndexSequence constants; + + auto model_io = + (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED; + + // Prepare scanning + graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { + uses_map[ind] = obj.getUses().size(); + + if (obj.isConstant()) + constants.append(ind); + }); + + // A trick to consider constants as an execption + for (const auto &ind : constants) { - // Custom registration - tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo()); + uses_map[ind]++; } - else + + for (const auto &op_ind : order) { - // Default registration - for (const auto op_idx : op_seq) + const auto &op = graph.operations().at(op_ind); + auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + + for (const auto &ind : op_inputs) { - const auto &op = lowered_graph->graph().operations().at(op_idx); - for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs()) + const auto &operand = graph.operands().at(ind); + assert(uses_map.find(ind) != uses_map.end()); + assert(uses_map[ind] > 0); + uses_map[ind]--; + if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind)) { - if (!tensor_builder->isRegistered(index) && !model_io.contains(index)) - { - const auto &operand_lower_info = - lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement(); - - // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl) - // op.getOutputs() of permute (CPU) returns tensor A - // but tensor A belongs to the backend of acl_cl. - // So, we have to make this tensor NOT registered for CPU. - if (operand_lower_info.backend() != backend) - continue; - - const auto &obj = lowered_graph->graph().operands().at(index); - const auto frontend_layout = op_seq.getLayout(); - const auto backend_layout = operand_lower_info.layout(); - ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout), - obj.typeInfo(), obj.info().memAllocType(), - obj.isConstant()}; - tensor_builder->registerTensorInfo(index, backend_info, backend_layout); - } + dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind)); } } } - } -} -std::vector<std::shared_ptr<backend::ITensor>> -ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph, - const ir::OperandIndexSequence &indices) -{ - std::vector<std::shared_ptr<backend::ITensor>> ret; + // Dispose and validate + for (const auto &ind : constants) + { + --uses_map[ind]; + } - // TODO Store controlflow backend in BackendContext - std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder; - std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg; - for (const auto &e : lowered_graph.backend_contexts()) + assert( + std::all_of(uses_map.begin(), uses_map.end(), + [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); + } + + // Generate kernels + for (auto &&pair : ordered_contexts) { - auto backend = e.first; - auto &context = e.second; - if (backend->config()->id() == backend::controlflow::Config::ID) + auto codes = pair.second->genKernels(); + for (auto &&pair : codes) { - cf_tensor_builder = - std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder); - cf_tensor_reg = - std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry); + auto &op_ind = pair.first; + auto &fn_seq = pair.second; + auto &op = lowered_graph->graph().operations().at(op_ind); + auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind); + if (options->he_profiling_mode) + fn_seq->wrap<SyncFunction>(lower_info->backend()->config()); + if (!dealloc_list_map[op_ind].empty()) + fn_seq->append(std::make_unique<DeallocFunction>(dealloc_list_map[op_ind])); + builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)}); } } - assert(cf_tensor_builder); - assert(cf_tensor_reg); - for (auto ind : indices) + auto code_map = builder.releaseCodeMap(); + + auto exec = new exec::LinearExecutor{std::move(lowered_graph), + std::move(backend_contexts), + tensor_regs, + std::move(code_map), + order, + tracing_ctx}; + + if (!options->trace_filepath.empty()) { - const auto &operand = lowered_graph.graph().operands().at(ind); - auto tensor = std::make_shared<backend::controlflow::UserTensor>( - operand.info(), - ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */ - cf_tensor_builder->dynamicTensorManager()); - - // Add tensor to controlflow TensorRegistry. - cf_tensor_reg->setNativeUserTensor(ind, tensor); - ret.push_back(tensor); + std::unique_ptr<exec::IExecutionObserver> ctp = + std::make_unique<exec::TracingObserver>(options->trace_filepath, exec->graph(), tracing_ctx); + exec->addObserver(std::move(ctp)); } - return ret; -} +#ifdef MINMAX_H5DUMPER + if (!options->minmax_filepath.empty()) + exec->addObserver(std::make_unique<exec::MinMaxRecorder>( + options->minmax_filepath, exec->graph(), exec->getBackendContexts())); +#endif -void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph) -{ - TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true}; - - lowered_graph.op_seqs().iterate( - [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) { - auto lower_info = lowered_graph.getLowerInfo(op_seq_index); - auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend()); - for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED | - ir::Remove::UNDEFINED) - { - // If an OpSequence input/output tensor does not have a own tensor object, - // it must be using external tensors, so find the tensor from other tensor builders and - // set the tensor to this tensor builder if portable - if (!backend_ctx->tensor_registry->getITensor(ind)) - { - auto tensor = tensor_regs.getITensor(ind); - assert(tensor); // The tensor must have been registered - auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor); - if (ptensor) - backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor); - } - } - }); + return exec; } exec::IExecutor * -ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const compiler::CompilerOptions &options, - const std::shared_ptr<exec::ExecutorMap> &executor_map) +ExecutorFactory::createDataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, + const std::shared_ptr<exec::IExecutors> &executors, + const ExecutorFactoryArgs &args, bool parallel) { - const auto &backend_contexts = lowered_graph->backend_contexts(); + const auto options = args.options; + const auto &model_index = args.model_index; + const auto tracing_ctx = args.tracing_ctx; + auto custom_kernel_builder = args.custom_kernel_builder; - initializeBackendContext(lowered_graph.get()); + backend::BackendContexts backend_contexts = + createBackendContexts(*lowered_graph, options->executor == "Linear", custom_kernel_builder); - // linearize - assert(!lowered_graph->graph().isBuildingPhase()); + TensorRegistries tensor_regs{backend_contexts, true}; - /************************************************* - * Backend dependent analysis & optimization phase - *************************************************/ + initializeSubgraphIOTensors( + *lowered_graph, backend_contexts, + (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) | + ir::Remove::DUPLICATED | ir::Remove::UNDEFINED); - for (auto &pair : backend_contexts) + for (auto &&pair : backend_contexts) { - auto &optimizer = pair.second->optimizer; - if (optimizer) - optimizer->optimize(); + pair.second->genTensors(); } - /********************************************************** - * Backend dependent analysis & optimization phase finished - **********************************************************/ + prepareMigrantTensors(*lowered_graph, backend_contexts); - /*********************** - * Code generation phase - ***********************/ + // Give some runtime objects to builtin KernelGenerator + prepareBuiltinBackend(tensor_regs, executors, backend_contexts, model_index); - auto order = Linear::linearize(*lowered_graph); - runTensorRegistration(lowered_graph.get(), order); + ExecutionBuilder builder; - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; - if (options.is_primary_subgraph) + // Adjust the order of backends for the upcoming iteration + auto ordered_contexts = orderBackendContext(backend_contexts); + + // Generate kernels + for (auto &&pair : ordered_contexts) { - input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs()); - output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs()); + auto codes = pair.second->genKernels(); + for (auto &&pair : codes) + { + auto &op_ind = pair.first; + auto &fn_seq = pair.second; + auto &op = lowered_graph->graph().operations().at(op_ind); + auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind); + if (options->he_profiling_mode) + fn_seq->wrap<SyncFunction>(lower_info->backend()->config()); + builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)}); + } } - Linear::dump(*lowered_graph, order); - Linear::planTensors(*lowered_graph, order); + auto code_map = builder.releaseCodeMap(); - TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true}; - TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true}; + exec::ExecutorBase *exec = nullptr; + if (parallel) + { + exec = new exec::ParallelExecutor{std::move(lowered_graph), std::move(backend_contexts), + tensor_regs, std::move(code_map), tracing_ctx}; + } + else + { + auto dataflow_exec = + new exec::DataflowExecutor{std::move(lowered_graph), std::move(backend_contexts), tensor_regs, + std::move(code_map), tracing_ctx}; + if (options->he_profiling_mode) + { + std::vector<const backend::Backend *> backends; + for (const auto &pair : backend_contexts) + { + backends.push_back(pair.first); + } + auto et = std::make_shared<exec::ExecTime>(backends); + std::unique_ptr<exec::IExecutionObserver> obs = + std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph()); + dataflow_exec->addObserver(std::move(obs)); + } + exec = dataflow_exec; + } - for (auto &tensor_builder : tensor_builders) + if (!options->trace_filepath.empty()) { - tensor_builder->prepare(); + std::unique_ptr<exec::IExecutionObserver> ctp = + std::make_unique<exec::TracingObserver>(options->trace_filepath, exec->graph(), tracing_ctx); + exec->addObserver(std::move(ctp)); } - prepareExternalTensors(*lowered_graph); + return exec; +} - ExecutionBuilder builder; +#ifdef ONERT_TRAIN +exec::IExecutor * +ExecutorFactory::create(std::unique_ptr<compiler::train::LoweredTrainableGraph> lowered_graph, + const std::shared_ptr<exec::IExecutors> &executors, + const ExecutorFactoryArgs &args, + const std::shared_ptr<exec::train::optimizer::Optimizer> &optimizer) +{ + assert(args.options != nullptr); - // Generate kernels - lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index, - const ir::OpSequence &op_seq) { - auto lower_info = lowered_graph->getLowerInfo(op_seq_index); - auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen; - // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow - auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get()); - if (cf_kernel_gen != nullptr) + if (args.options->executor != "Linear") + throw std::runtime_error("ExecutorFactory: TrainableExecutor supports only 'Linear' now"); + + return createTrainableExecutor(std::move(lowered_graph), executors, args, optimizer); +} + +void ExecutorFactory::prepareMigrantTensors( + compiler::ILoweredGraph &lowered_graph, + const backend::train::TrainableBackendContexts &backend_contexts) +{ + train::TensorRegistries tensor_regs{backend_contexts, true}; + + lowered_graph.graph().operations().iterate( + [&](const ir::OperationIndex &op_ind, const ir::IOperation &op) { + auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind); + auto &backend_ctx = backend_contexts.at(lower_info->backend()); + for (auto &&ind : + (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) + { + // If an Operation's input/output tensor does not have an own tensor object, + // it must be using migrant tensors, so find the tensor from other tensor registries and + // register it to the current tensor registry if it is portable + if (!backend_ctx->tensor_registry()->getITensor(ind)) + { + auto tensor = tensor_regs.getITensor(ind); + assert(tensor); // The tensor must have been registered + auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor); + if (ptensor) + backend_ctx->tensor_registry()->setMigrantTensor(ind, ptensor); + } + } + }); +} + +exec::IExecutor *ExecutorFactory::createTrainableExecutor( + std::unique_ptr<compiler::train::LoweredTrainableGraph> lowered_graph, + const std::shared_ptr<exec::IExecutors> &, const ExecutorFactoryArgs &args, + const std::shared_ptr<exec::train::optimizer::Optimizer> &optimizer) +{ + const auto options = args.options; + const auto tracing_ctx = args.tracing_ctx; + auto custom_kernel_builder = args.custom_kernel_builder; + + auto &graph = lowered_graph->graph(); + + lowered_graph->trainable_graph().operations().iterate([](const onert::ir::OperationIndex &, + const onert::ir::IOperation &op) { + try { - cf_kernel_gen->setTensorRegistries(tensor_regs); - cf_kernel_gen->setExecutorMap(executor_map); + UNUSED_RELEASE(dynamic_cast<const ir::train::ITrainableOperation &>(op)); } - auto fn_seq = kernel_gen->generate(op_seq); - if (options.he_profiling_mode) + catch (std::bad_cast &) { - fn_seq->wrap<SyncFunction>(lower_info->backend()->config()); + throw std::runtime_error("ExecutorFactory: " + op.name() + " is not trainable operation yet"); } - builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)}); }); - for (auto &tensor_builder : tensor_builders) - { - tensor_builder->allocate(); - } + // TODO Create context only once instead of replacing + backend::train::TrainableBackendContexts tbackend_contexts; + backend::BackendContexts base_backend_contexts = + createBackendContexts(*lowered_graph, true, custom_kernel_builder); - for (auto &pair : backend_contexts) + // Replace BackendContext with TrainbleBackendContext + for (auto &&pair : base_backend_contexts) { - pair.second->initConsts(); - } - - lowered_graph->graph().operands().iterate( - [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - - auto code_map = builder.releaseCodeMap(); - - for (auto &it : code_map) - { - auto op_seq_index = it.first; - auto &fn_seq = it.second.fn_seq; - - fn_seq->iterate([&](exec::IFunction &ifunc) { - ifunc.prepare(); - auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend(); - auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder; - tensor_builder->postFunctionPrepare(); + auto ctx = pair.second.get(); + const auto &data = ctx->data(); + + // Create partial and trainable graphs + auto tgraph = std::make_unique<ir::train::TrainableGraph>(*data.graph); + data.graph->operations().iterate( + [&](const onert::ir::OperationIndex &op_index, const onert::ir::IOperation &) { + const auto &orig_tgraph = lowered_graph->trainable_graph(); + const auto &trainable_op = orig_tgraph.operation(op_index); + auto gen_index = tgraph->replaceOperation(op_index, trainable_op.clone()); + UNUSED_RELEASE(gen_index); + assert(gen_index == op_index); + }); + data.graph->operands().iterate([&](const ir::OperandIndex &index, const ir::Operand &) { + const auto &orig_tgraph = lowered_graph->trainable_graph(); + if (orig_tgraph.derivatives().exist(index)) + { + const auto &deriv = orig_tgraph.derivatives().at(index); + auto new_deriv = std::make_unique<ir::Operand>(deriv); + auto gen_index = tgraph->addDerivative(index, std::move(new_deriv)); + UNUSED_RELEASE(gen_index); + assert(gen_index == index); + } }); - } - backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders); - auto exec = new exec::LinearExecutor{ - std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map), order}; + // Remove outputs of whole graph from external_operands + auto external_operands = data.external_operands; + for (const auto &index : lowered_graph->trainable_graph().getOutputs()) + { + if (external_operands.contains(index)) + external_operands.remove(index); + } - if (!options.trace_filepath.empty()) - { - std::unique_ptr<exec::IExecutionObserver> ctp = - std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph()); - exec->addObserver(std::move(ctp)); + // Set trainable context data + backend::train::TrainableContextData tdata; + tdata.tgraph = std::move(tgraph); + tdata.op_order = std::move(data.op_order); + tdata.external_operands = std::move(external_operands); + tdata.operand_layouts = std::move(data.operand_layouts); + tdata.custom_kernel_builder = std::move(data.custom_kernel_builder); + tdata.is_linear_executor = data.is_linear_executor; + tdata.optimizer = optimizer; + + // TODO Remove dynamic_cast + try + { + const auto backend = pair.first; + const auto tbackend = dynamic_cast<const backend::train::ITrainableBackend *>(backend); + tbackend_contexts.emplace(backend, tbackend->newContext(std::move(tdata))); + } + catch (const std::bad_cast &) + { + throw std::runtime_error("ExecutorFactory: Invalid backend - TrainableExecutor does not " + "support non-trainble backends"); + } } + base_backend_contexts.clear(); - return exec; -} - -exec::IExecutor *ExecutorFactory::createDataflowExecutor( - std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options, - const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel) -{ - const auto &backend_contexts = lowered_graph->backend_contexts(); + train::TensorRegistries tensor_regs{tbackend_contexts, true}; - initializeBackendContext(lowered_graph.get()); + initializeSubgraphIOTensors( + *lowered_graph, tbackend_contexts, + (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) | + ir::Remove::DUPLICATED | ir::Remove::UNDEFINED); + // linearize auto order = Linear::linearize(*lowered_graph); - runTensorRegistration(lowered_graph.get(), order); + Linear::dump(*lowered_graph, order); - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; - if (options.is_primary_subgraph) + for (auto &&pair : tbackend_contexts) { - input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs()); - output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs()); + pair.second->genTensors(); } - TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true}; - TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true}; - - // To make tensors never be deallocated, this is a workaround to use static memory planner - for (auto &tensor_builder : tensor_builders) + for (auto &&pair : tbackend_contexts) { - lowered_graph->graph().operands().iterate( - [&](const ir::OperandIndex &ind, const ir::Operand &) { - if (tensor_builder->isRegistered(ind)) - { - tensor_builder->notifyFirstUse(ind); - } - }); + auto tctx = pair.second.get(); + tctx->genTrainingTensors(); } - for (auto &tensor_builder : tensor_builders) + prepareMigrantTensors(*lowered_graph, tbackend_contexts); + + // Give some runtime objects to builtin KernelGenerator + for (auto &&pair : tbackend_contexts) { - tensor_builder->prepare(); + auto builtin_context = + dynamic_cast<backend::builtin::train::BackendContext *>(pair.second.get()); + if (builtin_context != nullptr) + { + auto builtin_kernel_gen = builtin_context->kernel_gen; + builtin_kernel_gen->setTensorRegistries(tensor_regs); + builtin_kernel_gen->setWholeGraphOutputs(lowered_graph->trainable_graph().getOutputs()); + } } - prepareExternalTensors(*lowered_graph); + // Adjust the order of backends for the upcoming iteration + auto ordered_contexts = + onert::orderBackendContext<backend::train::TrainableBackendContext>(tbackend_contexts); - ExecutionBuilder builder; + // TODO Remove this simulation + // Simulate the execution for deallocation of tensors + std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map; + { + ir::OperandIndexMap<uint32_t> uses_map; + ir::OperandIndexSequence constants; - // Generate kernels - lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index, - const ir::OpSequence &op_seq) { - auto lower_info = lowered_graph->getLowerInfo(op_seq_index); - auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen; - // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow - auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get()); - if (cf_kernel_gen != nullptr) + auto model_io = + (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED; + + // Prepare scanning + graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { + uses_map[ind] = obj.getUses().size(); + + if (obj.isConstant()) + constants.append(ind); + }); + + // A trick to consider constants as an execption + for (const auto &ind : constants) { - assert(cf_kernel_gen != nullptr); - cf_kernel_gen->setTensorRegistries(tensor_regs); - cf_kernel_gen->setExecutorMap(executor_map); + uses_map[ind]++; } - auto fn_seq = kernel_gen->generate(op_seq); - if (options.he_profiling_mode) + + for (const auto op_ind : order) { - fn_seq->wrap<SyncFunction>(lower_info->backend()->config()); + const auto &op = graph.operations().at(op_ind); + auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + + for (const auto &ind : op_inputs) + { + const auto &operand = graph.operands().at(ind); + assert(uses_map.find(ind) != uses_map.end()); + assert(uses_map[ind] > 0); + uses_map[ind]--; + if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind)) + { + dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind)); + } + } } - builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)}); - }); - for (const auto &tensor_builder : tensor_builders) - { - tensor_builder->allocate(); - } + // Dispose and validate + for (const auto &ind : constants) + { + --uses_map[ind]; + } - for (auto &pair : backend_contexts) - { - pair.second->initConsts(); + assert( + std::all_of(uses_map.begin(), uses_map.end(), + [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); } - lowered_graph->graph().operands().iterate( - [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - - auto code_map = builder.releaseCodeMap(); - - for (auto &it : code_map) + // Check derivative tensors { - auto op_seq_index = it.first; - auto &fn_seq = it.second.fn_seq; - - fn_seq->iterate([&](exec::IFunction &ifunc) { - ifunc.prepare(); - auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend(); - auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder; - tensor_builder->postFunctionPrepare(); - }); + // TODO Support multiple subgraphs + // Check if the derivative tensors corresponding to inputs of model are nullptr + // NOTE The derivative tensors corresponding to inputs of model are for inputs of PermuteLayers + // and they are nullptr and because they are meaningless. + assert(std::all_of(lowered_graph->trainable_graph().getInputs().begin(), + lowered_graph->trainable_graph().getInputs().end(), + [&](const auto &input_idx) { + return tensor_regs.getDerivativeITensor(input_idx) == nullptr; + })); + + // Check if the derivative tensors corresponding to outputs of model exist + assert(std::all_of(lowered_graph->trainable_graph().getOutputs().begin(), + lowered_graph->trainable_graph().getOutputs().end(), + [&](const auto &output_idx) { + return tensor_regs.getDerivativeITensor(output_idx) == nullptr; + })); } - backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders); - - exec::ExecutorBase *exec = nullptr; - if (parallel) - { - exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, - output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map)}; - } - else + train::TrainableCodeMap code_map; + // Generate kernels + for (auto &&pair : ordered_contexts) { - auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors, - output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map)}; - if (options.he_profiling_mode) + auto codes = pair.second->genKernels(); + for (auto &&pair : codes) { - std::vector<const backend::Backend *> backends; - for (const auto &pair : backend_contexts) - { - backends.push_back(pair.first); - } - auto et = std::make_shared<exec::ExecTime>(backends); - std::unique_ptr<exec::IExecutionObserver> obs = - std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph()); - dataflow_exec->addObserver(std::move(obs)); + auto &op_ind = pair.first; + auto &tn_seq = pair.second; + auto &op = lowered_graph->trainable_graph().operation(op_ind); + auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind); + + assert(code_map.find(op_ind) == code_map.end()); + code_map.insert( + {op_ind, train::TrainableCodeAndInfo{op_ind, &op, lower_info, std::move(tn_seq)}}); } - exec = dataflow_exec; } - if (!options.trace_filepath.empty()) + if (order.size() != code_map.size()) + { + throw std::runtime_error("ExecutorFactory: Some kernels are not generated"); + } + + auto exec = new exec::train::TrainableExecutor{std::move(lowered_graph), + std::move(tbackend_contexts), + tensor_regs, + std::move(code_map), + order, + tracing_ctx}; + + if (!options->trace_filepath.empty()) { std::unique_ptr<exec::IExecutionObserver> ctp = - std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph()); + std::make_unique<exec::TracingObserver>(options->trace_filepath, exec->graph(), tracing_ctx); exec->addObserver(std::move(ctp)); } + // TODO Support MINMAX_H5DUMPER return exec; } +#endif // ONERT_TRAIN } // namespace compiler } // namespace onert |