1 files changed, 709 insertions, 328 deletions
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index 062c6c9c3..6a08524cc 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -16,26 +16,37 @@
 
 #include "ExecutorFactory.h"
 
+#include "Linear.h"
+#include "../backend/builtin/BackendContext.h"
+#include "../backend/builtin/Config.h"
+#include "../backend/builtin/UserTensor.h"
+#include "../dumper/text/GraphDumper.h"
+#include "../exec/DataflowExecutor.h"
+#include "../exec/ExecTime.h"
+#include "../exec/ExecutionObservers.h"
+#include "../exec/LinearExecutor.h"
+#ifdef MINMAX_H5DUMPER
+#include "../exec/MinMaxRecorder.h"
+#endif
+#include "../exec/ParallelExecutor.h"
+#include "../ir/OperationCloner.h"
+
+#include <backend/IPortableTensor.h>
+#include <compiler/BackendManager.h>
+#include <compiler/ExecutionBuilder.h>
+#include <util/TracingCtx.h>
+
 #include <functional>
-#include "exec/ExecutionObservers.h"
-#include "exec/LinearExecutor.h"
-#include "exec/DataflowExecutor.h"
-#include "exec/ParallelExecutor.h"
-#include "compiler/BackendManager.h"
-#include "compiler/ExecutionBuilder.h"
-#include "exec/ExecTime.h"
-#include "compiler/Linear.h"
-#include "compiler/TensorBuilders.h"
-#include "backend/IConstantInitializer.h"
-#include "backend/IKernelGenerator.h"
-#include "backend/IOptimizer.h"
-#include "backend/ITensorRegister.h"
-#include "backend/controlflow/Config.h"
-#include "backend/controlflow/KernelGenerator.h"
-#include "backend/controlflow/UserTensor.h"
-#include "backend/controlflow/TensorBuilder.h"
 #include <memory>
 
+#ifdef ONERT_TRAIN
+#include "../backend/builtin/train/BackendContext.h"
+#include "../exec/train/TrainableExecutor.h"
+
+#include <backend/train/TrainableBackendContext.h>
+#include <backend/train/ITrainableBackend.h>
+#endif // ONERT_TRAIN
+
 namespace onert
 {
 namespace
@@ -46,7 +57,7 @@ class SyncFunction final : public exec::IFunction
 public:
   virtual ~SyncFunction() = default;
   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
-      : _fn{std::move(fn)}, _config{config}
+    : _fn{std::move(fn)}, _config{config}
   {
     assert(_fn);
     assert(_config);
@@ -65,21 +76,218 @@ private:
   std::shared_ptr<backend::IConfig> _config;
 };
 
-// TODO Think of a better way to manage TensorManagers
-backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders)
+using DeallocList = std::vector<backend::ITensor *>;
+// Deallocation after execution of an operation used by Linear Executor
+class DeallocFunction final : public exec::IFunction
+{
+public:
+  DeallocFunction(const DeallocList &tensors) : _dealloc_list{tensors} {}
+
+  void run() override
+  {
+    for (auto &&tensor : _dealloc_list)
+    {
+      if (!tensor->is_dynamic())
+        continue;
+      tensor->deallocBuffer();
+    }
+  }
+
+private:
+  DeallocList _dealloc_list;
+};
+
+// TODO Unify initializeSubgraphIOTensors
+void initializeSubgraphIOTensors(compiler::ILoweredGraph &lowered_graph,
+                                 const backend::BackendContexts &backend_contexts,
+                                 const ir::OperandIndexSequence &indices)
+{
+  // TODO Store builtin backend in BackendContext
+  std::shared_ptr<backend::builtin::TensorRegistry> builtin_tensor_reg;
+  for (const auto &e : backend_contexts)
+  {
+    auto backend = e.first;
+    auto &context = e.second;
+    if (backend->config()->id() == backend::builtin::Config::ID)
+    {
+      builtin_tensor_reg =
+        std::dynamic_pointer_cast<backend::builtin::TensorRegistry>(context->tensor_registry);
+    }
+  }
+  assert(builtin_tensor_reg);
+
+  for (auto &&ind : indices)
+  {
+    const auto &operand = lowered_graph.graph().operands().at(ind);
+    auto tensor = std::make_unique<backend::builtin::IOTensor>(
+      operand.info(),
+      ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */
+    );
+
+    // Add tensor to builtin TensorRegistry.
+    builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
+  }
+}
+
+#ifdef ONERT_TRAIN
+void initializeSubgraphIOTensors(compiler::ILoweredGraph &lowered_graph,
+                                 const backend::train::TrainableBackendContexts &backend_contexts,
+                                 const ir::OperandIndexSequence &indices)
 {
-  backend::TensorManagerSet tensor_mgrs;
-  for (auto &tensor_builder : tensor_builders)
+  std::shared_ptr<backend::builtin::train::TensorRegistry> builtin_tensor_reg;
+  for (const auto &e : backend_contexts)
   {
-    auto s_tensor_manager = tensor_builder->releaseStaticTensorManager();
-    if (s_tensor_manager != nullptr)
-      tensor_mgrs.insert(std::move(s_tensor_manager));
+    auto backend = e.first;
+    auto &context = e.second;
+    if (backend->config()->id() == backend::builtin::Config::ID)
+    {
+      builtin_tensor_reg = std::dynamic_pointer_cast<backend::builtin::train::TensorRegistry>(
+        context->tensor_registry());
+    }
+  }
+  assert(builtin_tensor_reg);
+
+  for (auto &&ind : indices)
+  {
+    const auto &operand = lowered_graph.graph().operands().at(ind);
+    auto tensor = std::make_unique<backend::builtin::IOTensor>(
+      operand.info(),
+      ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */
+    );
+
+    // Add tensor to builtin TensorRegistry.
+    builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
+  }
+}
+#endif // ONERT_TRAIN
+
+backend::BackendContexts
+createBackendContexts(compiler::ILoweredGraph &lgraph, bool linear_executor,
+                      std::shared_ptr<backend::custom::IKernelBuilder> custom_kernel_builder)
+{
+  backend::BackendContexts contexts;
+  auto &backend_manager = compiler::BackendManager::get();
+
+  std::unordered_map<const backend::Backend *, backend::ContextData> context_data_map;
+
+  // Generate partial graphs for each backend
+  for (auto &&backend : backend_manager.getAll())
+  {
+    auto &data = context_data_map[backend];
+    auto graph = std::make_unique<ir::Graph>();
+    graph->setLayout(lgraph.graph().layout());
+    data.graph = std::move(graph);
+  }
+
+  auto &whole_graph = lgraph.graph();
+  // Separate operands into partial graphs
+  whole_graph.operands().iterate([&](const ir::OperandIndex &operand_ind, ir::Operand &operand) {
+    auto &operand_li = lgraph.lower_info().operand;
+    const auto &def_factors = operand_li.at(operand_ind).def_factors();
+    if (def_factors.size() == 0) // Ignore unused tensor
+      return;
+    const auto &def_factor = def_factors.getOnlyElement();
+    const auto backend = def_factor.backend();
+    auto &partial_graph = *context_data_map[backend].graph;
+    auto &operand_layouts = context_data_map[backend].operand_layouts;
+    assert(operand_layouts.find(operand_ind) == operand_layouts.end());
+    operand_layouts[operand_ind] = def_factor.layout();
+
+    // Copy the operand and insert it to the partial graph
+    auto new_operand = std::make_unique<ir::Operand>(operand);
+    new_operand->clearDefUse();
+    operand.releaseData(); // Deref data of LoweredGraph
+    auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
+    UNUSED_RELEASE(new_operand_ind);
+    assert(new_operand_ind == operand_ind);
+  });
+  // Separate operations into partial graphs
+  whole_graph.operations().iterate(
+    [&](const ir::OperationIndex &op_ind, const ir::IOperation &operation) {
+      auto &op_li = lgraph.lower_info().operation;
+      auto backend = op_li.at(op_ind).backend();
+      auto &partial_graph = *context_data_map[backend].graph;
+      auto &external_operands = context_data_map[backend].external_operands;
+      auto &operand_layouts = context_data_map[backend].operand_layouts;
+
+      {
+        // Add missing operands (externals)
+        auto io_list = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED |
+                       ir::Remove::UNDEFINED;
+        for (auto &&operand_ind : io_list)
+        {
+          if (partial_graph.operands().exist(operand_ind))
+            continue;
+
+          // Copy the operand and insert it to the partial graph
+          const auto &operand = whole_graph.operands().at(operand_ind);
+          auto new_operand = std::make_unique<ir::Operand>(operand);
+          new_operand->clearDefUse();
+          auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
+          UNUSED_RELEASE(new_operand_ind);
+          assert(new_operand_ind == operand_ind);
+
+          auto layout =
+            lgraph.lower_info().operand.at(operand_ind).def_factors().getOnlyElement().layout();
+          assert(operand_layouts.find(operand_ind) == operand_layouts.end());
+          operand_layouts[operand_ind] = layout;
+          external_operands.add(operand_ind);
+        }
+
+        auto new_op_ind = partial_graph.addOperation(op_ind, clone(operation));
+        UNUSED_RELEASE(new_op_ind);
+        assert(new_op_ind == op_ind);
+      }
+    });
+
+  // Create contexts
+  auto whole_op_order = lgraph.graph().topolSortOperations();
+  for (auto &&pair : context_data_map)
+  {
+    auto backend = pair.first;
+    auto &data = pair.second;
+    // Handle graph input/outputs or external tensors
+    data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+      if (whole_graph.getInputs().contains(ind) || whole_graph.getOutputs().contains(ind))
+        data.external_operands.add(ind);
+      // Inputs are either "graph input" or "no def op and non-constant"
+      if (whole_graph.getInputs().contains(ind) ||
+          (!operand.getDef().valid() && !operand.isConstant()))
+        // Outputs are either "graph output" or "no uses"
+        data.graph->addInput(ind);
+      if (whole_graph.getOutputs().contains(ind) || operand.getUses().size() == 0)
+        data.graph->addOutput(ind);
+    });
+    dumper::text::dumpGraph(*data.graph);
+
+    std::copy_if(whole_op_order.begin(), whole_op_order.end(), std::back_inserter(data.op_order),
+                 [&](const auto &ind) { return data.graph->operations().exist(ind); });
+    data.is_linear_executor = linear_executor;
+    data.custom_kernel_builder = custom_kernel_builder;
+    contexts.emplace(backend, backend->newContext(std::move(data)));
+  }
+  return contexts;
+}
+
+template <typename Context>
+std::deque<std::pair<const backend::Backend *, Context *>> orderBackendContext(
+  const std::unordered_map<const backend::Backend *, std::unique_ptr<Context>> &tbackend_contexts)
+{
+  std::deque<std::pair<const backend::Backend *, Context *>> ordered_contexts;
 
-    auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager();
-    if (d_tensor_manager != nullptr)
-      tensor_mgrs.insert(std::move(d_tensor_manager));
+  for (auto &&pair : tbackend_contexts)
+  {
+    // NOTE builtin backend must be processed lastly.
+    // This is because of Permute layer's specialty which is the only operation that could have
+    // different ITensor objects for the input and the output. And it requires all other backends'
+    // tensors are ready to use.
+    if (pair.first->config()->id() == "builtin")
+      ordered_contexts.emplace_back(pair.first, pair.second.get());
+    else
+      ordered_contexts.emplace_front(pair.first, pair.second.get());
   }
-  return tensor_mgrs;
+
+  return ordered_contexts;
 }
 
 } // namespace
@@ -106,415 +314,588 @@ ExecutorFactory::ExecutorFactory()
 }
 
 exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                                         const compiler::CompilerOptions &options,
-                                         const std::shared_ptr<exec::ExecutorMap> &executor_map)
+                                         const std::shared_ptr<exec::IExecutors> &executors,
+                                         const ExecutorFactoryArgs &args)
 {
-  return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
+  assert(args.options != nullptr);
+  return _map.at(args.options->executor)(std::move(lowered_graph), executors, args);
 }
 
-void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph)
+void ExecutorFactory::prepareMigrantTensors(compiler::ILoweredGraph &lowered_graph,
+                                            const backend::BackendContexts &backend_contexts)
 {
-  struct Entry
-  {
-    std::vector<backend::BackendContext::OperationInfo> operation_list;
-    std::vector<ir::OperandIndex> operand_list;
-  };
-  std::unordered_map<const backend::Backend *, Entry> backend_assets;
-
-  // Build lists for operations
-  lowered_graph->op_seqs().iterate(
-      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
-        auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
-        auto backend = op_seq_li.at(op_seq_index)->backend();
-        for (auto &operation_idx : op_seq.operations())
+  TensorRegistries tensor_regs{backend_contexts, true};
+
+  lowered_graph.graph().operations().iterate(
+    [&](const ir::OperationIndex &op_ind, const ir::IOperation &op) {
+      auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind);
+      auto &backend_ctx = backend_contexts.at(lower_info->backend());
+      for (auto &&ind :
+           (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
+      {
+        // If an Operation's input/output tensor does not have an own tensor object,
+        // it must be using migrant tensors, so find the tensor from other tensor registries and
+        // register it to the current tensor registry if it is portable
+        if (!backend_ctx->tensor_registry->getITensor(ind))
         {
-          backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
+          auto tensor = tensor_regs.getITensor(ind);
+          assert(tensor); // The tensor must have been registered
+          auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
+          if (ptensor)
+            backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
         }
-      });
+      }
+    });
+}
 
-  // Build lists for operands
-  lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
-    const auto lower_info = lowered_graph->getLowerInfo(ind);
-    for (auto factor : lower_info->def_factors())
+void ExecutorFactory::prepareBuiltinBackend(const TensorRegistries &tensor_regs,
+                                            const std::shared_ptr<exec::IExecutors> &executors,
+                                            const backend::BackendContexts &backend_contexts,
+                                            const ir::ModelIndex &index)
+{
+  for (auto &&pair : backend_contexts)
+  {
+    auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get());
+    if (builtin_context != nullptr)
     {
-      auto backend = factor.backend();
-      backend_assets[backend].operand_list.emplace_back(ind);
+      auto builtin_kernel_gen = builtin_context->kernel_gen;
+      builtin_kernel_gen->setTensorRegistries(tensor_regs);
+      builtin_kernel_gen->setExecutors(executors);
+      builtin_kernel_gen->setModelIndex(index);
     }
-  });
+  }
+}
 
-  for (auto &pair : backend_assets)
+std::deque<std::pair<const backend::Backend *, backend::BackendContext *>>
+ExecutorFactory::orderBackendContext(const backend::BackendContexts &backend_contexts)
+{
+  std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
+  for (auto &&pair : backend_contexts)
   {
-    auto backend = pair.first;
-    auto &arg = pair.second;
-    lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
+    // NOTE builtin backend must be processed lastly.
+    // This is because of Permute layer's specialty which is the only operation that could have
+    // different ITensor objects for the input and the output. And it requires all other backends'
+    // tensors are ready to use.
+    if (pair.first->config()->id() == "builtin")
+      ordered_contexts.emplace_back(pair.first, pair.second.get());
+    else
+      ordered_contexts.emplace_front(pair.first, pair.second.get());
   }
+  return ordered_contexts;
 }
 
-void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_graph,
-                                            const std::vector<ir::OpSequenceIndex> &order)
+exec::IExecutor *
+ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                      const std::shared_ptr<exec::IExecutors> &executors,
+                                      const ExecutorFactoryArgs &args)
 {
-  for (const auto index : order)
+  const auto options = args.options;
+  const auto &model_index = args.model_index;
+  const auto tracing_ctx = args.tracing_ctx;
+  auto custom_kernel_builder = args.custom_kernel_builder;
+  auto &graph = lowered_graph->graph();
+
+  backend::BackendContexts backend_contexts =
+    createBackendContexts(*lowered_graph, options->executor == "Linear", custom_kernel_builder);
+
+  TensorRegistries tensor_regs{backend_contexts, true};
+
+  initializeSubgraphIOTensors(
+    *lowered_graph, backend_contexts,
+    (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+      ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
+
+  // linearize
+  auto order = Linear::linearize(*lowered_graph);
+  Linear::dump(*lowered_graph, order);
+
+  for (auto &&pair : backend_contexts)
   {
-    const auto &op_seq = lowered_graph->op_seqs().at(index);
-    const auto backend = lowered_graph->getLowerInfo(index)->backend();
-    const auto tensor_register = lowered_graph->backend_contexts().at(backend)->tensor_register;
-    auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
-    auto model_io = lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs();
+    pair.second->genTensors();
+  }
+
+  prepareMigrantTensors(*lowered_graph, backend_contexts);
 
-    if (tensor_register)
+  // Give some runtime objects to builtin KernelGenerator
+  prepareBuiltinBackend(tensor_regs, executors, backend_contexts, model_index);
+
+  ExecutionBuilder builder;
+
+  // Adjust the order of backends for the upcoming iteration
+  auto ordered_contexts = orderBackendContext(backend_contexts);
+
+  // Simulate the execution for deallocation of tensors
+  std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map;
+  {
+    ir::OperandIndexMap<uint32_t> uses_map;
+    ir::OperandIndexSequence constants;
+
+    auto model_io =
+      (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+    // Prepare scanning
+    graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+      uses_map[ind] = obj.getUses().size();
+
+      if (obj.isConstant())
+        constants.append(ind);
+    });
+
+    // A trick to consider constants as an execption
+    for (const auto &ind : constants)
     {
-      // Custom registration
-      tensor_register->registerTensors(op_seq, lowered_graph->getLowerInfo());
+      uses_map[ind]++;
     }
-    else
+
+    for (const auto &op_ind : order)
     {
-      // Default registration
-      for (const auto op_idx : op_seq)
+      const auto &op = graph.operations().at(op_ind);
+      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+      for (const auto &ind : op_inputs)
       {
-        const auto &op = lowered_graph->graph().operations().at(op_idx);
-        for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
+        const auto &operand = graph.operands().at(ind);
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind))
         {
-          if (!tensor_builder->isRegistered(index) && !model_io.contains(index))
-          {
-            const auto &operand_lower_info =
-                lowered_graph->getLowerInfo(index)->def_factors().getOnlyElement();
-
-            // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
-            // op.getOutputs() of permute (CPU) returns tensor A
-            // but tensor A belongs to the backend of acl_cl.
-            // So, we have to make this tensor NOT registered for CPU.
-            if (operand_lower_info.backend() != backend)
-              continue;
-
-            const auto &obj = lowered_graph->graph().operands().at(index);
-            const auto frontend_layout = op_seq.getLayout();
-            const auto backend_layout = operand_lower_info.layout();
-            ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                         obj.typeInfo(), obj.info().memAllocType(),
-                                         obj.isConstant()};
-            tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-          }
+          dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind));
         }
       }
     }
-  }
-}
 
-std::vector<std::shared_ptr<backend::ITensor>>
-ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph,
-                                          const ir::OperandIndexSequence &indices)
-{
-  std::vector<std::shared_ptr<backend::ITensor>> ret;
+    // Dispose and validate
+    for (const auto &ind : constants)
+    {
+      --uses_map[ind];
+    }
 
-  // TODO Store controlflow backend in BackendContext
-  std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder;
-  std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
-  for (const auto &e : lowered_graph.backend_contexts())
+    assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+  }
+
+  // Generate kernels
+  for (auto &&pair : ordered_contexts)
   {
-    auto backend = e.first;
-    auto &context = e.second;
-    if (backend->config()->id() == backend::controlflow::Config::ID)
+    auto codes = pair.second->genKernels();
+    for (auto &&pair : codes)
     {
-      cf_tensor_builder =
-          std::dynamic_pointer_cast<backend::controlflow::TensorBuilder>(context->tensor_builder);
-      cf_tensor_reg =
-          std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
+      auto &op_ind = pair.first;
+      auto &fn_seq = pair.second;
+      auto &op = lowered_graph->graph().operations().at(op_ind);
+      auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
+      if (options->he_profiling_mode)
+        fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      if (!dealloc_list_map[op_ind].empty())
+        fn_seq->append(std::make_unique<DeallocFunction>(dealloc_list_map[op_ind]));
+      builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
     }
   }
-  assert(cf_tensor_builder);
-  assert(cf_tensor_reg);
 
-  for (auto ind : indices)
+  auto code_map = builder.releaseCodeMap();
+
+  auto exec = new exec::LinearExecutor{std::move(lowered_graph),
+                                       std::move(backend_contexts),
+                                       tensor_regs,
+                                       std::move(code_map),
+                                       order,
+                                       tracing_ctx};
+
+  if (!options->trace_filepath.empty())
   {
-    const auto &operand = lowered_graph.graph().operands().at(ind);
-    auto tensor = std::make_shared<backend::controlflow::UserTensor>(
-        operand.info(),
-        ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
-        cf_tensor_builder->dynamicTensorManager());
-
-    // Add tensor to controlflow TensorRegistry.
-    cf_tensor_reg->setNativeUserTensor(ind, tensor);
-    ret.push_back(tensor);
+    std::unique_ptr<exec::IExecutionObserver> ctp =
+      std::make_unique<exec::TracingObserver>(options->trace_filepath, exec->graph(), tracing_ctx);
+    exec->addObserver(std::move(ctp));
   }
-  return ret;
-}
+#ifdef MINMAX_H5DUMPER
+  if (!options->minmax_filepath.empty())
+    exec->addObserver(std::make_unique<exec::MinMaxRecorder>(
+      options->minmax_filepath, exec->graph(), exec->getBackendContexts()));
+#endif
 
-void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph)
-{
-  TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
-
-  lowered_graph.op_seqs().iterate(
-      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
-        auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
-        auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
-        for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
-                            ir::Remove::UNDEFINED)
-        {
-          // If an OpSequence input/output tensor does not have a own tensor object,
-          // it must be using external tensors, so find the tensor from other tensor builders and
-          // set the tensor to this tensor builder if portable
-          if (!backend_ctx->tensor_registry->getITensor(ind))
-          {
-            auto tensor = tensor_regs.getITensor(ind);
-            assert(tensor); // The tensor must have been registered
-            auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
-            if (ptensor)
-              backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
-          }
-        }
-      });
+  return exec;
 }
 
 exec::IExecutor *
-ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
-                                      const compiler::CompilerOptions &options,
-                                      const std::shared_ptr<exec::ExecutorMap> &executor_map)
+ExecutorFactory::createDataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                        const std::shared_ptr<exec::IExecutors> &executors,
+                                        const ExecutorFactoryArgs &args, bool parallel)
 {
-  const auto &backend_contexts = lowered_graph->backend_contexts();
+  const auto options = args.options;
+  const auto &model_index = args.model_index;
+  const auto tracing_ctx = args.tracing_ctx;
+  auto custom_kernel_builder = args.custom_kernel_builder;
 
-  initializeBackendContext(lowered_graph.get());
+  backend::BackendContexts backend_contexts =
+    createBackendContexts(*lowered_graph, options->executor == "Linear", custom_kernel_builder);
 
-  // linearize
-  assert(!lowered_graph->graph().isBuildingPhase());
+  TensorRegistries tensor_regs{backend_contexts, true};
 
-  /*************************************************
-   * Backend dependent analysis & optimization phase
-   *************************************************/
+  initializeSubgraphIOTensors(
+    *lowered_graph, backend_contexts,
+    (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+      ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
-  for (auto &pair : backend_contexts)
+  for (auto &&pair : backend_contexts)
   {
-    auto &optimizer = pair.second->optimizer;
-    if (optimizer)
-      optimizer->optimize();
+    pair.second->genTensors();
   }
 
-  /**********************************************************
-   * Backend dependent analysis & optimization phase finished
-   **********************************************************/
+  prepareMigrantTensors(*lowered_graph, backend_contexts);
 
-  /***********************
-   * Code generation phase
-   ***********************/
+  // Give some runtime objects to builtin KernelGenerator
+  prepareBuiltinBackend(tensor_regs, executors, backend_contexts, model_index);
 
-  auto order = Linear::linearize(*lowered_graph);
-  runTensorRegistration(lowered_graph.get(), order);
+  ExecutionBuilder builder;
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
-  if (options.is_primary_subgraph)
+  // Adjust the order of backends for the upcoming iteration
+  auto ordered_contexts = orderBackendContext(backend_contexts);
+
+  // Generate kernels
+  for (auto &&pair : ordered_contexts)
   {
-    input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
-    output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
+    auto codes = pair.second->genKernels();
+    for (auto &&pair : codes)
+    {
+      auto &op_ind = pair.first;
+      auto &fn_seq = pair.second;
+      auto &op = lowered_graph->graph().operations().at(op_ind);
+      auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
+      if (options->he_profiling_mode)
+        fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
+    }
   }
 
-  Linear::dump(*lowered_graph, order);
-  Linear::planTensors(*lowered_graph, order);
+  auto code_map = builder.releaseCodeMap();
 
-  TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
-  TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
+  exec::ExecutorBase *exec = nullptr;
+  if (parallel)
+  {
+    exec = new exec::ParallelExecutor{std::move(lowered_graph), std::move(backend_contexts),
+                                      tensor_regs, std::move(code_map), tracing_ctx};
+  }
+  else
+  {
+    auto dataflow_exec =
+      new exec::DataflowExecutor{std::move(lowered_graph), std::move(backend_contexts), tensor_regs,
+                                 std::move(code_map), tracing_ctx};
+    if (options->he_profiling_mode)
+    {
+      std::vector<const backend::Backend *> backends;
+      for (const auto &pair : backend_contexts)
+      {
+        backends.push_back(pair.first);
+      }
+      auto et = std::make_shared<exec::ExecTime>(backends);
+      std::unique_ptr<exec::IExecutionObserver> obs =
+        std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
+      dataflow_exec->addObserver(std::move(obs));
+    }
+    exec = dataflow_exec;
+  }
 
-  for (auto &tensor_builder : tensor_builders)
+  if (!options->trace_filepath.empty())
   {
-    tensor_builder->prepare();
+    std::unique_ptr<exec::IExecutionObserver> ctp =
+      std::make_unique<exec::TracingObserver>(options->trace_filepath, exec->graph(), tracing_ctx);
+    exec->addObserver(std::move(ctp));
   }
 
-  prepareExternalTensors(*lowered_graph);
+  return exec;
+}
 
-  ExecutionBuilder builder;
+#ifdef ONERT_TRAIN
+exec::IExecutor *
+ExecutorFactory::create(std::unique_ptr<compiler::train::LoweredTrainableGraph> lowered_graph,
+                        const std::shared_ptr<exec::IExecutors> &executors,
+                        const ExecutorFactoryArgs &args,
+                        const std::shared_ptr<exec::train::optimizer::Optimizer> &optimizer)
+{
+  assert(args.options != nullptr);
 
-  // Generate kernels
-  lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
-                                        const ir::OpSequence &op_seq) {
-    auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
-    auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
-    // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
-    auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
-    if (cf_kernel_gen != nullptr)
+  if (args.options->executor != "Linear")
+    throw std::runtime_error("ExecutorFactory: TrainableExecutor supports only 'Linear' now");
+
+  return createTrainableExecutor(std::move(lowered_graph), executors, args, optimizer);
+}
+
+void ExecutorFactory::prepareMigrantTensors(
+  compiler::ILoweredGraph &lowered_graph,
+  const backend::train::TrainableBackendContexts &backend_contexts)
+{
+  train::TensorRegistries tensor_regs{backend_contexts, true};
+
+  lowered_graph.graph().operations().iterate(
+    [&](const ir::OperationIndex &op_ind, const ir::IOperation &op) {
+      auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind);
+      auto &backend_ctx = backend_contexts.at(lower_info->backend());
+      for (auto &&ind :
+           (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
+      {
+        // If an Operation's input/output tensor does not have an own tensor object,
+        // it must be using migrant tensors, so find the tensor from other tensor registries and
+        // register it to the current tensor registry if it is portable
+        if (!backend_ctx->tensor_registry()->getITensor(ind))
+        {
+          auto tensor = tensor_regs.getITensor(ind);
+          assert(tensor); // The tensor must have been registered
+          auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
+          if (ptensor)
+            backend_ctx->tensor_registry()->setMigrantTensor(ind, ptensor);
+        }
+      }
+    });
+}
+
+exec::IExecutor *ExecutorFactory::createTrainableExecutor(
+  std::unique_ptr<compiler::train::LoweredTrainableGraph> lowered_graph,
+  const std::shared_ptr<exec::IExecutors> &, const ExecutorFactoryArgs &args,
+  const std::shared_ptr<exec::train::optimizer::Optimizer> &optimizer)
+{
+  const auto options = args.options;
+  const auto tracing_ctx = args.tracing_ctx;
+  auto custom_kernel_builder = args.custom_kernel_builder;
+
+  auto &graph = lowered_graph->graph();
+
+  lowered_graph->trainable_graph().operations().iterate([](const onert::ir::OperationIndex &,
+                                                           const onert::ir::IOperation &op) {
+    try
     {
-      cf_kernel_gen->setTensorRegistries(tensor_regs);
-      cf_kernel_gen->setExecutorMap(executor_map);
+      UNUSED_RELEASE(dynamic_cast<const ir::train::ITrainableOperation &>(op));
     }
-    auto fn_seq = kernel_gen->generate(op_seq);
-    if (options.he_profiling_mode)
+    catch (std::bad_cast &)
     {
-      fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      throw std::runtime_error("ExecutorFactory: " + op.name() + " is not trainable operation yet");
     }
-    builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
   });
 
-  for (auto &tensor_builder : tensor_builders)
-  {
-    tensor_builder->allocate();
-  }
+  // TODO Create context only once instead of replacing
+  backend::train::TrainableBackendContexts tbackend_contexts;
+  backend::BackendContexts base_backend_contexts =
+    createBackendContexts(*lowered_graph, true, custom_kernel_builder);
 
-  for (auto &pair : backend_contexts)
+  // Replace BackendContext with TrainbleBackendContext
+  for (auto &&pair : base_backend_contexts)
   {
-    pair.second->initConsts();
-  }
-
-  lowered_graph->graph().operands().iterate(
-      [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
-
-  auto code_map = builder.releaseCodeMap();
-
-  for (auto &it : code_map)
-  {
-    auto op_seq_index = it.first;
-    auto &fn_seq = it.second.fn_seq;
-
-    fn_seq->iterate([&](exec::IFunction &ifunc) {
-      ifunc.prepare();
-      auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
-      auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
-      tensor_builder->postFunctionPrepare();
+    auto ctx = pair.second.get();
+    const auto &data = ctx->data();
+
+    // Create partial and trainable graphs
+    auto tgraph = std::make_unique<ir::train::TrainableGraph>(*data.graph);
+    data.graph->operations().iterate(
+      [&](const onert::ir::OperationIndex &op_index, const onert::ir::IOperation &) {
+        const auto &orig_tgraph = lowered_graph->trainable_graph();
+        const auto &trainable_op = orig_tgraph.operation(op_index);
+        auto gen_index = tgraph->replaceOperation(op_index, trainable_op.clone());
+        UNUSED_RELEASE(gen_index);
+        assert(gen_index == op_index);
+      });
+    data.graph->operands().iterate([&](const ir::OperandIndex &index, const ir::Operand &) {
+      const auto &orig_tgraph = lowered_graph->trainable_graph();
+      if (orig_tgraph.derivatives().exist(index))
+      {
+        const auto &deriv = orig_tgraph.derivatives().at(index);
+        auto new_deriv = std::make_unique<ir::Operand>(deriv);
+        auto gen_index = tgraph->addDerivative(index, std::move(new_deriv));
+        UNUSED_RELEASE(gen_index);
+        assert(gen_index == index);
+      }
     });
-  }
 
-  backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
-  auto exec = new exec::LinearExecutor{
-      std::move(lowered_graph), input_tensors,       output_tensors, tensor_regs,
-      std::move(tensor_mgrs),   std::move(code_map), order};
+    // Remove outputs of whole graph from external_operands
+    auto external_operands = data.external_operands;
+    for (const auto &index : lowered_graph->trainable_graph().getOutputs())
+    {
+      if (external_operands.contains(index))
+        external_operands.remove(index);
+    }
 
-  if (!options.trace_filepath.empty())
-  {
-    std::unique_ptr<exec::IExecutionObserver> ctp =
-        std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
-    exec->addObserver(std::move(ctp));
+    // Set trainable context data
+    backend::train::TrainableContextData tdata;
+    tdata.tgraph = std::move(tgraph);
+    tdata.op_order = std::move(data.op_order);
+    tdata.external_operands = std::move(external_operands);
+    tdata.operand_layouts = std::move(data.operand_layouts);
+    tdata.custom_kernel_builder = std::move(data.custom_kernel_builder);
+    tdata.is_linear_executor = data.is_linear_executor;
+    tdata.optimizer = optimizer;
+
+    // TODO Remove dynamic_cast
+    try
+    {
+      const auto backend = pair.first;
+      const auto tbackend = dynamic_cast<const backend::train::ITrainableBackend *>(backend);
+      tbackend_contexts.emplace(backend, tbackend->newContext(std::move(tdata)));
+    }
+    catch (const std::bad_cast &)
+    {
+      throw std::runtime_error("ExecutorFactory: Invalid backend - TrainableExecutor does not "
+                               "support non-trainble backends");
+    }
   }
+  base_backend_contexts.clear();
 
-  return exec;
-}
-
-exec::IExecutor *ExecutorFactory::createDataflowExecutor(
-    std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
-    const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
-{
-  const auto &backend_contexts = lowered_graph->backend_contexts();
+  train::TensorRegistries tensor_regs{tbackend_contexts, true};
 
-  initializeBackendContext(lowered_graph.get());
+  initializeSubgraphIOTensors(
+    *lowered_graph, tbackend_contexts,
+    (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+      ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
+  // linearize
   auto order = Linear::linearize(*lowered_graph);
-  runTensorRegistration(lowered_graph.get(), order);
+  Linear::dump(*lowered_graph, order);
 
-  std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
-  std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
-  if (options.is_primary_subgraph)
+  for (auto &&pair : tbackend_contexts)
   {
-    input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs());
-    output_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getOutputs());
+    pair.second->genTensors();
   }
 
-  TensorBuilders tensor_builders{lowered_graph->backend_contexts(), true};
-  TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
-
-  // To make tensors never be deallocated, this is a workaround to use static memory planner
-  for (auto &tensor_builder : tensor_builders)
+  for (auto &&pair : tbackend_contexts)
   {
-    lowered_graph->graph().operands().iterate(
-        [&](const ir::OperandIndex &ind, const ir::Operand &) {
-          if (tensor_builder->isRegistered(ind))
-          {
-            tensor_builder->notifyFirstUse(ind);
-          }
-        });
+    auto tctx = pair.second.get();
+    tctx->genTrainingTensors();
   }
 
-  for (auto &tensor_builder : tensor_builders)
+  prepareMigrantTensors(*lowered_graph, tbackend_contexts);
+
+  // Give some runtime objects to builtin KernelGenerator
+  for (auto &&pair : tbackend_contexts)
   {
-    tensor_builder->prepare();
+    auto builtin_context =
+      dynamic_cast<backend::builtin::train::BackendContext *>(pair.second.get());
+    if (builtin_context != nullptr)
+    {
+      auto builtin_kernel_gen = builtin_context->kernel_gen;
+      builtin_kernel_gen->setTensorRegistries(tensor_regs);
+      builtin_kernel_gen->setWholeGraphOutputs(lowered_graph->trainable_graph().getOutputs());
+    }
   }
 
-  prepareExternalTensors(*lowered_graph);
+  // Adjust the order of backends for the upcoming iteration
+  auto ordered_contexts =
+    onert::orderBackendContext<backend::train::TrainableBackendContext>(tbackend_contexts);
 
-  ExecutionBuilder builder;
+  // TODO Remove this simulation
+  // Simulate the execution for deallocation of tensors
+  std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map;
+  {
+    ir::OperandIndexMap<uint32_t> uses_map;
+    ir::OperandIndexSequence constants;
 
-  // Generate kernels
-  lowered_graph->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &op_seq_index,
-                                        const ir::OpSequence &op_seq) {
-    auto lower_info = lowered_graph->getLowerInfo(op_seq_index);
-    auto kernel_gen = lowered_graph->backend_contexts().at(lower_info->backend())->kernel_gen;
-    // Set TensorBuilderSet and ExecutorMap to kernel_gen of control flow
-    auto cf_kernel_gen = dynamic_cast<backend::controlflow::KernelGenerator *>(kernel_gen.get());
-    if (cf_kernel_gen != nullptr)
+    auto model_io =
+      (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+    // Prepare scanning
+    graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+      uses_map[ind] = obj.getUses().size();
+
+      if (obj.isConstant())
+        constants.append(ind);
+    });
+
+    // A trick to consider constants as an execption
+    for (const auto &ind : constants)
     {
-      assert(cf_kernel_gen != nullptr);
-      cf_kernel_gen->setTensorRegistries(tensor_regs);
-      cf_kernel_gen->setExecutorMap(executor_map);
+      uses_map[ind]++;
     }
-    auto fn_seq = kernel_gen->generate(op_seq);
-    if (options.he_profiling_mode)
+
+    for (const auto op_ind : order)
     {
-      fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      const auto &op = graph.operations().at(op_ind);
+      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+      for (const auto &ind : op_inputs)
+      {
+        const auto &operand = graph.operands().at(ind);
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind))
+        {
+          dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind));
+        }
+      }
     }
-    builder.append(op_seq_index, {&op_seq, lower_info, std::move(fn_seq)});
-  });
 
-  for (const auto &tensor_builder : tensor_builders)
-  {
-    tensor_builder->allocate();
-  }
+    // Dispose and validate
+    for (const auto &ind : constants)
+    {
+      --uses_map[ind];
+    }
 
-  for (auto &pair : backend_contexts)
-  {
-    pair.second->initConsts();
+    assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
   }
 
-  lowered_graph->graph().operands().iterate(
-      [](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
-
-  auto code_map = builder.releaseCodeMap();
-
-  for (auto &it : code_map)
+  // Check derivative tensors
   {
-    auto op_seq_index = it.first;
-    auto &fn_seq = it.second.fn_seq;
-
-    fn_seq->iterate([&](exec::IFunction &ifunc) {
-      ifunc.prepare();
-      auto backend = lowered_graph->getLowerInfo(op_seq_index)->backend();
-      auto tensor_builder = lowered_graph->backend_contexts().at(backend)->tensor_builder;
-      tensor_builder->postFunctionPrepare();
-    });
+    // TODO Support multiple subgraphs
+    // Check if the derivative tensors corresponding to inputs of model are nullptr
+    // NOTE The derivative tensors corresponding to inputs of model are for inputs of PermuteLayers
+    //      and they are nullptr and because they are meaningless.
+    assert(std::all_of(lowered_graph->trainable_graph().getInputs().begin(),
+                       lowered_graph->trainable_graph().getInputs().end(),
+                       [&](const auto &input_idx) {
+                         return tensor_regs.getDerivativeITensor(input_idx) == nullptr;
+                       }));
+
+    // Check if the derivative tensors corresponding to outputs of model exist
+    assert(std::all_of(lowered_graph->trainable_graph().getOutputs().begin(),
+                       lowered_graph->trainable_graph().getOutputs().end(),
+                       [&](const auto &output_idx) {
+                         return tensor_regs.getDerivativeITensor(output_idx) == nullptr;
+                       }));
   }
 
-  backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders);
-
-  exec::ExecutorBase *exec = nullptr;
-  if (parallel)
-  {
-    exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors,
-                                      output_tensors,           tensor_regs,
-                                      std::move(tensor_mgrs),   std::move(code_map)};
-  }
-  else
+  train::TrainableCodeMap code_map;
+  // Generate kernels
+  for (auto &&pair : ordered_contexts)
   {
-    auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors,
-                                                    output_tensors,           tensor_regs,
-                                                    std::move(tensor_mgrs),   std::move(code_map)};
-    if (options.he_profiling_mode)
+    auto codes = pair.second->genKernels();
+    for (auto &&pair : codes)
     {
-      std::vector<const backend::Backend *> backends;
-      for (const auto &pair : backend_contexts)
-      {
-        backends.push_back(pair.first);
-      }
-      auto et = std::make_shared<exec::ExecTime>(backends);
-      std::unique_ptr<exec::IExecutionObserver> obs =
-          std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
-      dataflow_exec->addObserver(std::move(obs));
+      auto &op_ind = pair.first;
+      auto &tn_seq = pair.second;
+      auto &op = lowered_graph->trainable_graph().operation(op_ind);
+      auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
+
+      assert(code_map.find(op_ind) == code_map.end());
+      code_map.insert(
+        {op_ind, train::TrainableCodeAndInfo{op_ind, &op, lower_info, std::move(tn_seq)}});
     }
-    exec = dataflow_exec;
   }
 
-  if (!options.trace_filepath.empty())
+  if (order.size() != code_map.size())
+  {
+    throw std::runtime_error("ExecutorFactory: Some kernels are not generated");
+  }
+
+  auto exec = new exec::train::TrainableExecutor{std::move(lowered_graph),
+                                                 std::move(tbackend_contexts),
+                                                 tensor_regs,
+                                                 std::move(code_map),
+                                                 order,
+                                                 tracing_ctx};
+
+  if (!options->trace_filepath.empty())
   {
     std::unique_ptr<exec::IExecutionObserver> ctp =
-        std::make_unique<exec::ChromeTracingObserver>(options.trace_filepath, exec->graph());
+      std::make_unique<exec::TracingObserver>(options->trace_filepath, exec->graph(), tracing_ctx);
     exec->addObserver(std::move(ctp));
   }
+  // TODO Support MINMAX_H5DUMPER
 
   return exec;
 }
+#endif // ONERT_TRAIN
 
 } // namespace compiler
 } // namespace onert